view data/plugin/macro/pdf2img.py @ 625:f4e63b74b969

FormSubmit: adapt to werkzeug MultiDict
author Thomas Waldmann <tw AT waldmann-edv DOT de>
date Mon, 25 Mar 2013 17:47:30 +0100
parents 626e6880e69a
children 127da830be6c
line wrap: on
line source
# -*- coding: iso-8859-1 -*-
"""
    MoinMoin - pdf2img converts pdf files to images of type svg or png and shows it rendered

    Features:
    * fetches pdf files from urls and converts them to svg or png
      The new image is stored and rendered from its cachefile
    * can also be used for an attached pdf file 
    * checks timestamp of content for updating cached image

    Requires posix platform and pdf2svg and pstoimg

    @copyright: 2011 MoinMoin:ReimarBauer
    @license: GNU GPL, see COPYING for details.
"""

from MoinMoin import log
logging = log.getLogger(__name__)

import httplib
import os
import tempfile
import urllib2
from urlparse import urlparse

from MoinMoin import caching, config
from MoinMoin.util.SubProcess import exec_cmd
from MoinMoin.action import AttachFile, cache
from MoinMoin.Page import Page

CACHE_ARENA = 'sendcache'
CACHE_SCOPE = 'wiki'
SVG_CMD = """pdf2svg "%(pdf_file)s" "%(img_file)s" %(pageno)s"""
PS2TOIMG_CMD = """pstoimg -quiet -crop tblr -density 200 -type png "%(pdf_file)s" -out "%(img_file)s" """

def url_exists(url):
    content = ""
    try:
        item = urllib2.urlopen(url)
        content = item.read(size=1)
        item.close()
    except (IOError, urllib2.HTTPError, ValueError):
        return False
    if content:
        return True
    return False

def last_modified(request=None, pagename=None, attachment=None, url=""):
    if not url:
        pdf_file = os.path.join(AttachFile.getAttachDir(request, pagename), attachment).encode(config.charset)
        st = os.stat(pdf_file)
        return request.user.getFormattedDateTime(st.st_mtime)

    parse_result = urlparse(url)
    conn = httplib.HTTPConnection(parse_result.netloc)
    conn.request("GET", parse_result.path)
    response = conn.getresponse()
    return response.getheader('last-modified')

def get_img_key(request, url, pageno, identifier, format):
    """
    gets the key needed for the cache file 
    """
    return cache.key(request, itemname=format, content="%s.%s.%s.%s" % (url, pageno, identifier, format))

def prepare_img_cache(request, url, pageno, identifier, format):
    """
    prepares the cache file and returns its file name
    """
    key = get_img_key(request, url, pageno, identifier, format)
    cache.put(request, key, " ",
              content_type=format)
    data_cache = caching.CacheEntry(request, CACHE_ARENA, key+'.data', CACHE_SCOPE, do_locking=False)
    return data_cache._fname

def fetch_pdf_item(request, url):
    """
    fetches the pdf item and stores it as cache file 
    """
    key = cache.key(request, itemname="PDF", content=url)
    try:
        item = urllib2.urlopen(url)
    except (IOError, urllib2.HTTPError, ValueError), err:
        logging.info(url)
        logging.debug("%s: %s" % (url, err))
        return ""
    else:
        cache.put(request, key, item.read(),
                  content_type="application/pdf")
        item.close()
        data_cache = caching.CacheEntry(request, CACHE_ARENA, key+'.data', CACHE_SCOPE, do_locking=False)
        return data_cache._fname

def macro_pdf2img(macro, attachment='', url='', width=1200, height=800, pageno=1, format=("svg", "png")):
    """
    converts pdf from urls to image files using MoinMoins cache and renders from there
    """
    request = macro.request
    pagename = request.page.page_name
    if format == "svg":
        content_type = 'image/svg+xml'
    if format == "png":
        content_type = 'image/png'
    pdf_file = None
    # only for posix implemented
    if os.name == 'posix':
        if attachment:
            page_name, filename = AttachFile.absoluteName(attachment, pagename)
            if not AttachFile.exists(request, page_name, filename):
                return "attachment: %s does not exists" % attachment
            url = AttachFile.getAttachUrl(page_name, filename, request)
            identifier = last_modified(request, page_name, filename)
        else:
            if not url_exists(url):
                return "url: %s does not exists" % url
            identifier = last_modified(url=url)
        logging.debug("%s: %s" % (url, identifier))

        key = get_img_key(request, url, pageno, identifier, format)
        if not cache.exists(request, key):
            if attachment:
                page_name, filename = AttachFile.absoluteName(attachment, pagename)
                if filename and filename.lower().endswith('.pdf') and AttachFile.exists(request, page_name, filename):
                    pdf_file = os.path.join(AttachFile.getAttachDir(request, page_name), filename).encode(config.charset)
            elif url:
                #TODO: a mimetype check is needed beforehand 
                pdf_file = fetch_pdf_item(request, url)
            if pdf_file:
                img_file = prepare_img_cache(request, url, pageno, identifier, format)
                if format == "svg":
                    cmd = SVG_CMD % {
                                     "pdf_file": pdf_file,
                                     "img_file": img_file,
                                     "pageno": pageno
                                     }
                if format == "png":
                    #without -multipage we get only the first page
                    cmd = PS2TOIMG_CMD % {
                                          "pdf_file": pdf_file,
                                          "img_file": img_file,
                                          }
                data, errors, rc = exec_cmd(cmd, timeout=300)
                logging.debug("Command '%s', rc: %d, stdout: %d bytes, stderr: %s" % (cmd, rc, len(data), errors))
                if not errors:
                    image = open(img_file, 'rb')
                    cache.put(request, key, image.read(),
                              content_type=content_type)
                    image.close()

        if cache.exists(request, key):
            cache_url = "%s%s%s" % (request.getQualifiedURL(), Page(request, pagename).url(request), cache.url(request, key))
            if format == "svg":
                from MoinMoin import macro as _macro
                from MoinMoin.parser.text import Parser
                _macro.formatter = request.html_formatter
                parser = Parser("##\n", request)
                m = _macro.Macro(parser)
                html_object = m.execute('EmbedObject', u'width=%s, height=%s, target=%s, url_mimetype=%s' % (width, height, cache_url, content_type))
            else:
                html_object = request.formatter.image(src=cache.url(request, key), alt="", width=width)
            if attachment:
                url = AttachFile.getAttachUrl(page_name, filename, request, addts=0, do='get')
                return '<B>%s Image from:</B> "<a href=%s>%s</a>"<hr>%s<br>' % (format.upper(), url, attachment, html_object)
            else:
                return '<B>%s Image from:</B> "<a href=%s>%s</a>"<hr>%s<br>' % (format.upper(), url, url, html_object)
        else:
             return ""