changeset 393:1ed44f76458c

merged
author Michael Mayorov <marchael@kb.csu.ru>
date Wed, 03 Aug 2011 09:48:52 +0000
parents 42f7af473347 (current diff) 41a22e9dae75 (diff)
children 0ab852761ff2
files setup.py
diffstat 4 files changed, 135 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/converter/opendocument_in.py	Wed Aug 03 09:38:01 2011 +0000
+++ b/MoinMoin/converter/opendocument_in.py	Wed Aug 03 09:48:52 2011 +0000
@@ -8,12 +8,14 @@
 """
 
 
-import re, zipfile
+from __future__ import absolute_import, division
+
+import zipfile
 
 from MoinMoin import log
 logging = log.getLogger(__name__)
 
-rx_stripxml = re.compile("<[^>]*?>", re.DOTALL|re.MULTILINE)
+from .xml_in import strip_xml
 
 
 class OpenDocumentIndexingConverter(object):
@@ -25,8 +27,9 @@
         zf = zipfile.ZipFile(rev, "r")  # rev is file-like
         try:
             data = zf.read("content.xml")
-            data = ' '.join(rx_stripxml.sub(" ", data).split())
-            return data.decode('utf-8')
+            text = data.decode('utf-8')
+            text = strip_xml(text)
+            return text
         finally:
             zf.close()
 
@@ -53,3 +56,18 @@
 for t in opendocument_types:
     default_registry.register(OpenDocumentIndexingConverter._factory, Type(t), type_text_plain)
 
+
+# use same converter for the old *.sx? (pre-opendocument) openoffice documents:
+OpenOfficeIndexingConverter = OpenDocumentIndexingConverter
+
+openoffice_types = """\
+application/vnd.sun.xml.calc
+application/vnd.sun.xml.draw
+application/vnd.sun.xml.impress
+application/vnd.sun.xml.math
+application/vnd.sun.xml.writer
+application/vnd.sun.xml.writer.global""".split()
+
+for t in openoffice_types:
+    default_registry.register(OpenOfficeIndexingConverter._factory, Type(t), type_text_plain)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/converter/pdf_in.py	Wed Aug 03 09:48:52 2011 +0000
@@ -0,0 +1,69 @@
+# Copyright: 2011 MoinMoin:ThomasWaldmann
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - PDF input converter
+"""
+
+
+from __future__ import absolute_import, division
+
+from MoinMoin import log
+logging = log.getLogger(__name__)
+
+from pdfminer.pdfparser import PDFDocument, PDFParser
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
+from pdfminer.pdfdevice import PDFDevice
+from pdfminer.converter import TextConverter
+from pdfminer.cmapdb import CMapDB
+from pdfminer.layout import LAParams
+
+
+LAPARAMS = LAParams(
+    # value is specified not as an actual length, but as a proportion of the length to the size of each character in question.
+    # two text chunks whose distance is closer than the char_margin is considered
+    # continuous and get grouped into one.
+    char_margin=0.3,
+    # it may be required to insert blank characters (spaces) as necessary if the distance
+    # between two words is greater than the word_margin, as a blank between words might
+    # not be represented as a space, but indicated by the positioning of each word.
+    word_margin=0.2,
+    # two lines whose distance is closer than the line_margin is grouped as a text box,
+    # which is a rectangular area that contains a "cluster" of text portions.
+    line_margin=0.3,
+)
+
+
+class UnicodeConverter(TextConverter):
+    # as result, we want a unicode object
+    # TextConverter only provides encoded output into a file-like object
+    def __init__(self, rsrcmgr, pageno=1, laparams=None, showpageno=False):
+        TextConverter.__init__(self, rsrcmgr, None, codec=None, pageno=pageno, laparams=laparams,
+                               showpageno=showpageno)
+        self.__text = []
+
+    def write_text(self, text):
+        self.__text.append(text)
+
+    def read_result(self):
+        return u''.join(self.__text)
+
+
+class PDFIndexingConverter(object):
+    @classmethod
+    def _factory(cls, input, output, **kw):
+        return cls()
+
+    def __call__(self, rev, contenttype=None, arguments=None):
+        rsrcmgr = PDFResourceManager()
+        device = UnicodeConverter(rsrcmgr, laparams=LAPARAMS)
+        try:
+            process_pdf(rsrcmgr, device, rev)
+            return device.read_result()
+        finally:
+            device.close()
+
+
+from . import default_registry
+from MoinMoin.util.mime import Type, type_text_plain
+default_registry.register(PDFIndexingConverter._factory, Type('application/pdf'), type_text_plain)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/converter/xml_in.py	Wed Aug 03 09:48:52 2011 +0000
@@ -0,0 +1,43 @@
+# Copyright: 2011 MoinMoin:ThomasWaldmann
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - Generic XML input converter
+"""
+
+from __future__ import absolute_import, division
+
+import re
+
+from MoinMoin import log
+logging = log.getLogger(__name__)
+
+from ._util import decode_data
+
+RX_STRIPXML = re.compile(u"<[^>]*?>", re.U|re.DOTALL|re.MULTILINE)
+
+def strip_xml(text):
+    text = RX_STRIPXML.sub(u" ", text)
+    text = ' '.join(text.split())
+    return text
+
+
+class XMLIndexingConverter(object):
+    """
+    We try to generically extract contents from XML documents by just throwing
+    away all XML tags. This is for indexing, so this might be good enough.
+    """
+    @classmethod
+    def _factory(cls, input, output, **kw):
+        return cls()
+
+    def __call__(self, rev, contenttype=None, arguments=None):
+        text = decode_data(rev, contenttype)
+        text = strip_xml(text)
+        return text
+
+
+from . import default_registry
+from MoinMoin.util.mime import Type, type_text_plain
+default_registry.register(XMLIndexingConverter._factory, Type('text/xml'), type_text_plain)
+
--- a/setup.py	Wed Aug 03 09:38:01 2011 +0000
+++ b/setup.py	Wed Aug 03 09:48:52 2011 +0000
@@ -90,6 +90,7 @@
         'py==1.3.4', # py.test 1.3.4 is needed by unit tests
         'whoosh>=2.0.0', # needed for indexed search
         'sphinx', # needed to build the docs
+        'pdfminer', # pdf -> text/plain conversion
         'XStatic>=0.0.2',
         'XStatic-CKEditor>=3.6.1.2',
         'XStatic-jQuery>=1.6.1.4',