changeset 438:8bb545d58e6f

lupy: filter plugins for attachment search imported from: moin--main--1.5--patch-442
author Thomas Waldmann <tw@waldmann-edv.de>
date Mon, 13 Feb 2006 12:09:21 +0000
parents f96c2c8d7c91
children 05175e00f75a
files ChangeLog MoinMoin/filter/__init__.py MoinMoin/filter/binary.py MoinMoin/filter/sxw.py MoinMoin/filter/txt.py MoinMoin/lupy.py docs/CHANGES
diffstat 7 files changed, 153 insertions(+), 22 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Sun Feb 12 20:27:04 2006 +0000
+++ b/ChangeLog	Mon Feb 13 12:09:21 2006 +0000
@@ -2,6 +2,32 @@
 # arch-tag: automatic-ChangeLog--arch@arch.thinkmo.de--2003-archives/moin--main--1.5
 #
 
+2006-02-13 13:09:21 GMT	Thomas Waldmann <tw@waldmann-edv.de>	patch-442
+
+    Summary:
+      lupy: filter plugins for attachment search
+    Revision:
+      moin--main--1.5--patch-442
+
+    lupy: filter plugins for attachment search
+    
+
+    new files:
+     MoinMoin/filter/.arch-ids/=id
+     MoinMoin/filter/.arch-ids/__init__.py.id
+     MoinMoin/filter/.arch-ids/binary.py.id
+     MoinMoin/filter/.arch-ids/sxw.py.id
+     MoinMoin/filter/.arch-ids/txt.py.id
+     MoinMoin/filter/__init__.py MoinMoin/filter/binary.py
+     MoinMoin/filter/sxw.py MoinMoin/filter/txt.py
+
+    modified files:
+     ChangeLog MoinMoin/lupy.py docs/CHANGES
+
+    new directories:
+     MoinMoin/filter MoinMoin/filter/.arch-ids
+
+
 2006-02-12 21:27:04 GMT	Thomas Waldmann <tw@waldmann-edv.de>	patch-441
 
     Summary:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/filter/__init__.py	Mon Feb 13 12:09:21 2006 +0000
@@ -0,0 +1,13 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - Filter Package
+
+    @copyright: 2006 by Thomas Waldmann
+    @license: GNU GPL, see COPYING for details.
+"""
+
+from MoinMoin.util import pysupport
+
+filters = pysupport.getPackageModules(__file__)
+modules = filters
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/filter/binary.py	Mon Feb 13 12:09:21 2006 +0000
@@ -0,0 +1,35 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - binary file Filter
+    
+    Processes any binary file and extracts ASCII content from it.
+
+    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
+    @license: GNU GPL, see COPYING for details.
+"""
+
+import os, string
+
+# we don't want or are not able to process those:
+blacklist = ('.exe', '.com', '.cab',
+             '.iso',
+             '.zip', '.gz', '.tgz', '.bz2', '.tb2', )
+
+# builds a list of all characters:
+norm = string.maketrans('', '')
+# builds a list of all non-alphanumeric characters:
+non_alnum = string.translate(norm, norm, string.letters+string.digits) 
+# translate table that replaces all non-alphanumeric by blanks:
+trans_nontext = string.maketrans(non_alnum, ' '*len(non_alnum))
+
+def execute(indexobj, filename):
+    fileext = os.path.splitext(filename)[1]
+    if fileext in blacklist:
+        return u''
+    f = file(filename, "rb")
+    data = f.read()
+    f.close()
+    data = data.translate(trans_nontext)
+    data = ' '.join(data.split()) # remove lots of blanks
+    return data.decode('ascii')
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/filter/sxw.py	Mon Feb 13 12:09:21 2006 +0000
@@ -0,0 +1,24 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - OpenOffice SXW Filter
+
+    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
+    @license: GNU GPL, see COPYING for details.
+"""
+
+import re, sys
+import zipfile
+
+rx_stripxml = re.compile("<[^>]*?>", re.DOTALL|re.MULTILINE)
+
+def execute(indexobj, filename):
+        try:
+            zf = zipfile.ZipFile(filename, "r")
+            data = zf.read("content.xml")
+            zf.close()
+            data = " ".join(rx_stripxml.sub(" ", data).split())
+        except RuntimeError, err:
+            indexobj.request.log(str(err))
+            data = ""
+        return data.decode('utf-8')
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/filter/txt.py	Mon Feb 13 12:09:21 2006 +0000
@@ -0,0 +1,28 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - plain text file Filter
+
+    We try to support more than ASCII here.
+
+    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
+    @license: GNU GPL, see COPYING for details.
+"""
+
+import codecs
+
+def execute(indexobj, filename):
+    for enc in ('utf-8', 'iso-8859-15', 'latin-1', ):
+        try:
+            f = codecs.open(filename, "r", enc)
+            data = f.read()
+            f.close()
+            return data
+        except UnicodeError, err:
+            pass
+    f = file(filename, "r")
+    data = f.read()
+    f.close()
+    data = data.decode('ascii', 'replace')
+    return data
+    
+
--- a/MoinMoin/lupy.py	Sun Feb 12 20:27:04 2006 +0000
+++ b/MoinMoin/lupy.py	Mon Feb 13 12:09:21 2006 +0000
@@ -360,12 +360,11 @@
     def _do_queued_updates(self, request, lock=None, amount=5):
         """ Assumes that the write lock is acquired """
         try:
-            self.translate_table = self.make_transtable()
-            pages = self.queue.pages() # [:amount]
+            pages = self.queue.pages()[:amount]
             for name in pages:
                 p = Page(request, name)
                 self._update_page(p)
-            self.queue.remove(pages)
+                self.queue.remove([name])
         finally:
             if lock:
                 lock.release()
@@ -380,12 +379,23 @@
             self._index_page(writer, page)
             writer.close()
    
-    def make_transtable(self):
-        import string
-        norm = string.maketrans('', '') # builds a list of all characters
-        non_alnum = string.translate(norm, norm, string.letters+string.digits) 
-        trans_nontext = string.maketrans(non_alnum, ' '*len(non_alnum))
-        return trans_nontext
+    def contentfilter(self, filename):
+        """ Get a filter for content of filename and return unicode content. """
+        import wikiutil
+        request = self.request
+        fileext = os.path.splitext(filename)[1]
+        if fileext:
+            fileext = fileext[1:].lower() # skip the leading dot
+        else:
+            fileext = 'binary'
+        try:
+            execute = wikiutil.importPlugin(request.cfg, 'filter', fileext)
+        except wikiutil.PluginMissingError:
+            try:
+                execute = wikiutil.importPlugin(request.cfg, 'filter', 'binary')
+            except wikiutil.PluginMissingError:
+                raise ImportError("Cannot load filter %s" % 'binary')
+        return execute(self, filename)
    
     def _index_page(self, writer, page):
         """ Assumes that the write lock is acquired """
@@ -406,17 +416,11 @@
         writer.addDocument(d)
         
         from MoinMoin.action import AttachFile
-        def filecontent(fn):
-            f = file(fn, "rb")
-            data = f.read()
-            f.close()
-            data = data.translate(self.translate_table)
-            data = ' '.join(data.split()) # remove lots of blanks
-            return data.decode('utf-8')
-        
+
         attachments = AttachFile._get_files(request, pagename)
         for att in attachments:
-            att_content = filecontent(AttachFile.getFilename(request, pagename, att))
+            filename = AttachFile.getFilename(request, pagename, att)
+            att_content = self.contentfilter(filename)
             d = document.Document()
             d.add(document.Keyword('pagename', pagename))
             d.add(document.Keyword('attachment', att)) # this is an attachment, store its filename
@@ -444,7 +448,6 @@
             writer.mergeFactor = 50
             pages = request.rootpage.getPageList(user='', exists=1)
             request.log("indexing all (%d) pages..." % len(pages))
-            self.translate_table = self.make_transtable()
             for pagename in pages:
                 p = Page(request, pagename)
                 # code does NOT seem to assume request.page being set any more
--- a/docs/CHANGES	Sun Feb 12 20:27:04 2006 +0000
+++ b/docs/CHANGES	Mon Feb 13 12:09:21 2006 +0000
@@ -36,9 +36,11 @@
   New features:
     * attachment search using lupy (lupy_search = 1 in your config)
       Title search will also search attachment filenames.
-      Full text search will also search attachment contents (filtered through
-      some very simple ASCII-only filter when the index is built - sorry, no
-      umlauts).
+      Full text search will also search attachment contents.
+    * indexing filter plugins, see MoinMoin:FiltersForIndexing
+      There are not many filters yet, so most is handled by the "binary"
+      filter, a very simple ASCII-only filter.
+      Feel free to contribute more filter plugins!
 
   Bugfixes:
     * cookie_lifetime didn't work comfortable for low values. The cookie was