changeset 454:e099072bba24

changed filters to use mimetypes imported from: moin--main--1.5--patch-458
author Thomas Waldmann <tw@waldmann-edv.de>
date Sat, 25 Feb 2006 08:11:57 +0000
parents 493972205fac
children e5609b8b7647
files ChangeLog MoinMoin/filter/application_octet_stream.py MoinMoin/filter/application_vnd_sun_xml_writer.py MoinMoin/filter/audio.py MoinMoin/filter/binary.py MoinMoin/filter/image.py MoinMoin/filter/sxw.py MoinMoin/filter/text.py MoinMoin/filter/txt.py MoinMoin/filter/video.py MoinMoin/lupy.py docs/CHANGES
diffstat 12 files changed, 188 insertions(+), 99 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Tue Feb 21 21:41:21 2006 +0000
+++ b/ChangeLog	Sat Feb 25 08:11:57 2006 +0000
@@ -2,6 +2,41 @@
 # arch-tag: automatic-ChangeLog--arch@arch.thinkmo.de--2003-archives/moin--main--1.5
 #
 
+2006-02-25 09:11:57 GMT	Thomas Waldmann <tw@waldmann-edv.de>	patch-458
+
+    Summary:
+      changed filters to use mimetypes
+    Revision:
+      moin--main--1.5--patch-458
+
+    changed filters to use mimetypes
+    
+
+    new files:
+     MoinMoin/filter/.arch-ids/audio.py.id
+     MoinMoin/filter/.arch-ids/image.py.id
+     MoinMoin/filter/.arch-ids/video.py.id MoinMoin/filter/audio.py
+     MoinMoin/filter/image.py MoinMoin/filter/video.py
+
+    modified files:
+     ChangeLog MoinMoin/filter/text.py MoinMoin/lupy.py
+     docs/CHANGES
+
+    renamed files:
+     MoinMoin/filter/.arch-ids/binary.py.id
+       ==> MoinMoin/filter/.arch-ids/application_octet_stream.py.id
+     MoinMoin/filter/.arch-ids/sxw.py.id
+       ==> MoinMoin/filter/.arch-ids/application_vnd_sun_xml_writer.py.id
+     MoinMoin/filter/.arch-ids/txt.py.id
+       ==> MoinMoin/filter/.arch-ids/text.py.id
+     MoinMoin/filter/binary.py
+       ==> MoinMoin/filter/application_octet_stream.py
+     MoinMoin/filter/sxw.py
+       ==> MoinMoin/filter/application_vnd_sun_xml_writer.py
+     MoinMoin/filter/txt.py
+       ==> MoinMoin/filter/text.py
+
+
 2006-02-21 22:41:21 GMT	Thomas Waldmann <tw@waldmann-edv.de>	patch-457
 
     Summary:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/filter/application_octet_stream.py	Sat Feb 25 08:11:57 2006 +0000
@@ -0,0 +1,35 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - binary file Filter
+    
+    Processes any binary file and extracts ASCII content from it.
+
+    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
+    @license: GNU GPL, see COPYING for details.
+"""
+
+import os, string
+
+# we don't want or are not able to process those:
+blacklist = ('.exe', '.com', '.cab',
+             '.iso',
+             '.zip', '.gz', '.tgz', '.bz2', '.tb2', )
+
+# builds a list of all characters:
+norm = string.maketrans('', '')
+# builds a list of all non-alphanumeric characters:
+non_alnum = string.translate(norm, norm, string.letters+string.digits) 
+# translate table that replaces all non-alphanumeric by blanks:
+trans_nontext = string.maketrans(non_alnum, ' '*len(non_alnum))
+
+def execute(indexobj, filename):
+    fileext = os.path.splitext(filename)[1]
+    if fileext in blacklist:
+        return u''
+    f = file(filename, "rb")
+    data = f.read()
+    f.close()
+    data = data.translate(trans_nontext)
+    data = ' '.join(data.split()) # remove lots of blanks
+    return data.decode('ascii')
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/filter/application_vnd_sun_xml_writer.py	Sat Feb 25 08:11:57 2006 +0000
@@ -0,0 +1,24 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - OpenOffice SXW Filter
+
+    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
+    @license: GNU GPL, see COPYING for details.
+"""
+
+import re, sys
+import zipfile
+
+rx_stripxml = re.compile("<[^>]*?>", re.DOTALL|re.MULTILINE)
+
+def execute(indexobj, filename):
+        try:
+            zf = zipfile.ZipFile(filename, "r")
+            data = zf.read("content.xml")
+            zf.close()
+            data = " ".join(rx_stripxml.sub(" ", data).split())
+        except RuntimeError, err:
+            indexobj.request.log(str(err))
+            data = ""
+        return data.decode('utf-8')
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/filter/audio.py	Sat Feb 25 08:11:57 2006 +0000
@@ -0,0 +1,15 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - audio/* file Filter
+
+    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
+    @license: GNU GPL, see COPYING for details.
+"""
+
+def execute(indexobj, filename):
+    """ Audio data filtering not implemented yet.
+    
+        TODO: maybe extract title, artist, etc. from mp3 and ogg
+    """
+    return u""
+
--- a/MoinMoin/filter/binary.py	Tue Feb 21 21:41:21 2006 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,35 +0,0 @@
-# -*- coding: iso-8859-1 -*-
-"""
-    MoinMoin - binary file Filter
-    
-    Processes any binary file and extracts ASCII content from it.
-
-    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
-    @license: GNU GPL, see COPYING for details.
-"""
-
-import os, string
-
-# we don't want or are not able to process those:
-blacklist = ('.exe', '.com', '.cab',
-             '.iso',
-             '.zip', '.gz', '.tgz', '.bz2', '.tb2', )
-
-# builds a list of all characters:
-norm = string.maketrans('', '')
-# builds a list of all non-alphanumeric characters:
-non_alnum = string.translate(norm, norm, string.letters+string.digits) 
-# translate table that replaces all non-alphanumeric by blanks:
-trans_nontext = string.maketrans(non_alnum, ' '*len(non_alnum))
-
-def execute(indexobj, filename):
-    fileext = os.path.splitext(filename)[1]
-    if fileext in blacklist:
-        return u''
-    f = file(filename, "rb")
-    data = f.read()
-    f.close()
-    data = data.translate(trans_nontext)
-    data = ' '.join(data.split()) # remove lots of blanks
-    return data.decode('ascii')
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/filter/image.py	Sat Feb 25 08:11:57 2006 +0000
@@ -0,0 +1,15 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - image/* file Filter
+
+    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
+    @license: GNU GPL, see COPYING for details.
+"""
+
+def execute(indexobj, filename):
+    """ Image data filtering not implemented yet.
+    
+        TODO: maybe extract comments or time stamps from jpegs and png.
+    """
+    return u""
+
--- a/MoinMoin/filter/sxw.py	Tue Feb 21 21:41:21 2006 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,24 +0,0 @@
-# -*- coding: iso-8859-1 -*-
-"""
-    MoinMoin - OpenOffice SXW Filter
-
-    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
-    @license: GNU GPL, see COPYING for details.
-"""
-
-import re, sys
-import zipfile
-
-rx_stripxml = re.compile("<[^>]*?>", re.DOTALL|re.MULTILINE)
-
-def execute(indexobj, filename):
-        try:
-            zf = zipfile.ZipFile(filename, "r")
-            data = zf.read("content.xml")
-            zf.close()
-            data = " ".join(rx_stripxml.sub(" ", data).split())
-        except RuntimeError, err:
-            indexobj.request.log(str(err))
-            data = ""
-        return data.decode('utf-8')
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/filter/text.py	Sat Feb 25 08:11:57 2006 +0000
@@ -0,0 +1,28 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - text/* file Filter
+
+    We try to support more than ASCII here.
+
+    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
+    @license: GNU GPL, see COPYING for details.
+"""
+
+import codecs
+
+def execute(indexobj, filename):
+    for enc in ('utf-8', 'iso-8859-15', 'iso-8859-1', ):
+        try:
+            f = codecs.open(filename, "r", enc)
+            data = f.read()
+            f.close()
+            return data
+        except UnicodeError, err:
+            pass
+    f = file(filename, "r")
+    data = f.read()
+    f.close()
+    data = data.decode('ascii', 'replace')
+    return data
+    
+
--- a/MoinMoin/filter/txt.py	Tue Feb 21 21:41:21 2006 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,28 +0,0 @@
-# -*- coding: iso-8859-1 -*-
-"""
-    MoinMoin - plain text file Filter
-
-    We try to support more than ASCII here.
-
-    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
-    @license: GNU GPL, see COPYING for details.
-"""
-
-import codecs
-
-def execute(indexobj, filename):
-    for enc in ('utf-8', 'iso-8859-15', 'latin-1', ):
-        try:
-            f = codecs.open(filename, "r", enc)
-            data = f.read()
-            f.close()
-            return data
-        except UnicodeError, err:
-            pass
-    f = file(filename, "r")
-    data = f.read()
-    f.close()
-    data = data.decode('ascii', 'replace')
-    return data
-    
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/filter/video.py	Sat Feb 25 08:11:57 2006 +0000
@@ -0,0 +1,15 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - video/* file Filter
+
+    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
+    @license: GNU GPL, see COPYING for details.
+"""
+
+def execute(indexobj, filename):
+    """ Video data filtering not implemented yet.
+    
+        TODO: maybe extract some meta information from some popular video formats.
+    """
+    return u""
+
--- a/MoinMoin/lupy.py	Tue Feb 21 21:41:21 2006 +0000
+++ b/MoinMoin/lupy.py	Sat Feb 25 08:11:57 2006 +0000
@@ -381,21 +381,30 @@
    
     def contentfilter(self, filename):
         """ Get a filter for content of filename and return unicode content. """
-        import wikiutil
+        import mimetypes
+        from MoinMoin import wikiutil
         request = self.request
-        fileext = os.path.splitext(filename)[1]
-        if fileext:
-            fileext = fileext[1:].lower() # skip the leading dot
-        else:
-            fileext = 'binary'
+        mimetype, encoding = mimetypes.guess_type(filename)
+        if mimetype is None:
+            mimetype = 'application/octet-stream'
+        def mt2mn(mt): # mimetype to modulename
+            return mt.replace("/", "_").replace("-","_").replace(".", "_")
         try:
-            execute = wikiutil.importPlugin(request.cfg, 'filter', fileext)
+            _filter = mt2mn(mimetype)
+            execute = wikiutil.importPlugin(request.cfg, 'filter', _filter)
         except wikiutil.PluginMissingError:
             try:
-                execute = wikiutil.importPlugin(request.cfg, 'filter', 'binary')
+                _filter = mt2mn(mimetype.split("/", 1)[0])
+                execute = wikiutil.importPlugin(request.cfg, 'filter', _filter)
             except wikiutil.PluginMissingError:
-                raise ImportError("Cannot load filter %s" % 'binary')
-        return execute(self, filename)
+                try:
+                    _filter = mt2mn('application/octet-stream')
+                    execute = wikiutil.importPlugin(request.cfg, 'filter', _filter)
+                except wikiutil.PluginMissingError:
+                    raise ImportError("Cannot load filter %s" % binaryfilter)
+        data = execute(self, filename)
+        request.log("Filter %s returned %d characters for file %s" % (_filter, len(data), filename))
+        return data
    
     def _index_page(self, writer, page):
         """ Assumes that the write lock is acquired """
--- a/docs/CHANGES	Tue Feb 21 21:41:21 2006 +0000
+++ b/docs/CHANGES	Sat Feb 25 08:11:57 2006 +0000
@@ -38,8 +38,8 @@
       Title search will also search attachment filenames.
       Full text search will also search attachment contents.
     * indexing filter plugins, see MoinMoin:FiltersForIndexing
-      There are not many filters yet, so most is handled by the "binary"
-      filter, a very simple ASCII-only filter.
+      There are not many filters yet, so most is handled by the
+      "application/octet-stream" filter, a very simple ASCII-only filter.
       Feel free to contribute more filter plugins!
     * We check cfg.superuser to be a list of user names (as documented) and
       deny superuser access if it is not. This avoids security issues by