changeset 789:a1359eaee20e

xapian search: some fixes, move mimetype stuff to wikiutil, add opendocument mimetypes, add opendocument filter modules
author Thomas Waldmann <tw AT waldmann-edv DOT de>
date Mon, 01 May 2006 16:43:28 +0200
parents 4840926790f5
children 930c9f03aa13
files MoinMoin/Xapian.py MoinMoin/action/AttachFile.py MoinMoin/filter/application_vnd_oasis_opendocument.py MoinMoin/filter/application_vnd_oasis_opendocument_presentation.py MoinMoin/filter/application_vnd_oasis_opendocument_spreadsheet.py MoinMoin/filter/application_vnd_oasis_opendocument_text.py MoinMoin/script/index/build.py MoinMoin/search.py MoinMoin/support/xapwrap/index.py MoinMoin/wikiutil.py
diffstat 10 files changed, 119 insertions(+), 23 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/Xapian.py	Sun Apr 30 18:12:28 2006 +0200
+++ b/MoinMoin/Xapian.py	Mon May 01 16:43:28 2006 +0200
@@ -387,9 +387,8 @@
         def mt2mn(mt): # mimetype to modulename
             return mt.replace("/", "_").replace("-","_").replace(".", "_")
 
-        import mimetypes
         request = self.request
-        mimetype, encoding = mimetypes.guess_type(filename)
+        mimetype, encoding = wikiutil.guess_type(filename)
         if mimetype is None:
             mimetype = 'application/octet-stream'
         try:
@@ -480,6 +479,7 @@
         pagename = page.page_name
         mtime = page.mtime_usecs()
         if mode == 'update':
+            # from #xapian: if you generate a special "unique id" term, you can just call database.replace_document(uid_term, doc)
             query = xapidx.RawQuery(xapdoc.makePairForWrite('title', pagename))
             docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', ])
             if docs:
@@ -498,11 +498,11 @@
             pname = xapdoc.SortKey('pagename', pagename)
             attachment = xapdoc.SortKey('attachment', '') # this is a real page, not an attachment
             mtime = xapdoc.SortKey('mtime', mtime)
-            title = xapdoc.Keyword('title', pagename)
+            title = xapdoc.TextField('title', pagename, True) # prefixed
             links = xapdoc.Keyword('link_text', ' '.join(page.getPageLinks(request)))
             content = xapdoc.TextField('content', page.get_raw_body())
-            doc = xapdoc.Document(textFields=(content,),
-                                  keywords=(title, links,),
+            doc = xapdoc.Document(textFields=(content, title),
+                                  keywords=(links,),
                                   sortFields=(pname, attachment, mtime,),
                                  )
             #search_db_language = "english"
@@ -641,7 +641,7 @@
 def run_query(query, db):
     enquire = xapian.Enquire(db)
     parser = xapian.QueryParser()
-    query = parser.parse_query(query)
+    query = parser.parse_query(query, xapian.QueryParser.FLAG_WILDCARD)
     print query.get_description()
     enquire.set_query(query)
     return enquire.get_mset(0, 10)
--- a/MoinMoin/action/AttachFile.py	Sun Apr 30 18:12:28 2006 +0200
+++ b/MoinMoin/action/AttachFile.py	Mon May 01 16:43:28 2006 +0200
@@ -26,7 +26,7 @@
     @license: GNU GPL, see COPYING for details.
 """
 
-import os, mimetypes, time, zipfile
+import os, time, zipfile
 from MoinMoin import config, user, util, wikiutil, packages
 from MoinMoin.Page import Page
 from MoinMoin.util import MoinMoinNoFooter, filesys
@@ -544,15 +544,15 @@
     target = wikiutil.taintfilename(target)
 
     # set mimetype from extension, or from given mimetype
-    #type, encoding = mimetypes.guess_type(target)
+    #type, encoding = wikiutil.guess_type(target)
     #if not type:
     #    ext = None
     #    if request.form.has_key('mime'):
-    #        ext = mimetypes.guess_extension(request.form['mime'][0])
+    #        ext = wikiutil.guess_extension(request.form['mime'][0])
     #    if not ext:
-    #        type, encoding = mimetypes.guess_type(filename)
+    #        type, encoding = wikiutil.guess_type(filename)
     #        if type:
-    #            ext = mimetypes.guess_extension(type)
+    #            ext = wikiutil.guess_extension(type)
     #        else:
     #            ext = ''
     #    target = target + ext
@@ -634,7 +634,7 @@
     if not filename: return # error msg already sent in _access_file
 
     # get mimetype
-    type, enc = mimetypes.guess_type(filename)
+    type, enc = wikiutil.guess_type(filename)
     if not type:
         type = "application/octet-stream"
 
@@ -766,7 +766,7 @@
 
     request.write('<h2>' + _("Attachment '%(filename)s'") % {'filename': filename} + '</h2>')
 
-    type, enc = mimetypes.guess_type(filename)
+    type, enc = wikiutil.guess_type(filename)
     if type:
         if type[:5] == 'image':
             timestamp = htdocs_access(request) and "?%s" % time.time() or ''
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/filter/application_vnd_oasis_opendocument.py	Mon May 01 16:43:28 2006 +0200
@@ -0,0 +1,25 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - OpenOffice.org 2.0 *.od? Filter (OpenDocument)
+
+    Depends on: nothing (only python with zlib)
+
+    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
+    @license: GNU GPL, see COPYING for details.
+"""
+
+import re, zipfile
+
+rx_stripxml = re.compile("<[^>]*?>", re.DOTALL|re.MULTILINE)
+
+def execute(indexobj, filename):
+    try:
+        zf = zipfile.ZipFile(filename, "r")
+        data = zf.read("content.xml")
+        zf.close()
+        data = " ".join(rx_stripxml.sub(" ", data).split())
+    except RuntimeError, err:
+        indexobj.request.log(str(err))
+        data = ""
+    return data.decode('utf-8')
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/filter/application_vnd_oasis_opendocument_presentation.py	Mon May 01 16:43:28 2006 +0200
@@ -0,0 +1,15 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - OpenOffice.org 2.x Presenter Filter (OpenDocument Presentation)
+
+    Depends on: nothing (only python with zlib)
+
+    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
+    @license: GNU GPL, see COPYING for details.
+"""
+
+from MoinMoin.filter.application_vnd_oasis_opendocument import execute as odfilter
+
+def execute(indexobj, filename):
+    return odfilter(indexobj, filename)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/filter/application_vnd_oasis_opendocument_spreadsheet.py	Mon May 01 16:43:28 2006 +0200
@@ -0,0 +1,15 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - OpenOffice.org 2.x Calc Filter (OpenDocument Spreadsheet)
+
+    Depends on: nothing (only python with zlib)
+
+    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
+    @license: GNU GPL, see COPYING for details.
+"""
+
+from MoinMoin.filter.application_vnd_oasis_opendocument import execute as odfilter
+
+def execute(indexobj, filename):
+    return odfilter(indexobj, filename)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/filter/application_vnd_oasis_opendocument_text.py	Mon May 01 16:43:28 2006 +0200
@@ -0,0 +1,15 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - OpenOffice.org 2.x Writer Filter (OpenDocument Text)
+
+    Depends on: nothing (only python with zlib)
+
+    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
+    @license: GNU GPL, see COPYING for details.
+"""
+
+from MoinMoin.filter.application_vnd_oasis_opendocument import execute as odfilter
+
+def execute(indexobj, filename):
+    return odfilter(indexobj, filename)
+
--- a/MoinMoin/script/index/build.py	Sun Apr 30 18:12:28 2006 +0200
+++ b/MoinMoin/script/index/build.py	Mon May 01 16:43:28 2006 +0200
@@ -22,10 +22,10 @@
             "--files", metavar="FILES", dest="file_list",
             help="filename of file list, e.g. files.lst (one file per line)"
         )
-        #self.parser.add_option(
-        #    "--update", action="store_true", dest="update",
-        #    help="when given, update an existing index"
-        #)
+        self.parser.add_option(
+            "--mode", metavar="MODE", dest="mode",
+            help="either add (unconditionally add to index) or update (update an existing index)"
+        )
     
     def mainloop(self):
         self.init_request()
@@ -40,6 +40,6 @@
     """ Xapian index build script class """
 
     def command(self):
-        Index(self.request).indexPages(self.files) # , self.options.update)
+        Index(self.request).indexPages(self.files, self.options.mode)
         #Index(self.request).test(self.request)
 
--- a/MoinMoin/search.py	Sun Apr 30 18:12:28 2006 +0200
+++ b/MoinMoin/search.py	Mon May 01 16:43:28 2006 +0200
@@ -252,13 +252,13 @@
         if self.use_re:
             return '' # xapian can't do regex search
         else:
-            terms = pattern.lower().split()
+            terms = pattern.split()
             terms = [list(Xapian.tokenizer(t)) for t in terms]
             term = []
             for t in terms:
                 term.append(" AND ".join(t))
-        return "(%s OR %s)" % (self.titlesearch.xapian_term(), " AND ".join(term))
-
+            term = "(%s OR %s)" % (self.titlesearch.xapian_term(), " AND ".join(term))
+            return "%s %s" % (self.negated and "NOT" or "", term)
 
 class TitleSearch(BaseExpression):
     """ Term searches in pattern in page title only """
@@ -1169,7 +1169,8 @@
                 query = xapwrap.index.ParsedQuery(query)
                 hits = index.search(query)
                 self.request.log("xapianSearch: finds: %r" % hits)
-                pages = [(hit['values']['pagename'], hit['values']['attachment']) for hit in hits]
+                pages = [(hit['values']['pagename'].decode(config.charset),
+                          hit['values']['attachment'].decode(config.charset)) for hit in hits]
                 self.request.log("xapianSearch: finds pages: %r" % pages)
             except index.LockedException:
                 pass
--- a/MoinMoin/support/xapwrap/index.py	Sun Apr 30 18:12:28 2006 +0200
+++ b/MoinMoin/support/xapwrap/index.py	Mon May 01 16:43:28 2006 +0200
@@ -525,7 +525,7 @@
                     V = v.upper()
                 else:
                     V = k.upper()
-                self.qp.set_prefix(k, V)
+                self.qp.add_prefix(k, V)
 
     def configure(self, prefixMap = None, indexValueMap = None):
         if prefixMap is not None:
--- a/MoinMoin/wikiutil.py	Sun Apr 30 18:12:28 2006 +0200
+++ b/MoinMoin/wikiutil.py	Mon May 01 16:43:28 2006 +0200
@@ -24,6 +24,31 @@
 CHILD_PREFIX_LEN = len(CHILD_PREFIX)
 
 #############################################################################
+### mimetype support
+#############################################################################
+import mimetypes
+guess_type = mimetypes.guess_type
+guess_extension = mimetypes.guess_extension
+
+_our_types = {
+ # OpenOffice 2.x & others
+ '.odt': 'application/vnd.oasis.opendocument.text',
+ '.ods': 'application/vnd.oasis.opendocument.spreadsheet',
+ '.odp': 'application/vnd.oasis.opendocument.presentation',
+ '.odg': 'application/vnd.oasis.opendocument.graphics',
+ '.odc': 'application/vnd.oasis.opendocument.chart',
+ '.odf': 'application/vnd.oasis.opendocument.formula',
+ '.odb': 'application/vnd.oasis.opendocument.database',
+ '.odi': 'application/vnd.oasis.opendocument.image',
+ '.odm': 'application/vnd.oasis.opendocument.text-master',
+ '.ott': 'application/vnd.oasis.opendocument.text-template',
+ '.ots': 'application/vnd.oasis.opendocument.spreadsheet-template',
+ '.otp': 'application/vnd.oasis.opendocument.presentation-template',
+ '.otg': 'application/vnd.oasis.opendocument.graphics-template',
+}
+[mimetypes.add_type(mimetype, ext, True) for ext, mimetype in our_types.items()]
+
+#############################################################################
 ### Getting data from user/Sending data to user
 #############################################################################