changeset 823:17d66aec432c

add Xapian.UnicodeQuery, small cleanups
author Franz Pletz <fpletz AT franz-pletz DOT org>
date Sat, 10 Jun 2006 15:06:08 +0200
parents 47a674c70966
children 4562cd3a4a5f
files MoinMoin/Xapian.py MoinMoin/search.py MoinMoin/support/xapwrap/document.py MoinMoin/support/xapwrap/index.py
diffstat 4 files changed, 40 insertions(+), 31 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/Xapian.py	Sat Jun 10 11:28:36 2006 +0200
+++ b/MoinMoin/Xapian.py	Sat Jun 10 15:06:08 2006 +0200
@@ -2,7 +2,8 @@
 """
     MoinMoin - xapian indexing search engine
 
-    @copyright: 2006 by Thomas Waldmann
+    @copyright: 2006 MoinMoin:ThomasWaldmann,
+                2006 MoinMoin:FranzPletz
     @license: GNU GPL, see COPYING for details.
 """
 debug = True
@@ -10,6 +11,7 @@
 import sys, os, re, codecs, errno, time
 from pprint import pprint
 
+import xapian
 from MoinMoin.support.xapwrap import document as xapdoc
 from MoinMoin.support.xapwrap import index as xapidx
 from MoinMoin.parser.text_moin_wiki import Parser as WikiParser
@@ -19,6 +21,19 @@
 from MoinMoin.util import filesys, lock
 
 
+class UnicodeQuery(xapian.Query):
+    def __init__(self, *args, **kwargs):
+        self.encoding = kwargs.get('encoding', config.charset)
+
+        nargs = []
+        for i in args:
+            if isinstance(i, unicode):
+                i = i.encode(self.encoding)
+            nargs.append(i)
+
+        xapian.Query.__init__(self, *nargs, **kwargs)
+
+
 ##############################################################################
 ### Tokenizer
 ##############################################################################
--- a/MoinMoin/search.py	Sat Jun 10 11:28:36 2006 +0200
+++ b/MoinMoin/search.py	Sat Jun 10 15:06:08 2006 +0200
@@ -15,7 +15,8 @@
 from MoinMoin.Page import Page
 
 import Xapian
-import xapian
+from xapian import Query
+from Xapian import UnicodeQuery
 
 #############################################################################
 ### query objects
@@ -186,9 +187,9 @@
 
         # prepare query for not negated terms
         if len(terms) == 1:
-            t1 = xapian.Query(terms[0])
+            t1 = Query(terms[0])
         else:
-            t1 = xapian.Query(xapian.Query.OP_AND, terms)
+            t1 = Query(Query.OP_AND, terms)
 
         # negated terms?
         if not not_terms:
@@ -197,11 +198,11 @@
         
         # yes, link not negated and negated terms' query with a AND_NOT query
         if len(not_terms) == 1:
-            t2 = xapian.Query(not_terms[0])
+            t2 = Query(not_terms[0])
         else:
-            t2 = xapian.Query(xapian.Query.OP_OR, not_terms)
+            t2 = Query(Query.OP_OR, not_terms)
 
-        return xapian.Query(xapian.Query.OP_AND_NOT, t1, t2)
+        return Query(Query.OP_AND_NOT, t1, t2)
 
 
 class OrExpression(AndExpression):
@@ -225,7 +226,7 @@
 
     def xapian_term(self):
         # XXX: negated terms managed by _moinSearch?
-        return xapian.Query(xapian.Query.OP_OR, [term.xapian_term() for term in self._subterms])
+        return Query(Query.OP_OR, [term.xapian_term() for term in self._subterms])
 
 
 class TextSearch(BaseExpression):
@@ -297,16 +298,14 @@
             for t in terms:
                 t = [i.encode(config.charset) for i in list(analyzer.tokenize(t))]
                 if len(t) < 2:
-                    queries.append(xapian.Query(t[0]))
+                    queries.append(UnicodeQuery(t[0]))
                 else:
-                    queries.append(xapian.Query(xapian.Query.OP_AND, t))
+                    queries.append(UnicodeQuery(Query.OP_AND, t))
 
             # titlesearch OR parsed wikiwords
-            term = xapian.Query(xapian.Query.OP_OR,
+            return Query(Query.OP_OR,
                     (self.titlesearch.xapian_term(),
-                        xapian.Query(xapian.Query.OP_AND, queries)))
-
-            return term
+                        Query(Query.OP_AND, queries)))
 
 
 class TitleSearch(BaseExpression):
@@ -374,16 +373,14 @@
             # all parsed wikiwords, AND'ed
             queries = []
             for t in terms:
-                t = ['%s%s' % (Xapian.Index.prefixMap['title'],
-                    i.encode(config.charset)) for i in list(analyzer.tokenize(t))]
+                t = ['%s%s' % (Xapian.Index.prefixMap['title'], i)
+                        for i in list(analyzer.tokenize(t))]
                 if len(t) < 2:
-                    queries.append(xapian.Query(t[0]))
+                    queries.append(UnicodeQuery(t[0]))
                 else:
-                    queries.append(xapian.Query(xapian.Query.OP_AND, t))
+                    queries.append(UnicodeQuery(Query.OP_AND, t))
 
-            term = xapian.Query(xapian.Query.OP_AND, queries)
-
-            return term
+            return Query(Query.OP_AND, queries)
 
 
 class LinkSearch(BaseExpression):
@@ -468,11 +465,8 @@
         if self.use_re:
             return None # xapian doesnt support regex search
         else:
-            term = xapian.Query(('%s%s%s' %
-                (Xapian.Index.prefixMap['linkto'],
-                    pattern[0] in string.uppercase and ':' or '',
-                    pattern)).encode(config.charset))
-            return term
+            return UnicodeQuery('%s:%s' %
+                    (Xapian.Index.prefixMap['linkto'], pattern))
 
 ############################################################################
 ### Results
--- a/MoinMoin/support/xapwrap/document.py	Sat Jun 10 11:28:36 2006 +0200
+++ b/MoinMoin/support/xapwrap/document.py	Sat Jun 10 15:06:08 2006 +0200
@@ -297,7 +297,7 @@
     else: # we have a map, so first translate it using the map (e.g. 'title' -> 'S')
         prefix = prefixMap.get(prefix, prefix.upper())
 
-    result = '%s%s%s' % (prefix, token[0] in string.uppercase and ':' or '', token)
+    result = '%s%s%s' % (prefix, prefix[0] == 'X' and ':' or '', token)
     # since return value is going into the db, it must be encoded as UTF-8
     result = result.encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY)
     return checkKeyLen(result)
--- a/MoinMoin/support/xapwrap/index.py	Sat Jun 10 11:28:36 2006 +0200
+++ b/MoinMoin/support/xapwrap/index.py	Sat Jun 10 15:06:08 2006 +0200
@@ -489,16 +489,16 @@
         if self.db is None:
             self._setupDB()
 
-            self.qp = xapian.QueryParser()
+            #self.qp = xapian.QueryParser()
             # this is vital: these options specify no language for
             # stemming (""), disable stemming (False), and specify an
             # empty stop word object (None). we need this because by
             # default, xapian's query parser does english stemming
-            s = xapian.Stem(self.STEMMING_LANGUAGE)
-            self.qp.set_stemmer(s)
+            #s = xapian.Stem(self.STEMMING_LANGUAGE)
+            #self.qp.set_stemmer(s)
 
             # we want query terms to be ANDed together by default
-            self.qp.set_default_op(self.DEFAULT_QUERY_COMBINER_OP)
+            #self.qp.set_default_op(self.DEFAULT_QUERY_COMBINER_OP)
             self._configure()
 
             log("Index %s contains %s documents" %