changeset 852:0ccd65be5656

some more code and thinking on matching stemmed words
author Franz Pletz <fpletz AT franz-pletz DOT org>
date Sun, 18 Jun 2006 01:06:50 +0200
parents 4d1bc2e51184
children 210f3adb44de
files MoinMoin/Xapian.py MoinMoin/search.py docs/CHANGES.fpletz
diffstat 3 files changed, 33 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/Xapian.py	Sat Jun 17 20:53:12 2006 +0200
+++ b/MoinMoin/Xapian.py	Sun Jun 18 01:06:50 2006 +0200
@@ -36,7 +36,7 @@
             if isinstance(term, unicode):
                 term = term.encode(self.encoding)
             elif isinstance(term, list) or isinstance(term, tuple):
-                term = map(lambda t: t.encode(self.encoding), term)
+                term = [t.encode(self.encoding) for t in term]
             nargs.append(term)
 
         xapian.Query.__init__(self, *nargs, **kwargs)
--- a/MoinMoin/search.py	Sat Jun 17 20:53:12 2006 +0200
+++ b/MoinMoin/search.py	Sun Jun 18 01:06:50 2006 +0200
@@ -76,7 +76,7 @@
         """
         return ''
 
-    def _build_re(self, pattern, use_re=False, case=False):
+    def _build_re(self, pattern, use_re=False, case=False, stemmed=False):
         """ Make a regular expression out of a text pattern """
         flags = case and re.U or (re.I | re.U)
         if use_re:
@@ -90,8 +90,15 @@
                 self.pattern = pattern
         else:
             pattern = re.escape(pattern)
-            self.search_re = re.compile(r'%s[%s]*' % (pattern,
-                config.chars_lower), flags)
+            if stemmed:
+                # XXX: works, but pretty CPU-intensive (obviously...)
+                self.search_re = re.compile(r'(?=^|[\s]+|[^%s]+)%s[%s]*' %
+                        (config.chars_lower, case and pattern or
+                            ''.join(['[%s%s]' % (ch.upper(), ch.lower())
+                                for ch in pattern]),
+                         config.chars_lower), re.U)
+            else:
+                self.search_re = re.compile(pattern, flags)
             self.pattern = pattern
 
 
@@ -311,10 +318,9 @@
                     t = [UnicodeQuery(i) for i in analyzer.tokenize(t)]
                 queries.append(Query(Query.OP_AND, t))
 
-            # TODO: hilight and sort stemmed words correctly (also in TitleSearch)
-            #if stemmed:
-            #    self._build_re(' '.join(stemmed), use_re=False,
-            #            case=self.case)
+            if stemmed:
+                self._build_re(' '.join(stemmed), use_re=False,
+                        case=self.case, stemmed=True)
 
             # titlesearch OR parsed wikiwords
             return Query(Query.OP_OR,
@@ -386,12 +392,16 @@
 
             # all parsed wikiwords, AND'ed
             queries = []
+            stemmed = []
             for t in terms:
                 if Xapian.use_stemming:
                     # stemmed OR not stemmed
-                    t = [UnicodeQuery(Query.OP_OR, ['%s%s' %
-                        (Xapian.Index.prefixMap['title'], j) for j in i])
-                            for i in analyzer.tokenize(t, flat_stemming=False)]
+                    tmp = []
+                    for i in analyzer.tokenize(t, flat_stemming=False):
+                        tmp.append(UnicodeQuery(Query.OP_OR, ['%s%s' %
+                            (Xapian.Index.prefixMap['title'], j) for j in i]))
+                        stemmed.append(i[1])
+                    t = tmp
                 else:
                     # just not stemmed
                     t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], i))
@@ -399,6 +409,10 @@
 
                 queries.append(Query(Query.OP_AND, t))
 
+            if stemmed:
+                self._build_re(' '.join(stemmed), use_re=False,
+                        case=self.case, stemmed=True)
+
             return Query(Query.OP_AND, queries)
 
 
--- a/docs/CHANGES.fpletz	Sat Jun 17 20:53:12 2006 +0200
+++ b/docs/CHANGES.fpletz	Sun Jun 18 01:06:50 2006 +0200
@@ -2,13 +2,12 @@
 =============================
 
   Known main issues:
-    * Somethings' wrong with the matching of stemmed terms, i.e. matches
-      beyond single WikiWord borders although matching lower-case only
-      (see MoinMoin/search.py:92)
+    * _moinSearch matches all characters in words when stemming,
+      workaround uses too much CPU
     * Matching of stemmed terms is generally unreliable because the
       matches (and consequently the count) are not obtained by Xapian
       as _moinSearch is called with the Xapian results. Use the Xapian
-      matches?
+      matches somehow?
     * Regex searching with Xapian?
 
   ToDo:
@@ -58,4 +57,9 @@
       bugfixes)
 
 2006-06-17
+    * Tackled some of the issues with matching stemmed words. Need some
+      advice on how to detect and match them reliably using the current
+      framework
 
+2006-06-18
+