changeset 856:e69f2c2a238d

matching stemmed words works reliably and, most importantly, fast ;)
author Franz Pletz <fpletz AT franz-pletz DOT org>
date Mon, 19 Jun 2006 11:16:04 +0200
parents 481c72d4a181
children d93a8a6a4559
files MoinMoin/search.py docs/CHANGES.fpletz
diffstat 2 files changed, 36 insertions(+), 18 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/search.py	Mon Jun 19 09:53:52 2006 +0200
+++ b/MoinMoin/search.py	Mon Jun 19 11:16:04 2006 +0200
@@ -90,15 +90,7 @@
                 self.pattern = pattern
         else:
             pattern = re.escape(pattern)
-            if stemmed:
-                # XXX: works, but pretty CPU-intensive (obviously...)
-                self.search_re = re.compile(r'(?=^|[\s]+|[^%s]+)%s[%s]*' %
-                        (config.chars_lower, case and pattern or
-                            ''.join(['[%s%s]' % (ch.upper(), ch.lower())
-                                for ch in pattern]),
-                         config.chars_lower), re.U)
-            else:
-                self.search_re = re.compile(pattern, flags)
+            self.search_re = re.compile(pattern, flags)
             self.pattern = pattern
 
 
@@ -280,7 +272,23 @@
         # Search in page body
         body = page.get_raw_body()
         for match in self.search_re.finditer(body):
-            matches.append(TextMatch(re_match=match))
+            if Xapian.use_stemming:
+                # somewhere in regular word
+                if body[match.start()] not in config.chars_upper and \
+                        body[match.start()-1] in config.chars_lower:
+                    continue
+
+                post = 0
+                for c in body[match.end():]:
+                    if c in config.chars_lower:
+                        post += 1
+                    else:
+                        break
+
+                matches.append(TextMatch(start=match.start(),
+                        end=match.end()+post))
+            else:
+                matches.append(TextMatch(re_match=match))
 
         # Decide what to do with the results.
         if ((self.negated and matches) or
@@ -368,7 +376,23 @@
         # Get matches in page name
         matches = []
         for match in self.search_re.finditer(page.page_name):
-            matches.append(TitleMatch(re_match=match))
+            if Xapian.use_stemming:
+                # somewhere in regular word
+                if page.page_name[match.start()] not in config.chars_upper and \
+                        page.page_name[match.start()-1] in config.chars_lower:
+                    continue
+
+                post = 0
+                for c in page.page_name[match.end():]:
+                    if c in config.chars_lower:
+                        post += 1
+                    else:
+                        break
+
+                matches.append(TitleMatch(start=match.start(),
+                        end=match.end()+post))
+            else:
+                matches.append(TitleMatch(re_match=match))
         
         if ((self.negated and matches) or
             (not self.negated and not matches)):
--- a/docs/CHANGES.fpletz	Mon Jun 19 09:53:52 2006 +0200
+++ b/docs/CHANGES.fpletz	Mon Jun 19 11:16:04 2006 +0200
@@ -2,12 +2,6 @@
 =============================
 
   Known main issues:
-    * _moinSearch matches all characters in words when stemming,
-      workaround uses too much CPU
-    * Matching of stemmed terms is generally unreliable because the
-      matches (and consequently the count) are not obtained by Xapian
-      as _moinSearch is called with the Xapian results. Use the Xapian
-      matches somehow?
     * Regex searching with Xapian?
 
   ToDo:
@@ -16,7 +10,6 @@
     * Mockup the new search UI
     * Write/update documentation for all the new search stuff
     * Indexing and searching of categories (new term prefix)
-    * Finish the stemming/matching stuff
 
   New Features:
     * Faster search thanks to Xapian
@@ -83,4 +76,5 @@
 
         Possible extension: Xapian can handle multiple databases, maybe
         allow searching across defined wikis on a wikifarm?
+    * All stemming/matching issues resolved (hopefully)