changeset 843:11a9d77e92d3

stemming works.. in english
author Franz Pletz <fpletz AT franz-pletz DOT org>
date Thu, 15 Jun 2006 13:52:26 +0200
parents 4bd5f5f8f95a
children 399041205773
files MoinMoin/Xapian.py MoinMoin/search.py docs/CHANGES.fpletz
diffstat 3 files changed, 50 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/Xapian.py	Wed Jun 14 20:33:15 2006 +0200
+++ b/MoinMoin/Xapian.py	Thu Jun 15 13:52:26 2006 +0200
@@ -20,16 +20,23 @@
 from MoinMoin import config, wikiutil
 from MoinMoin.util import filesys, lock
 
+try:
+    from Stemmer import Stemmer
+    def getStemmer(algorithm='english'):
+        return Stemmer(algorithm)
+    use_stemming = True
+except ImportError:
+    use_stemming = False
 
 class UnicodeQuery(xapian.Query):
     def __init__(self, *args, **kwargs):
         self.encoding = kwargs.get('encoding', config.charset)
 
         nargs = []
-        for i in args:
-            if isinstance(i, unicode):
-                i = i.encode(self.encoding)
-            nargs.append(i)
+        for term in args:
+            if isinstance(term, unicode):
+                term = term.encode(self.encoding)
+            nargs.append(term)
 
         xapian.Query.__init__(self, *nargs, **kwargs)
 
@@ -62,6 +69,10 @@
     # XXX limit stuff above to xapdoc.MAX_KEY_LEN
     # WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U)
 
+    def __init__(self):
+        if use_stemming:
+            self.stemmer = getStemmer()
+
     def tokenize(self, value):
         """Yield a stream of lower cased words from a string.
            value must be an UNICODE object or a list of unicode objects
@@ -69,6 +80,8 @@
         def enc(uc):
             """ 'encode' unicode results into whatever xapian / xapwrap wants """
             lower = uc.lower()
+            if use_stemming:
+                return self.stemmer.stemWord(lower)
             return lower
             
         if isinstance(value, list): # used for page links
@@ -93,7 +106,7 @@
                         yield enc(word)
                 elif m.group("word"):
                     word = m.group("word")
-                    yield  enc(word)
+                    yield enc(word)
                     # if it is a CamelCaseWord, we additionally yield Camel, Case and Word
                     if self.wikiword_re.match(word):
                         for sm in re.finditer(self.singleword_re, word):
@@ -539,7 +552,7 @@
                                   sortFields=(xpname, xattachment, xmtime, xwname, ),
                                  )
             doc.analyzerFactory = WikiAnalyzer
-            #search_db_language = "english"
+            #search_db_language = "english"      # XXX: hardcoded
             #stemmer = xapian.Stem(search_db_language)
             #pagetext = page.get_raw_body().lower()
             #words = re.finditer(r"\w+", pagetext)
--- a/MoinMoin/search.py	Wed Jun 14 20:33:15 2006 +0200
+++ b/MoinMoin/search.py	Thu Jun 15 13:52:26 2006 +0200
@@ -89,7 +89,8 @@
                 self.pattern = pattern
         else:
             pattern = re.escape(pattern)
-            self.search_re = re.compile(pattern, flags)
+            self.search_re = re.compile(r'%s[%s]*' % (pattern,
+                config.chars_lower), flags)
             self.pattern = pattern
 
 
@@ -247,8 +248,18 @@
         self.negated = 0
         self.use_re = use_re
         self.case = case
+        
+        if self.xapian_wanted() and Xapian.use_stemming:
+            terms = self._pattern.split(' ')
+            terms = Xapian.getStemmer().stemWords(terms)
+            self._pattern = ' '.join(terms)
+            stemmed = True
+        else:
+            stemmed = False
+        
         self._build_re(self._pattern, use_re=use_re, case=case)
-        self.titlesearch = TitleSearch(self._pattern, use_re=use_re, case=case)
+        self.titlesearch = TitleSearch(self._pattern, use_re=use_re,
+                case=case, stemmed=stemmed)
         
     def costs(self):
         return 10000
@@ -292,7 +303,7 @@
         else:
             analyzer = Xapian.WikiAnalyzer()
             terms = self._pattern.split()
-            
+
             # all parsed wikiwords, AND'ed
             queries = []
             for t in terms:
@@ -311,7 +322,7 @@
 class TitleSearch(BaseExpression):
     """ Term searches in pattern in page title only """
 
-    def __init__(self, pattern, use_re=False, case=False):
+    def __init__(self, pattern, use_re=False, case=False, stemmed=False):
         """ Init a title search
 
         @param pattern: pattern to search for, ascii string or unicode
@@ -322,7 +333,13 @@
         self.negated = 0
         self.use_re = use_re
         self.case = case
-        self._build_re(unicode(pattern), use_re=use_re, case=case)
+
+        if not stemmed and self.xapian_wanted() and Xapian.use_stemming:
+            terms = self._pattern.split(' ')
+            terms = Xapian.getStemmer().stemWords(terms)
+            self._pattern = ' '.join(terms)
+
+        self._build_re(self._pattern, use_re=use_re, case=case)
         
     def costs(self):
         return 100
--- a/docs/CHANGES.fpletz	Wed Jun 14 20:33:15 2006 +0200
+++ b/docs/CHANGES.fpletz	Thu Jun 15 13:52:26 2006 +0200
@@ -2,11 +2,13 @@
 =============================
 
   Known main issues:
-    * ...
+    * Stemming in English only for now because we would have to stem every
+      word in a query for every language. Suggestions? ;-)
+    * Somethings' wrong with the matching of stemmed terms, i.e. matches
+      beyond single WikiWord borders although matching lower-case only
+      (see MoinMoin/search.py:92)
 
   ToDo:
-    * Manually parse prefixes (e.g. title:) in MoinMoin.Xapian.Index
-      right before searching
     * Mockup the new search UI
 
   New Features:
@@ -31,5 +33,8 @@
 tweaking to use AND_NOT because Xapian doesn't provide a pure NOT. Should
 be no issue with OrExpression as _moinSearch handles this correctly.
 
-2006-06-11
+2006-06-11 Now handling prefixes correctly (title -> S, XLINKTO always
+with ':')
 
+2006-06-15 Integrated stemming, english only for now (see issues).
+