changeset 916:d0af8dce4d0e

Xapian.use_stemming -> request.cfg.xapian_stemming and stemming lang bugfix
author Franz Pletz <fpletz AT franz-pletz DOT org>
date Thu, 22 Jun 2006 00:40:06 +0200
parents 01750f3c867c
children 04c4f745620f
files MoinMoin/Xapian.py MoinMoin/multiconfig.py MoinMoin/search.py docs/CHANGES.fpletz
diffstat 4 files changed, 40 insertions(+), 22 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/Xapian.py	Tue Jun 20 21:13:27 2006 +0200
+++ b/MoinMoin/Xapian.py	Thu Jun 22 00:40:06 2006 +0200
@@ -24,9 +24,9 @@
 try:
     # PyStemmer, snowball python bindings from http://snowball.tartarus.org/
     from Stemmer import Stemmer
-    use_stemming = True
+    stemmer_available = True
 except ImportError:
-    use_stemming = False
+    stemmer_available = False
 
 class UnicodeQuery(xapian.Query):
     def __init__(self, *args, **kwargs):
@@ -47,8 +47,8 @@
 ### Tokenizer
 ##############################################################################
 
-def getWikiAnalyzerFactory(language='en'):
-    return (lambda: WikiAnalyzer(language))
+def getWikiAnalyzerFactory(request=None, language='en'):
+    return (lambda: WikiAnalyzer(request, language))
 
 class WikiAnalyzer:
     singleword = r"[%(u)s][%(l)s]+" % {
@@ -74,8 +74,8 @@
     # XXX limit stuff above to xapdoc.MAX_KEY_LEN
     # WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U)
 
-    def __init__(self, language=None):
-        if use_stemming and language:
+    def __init__(self, request=None, language=None):
+        if request and request.cfg.xapian_stemming and language:
             self.stemmer = Stemmer(language)
         else:
             self.stemmer = None
@@ -303,6 +303,10 @@
         ## if not self.exists():
         ##    self.indexPagesInNewThread(request)
 
+        # Check if we should and can stem words
+        if request.cfg.xapian_stemming and not stemmer_available:
+            request.cfg.xapian_stemming = False
+
     def _main_dir(self):
         if self.request.cfg.xapian_index_dir:
             return os.path.join(self.request.cfg.xapian_index_dir,
@@ -530,7 +534,7 @@
 
         lang = ''
 
-        if use_stemming:
+        if page.request.cfg.xapian_stemming:
             for line in body.split('\n'):
                 if line.startswith('#language'):
                     lang = line.split(' ')[1]
@@ -603,7 +607,8 @@
                                   keywords=xkeywords,
                                   sortFields=(xpname, xattachment, xmtime, xwname, ),
                                  )
-            doc.analyzerFactory = getWikiAnalyzerFactory()
+            doc.analyzerFactory = getWikiAnalyzerFactory(request,
+                    stem_language)
 
             if mode == 'update':
                 if debug: request.log("%s (replace %r)" % (pagename, uid))
@@ -643,14 +648,16 @@
                 xmtime = xapdoc.SortKey('mtime', mtime)
                 xtitle = xapdoc.Keyword('title', '%s/%s' % (pagename, att))
                 xlanguage = xapdoc.Keyword('lang', language)
+                xstem_language = xapdoc.Keyword('stem_lang', stem_language)
                 mimetype, att_content = self.contentfilter(filename)
                 xmimetype = xapdoc.TextField('mimetype', mimetype, True)
                 xcontent = xapdoc.TextField('content', att_content)
                 doc = xapdoc.Document(textFields=(xcontent, xmimetype, ),
-                                      keywords=(xatt_itemid, xtitle, xlanguage, ),
+                                      keywords=(xatt_itemid, xtitle, xlanguage, xstem_language, ),
                                       sortFields=(xpname, xattachment, xmtime, xwname, ),
                                      )
-                doc.analyzerFactory = getWikiAnalyzerFactory()
+                doc.analyzerFactory = getWikiAnalyzerFactory(request,
+                        stem_language)
                 if mode == 'update':
                     if debug: request.log("%s (replace %r)" % (pagename, uid))
                     doc.uid = uid
--- a/MoinMoin/multiconfig.py	Tue Jun 20 21:13:27 2006 +0200
+++ b/MoinMoin/multiconfig.py	Thu Jun 22 00:40:06 2006 +0200
@@ -277,6 +277,7 @@
 
     xapian_search = False # disabled until xapian is finished
     xapian_index_dir = None
+    xapian_stemming = True
 
     mail_login = None # or "user pwd" if you need to use SMTP AUTH
     mail_sendmail = None # "/usr/sbin/sendmail -t -i" to not use SMTP, but sendmail
--- a/MoinMoin/search.py	Tue Jun 20 21:13:27 2006 +0200
+++ b/MoinMoin/search.py	Thu Jun 22 00:40:06 2006 +0200
@@ -18,9 +18,8 @@
 try:
     import Xapian
     from Xapian import Query, UnicodeQuery
-    use_stemming = Xapian.use_stemming
 except ImportError:
-    use_stemming = False
+    pass
 
 #############################################################################
 ### query objects
@@ -275,7 +274,7 @@
         # Search in page body
         body = page.get_raw_body()
         for match in self.search_re.finditer(body):
-            if use_stemming:
+            if page.request.cfg.xapian_stemming:
                 # somewhere in regular word
                 if body[match.start()] not in config.chars_upper and \
                         body[match.start()-1] in config.chars_lower:
@@ -310,14 +309,15 @@
         if self.use_re:
             return None # xapian can't do regex search
         else:
-            analyzer = Xapian.WikiAnalyzer(language=request.cfg.language_default)
+            analyzer = Xapian.WikiAnalyzer(request=request,
+                    language=request.cfg.language_default)
             terms = self._pattern.split()
 
             # all parsed wikiwords, AND'ed
             queries = []
             stemmed = []
             for t in terms:
-                if use_stemming:
+                if request.cfg.xapian_stemming:
                     # stemmed OR not stemmed
                     tmp = []
                     for i in analyzer.tokenize(t, flat_stemming=False):
@@ -379,7 +379,7 @@
         # Get matches in page name
         matches = []
         for match in self.search_re.finditer(page.page_name):
-            if use_stemming:
+            if page.request.cfg.xapian_stemming:
                 # somewhere in regular word
                 if page.page_name[match.start()] not in config.chars_upper and \
                         page.page_name[match.start()-1] in config.chars_lower:
@@ -413,7 +413,8 @@
         if self.use_re:
             return None # xapian doesn't support regex search
         else:
-            analyzer = Xapian.WikiAnalyzer(language=request.cfg.language_default)
+            analyzer = Xapian.WikiAnalyzer(request=request,
+                    language=request.cfg.language_default)
             terms = self._pattern.split()
             terms = [list(analyzer.raw_tokenize(t)) for t in terms]
 
@@ -421,7 +422,7 @@
             queries = []
             stemmed = []
             for t in terms:
-                if use_stemming:
+                if request.cfg.xapian_stemming:
                     # stemmed OR not stemmed
                     tmp = []
                     for i in analyzer.tokenize(t, flat_stemming=False):
--- a/docs/CHANGES.fpletz	Tue Jun 20 21:13:27 2006 +0200
+++ b/docs/CHANGES.fpletz	Thu Jun 22 00:40:06 2006 +0200
@@ -3,22 +3,25 @@
 
   Known main issues:
     * Regex searching with Xapian?
+    * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper metadata)
 
   ToDo:
-    * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper
-      metadata)
     * Mockup the new search UI
     * Write/update documentation for all the new search stuff
     * Indexing and searching of categories (new term prefix)
-    * MoinMoin.Xapian.use_stemming -> request.cfg.xapian_use_stemming
+    * Drop _moinSearch when using Xapian and use term positions provided
+      by Xapian itself, needs some reworking of WikiAnalyzer/xapwrap to
+      get the position of stemmed words right
 
   New Features:
     * Faster search thanks to Xapian
     * Searching for languages with new prefix lang/language, i.e. lang:de
-      Note: Only available when Xapian is activated
+      Note: Currently only available when Xapian is used
     * New config options:
         xapian_search        0      enables xapian-powered search
         xapian_index_dir     None   directory for xapian indices
+        xapian_stemming      True   Toggles usage of stemmer, fallback
+                                    to False if no stemmer installed
   
   Bugfixes (only stuff that is buggy in moin/1.6 main branch):
     * ...
@@ -80,3 +83,9 @@
     * All stemming/matching issues resolved (hopefully)
     * Works now without xapian installed (enhance error reporting)
 
+2006-06-21
+    * Making stemming configurable (xapian_stemming) with fallback to
+      False if no stemmer available
+    * Xapian.use_stemming -> request.cfg.xapian_stemming
+    * Fixed bug in the selection of the stemming language
+