changeset 849:02d6697b000d

basic searching using stemmed and unstemmed terms
author Franz Pletz <fpletz AT franz-pletz DOT org>
date Thu, 15 Jun 2006 20:29:29 +0200
parents ac386d2622af
children 71875396f812
files MoinMoin/Xapian.py MoinMoin/search.py docs/CHANGES.fpletz
diffstat 3 files changed, 95 insertions(+), 75 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/Xapian.py	Thu Jun 15 16:44:16 2006 +0200
+++ b/MoinMoin/Xapian.py	Thu Jun 15 20:29:29 2006 +0200
@@ -23,8 +23,6 @@
 try:
     # PyStemmer, snowball python bindings from http://snowball.tartarus.org/
     from Stemmer import Stemmer
-    def getStemmer(algorithm='english'):
-        return Stemmer(algorithm)
     use_stemming = True
 except ImportError:
     use_stemming = False
@@ -37,6 +35,8 @@
         for term in args:
             if isinstance(term, unicode):
                 term = term.encode(self.encoding)
+            elif isinstance(term, list) or isinstance(term, tuple):
+                term = map(lambda t: t.encode(self.encoding), term)
             nargs.append(term)
 
         xapian.Query.__init__(self, *nargs, **kwargs)
@@ -46,6 +46,9 @@
 ### Tokenizer
 ##############################################################################
 
+def getWikiAnalyzerFactory(language='en'):
+    return (lambda: WikiAnalyzer(language))
+
 class WikiAnalyzer:
     singleword = r"[%(u)s][%(l)s]+" % {
                      'u': config.chars_upper,
@@ -70,19 +73,16 @@
     # XXX limit stuff above to xapdoc.MAX_KEY_LEN
     # WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U)
 
-    def __init__(self):
-        if use_stemming:
-            self.stemmer = getStemmer()
+    def __init__(self, language=None):
+        if use_stemming and language:
+            self.stemmer = Stemmer(language)
+        else:
+            self.stemmer = None
 
-    def tokenize(self, value):
-        """Yield a stream of lower cased words from a string.
-           value must be an UNICODE object or a list of unicode objects
-        """
+    def raw_tokenize(self, value):
         def enc(uc):
             """ 'encode' unicode results into whatever xapian / xapwrap wants """
             lower = uc.lower()
-            if use_stemming:
-                return self.stemmer.stemWord(lower)
             return lower
             
         if isinstance(value, list): # used for page links
@@ -113,6 +113,18 @@
                         for sm in re.finditer(self.singleword_re, word):
                             yield enc(sm.group())
 
+    def tokenize(self, value, flat_stemming=True):
+        """Yield a stream of lower cased raw and stemmed (optional) words from a string.
+           value must be an UNICODE object or a list of unicode objects
+        """
+        for i in self.raw_tokenize(value):
+            if flat_stemming:
+                yield i # XXX: should we really use a prefix for that? Index.prefixMap['raw'] + i
+                if self.stemmer:
+                    yield self.stemmer.stemWord(i)
+            else:
+                yield (i, self.stemmer.stemWord(i))
+
 
 #############################################################################
 ### Indexing
@@ -254,7 +266,7 @@
                        #N   ISO couNtry code (or domaiN name)
                        #P   Pathname
                        #Q   uniQue id
-                       #R   Raw (i.e. unstemmed) term
+        'raw':  'R',   # Raw (i.e. unstemmed) term
         'title': 'S',  # Subject (or title)
         'mimetype': 'T',
         'url': 'U',    # full URL of indexed document - if the resulting term would be > 240
@@ -495,7 +507,7 @@
                                       keywords=(xtitle, xitemid, ),
                                       sortFields=(xpname, xattachment, xmtime, xwname, ),
                                      )
-                doc.analyzerFactory = WikiAnalyzer
+                doc.analyzerFactory = getWikiAnalyzerFactory()
                 if mode == 'update':
                     if debug: request.log("%s (replace %r)" % (filename, uid))
                     doc.uid = uid
@@ -515,7 +527,7 @@
             if line.startswith('#language'):
                 lang = line.split(' ')[1]
                 try:
-                    getStemmer(lang)
+                    Stemmer(lang)
                 except KeyError:
                     # lang is not stemmable
                     break
@@ -583,17 +595,8 @@
                                   keywords=xkeywords,
                                   sortFields=(xpname, xattachment, xmtime, xwname, ),
                                  )
-            doc.analyzerFactory = WikiAnalyzer
-            #search_db_language = "english"      # XXX: hardcoded
-            #stemmer = xapian.Stem(search_db_language)
-            #pagetext = page.get_raw_body().lower()
-            #words = re.finditer(r"\w+", pagetext)
-            #count = 0
-            #for wordmatch in words:
-            #    count += 1
-            #    word = wordmatch.group().encode(config.charset)
-            #    document.add_posting('R' + stemmer.stem_word(word), count) # count should be term position in document (starting at 1)
-            
+            doc.analyzerFactory = getWikiAnalyzerFactory()
+
             if mode == 'update':
                 if debug: request.log("%s (replace %r)" % (pagename, uid))
                 doc.uid = uid
@@ -636,10 +639,10 @@
                 xmimetype = xapdoc.TextField('mimetype', mimetype, True)
                 xcontent = xapdoc.TextField('content', att_content)
                 doc = xapdoc.Document(textFields=(xcontent, xmimetype, ),
-                                      keywords=(xatt_itemid, xtitle, xlanguage),
+                                      keywords=(xatt_itemid, xtitle, xlanguage, ),
                                       sortFields=(xpname, xattachment, xmtime, xwname, ),
                                      )
-                doc.analyzerFactory = WikiAnalyzer
+                doc.analyzerFactory = getWikiAnalyzerFactory()
                 if mode == 'update':
                     if debug: request.log("%s (replace %r)" % (pagename, uid))
                     doc.uid = uid
--- a/MoinMoin/search.py	Thu Jun 15 16:44:16 2006 +0200
+++ b/MoinMoin/search.py	Thu Jun 15 20:29:29 2006 +0200
@@ -10,7 +10,8 @@
     @license: GNU GPL, see COPYING for details
 """
 
-import re, time, sys, StringIO, string
+import re, time, sys, StringIO, string, operator
+from sets import Set
 from MoinMoin import wikiutil, config
 from MoinMoin.Page import Page
 
@@ -176,15 +177,15 @@
             wanted = wanted and term.xapian_wanted()
         return wanted
 
-    def xapian_term(self):
+    def xapian_term(self, request):
         # sort negated terms
         terms = []
         not_terms = []
         for term in self._subterms:
             if not term.negated:
-                terms.append(term.xapian_term())
+                terms.append(term.xapian_term(request))
             else:
-                not_terms.append(term.xapian_term())
+                not_terms.append(term.xapian_term(request))
 
         # prepare query for not negated terms
         if len(terms) == 1:
@@ -225,9 +226,9 @@
                 matches.extend(result)
         return matches
 
-    def xapian_term(self):
+    def xapian_term(self, request):
         # XXX: negated terms managed by _moinSearch?
-        return Query(Query.OP_OR, [term.xapian_term() for term in self._subterms])
+        return Query(Query.OP_OR, [term.xapian_term(request) for term in self._subterms])
 
 
 class TextSearch(BaseExpression):
@@ -248,18 +249,8 @@
         self.negated = 0
         self.use_re = use_re
         self.case = case
-        
-        if self.xapian_wanted() and Xapian.use_stemming:
-            terms = self._pattern.split(' ')
-            terms = Xapian.getStemmer().stemWords(terms)
-            self._pattern = ' '.join(terms)
-            stemmed = True
-        else:
-            stemmed = False
-        
         self._build_re(self._pattern, use_re=use_re, case=case)
-        self.titlesearch = TitleSearch(self._pattern, use_re=use_re,
-                case=case, stemmed=stemmed)
+        self.titlesearch = TitleSearch(self._pattern, use_re=use_re, case=case)
         
     def costs(self):
         return 10000
@@ -297,32 +288,44 @@
     def xapian_wanted(self):
         return not self.use_re
 
-    def xapian_term(self):
+    def xapian_term(self, request):
         if self.use_re:
             return None # xapian can't do regex search
         else:
-            analyzer = Xapian.WikiAnalyzer()
+            analyzer = Xapian.WikiAnalyzer(language=request.cfg.language_default)
             terms = self._pattern.split()
 
             # all parsed wikiwords, AND'ed
             queries = []
+            stemmed = []
             for t in terms:
-                t = [i.encode(config.charset) for i in list(analyzer.tokenize(t))]
-                if len(t) < 2:
-                    queries.append(UnicodeQuery(t[0]))
+                if Xapian.use_stemming:
+                    # stemmed OR not stemmed
+                    tmp = []
+                    for i in analyzer.tokenize(t, flat_stemming=False):
+                        tmp.append(UnicodeQuery(Query.OP_OR, i))
+                        stemmed.append(i[1])
+                    t = tmp
                 else:
-                    queries.append(UnicodeQuery(Query.OP_AND, t))
+                    # just not stemmed
+                    t = [Query(i) for i in analyzer.tokenize(t)]
+                queries.append(Query(Query.OP_AND, t))
+
+            # TODO: hilight and sort stemmed words correctly (also in TitleSearch)
+            #if stemmed:
+            #    self._build_re(' '.join(stemmed), use_re=False,
+            #            case=self.case)
 
             # titlesearch OR parsed wikiwords
             return Query(Query.OP_OR,
-                    (self.titlesearch.xapian_term(),
+                    (self.titlesearch.xapian_term(request),
                         Query(Query.OP_AND, queries)))
 
 
 class TitleSearch(BaseExpression):
     """ Term searches in pattern in page title only """
 
-    def __init__(self, pattern, use_re=False, case=False, stemmed=False):
+    def __init__(self, pattern, use_re=False, case=False):
         """ Init a title search
 
         @param pattern: pattern to search for, ascii string or unicode
@@ -333,12 +336,6 @@
         self.negated = 0
         self.use_re = use_re
         self.case = case
-
-        if not stemmed and self.xapian_wanted() and Xapian.use_stemming:
-            terms = self._pattern.split(' ')
-            terms = Xapian.getStemmer().stemWords(terms)
-            self._pattern = ' '.join(terms)
-
         self._build_re(self._pattern, use_re=use_re, case=case)
         
     def costs(self):
@@ -379,23 +376,28 @@
     def xapian_wanted(self):
         return not self.use_re
 
-    def xapian_term(self):
+    def xapian_term(self, request):
         if self.use_re:
             return None # xapian doesn't support regex search
         else:
-            analyzer = Xapian.WikiAnalyzer()
+            analyzer = Xapian.WikiAnalyzer(language=request.cfg.language_default)
             terms = self._pattern.split()
-            terms = [list(analyzer.tokenize(t)) for t in terms]
+            terms = [list(analyzer.raw_tokenize(t)) for t in terms]
 
             # all parsed wikiwords, AND'ed
             queries = []
             for t in terms:
-                t = ['%s%s' % (Xapian.Index.prefixMap['title'], i)
-                        for i in list(analyzer.tokenize(t))]
-                if len(t) < 2:
-                    queries.append(UnicodeQuery(t[0]))
+                if Xapian.use_stemming:
+                    # stemmed OR not stemmed
+                    t = [UnicodeQuery(Query.OP_OR, ['%s%s' %
+                        (Xapian.Index.prefixMap['title'], j) for j in i])
+                            for i in analyzer.tokenize(t, flat_stemming=False)]
                 else:
-                    queries.append(UnicodeQuery(Query.OP_AND, t))
+                    # just not stemmed
+                    t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], j))
+                        for i in analyzer.tokenize(t)]
+
+                queries.append(Query(Query.OP_AND, t))
 
             return Query(Query.OP_AND, queries)
 
@@ -476,7 +478,7 @@
     def xapian_wanted(self):
         return not self.use_re
 
-    def xapian_term(self):
+    def xapian_term(self, request):
         pattern = self.pattern
         if self.use_re:
             return None # xapian doesnt support regex search
@@ -524,7 +526,7 @@
     def xapian_wanted(self):
         return not self.use_re
 
-    def xapian_term(self):
+    def xapian_term(self, request):
         pattern = self.pattern
         if self.use_re:
             return None # xapian doesnt support regex search
@@ -844,7 +846,7 @@
                 case = True
             elif "linkto".startswith(m):
                 linkto = True
-            elif "lang".startswith(m):
+            elif "language".startswith(m):
                 lang = True
 
         if lang:
@@ -1330,7 +1332,7 @@
             self.request.clock.start('_xapianSearch')
             try:
                 from MoinMoin.support import xapwrap
-                query = self.query.xapian_term()
+                query = self.query.xapian_term(self.request)
                 self.request.log("xapianSearch: query = %r" %
                         query.get_description())
                 query = xapwrap.index.QObjQuery(query)
--- a/docs/CHANGES.fpletz	Thu Jun 15 16:44:16 2006 +0200
+++ b/docs/CHANGES.fpletz	Thu Jun 15 20:29:29 2006 +0200
@@ -2,21 +2,31 @@
 =============================
 
   Known main issues:
-    * Stemming in English only for now because we would have to stem every
-      word in a query for every language. Suggestions? ;-)
+    * Stemming in English only for now because we would have to stem
+      every word in a query for every language. Suggestions? ;-)
     * Somethings' wrong with the matching of stemmed terms, i.e. matches
       beyond single WikiWord borders although matching lower-case only
       (see MoinMoin/search.py:92)
+    * Matching of stemmed terms is generally unreliable because the
+      matches (and consequently the count) are not obtained by Xapian
+      as _moinSearch is called with the Xapian results. Use the Xapian
+      matches?
     * Regex searching with Xapian?
 
   ToDo:
-    * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper metadata)
+    * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper
+      metadata)
     * Mockup the new search UI
     * Write/update documentation for all the new search stuff
+    * Wikifarms support (multiple indexes)
+    * Indexing and searching of Categories (new term prefix)
+    * Finish the stemming/matching stuff
+    * Test if indexing/searching works realiably without a stemmer
+      installed
 
   New Features:
     * Faster search thanks to Xapian
-    * Searching for languages with new prefix 'lang', i.e. lang:de
+    * Searching for languages with new prefix lang/language, i.e. lang:de
       Note: Only available when Xapian is activated
   
   Bugfixes (only stuff that is buggy in moin/1.6 main branch):
@@ -43,4 +53,9 @@
 
 2006-06-15
     * Integrated basic stemming, english only for now (see issues).
-    * Introduced LanguageSearch (lang:)
+    * Introduced LanguageSearch (new prefix lang/language)
+    * Searching now works with stemmed terms but matching is limited due
+      to usage of _moinSearch
+
+2006-06-16
+