changeset 853:210f3adb44de

merge with main
author Franz Pletz <fpletz AT franz-pletz DOT org>
date Sun, 18 Jun 2006 01:07:21 +0200
parents 0ccd65be5656 (diff) 715171e93d79 (current diff)
children a71bcc0f27c3 481c72d4a181
files
diffstat 3 files changed, 233 insertions(+), 64 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/Xapian.py	Sat Jun 17 19:44:23 2006 +0200
+++ b/MoinMoin/Xapian.py	Sun Jun 18 01:07:21 2006 +0200
@@ -20,16 +20,24 @@
 from MoinMoin import config, wikiutil
 from MoinMoin.util import filesys, lock
 
+try:
+    # PyStemmer, snowball python bindings from http://snowball.tartarus.org/
+    from Stemmer import Stemmer
+    use_stemming = True
+except ImportError:
+    use_stemming = False
 
 class UnicodeQuery(xapian.Query):
     def __init__(self, *args, **kwargs):
         self.encoding = kwargs.get('encoding', config.charset)
 
         nargs = []
-        for i in args:
-            if isinstance(i, unicode):
-                i = i.encode(self.encoding)
-            nargs.append(i)
+        for term in args:
+            if isinstance(term, unicode):
+                term = term.encode(self.encoding)
+            elif isinstance(term, list) or isinstance(term, tuple):
+                term = [t.encode(self.encoding) for t in term]
+            nargs.append(term)
 
         xapian.Query.__init__(self, *nargs, **kwargs)
 
@@ -38,6 +46,9 @@
 ### Tokenizer
 ##############################################################################
 
+def getWikiAnalyzerFactory(language='en'):
+    return (lambda: WikiAnalyzer(language))
+
 class WikiAnalyzer:
     singleword = r"[%(u)s][%(l)s]+" % {
                      'u': config.chars_upper,
@@ -62,10 +73,13 @@
     # XXX limit stuff above to xapdoc.MAX_KEY_LEN
     # WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U)
 
-    def tokenize(self, value):
-        """Yield a stream of lower cased words from a string.
-           value must be an UNICODE object or a list of unicode objects
-        """
+    def __init__(self, language=None):
+        if use_stemming and language:
+            self.stemmer = Stemmer(language)
+        else:
+            self.stemmer = None
+
+    def raw_tokenize(self, value):
         def enc(uc):
             """ 'encode' unicode results into whatever xapian / xapwrap wants """
             lower = uc.lower()
@@ -93,12 +107,24 @@
                         yield enc(word)
                 elif m.group("word"):
                     word = m.group("word")
-                    yield  enc(word)
+                    yield enc(word)
                     # if it is a CamelCaseWord, we additionally yield Camel, Case and Word
                     if self.wikiword_re.match(word):
                         for sm in re.finditer(self.singleword_re, word):
                             yield enc(sm.group())
 
+    def tokenize(self, value, flat_stemming=True):
+        """Yield a stream of lower cased raw and stemmed (optional) words from a string.
+           value must be an UNICODE object or a list of unicode objects
+        """
+        for i in self.raw_tokenize(value):
+            if flat_stemming:
+                yield i # XXX: should we really use a prefix for that? Index.prefixMap['raw'] + i
+                if self.stemmer:
+                    yield self.stemmer.stemWord(i)
+            else:
+                yield (i, self.stemmer.stemWord(i))
+
 
 #############################################################################
 ### Indexing
@@ -240,7 +266,7 @@
                        #N   ISO couNtry code (or domaiN name)
                        #P   Pathname
                        #Q   uniQue id
-                       #R   Raw (i.e. unstemmed) term
+        'raw':  'R',   # Raw (i.e. unstemmed) term
         'title': 'S',  # Subject (or title)
         'mimetype': 'T',
         'url': 'U',    # full URL of indexed document - if the resulting term would be > 240
@@ -250,6 +276,7 @@
                        #  the D term, and changing the last digit to a '2' if it's a '3')
                        #X   longer prefix for user-defined use
         'linkto': 'XLINKTO', # this document links to that document
+        'stem_lang': 'XSTEMLANG', # ISO Language code this document was stemmed in 
                        #Y   year (four digits)
     }
 
@@ -358,7 +385,7 @@
                     indexThread.join()
                 return func
 
-            self.request.finish = joinDecorator(self.request.finish)        
+            self.request.finish = joinDecorator(self.request.finish)
             indexThread.start()
         except:
             self.lock.release()
@@ -391,7 +418,7 @@
                     indexThread.join()
                 return func
                 
-            self.request.finish = joinDecorator(self.request.finish)        
+            self.request.finish = joinDecorator(self.request.finish)
             indexThread.start()
         except:
             self.lock.release()
@@ -422,8 +449,8 @@
                 break
             except wikiutil.PluginMissingError:
                 pass
-            #else:
-            #    raise "Cannot load filter for mimetype." + modulename  # XXX
+            else:
+                request.log("Cannot load filter for mimetype." + modulename)
         try:
             data = execute(self, filename)
             if debug:
@@ -480,7 +507,7 @@
                                       keywords=(xtitle, xitemid, ),
                                       sortFields=(xpname, xattachment, xmtime, xwname, ),
                                      )
-                doc.analyzerFactory = WikiAnalyzer
+                doc.analyzerFactory = getWikiAnalyzerFactory()
                 if mode == 'update':
                     if debug: request.log("%s (replace %r)" % (filename, uid))
                     doc.uid = uid
@@ -491,6 +518,34 @@
         except (OSError, IOError), err:
             pass
 
+    def _get_languages(self, page):
+        body = page.get_raw_body()
+        default_lang = page.request.cfg.language_default
+
+        lang = ''
+
+        if use_stemming:
+            for line in body.split('\n'):
+                if line.startswith('#language'):
+                    lang = line.split(' ')[1]
+                    try:
+                        Stemmer(lang)
+                    except KeyError:
+                        # lang is not stemmable
+                        break
+                    else:
+                        # lang is stemmable
+                        return (lang, lang)
+                elif not line.startswith('#'):
+                    break
+        
+        if not lang:
+            # no lang found at all.. fallback to default language
+            lang = default_lang
+
+        # return actual lang and lang to stem in
+        return (lang, default_lang)
+
     def _index_page(self, writer, page, mode='update'):
         """ Index a page - assumes that the write lock is acquired
             @arg writer: the index writer object
@@ -504,6 +559,8 @@
         pagename = page.page_name
         mtime = page.mtime_usecs()
         itemid = "%s:%s" % (wikiname, pagename)
+        # XXX: Hack until we get proper metadata
+        language, stem_language = self._get_languages(page)
         updated = False
 
         if mode == 'update':
@@ -530,7 +587,9 @@
             xattachment = xapdoc.SortKey('attachment', '') # this is a real page, not an attachment
             xmtime = xapdoc.SortKey('mtime', mtime)
             xtitle = xapdoc.TextField('title', pagename, True) # prefixed
-            xkeywords = [xapdoc.Keyword('itemid', itemid)]
+            xkeywords = [xapdoc.Keyword('itemid', itemid),
+                    xapdoc.Keyword('lang', language),
+                    xapdoc.Keyword('stem_lang', stem_language)]
             for pagelink in page.getPageLinks(request):
                 xkeywords.append(xapdoc.Keyword('linkto', pagelink))
             xcontent = xapdoc.TextField('content', page.get_raw_body())
@@ -538,17 +597,8 @@
                                   keywords=xkeywords,
                                   sortFields=(xpname, xattachment, xmtime, xwname, ),
                                  )
-            doc.analyzerFactory = WikiAnalyzer
-            #search_db_language = "english"
-            #stemmer = xapian.Stem(search_db_language)
-            #pagetext = page.get_raw_body().lower()
-            #words = re.finditer(r"\w+", pagetext)
-            #count = 0
-            #for wordmatch in words:
-            #    count += 1
-            #    word = wordmatch.group().encode(config.charset)
-            #    document.add_posting('R' + stemmer.stem_word(word), count) # count should be term position in document (starting at 1)
-            
+            doc.analyzerFactory = getWikiAnalyzerFactory()
+
             if mode == 'update':
                 if debug: request.log("%s (replace %r)" % (pagename, uid))
                 doc.uid = uid
@@ -586,14 +636,15 @@
                 xattachment = xapdoc.SortKey('attachment', att) # this is an attachment, store its filename
                 xmtime = xapdoc.SortKey('mtime', mtime)
                 xtitle = xapdoc.Keyword('title', '%s/%s' % (pagename, att))
+                xlanguage = xapdoc.Keyword('lang', language)
                 mimetype, att_content = self.contentfilter(filename)
                 xmimetype = xapdoc.TextField('mimetype', mimetype, True)
                 xcontent = xapdoc.TextField('content', att_content)
                 doc = xapdoc.Document(textFields=(xcontent, xmimetype, ),
-                                      keywords=(xatt_itemid, xtitle, ),
+                                      keywords=(xatt_itemid, xtitle, xlanguage, ),
                                       sortFields=(xpname, xattachment, xmtime, xwname, ),
                                      )
-                doc.analyzerFactory = WikiAnalyzer
+                doc.analyzerFactory = getWikiAnalyzerFactory()
                 if mode == 'update':
                     if debug: request.log("%s (replace %r)" % (pagename, uid))
                     doc.uid = uid
@@ -631,7 +682,7 @@
                     fname = fname.strip()
                     self._index_file(request, writer, fname, mode)
             writer.close()
-            request.log("indexing completed successfully in %0.2f seconds." % 
+            request.log("indexing completed successfully in %0.2f seconds." %
                         (time.time() - start))
             self._sign()
         finally:
--- a/MoinMoin/search.py	Sat Jun 17 19:44:23 2006 +0200
+++ b/MoinMoin/search.py	Sun Jun 18 01:07:21 2006 +0200
@@ -10,7 +10,8 @@
     @license: GNU GPL, see COPYING for details
 """
 
-import re, time, sys, StringIO, string
+import re, time, sys, StringIO, string, operator
+from sets import Set
 from MoinMoin import wikiutil, config
 from MoinMoin.Page import Page
 
@@ -75,7 +76,7 @@
         """
         return ''
 
-    def _build_re(self, pattern, use_re=False, case=False):
+    def _build_re(self, pattern, use_re=False, case=False, stemmed=False):
         """ Make a regular expression out of a text pattern """
         flags = case and re.U or (re.I | re.U)
         if use_re:
@@ -89,7 +90,15 @@
                 self.pattern = pattern
         else:
             pattern = re.escape(pattern)
-            self.search_re = re.compile(pattern, flags)
+            if stemmed:
+                # XXX: works, but pretty CPU-intensive (obviously...)
+                self.search_re = re.compile(r'(?=^|[\s]+|[^%s]+)%s[%s]*' %
+                        (config.chars_lower, case and pattern or
+                            ''.join(['[%s%s]' % (ch.upper(), ch.lower())
+                                for ch in pattern]),
+                         config.chars_lower), re.U)
+            else:
+                self.search_re = re.compile(pattern, flags)
             self.pattern = pattern
 
 
@@ -175,15 +184,15 @@
             wanted = wanted and term.xapian_wanted()
         return wanted
 
-    def xapian_term(self):
+    def xapian_term(self, request):
         # sort negated terms
         terms = []
         not_terms = []
         for term in self._subterms:
             if not term.negated:
-                terms.append(term.xapian_term())
+                terms.append(term.xapian_term(request))
             else:
-                not_terms.append(term.xapian_term())
+                not_terms.append(term.xapian_term(request))
 
         # prepare query for not negated terms
         if len(terms) == 1:
@@ -224,9 +233,9 @@
                 matches.extend(result)
         return matches
 
-    def xapian_term(self):
+    def xapian_term(self, request):
         # XXX: negated terms managed by _moinSearch?
-        return Query(Query.OP_OR, [term.xapian_term() for term in self._subterms])
+        return Query(Query.OP_OR, [term.xapian_term(request) for term in self._subterms])
 
 
 class TextSearch(BaseExpression):
@@ -286,25 +295,36 @@
     def xapian_wanted(self):
         return not self.use_re
 
-    def xapian_term(self):
+    def xapian_term(self, request):
         if self.use_re:
             return None # xapian can't do regex search
         else:
-            analyzer = Xapian.WikiAnalyzer()
+            analyzer = Xapian.WikiAnalyzer(language=request.cfg.language_default)
             terms = self._pattern.split()
-            
+
             # all parsed wikiwords, AND'ed
             queries = []
+            stemmed = []
             for t in terms:
-                t = [i.encode(config.charset) for i in list(analyzer.tokenize(t))]
-                if len(t) < 2:
-                    queries.append(UnicodeQuery(t[0]))
+                if Xapian.use_stemming:
+                    # stemmed OR not stemmed
+                    tmp = []
+                    for i in analyzer.tokenize(t, flat_stemming=False):
+                        tmp.append(UnicodeQuery(Query.OP_OR, i))
+                        stemmed.append(i[1])
+                    t = tmp
                 else:
-                    queries.append(UnicodeQuery(Query.OP_AND, t))
+                    # just not stemmed
+                    t = [UnicodeQuery(i) for i in analyzer.tokenize(t)]
+                queries.append(Query(Query.OP_AND, t))
+
+            if stemmed:
+                self._build_re(' '.join(stemmed), use_re=False,
+                        case=self.case, stemmed=True)
 
             # titlesearch OR parsed wikiwords
             return Query(Query.OP_OR,
-                    (self.titlesearch.xapian_term(),
+                    (self.titlesearch.xapian_term(request),
                         Query(Query.OP_AND, queries)))
 
 
@@ -322,7 +342,7 @@
         self.negated = 0
         self.use_re = use_re
         self.case = case
-        self._build_re(unicode(pattern), use_re=use_re, case=case)
+        self._build_re(self._pattern, use_re=use_re, case=case)
         
     def costs(self):
         return 100
@@ -362,23 +382,36 @@
     def xapian_wanted(self):
         return not self.use_re
 
-    def xapian_term(self):
+    def xapian_term(self, request):
         if self.use_re:
             return None # xapian doesn't support regex search
         else:
-            analyzer = Xapian.WikiAnalyzer()
+            analyzer = Xapian.WikiAnalyzer(language=request.cfg.language_default)
             terms = self._pattern.split()
-            terms = [list(analyzer.tokenize(t)) for t in terms]
+            terms = [list(analyzer.raw_tokenize(t)) for t in terms]
 
             # all parsed wikiwords, AND'ed
             queries = []
+            stemmed = []
             for t in terms:
-                t = ['%s%s' % (Xapian.Index.prefixMap['title'], i)
-                        for i in list(analyzer.tokenize(t))]
-                if len(t) < 2:
-                    queries.append(UnicodeQuery(t[0]))
+                if Xapian.use_stemming:
+                    # stemmed OR not stemmed
+                    tmp = []
+                    for i in analyzer.tokenize(t, flat_stemming=False):
+                        tmp.append(UnicodeQuery(Query.OP_OR, ['%s%s' %
+                            (Xapian.Index.prefixMap['title'], j) for j in i]))
+                        stemmed.append(i[1])
+                    t = tmp
                 else:
-                    queries.append(UnicodeQuery(Query.OP_AND, t))
+                    # just not stemmed
+                    t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], i))
+                        for i in analyzer.tokenize(t)]
+
+                queries.append(Query(Query.OP_AND, t))
+
+            if stemmed:
+                self._build_re(' '.join(stemmed), use_re=False,
+                        case=self.case, stemmed=True)
 
             return Query(Query.OP_AND, queries)
 
@@ -387,7 +420,7 @@
     """ Search the term in the pagelinks """
 
     def __init__(self, pattern, use_re=False, case=True):
-        """ Init a title search
+        """ Init a link search
 
         @param pattern: pattern to search for, ascii string or unicode
         @param use_re: treat pattern as re of plain text, bool
@@ -459,7 +492,7 @@
     def xapian_wanted(self):
         return not self.use_re
 
-    def xapian_term(self):
+    def xapian_term(self, request):
         pattern = self.pattern
         if self.use_re:
             return None # xapian doesnt support regex search
@@ -467,6 +500,56 @@
             return UnicodeQuery('%s:%s' %
                     (Xapian.Index.prefixMap['linkto'], pattern))
 
+
+class LanguageSearch(BaseExpression):
+    """ Search the pages written in a language """
+
+    def __init__(self, pattern, use_re=False, case=True):
+        """ Init a language search
+
+        @param pattern: pattern to search for, ascii string or unicode
+        @param use_re: treat pattern as re of plain text, bool
+        @param case: do case sensitive search, bool 
+        """
+        # iso language code, always lowercase
+        self._pattern = pattern.lower()
+        self.negated = 0
+        self.use_re = use_re
+        self.case = case
+        self.xapian_called = False
+        self._build_re(self._pattern, use_re=use_re, case=case)
+
+    def costs(self):
+        return 5000 # cheaper than a TextSearch
+
+    def __unicode__(self):
+        neg = self.negated and '-' or ''
+        return u'%s!"%s"' % (neg, unicode(self._pattern))
+
+    def highlight_re(self):
+        return ""
+
+    def search(self, page):
+        # We just use (and trust ;)) xapian for this.. deactivated for _moinSearch
+        if not self.xapian_called:
+            return None
+        else:
+            # XXX why not return None or empty list?
+            return [Match()]
+
+    def xapian_wanted(self):
+        return not self.use_re
+
+    def xapian_term(self, request):
+        pattern = self.pattern
+        if self.use_re:
+            return None # xapian doesnt support regex search
+        else:
+            self.xapian_called = True
+            return UnicodeQuery('%s%s' %
+                    (Xapian.Index.prefixMap['lang'], pattern))
+
+
 ############################################################################
 ### Results
 ############################################################################
@@ -765,7 +848,8 @@
         title_search = self.titlesearch
         regex = self.regex
         case = self.case
-        linkto = 0
+        linkto = False
+        lang = False
 
         for m in modifiers:
             if "title".startswith(m):
@@ -776,8 +860,12 @@
                 case = True
             elif "linkto".startswith(m):
                 linkto = True
+            elif "language".startswith(m):
+                lang = True
 
-        if linkto:
+        if lang:
+            obj = LanguageSearch(text, use_re=regex, case=False)
+        elif linkto:
             obj = LinkSearch(text, use_re=regex, case=case)
         elif title_search:
             obj = TitleSearch(text, use_re=regex, case=case)
@@ -1258,7 +1346,7 @@
             self.request.clock.start('_xapianSearch')
             try:
                 from MoinMoin.support import xapwrap
-                query = self.query.xapian_term()
+                query = self.query.xapian_term(self.request)
                 self.request.log("xapianSearch: query = %r" %
                         query.get_description())
                 query = xapwrap.index.QObjQuery(query)
--- a/docs/CHANGES.fpletz	Sat Jun 17 19:44:23 2006 +0200
+++ b/docs/CHANGES.fpletz	Sun Jun 18 01:07:21 2006 +0200
@@ -2,15 +2,27 @@
 =============================
 
   Known main issues:
-    * ...
+    * _moinSearch matches all characters in words when stemming,
+      workaround uses too much CPU
+    * Matching of stemmed terms is generally unreliable because the
+      matches (and consequently the count) are not obtained by Xapian
+      as _moinSearch is called with the Xapian results. Use the Xapian
+      matches somehow?
+    * Regex searching with Xapian?
 
   ToDo:
-    * Manually parse prefixes (e.g. title:) in MoinMoin.Xapian.Index
-      right before searching
+    * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper
+      metadata)
     * Mockup the new search UI
+    * Write/update documentation for all the new search stuff
+    * Wikifarms support (multiple indexes)
+    * Indexing and searching of Categories (new term prefix)
+    * Finish the stemming/matching stuff
 
   New Features:
-    * TBD
+    * Faster search thanks to Xapian
+    * Searching for languages with new prefix lang/language, i.e. lang:de
+      Note: Only available when Xapian is activated
   
   Bugfixes (only stuff that is buggy in moin/1.6 main branch):
     * ...
@@ -32,4 +44,22 @@
 be no issue with OrExpression as _moinSearch handles this correctly.
 
 2006-06-11
+    * Now handling prefixes correctly (title -> S, XLINKTO always with ':')
 
+2006-06-15
+    * Integrated basic stemming, english only for now (see issues).
+    * Introduced LanguageSearch (new prefix lang/language)
+    * Searching now works with stemmed terms but matching is limited due
+      to usage of _moinSearch
+
+2006-06-16
+    * Indexing & searching now works without a stemmer installed (small
+      bugfixes)
+
+2006-06-17
+    * Tackled some of the issues with matching stemmed words. Need some
+      advice on how to detect and match them reliably using the current
+      framework
+
+2006-06-18
+