changeset 847:813125ff0d74

Introducing LanguageSearch
author Franz Pletz <fpletz AT franz-pletz DOT org>
date Thu, 15 Jun 2006 16:11:28 +0200
parents 04703997eb66
children ac386d2622af
files MoinMoin/Xapian.py MoinMoin/search.py docs/CHANGES.fpletz
diffstat 3 files changed, 83 insertions(+), 11 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/Xapian.py	Thu Jun 15 15:23:14 2006 +0200
+++ b/MoinMoin/Xapian.py	Thu Jun 15 16:11:28 2006 +0200
@@ -264,6 +264,7 @@
                        #  the D term, and changing the last digit to a '2' if it's a '3')
                        #X   longer prefix for user-defined use
         'linkto': 'XLINKTO', # this document links to that document
+        'stem_lang': 'XSTEMLANG', # ISO Language code this document was stemmed in 
                        #Y   year (four digits)
     }
 
@@ -505,22 +506,31 @@
         except (OSError, IOError), err:
             pass
 
-    def _get_language(self, page):
+    def _get_languages(self, page):
         body = page.get_raw_body()
+        default_lang = page.request.cfg.language_default
 
+        lang = ''
         for line in body.split('\n'):
             if line.startswith('#language'):
                 lang = line.split(' ')[1]
                 try:
                     getStemmer(lang)
                 except KeyError:
+                    # lang is not stemmable
                     break
                 else:
-                    return lang
+                    # lang is stemmable
+                    return (lang, lang)
             elif not line.startswith('#'):
                 break
+        
+        if not lang:
+            # no lang found at all.. fallback to default language
+            lang = default_lang
 
-        return page.request.cfg.language_default
+        # return actual lang and lang to stem in
+        return (lang, default_lang)
 
     def _index_page(self, writer, page, mode='update'):
         """ Index a page - assumes that the write lock is acquired
@@ -535,7 +545,8 @@
         pagename = page.page_name
         mtime = page.mtime_usecs()
         itemid = "%s:%s" % (wikiname, pagename)
-        language = self._get_language(page)  # XXX: Hack until we get proper metadata
+        # XXX: Hack until we get proper metadata
+        language, stem_language = self._get_languages(page)
         updated = False
 
         if mode == 'update':
@@ -563,7 +574,8 @@
             xmtime = xapdoc.SortKey('mtime', mtime)
             xtitle = xapdoc.TextField('title', pagename, True) # prefixed
             xkeywords = [xapdoc.Keyword('itemid', itemid),
-                    xapdoc.Keyword('lang', language)]
+                    xapdoc.Keyword('lang', language),
+                    xapdoc.Keyword('stem_lang', stem_language)]
             for pagelink in page.getPageLinks(request):
                 xkeywords.append(xapdoc.Keyword('linkto', pagelink))
             xcontent = xapdoc.TextField('content', page.get_raw_body())
--- a/MoinMoin/search.py	Thu Jun 15 15:23:14 2006 +0200
+++ b/MoinMoin/search.py	Thu Jun 15 16:11:28 2006 +0200
@@ -404,7 +404,7 @@
     """ Search the term in the pagelinks """
 
     def __init__(self, pattern, use_re=False, case=True):
-        """ Init a title search
+        """ Init a link search
 
         @param pattern: pattern to search for, ascii string or unicode
         @param use_re: treat pattern as re of plain text, bool
@@ -484,6 +484,56 @@
             return UnicodeQuery('%s:%s' %
                     (Xapian.Index.prefixMap['linkto'], pattern))
 
+
+class LanguageSearch(BaseExpression):
+    """ Search the pages written in a language """
+
+    def __init__(self, pattern, use_re=False, case=True):
+        """ Init a language search
+
+        @param pattern: pattern to search for, ascii string or unicode
+        @param use_re: treat pattern as re of plain text, bool
+        @param case: do case sensitive search, bool 
+        """
+        # used for search in languages, always lowercase
+        self._pattern = pattern.lower()
+        self.negated = 0
+        self.use_re = use_re
+        self.case = case
+        self.xapian_called = False
+        self._build_re(self._pattern, use_re=use_re, case=case)
+
+    def costs(self):
+        return 5000 # cheaper than a TextSearch
+
+    def __unicode__(self):
+        neg = self.negated and '-' or ''
+        return u'%s!"%s"' % (neg, unicode(self._pattern))
+
+    def highlight_re(self):
+        return ""
+
+    def search(self, page):
+        # We just use (and trust ;)) xapian for this.. deactivated for _moinSearch
+        if not self.xapian_called:
+            return None
+        else:
+            # XXX why not return None or empty list?
+            return [Match()]
+
+    def xapian_wanted(self):
+        return not self.use_re
+
+    def xapian_term(self):
+        pattern = self.pattern
+        if self.use_re:
+            return None # xapian doesnt support regex search
+        else:
+            self.xapian_called = True
+            return UnicodeQuery('%s%s' %
+                    (Xapian.Index.prefixMap['lang'], pattern))
+
+
 ############################################################################
 ### Results
 ############################################################################
@@ -782,7 +832,8 @@
         title_search = self.titlesearch
         regex = self.regex
         case = self.case
-        linkto = 0
+        linkto = False
+        lang = False
 
         for m in modifiers:
             if "title".startswith(m):
@@ -793,8 +844,12 @@
                 case = True
             elif "linkto".startswith(m):
                 linkto = True
+            elif "lang".startswith(m):
+                lang = True
 
-        if linkto:
+        if lang:
+            obj = LanguageSearch(text, use_re=regex, case=False)
+        elif linkto:
             obj = LinkSearch(text, use_re=regex, case=case)
         elif title_search:
             obj = TitleSearch(text, use_re=regex, case=case)
--- a/docs/CHANGES.fpletz	Thu Jun 15 15:23:14 2006 +0200
+++ b/docs/CHANGES.fpletz	Thu Jun 15 16:11:28 2006 +0200
@@ -7,12 +7,16 @@
     * Somethings' wrong with the matching of stemmed terms, i.e. matches
       beyond single WikiWord borders although matching lower-case only
       (see MoinMoin/search.py:92)
+    * Regex searching with Xapian?
 
   ToDo:
+    * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper metadata)
     * Mockup the new search UI
 
   New Features:
-    * TBD
+    * Faster search thanks to Xapian
+    * Searching for languages with new prefix 'lang', i.e. lang:de
+      Note: Only available when Xapian is activated
   
   Bugfixes (only stuff that is buggy in moin/1.6 main branch):
     * ...
@@ -36,5 +40,6 @@
 2006-06-11 Now handling prefixes correctly (title -> S, XLINKTO always
 with ':')
 
-2006-06-15 Integrated stemming, english only for now (see issues).
-
+2006-06-15
+    * Integrated basic stemming, english only for now (see issues).
+    * Introduced LanguageSearch (lang:)