changeset 924:22f6f589162a

term-based regexp search
author Franz Pletz <fpletz AT franz-pletz DOT org>
date Sat, 01 Jul 2006 22:47:14 +0200
parents f16cf67d3440
children 4508fc92fcb1
files MoinMoin/search/Xapian.py MoinMoin/search/builtin.py MoinMoin/search/queryparser.py docs/CHANGES.fpletz
diffstat 4 files changed, 84 insertions(+), 40 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/search/Xapian.py	Sat Jul 01 20:18:39 2006 +0200
+++ b/MoinMoin/search/Xapian.py	Sat Jul 01 22:47:14 2006 +0200
@@ -209,13 +209,12 @@
             self.queue.remove([name])
         writer.close()
 
-    # XXX: why?
-    #def test(self, request):
-    #   idx = xapidx.ReadOnlyIndex(self.dir)
-    #   idx.configure(self.prefixMap, self.indexValueMap)
-    #   print idx.search("is")
-    #   #for d in docs:
-    #   #    request.log("%r %r %r" % (d, d.get('attachment'), d.get('pagename')))
+    def allterms(self):
+        db = xapidx.ExceptionTranslater.openIndex(True, self.dir)
+        i = db.allterms_begin()
+        while i != db.allterms_end():
+            yield i.get_term()
+            i.next()
 
     def _index_file(self, request, writer, filename, mode='update'):
         """ index a file as it were a page named pagename
--- a/MoinMoin/search/builtin.py	Sat Jul 01 20:18:39 2006 +0200
+++ b/MoinMoin/search/builtin.py	Sat Jul 01 22:47:14 2006 +0200
@@ -384,11 +384,12 @@
             index = Index(self.request)
         except ImportError:
             index = None
-        if index and index.exists() and self.query.xapian_wanted():
+        if index and index.exists(): #and self.query.xapian_wanted():
             self.request.clock.start('_xapianSearch')
             try:
                 from MoinMoin.support import xapwrap
-                query = self.query.xapian_term(self.request)
+                query = self.query.xapian_term(self.request,
+                        index.allterms)
                 self.request.log("xapianSearch: query = %r" %
                         query.get_description())
                 query = xapwrap.index.QObjQuery(query)
@@ -403,6 +404,8 @@
                 self.request.log("xapianSearch: finds pages: %r" % pages)
             except BaseIndex.LockedException:
                 pass
+            #except AttributeError:
+            #    pages = []
             self.request.clock.stop('_xapianSearch')
         return self._moinSearch(pages)
 
--- a/MoinMoin/search/queryparser.py	Sat Jul 01 20:18:39 2006 +0200
+++ b/MoinMoin/search/queryparser.py	Sat Jul 01 22:47:14 2006 +0200
@@ -177,15 +177,15 @@
             wanted = wanted and term.xapian_wanted()
         return wanted
 
-    def xapian_term(self, request):
+    def xapian_term(self, request, allterms):
         # sort negated terms
         terms = []
         not_terms = []
         for term in self._subterms:
             if not term.negated:
-                terms.append(term.xapian_term(request))
+                terms.append(term.xapian_term(request, allterms))
             else:
-                not_terms.append(term.xapian_term(request))
+                not_terms.append(term.xapian_term(request, allterms))
 
         # prepare query for not negated terms
         if len(terms) == 1:
@@ -226,9 +226,9 @@
                 matches.extend(result)
         return matches
 
-    def xapian_term(self, request):
+    def xapian_term(self, request, allterms):
         # XXX: negated terms managed by _moinSearch?
-        return Query(Query.OP_OR, [term.xapian_term(request) for term in self._subterms])
+        return Query(Query.OP_OR, [term.xapian_term(request, allterms) for term in self._subterms])
 
 
 class TextSearch(BaseExpression):
@@ -303,9 +303,14 @@
     def xapian_wanted(self):
         return not self.use_re
 
-    def xapian_term(self, request):
+    def xapian_term(self, request, allterms):
         if self.use_re:
-            return None # xapian can't do regex search
+            # basic regex matching per term
+            terms = [term for term in allterms() if
+                    self.search_re.match(term)]
+            if not terms:
+                return None
+            queries = [Query(Query.OP_OR, terms)]
         else:
             analyzer = Xapian.WikiAnalyzer(request=request,
                     language=request.cfg.language_default)
@@ -331,10 +336,10 @@
                 self._build_re(' '.join(stemmed), use_re=False,
                         case=self.case, stemmed=True)
 
-            # titlesearch OR parsed wikiwords
-            return Query(Query.OP_OR,
-                    (self.titlesearch.xapian_term(request),
-                        Query(Query.OP_AND, queries)))
+        # titlesearch OR parsed wikiwords
+        return Query(Query.OP_OR,
+                (self.titlesearch.xapian_term(request, allterms),
+                    Query(Query.OP_AND, queries)))
 
 
 class TitleSearch(BaseExpression):
@@ -406,9 +411,14 @@
     def xapian_wanted(self):
         return not self.use_re
 
-    def xapian_term(self, request):
+    def xapian_term(self, request, allterms):
         if self.use_re:
-            return None # xapian doesn't support regex search
+            # basic regex matching per term
+            terms = [term for term in allterms() if
+                    self.search_re.match(term)]
+            if not terms:
+                return None
+            queries = [Query(Query.OP_OR, terms)]
         else:
             analyzer = Xapian.WikiAnalyzer(request=request,
                     language=request.cfg.language_default)
@@ -438,7 +448,7 @@
                 self._build_re(' '.join(stemmed), use_re=False,
                         case=self.case, stemmed=True)
 
-            return Query(Query.OP_AND, queries)
+        return Query(Query.OP_AND, queries)
 
 
 class LinkSearch(BaseExpression):
@@ -464,12 +474,10 @@
     def _build_re(self, pattern, use_re=False, case=False):
         """ Make a regular expression out of a text pattern """
         flags = case and re.U or (re.I | re.U)
-        try:
-            if not use_re:
-                raise re.error
+        if use_re:
             self.search_re = re.compile(pattern, flags)
             self.static = False
-        except re.error:
+        else:
             self.pattern = pattern
             self.static = True
         
@@ -516,13 +524,26 @@
     def xapian_wanted(self):
         return not self.use_re
 
-    def xapian_term(self, request):
-        pattern = self.pattern
+    def xapian_term(self, request, allterms):
+        prefix = Xapian.Index.prefixMap['linkto']
         if self.use_re:
-            return None # xapian doesnt support regex search
+            # basic regex matching per term
+            terms = []
+            found = None
+            n = len(prefix)
+            for term in allterms():
+                if prefix == term[:n]:
+                    found = True
+                    if self.search_re.match(term[n+1:]):
+                        terms.append(term)
+                elif found:
+                    continue
+
+            if not terms:
+                return None
+            return Query(Query.OP_OR, terms)
         else:
-            return UnicodeQuery('%s:%s' %
-                    (Xapian.Index.prefixMap['linkto'], pattern))
+            return UnicodeQuery('%s:%s' % (prefix, self.pattern))
 
 
 class LanguageSearch(BaseExpression):
@@ -563,14 +584,28 @@
     def xapian_wanted(self):
         return not self.use_re
 
-    def xapian_term(self, request):
-        pattern = self.pattern
+    def xapian_term(self, request, allterms):
+        self.xapian_called = True
+        prefix = Xapian.Index.prefixMap['lang']
         if self.use_re:
-            return None # xapian doesnt support regex search
+            # basic regex matching per term
+            terms = []
+            found = None
+            n = len(prefix)
+            for term in allterms():
+                if prefix == term[:n]:
+                    found = True
+                    if self.search_re.match(term[n:]):
+                        terms.append(term)
+                elif found:
+                    continue
+
+            if not terms:
+                return None
+            return Query(Query.OP_OR, terms)
         else:
-            self.xapian_called = True
-            return UnicodeQuery('%s%s' %
-                    (Xapian.Index.prefixMap['lang'], pattern))
+            pattern = self.pattern
+            return UnicodeQuery('%s%s' % (prefix, pattern))
 
 
 ##############################################################################
--- a/docs/CHANGES.fpletz	Sat Jul 01 20:18:39 2006 +0200
+++ b/docs/CHANGES.fpletz	Sat Jul 01 22:47:14 2006 +0200
@@ -2,11 +2,12 @@
 =============================
 
   Known main issues:
-    * Regex searching with Xapian?
+    * Only term-based regex searching possible, modifier or heuristic to
+      enable usage of _moinSearch for full compatibility?
     * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper metadata)
 
   ToDo:
-    * Mockup the new search UI
+    * Implement the new search UI
     * Write/update documentation for all the new search stuff
     * Indexing and searching of categories (new term prefix)
     * Drop _moinSearch when using Xapian and use term positions provided
@@ -100,3 +101,9 @@
       derived from this, cleanups in calling structure and function
       prototypes to make it more extensible
 
+2006-06-29
+    * Tested some ideas with regexp searching
+
+2006-07-01
+    * Fully implemented term-based regexp searching
+