changeset 4976:8df5d749cf2d

Xapian2009: xapian_term() was refactored. Code repetition was reduced by introducing BaseFieldSearch class. Field action definitions was updated.
author Dmitrijs Milajevs <dimazest@gmail.com>
date Tue, 04 Aug 2009 20:24:07 +0200
parents fc330376d50b
children 2171281b6d79
files MoinMoin/search/Xapian.py MoinMoin/search/queryparser.py
diffstat 2 files changed, 71 insertions(+), 129 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/search/Xapian.py	Tue Aug 04 15:23:42 2009 +0200
+++ b/MoinMoin/search/Xapian.py	Tue Aug 04 20:24:07 2009 +0200
@@ -66,15 +66,21 @@
         self.add_field_action('revision', STORE_CONTENT)
         self.add_field_action('revision',  INDEX_EXACT)
         self.add_field_action('mimetype ', INDEX_EXACT)
+        self.add_field_action('mimetype', STORE_CONTENT)
         self.add_field_action('title', INDEX_FREETEXT, weight=5)
         self.add_field_action('content', INDEX_FREETEXT, spell=True)
-        self.add_field_action('fulltittle',  INDEX_EXACT)
+        self.add_field_action('fulltitle',  INDEX_EXACT)
+        self.add_field_action('fulltitle', STORE_CONTENT)
         self.add_field_action('domain',  INDEX_EXACT)
+        self.add_field_action('domain', STORE_CONTENT)
         self.add_field_action('lang ',  INDEX_EXACT)
+        self.add_field_action('lang', STORE_CONTENT)
         self.add_field_action('stem_lang ',  INDEX_EXACT)
         self.add_field_action('author',  INDEX_EXACT)
         self.add_field_action('linkto',  INDEX_EXACT)
+        self.add_field_action('linkto', STORE_CONTENT)
         self.add_field_action('category',  INDEX_EXACT)
+        self.add_field_action('category', STORE_CONTENT)
 
 
 ##############################################################################
@@ -239,6 +245,17 @@
         """ Check if the Xapian index exists """
         return BaseIndex.exists(self) and os.listdir(self.dir)
 
+    def get_all_documents(self):
+        """
+        Return all the documents in the xapian index.
+        """
+        connection = xappy.SearchConnection(self.dir)
+        document_count = connection.get_doccount()
+        query = connection.query_all()
+        hits = connection.search(query, 0, document_count)
+        connection.close()
+        return hits
+
     def _search(self, query, sort='weight', historysearch=0):
         """
         Perform the search using xapian (read-lock acquired)
--- a/MoinMoin/search/queryparser.py	Tue Aug 04 15:23:42 2009 +0200
+++ b/MoinMoin/search/queryparser.py	Tue Aug 04 20:24:07 2009 +0200
@@ -106,6 +106,28 @@
     def xapian_wanted(self):
         return False
 
+    def _get_query_for_search_re(self, connection, field_to_check=None):
+        """
+        Return a query which satisfy self.search_re for field values.
+        If field_to_check is given check values only for that field.
+        """
+        queries = []
+
+        documents = connection.get_all_documents()
+        for document in documents:
+            data = document.data
+            if field_to_check:
+                # Check only field with given name
+                if self.search_re.match(data[field_to_check]):
+                    queries.append(connection.query_field(field_to_check, data[field_to_check]))
+            else:
+                # Check all fields
+                for field, value in data.iteritems():
+                    if self.search_re.match(value):
+                        queries.append(connection.query_field(field, value))
+
+        return Query(Query.OP_OR, queries)
+
     def __unicode__(self):
         neg = self.negated and '-' or ''
         return u'%s%s"%s"' % (neg, self._tag, unicode(self._pattern))
@@ -367,13 +389,7 @@
         # XXX next version of xappy (>0.5) will provide Query class
         # it should be used.
         if self.use_re:
-            # XXX
-            pass
-#             basic regex matching per term
-#             terms = [term for term in allterms() if self.search_re.match(term)]
-#             if not terms:
-#                 return Query()
-#             queries = [Query(Query.OP_OR, terms)]
+            queries = [self._get_query_for_search_re(connection)]
         else:
             analyzer = Xapian.WikiAnalyzer(request=request, language=request.cfg.language_default)
             terms = self._pattern.split()
@@ -476,19 +492,8 @@
 
     def xapian_term(self, request, allterms):
         if self.use_re:
-            # basic regex matching per term
-            terms = []
-            found = False
-            for term in allterms():
-                if term[:4] == 'XFT:':
-                    found = True
-                    if self.search_re.findall(term[4:]):
-                        terms.append(Query(term, 100))
-                elif found:
-                    break
-            if not terms:
-                return Query()
-            queries = [Query(Query.OP_OR, terms)]
+            # XXX weight for a query!
+            queries = [self._get_query_for_search_re(connection, 'fulltitle')]
         else:
             analyzer = Xapian.WikiAnalyzer(request=request,
                     language=request.cfg.language_default)
@@ -527,9 +532,22 @@
         return Query(Query.OP_AND, queries)
 
 
-class LinkSearch(BaseExpression):
+class BaseFieldSearch(BaseExpression):
+
+    _field_to_search = None
+
+    def xapian_term(self, request, allterms):
+        if self.use_re:
+            return self._get_query_for_search_re(connection, self._field_to_search)
+        else:
+            return connection.query_field(_field_to_search, self.pattern)
+
+
+class LinkSearch(BaseFieldSearch):
     """ Search the term in the pagelinks """
 
+    _field_to_search = 'linkto'
+
     def __init__(self, pattern, use_re=False, case=True):
         """ Init a link search
 
@@ -611,30 +629,11 @@
     def xapian_need_postproc(self):
         return self.case
 
-    def xapian_term(self, request, allterms):
-        prefix = Xapian.Index.prefixMap['linkto']
-        if self.use_re:
-            # basic regex matching per term
-            terms = []
-            found = None
-            n = len(prefix)
-            for term in allterms():
-                if prefix == term[:n]:
-                    found = True
-                    if self.search_re.match(term[n+1:]):
-                        terms.append(term)
-                elif found:
-                    continue
 
-            if not terms:
-                return Query()
-            return Query(Query.OP_OR, terms)
-        else:
-            return UnicodeQuery('%s:%s' % (prefix, self.pattern))
+class LanguageSearch(BaseFieldSearch):
+    """ Search the pages written in a language """
 
-
-class LanguageSearch(BaseExpression):
-    """ Search the pages written in a language """
+    _field_to_search = 'lang'
 
     def __init__(self, pattern, use_re=False, case=True):
         """ Init a language search
@@ -695,28 +694,6 @@
     def xapian_need_postproc(self):
         return False # case-sensitivity would make no sense
 
-    def xapian_term(self, request, allterms):
-        prefix = Xapian.Index.prefixMap['lang']
-        if self.use_re:
-            # basic regex matching per term
-            terms = []
-            found = None
-            n = len(prefix)
-            for term in allterms():
-                if prefix == term[:n]:
-                    found = True
-                    if self.search_re.match(term[n:]):
-                        terms.append(term)
-                elif found:
-                    continue
-
-            if not terms:
-                return Query()
-            return Query(Query.OP_OR, terms)
-        else:
-            pattern = self.pattern
-            return UnicodeQuery('%s%s' % (prefix, pattern))
-
 
 class CategorySearch(TextSearch):
     """ Search the pages belonging to a category """
@@ -756,31 +733,21 @@
         return self.case
 
     def xapian_term(self, request, allterms):
-        prefix = Xapian.Index.prefixMap['category']
+        # XXX Probably, it is a good idea to inherit this class from
+        # BaseFieldSearch and get rid of this definition
         if self.use_re:
-            # basic regex matching per term
-            terms = []
-            found = None
-            n = len(prefix)
-            for term in allterms():
-                if prefix == term[:n]:
-                    found = True
-                    if self.search_re.match(term[n+1:]):
-                        terms.append(term)
-                elif found:
-                    continue
-
-            if not terms:
-                return Query()
-            return Query(Query.OP_OR, terms)
+            return self._get_query_for_search_re(connection, 'category')
         else:
             pattern = self._pattern.lower()
-            return UnicodeQuery('%s:%s' % (prefix, pattern))
+            # XXX UnicodeQuery was used
+            return connection.query_field('category', pattern)
 
 
-class MimetypeSearch(BaseExpression):
+class MimetypeSearch(BaseFieldSearch):
     """ Search for files belonging to a specific mimetype """
 
+    _field_to_search = 'mimetype'
+
     def __init__(self, pattern, use_re=False, case=True):
         """ Init a mimetype search
 
@@ -820,31 +787,11 @@
     def xapian_need_postproc(self):
         return False # case-sensitivity would make no sense
 
-    def xapian_term(self, request, allterms):
-        prefix = Xapian.Index.prefixMap['mimetype']
-        if self.use_re:
-            # basic regex matching per term
-            terms = []
-            found = None
-            n = len(prefix)
-            for term in allterms():
-                if prefix == term[:n]:
-                    found = True
-                    if self.search_re.match(term[n:]):
-                        terms.append(term)
-                elif found:
-                    continue
 
-            if not terms:
-                return Query()
-            return Query(Query.OP_OR, terms)
-        else:
-            pattern = self._pattern
-            return UnicodeQuery('%s%s' % (prefix, pattern))
+class DomainSearch(BaseFieldSearch):
+    """ Search for pages belonging to a specific domain """
 
-
-class DomainSearch(BaseExpression):
-    """ Search for pages belonging to a specific domain """
+    _field_to_search = 'domain'
 
     def __init__(self, pattern, use_re=False, case=True):
         """ Init a domain search
@@ -903,28 +850,6 @@
     def xapian_need_postproc(self):
         return False # case-sensitivity would make no sense
 
-    def xapian_term(self, request, allterms):
-        prefix = Xapian.Index.prefixMap['domain']
-        if self.use_re:
-            # basic regex matching per term
-            terms = []
-            found = None
-            n = len(prefix)
-            for term in allterms():
-                if prefix == term[:n]:
-                    found = True
-                    if self.search_re.match(term[n+1:]):
-                        terms.append(term)
-                elif found:
-                    continue
-
-            if not terms:
-                return Query()
-            return Query(Query.OP_OR, terms)
-        else:
-            pattern = self._pattern
-            return UnicodeQuery('%s:%s' % (prefix, pattern))
-
 
 ##############################################################################
 ### Parse Query