changeset 793:a465544cff9a

use WikiAnalyzer, make analyzers yield unicode
author Thomas Waldmann <tw AT waldmann-edv DOT de>
date Tue, 02 May 2006 02:18:21 +0200
parents 3847b31a690f
children b7a31430bfc6
files MoinMoin/Xapian.py MoinMoin/parser/wiki.py MoinMoin/search.py MoinMoin/support/xapwrap/document.py MoinMoin/support/xapwrap/index.py
diffstat 5 files changed, 102 insertions(+), 74 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/Xapian.py	Mon May 01 19:26:01 2006 +0200
+++ b/MoinMoin/Xapian.py	Tue May 02 02:18:21 2006 +0200
@@ -12,6 +12,7 @@
 
 from MoinMoin.support.xapwrap import document as xapdoc
 from MoinMoin.support.xapwrap import index as xapidx
+from MoinMoin.parser.wiki import Parser as WikiParser
 
 from MoinMoin.Page import Page
 from MoinMoin import config, wikiutil
@@ -22,55 +23,65 @@
 ### Tokenizer
 ##############################################################################
 
-singleword = r"[%(u)s][%(l)s]+" % {
-                 'u': config.chars_upper,
-                 'l': config.chars_lower,
-             }
-
-singleword_re = re.compile(singleword, re.U)
-wikiword_re = re.compile(r"^(%s){2,}$" % singleword, re.U)
-
-token_re = re.compile(
-    r"(?P<company>\w+[&@]\w+)|" + # company names like AT&T and Excite@Home.
-    r"(?P<email>\w+([.-]\w+)*@\w+([.-]\w+)*)|" +    # email addresses
-    r"(?P<hostname>\w+(\.\w+)+)|" +                 # hostnames
-    r"(?P<num>(\w+[-/.,])*\w*\d\w*([-/.,]\w+)*)|" + # version numbers
-    r"(?P<acronym>(\w\.)+)|" +          # acronyms: U.S.A., I.B.M., etc.
-    r"(?P<word>\w+)",                   # words
-    re.U)
-
-dot_re = re.compile(r"[-_/,.]")
-mail_re = re.compile(r"[-_/,.]|(@)")
+class WikiAnalyzer:
+    singleword = r"[%(u)s][%(l)s]+" % {
+                     'u': config.chars_upper,
+                     'l': config.chars_lower,
+                 }
 
-def tokenizer(value):
-    """Yield a stream of lower cased words from a string."""
-    if isinstance(value, list): # used for page links
-        for v in value:
-            yield v
-    else:
-        tokenstream = re.finditer(token_re, value)
-        for m in tokenstream:
-            if m.group("acronym"):
-                yield m.group("acronym").replace('.', '').lower()
-            elif m.group("company"):
-                yield m.group("company").lower()
-            elif m.group("email"):
-                for word in mail_re.split(m.group("email").lower()):
-                    if word:
-                        yield word
-            elif m.group("hostname"):                
-                for word in dot_re.split(m.group("hostname").lower()):
-                    yield word
-            elif m.group("num"):
-                for word in dot_re.split(m.group("num").lower()):
-                    yield word
-            elif m.group("word"):
-                word = m.group("word")
-                yield  word.lower()
-                # if it is a CamelCaseWord, we additionally yield Camel, Case and Word
-                if wikiword_re.match(word):
-                    for sm in re.finditer(singleword_re, word):
-                        yield sm.group().lower()
+    singleword_re = re.compile(singleword, re.U)
+    wikiword_re = re.compile(WikiParser.word_rule, re.U)
+
+    token_re = re.compile(
+        r"(?P<company>\w+[&@]\w+)|" + # company names like AT&T and Excite@Home.
+        r"(?P<email>\w+([.-]\w+)*@\w+([.-]\w+)*)|" +    # email addresses
+        r"(?P<hostname>\w+(\.\w+)+)|" +                 # hostnames
+        r"(?P<num>(\w+[-/.,])*\w*\d\w*([-/.,]\w+)*)|" + # version numbers
+        r"(?P<acronym>(\w\.)+)|" +          # acronyms: U.S.A., I.B.M., etc.
+        r"(?P<word>\w+)",                   # words
+        re.U)
+
+    dot_re = re.compile(r"[-_/,.]")
+    mail_re = re.compile(r"[-_/,.]|(@)")
+    
+    # XXX limit stuff above to xapdoc.MAX_KEY_LEN
+    # WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U)
+
+    def tokenize(self, value):
+        """Yield a stream of lower cased words from a string.
+           value must be an UNICODE object or a list of unicode objects
+        """
+        def enc(uc):
+            lower = uc.lower()
+            return lower
+            
+        if isinstance(value, list): # used for page links
+            for v in value:
+                yield enc(v)
+        else:
+            tokenstream = re.finditer(self.token_re, value)
+            for m in tokenstream:
+                if m.group("acronym"):
+                    yield enc(m.group("acronym").replace('.', ''))
+                elif m.group("company"):
+                    yield enc(m.group("company"))
+                elif m.group("email"):
+                    for word in self.mail_re.split(m.group("email")):
+                        if word:
+                            yield enc(word)
+                elif m.group("hostname"):                
+                    for word in self.dot_re.split(m.group("hostname")):
+                        yield enc(word)
+                elif m.group("num"):
+                    for word in self.dot_re.split(m.group("num")):
+                        yield enc(word)
+                elif m.group("word"):
+                    word = m.group("word")
+                    yield  enc(word)
+                    # if it is a CamelCaseWord, we additionally yield Camel, Case and Word
+                    if self.wikiword_re.match(word):
+                        for sm in re.finditer(self.singleword_re, word):
+                            yield enc(sm.group())
 
 
 #############################################################################
@@ -455,6 +466,7 @@
                                       keywords=(title, ),
                                       sortFields=(pname, attachment, mtime,),
                                      )
+                doc.analyzerFactory = WikiAnalyzer
                 if mode == 'update':
                     if debug: request.log("%s (replace %r)" % (filename, uid))
                     doc.uid = uid
@@ -497,12 +509,15 @@
             attachment = xapdoc.SortKey('attachment', '') # this is a real page, not an attachment
             mtime = xapdoc.SortKey('mtime', mtime)
             title = xapdoc.TextField('title', pagename, True) # prefixed
-            links = xapdoc.Keyword('link_text', ' '.join(page.getPageLinks(request)))
+            keywords = []
+            for pagelink in page.getPageLinks(request):
+                keywords.append(xapdoc.Keyword('linkto', pagelink.lower()))
             content = xapdoc.TextField('content', page.get_raw_body())
             doc = xapdoc.Document(textFields=(content, title),
-                                  keywords=(links,),
+                                  keywords=keywords,
                                   sortFields=(pname, attachment, mtime,),
                                  )
+            doc.analyzerFactory = WikiAnalyzer
             #search_db_language = "english"
             #stemmer = xapian.Stem(search_db_language)
             #pagetext = page.get_raw_body().lower()
@@ -555,6 +570,7 @@
                                       keywords=(title, ),
                                       sortFields=(pname, attachment, mtime,),
                                      )
+                doc.analyzerFactory = WikiAnalyzer
                 if mode == 'update':
                     if debug: request.log("%s (replace %r)" % (pagename, uid))
                     doc.uid = uid
--- a/MoinMoin/parser/wiki.py	Mon May 01 19:26:01 2006 +0200
+++ b/MoinMoin/parser/wiki.py	Tue May 02 02:18:21 2006 +0200
@@ -29,7 +29,6 @@
     Dependencies = []
 
     # some common strings
-    PARENT_PREFIX = wikiutil.PARENT_PREFIX
     attachment_schemas = ["attachment", "inline", "drawing"]
     punct_pattern = re.escape(u'''"\'}]|:,.)?!''')
     url_pattern = (u'http|https|ftp|nntp|news|mailto|telnet|wiki|file|irc|' +
@@ -41,8 +40,9 @@
         'u': config.chars_upper,
         'l': config.chars_lower,
         'subpages': wikiutil.CHILD_PREFIX + '?',
-        'parent': ur'(?:%s)?' % re.escape(PARENT_PREFIX),
+        'parent': ur'(?:%s)?' % re.escape(wikiutil.PARENT_PREFIX),
     }
+
     url_rule = ur'%(url_guard)s(%(url)s)\:([^\s\<%(punct)s]|([%(punct)s][^\s\<%(punct)s]))+' % {
         'url_guard': u'(^|(?<!\w))',
         'url': url_pattern,
--- a/MoinMoin/search.py	Mon May 01 19:26:01 2006 +0200
+++ b/MoinMoin/search.py	Tue May 02 02:18:21 2006 +0200
@@ -248,12 +248,12 @@
             return [Match()]
 
     def xapian_term(self):
-        pattern = self._pattern.lower()
         if self.use_re:
             return '' # xapian can't do regex search
         else:
-            terms = pattern.split()
-            terms = [list(Xapian.tokenizer(t)) for t in terms]
+            analyzer = Xapian.WikiAnalyzer()
+            terms = self._pattern.split()
+            terms = [list(analyzer.tokenize(t)) for t in terms]
             term = []
             for t in terms:
                 term.append(" AND ".join(t))
@@ -312,12 +312,17 @@
             return [Match()]
 
     def xapian_term(self):
-        pattern = self._pattern.lower()
         if self.use_re:
             return '' # xapian doesn't support regex search
         else:
-            return 'title:%s' % pattern
-
+            analyzer = Xapian.WikiAnalyzer()
+            terms = self._pattern.split()
+            terms = [list(analyzer.tokenize(t)) for t in terms]
+            term = []
+            for t in terms:
+                term.append(" AND ".join(t))
+            term = '%s title:(%s)' % (self.negated and "NOT" or "", " AND ".join(term))
+            return term
 
 class LinkSearch(BaseExpression):
     """ Search the term in the pagelinks """
@@ -398,7 +403,8 @@
         if self.use_re:
             return '' # xapian doesnt support regex search
         else:
-            return 'linkto:%s' % pattern
+            term = '%s linkto:%s' % (self.negated and "NOT" or "", pattern.lower())
+            return term
 
 ############################################################################
 ### Results
@@ -613,7 +619,9 @@
         self.regex = kw.get('regex', 0)
 
     def parse_query(self, query):
-        """ transform an string into a tree of Query objects"""
+        """ transform an string into a tree of Query objects """
+        if isinstance(query, str):
+            query = query.decode(config.charset)
         self._query = query
         result = self._or_expression()
         if result is None:
@@ -650,7 +658,7 @@
                  r'(?P<OPS>\(|\)|(or\b(?!$)))|' +  # or, (, )
                  r'(?P<MOD>(\w+:)*)' +
                  r'(?P<TERM>("[^"]+")|' +
-                  r"('[^']+')|(\S+)))")             # search word itself
+                 r"('[^']+')|(\S+)))")             # search word itself
         self._query = self._query.strip()
         match = re.match(regex, self._query, re.U)
         if not match:
@@ -1164,8 +1172,8 @@
             self.request.clock.start('_xapianSearch')
             try:
                 from MoinMoin.support import xapwrap
-                query = self.query.xapian_term().encode(config.charset)
-                self.request.log("xapianSearch: query = %s" % query)
+                query = self.query.xapian_term()
+                self.request.log("xapianSearch: query = %r" % query)
                 query = xapwrap.index.ParsedQuery(query)
                 hits = index.search(query)
                 self.request.log("xapianSearch: finds: %r" % hits)
--- a/MoinMoin/support/xapwrap/document.py	Mon May 01 19:26:01 2006 +0200
+++ b/MoinMoin/support/xapwrap/document.py	Tue May 02 02:18:21 2006 +0200
@@ -34,13 +34,8 @@
         # OS/libc/$LC_CTYPE dependant result
         text = originalText.lower()
         for match in self.WORD_RE.finditer(text):
-            # the xapian swig bindings don't like unicode objects, so we
-            # decode terms to UTF-8 before indexing. this is fine as
-            # long as all data that goes into the db (whether for
-            # indexing or search) is converted to UTF-8 string and all
-            # data coming from the db (.get_value(), .get_data()) is
-            # decoded as UTF-8.
-            yield match.group().encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY)
+            # we yield unicode ONLY
+            yield match.group()
 
 
 class TextField(object):
@@ -150,6 +145,13 @@
         # add text fields
         for field in self.textFields:
             for token in analyzer.tokenize(field.text):
+                # the xapian swig bindings don't like unicode objects, so we
+                # decode terms to UTF-8 before indexing. this is fine as
+                # long as all data that goes into the db (whether for
+                # indexing or search) is converted to UTF-8 string and all
+                # data coming from the db (.get_value(), .get_data()) is
+                # decoded as UTF-8.
+                token = token.encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY)
                 # the tokenizer cannot guarantee that token length is
                 # below MAX_KEY_LEN since the regexp is done with
                 # unicode and the result is later converted to UTF-8. In
@@ -162,9 +164,8 @@
             if field.prefix:
                 prefix = field.name
                 for token in analyzer.tokenize(field.text):
-                    # XXX FIXME: slight loss of efficiency here: token is
-                    # already known to be in UTF-8 and we convert it
-                    # back to unicode and then back to UTF-8 again...
+                    # token is unicode, but gets converted to UTF-8
+                    # by makePairForWrite:
                     term = makePairForWrite(prefix, token, prefixMap)
                     d.add_posting(term, position)
                     position += 1
--- a/MoinMoin/support/xapwrap/index.py	Mon May 01 19:26:01 2006 +0200
+++ b/MoinMoin/support/xapwrap/index.py	Tue May 02 02:18:21 2006 +0200
@@ -121,6 +121,7 @@
 import cPickle, sets, glob, os
 import xapian
 from document import makePairForWrite, StandardAnalyzer, Document, SortKey, Keyword
+from document import UNICODE_ENCODING, UNICODE_ERROR_POLICY
 
 try:
     from atop.tpython import FilesystemLock
@@ -571,7 +572,7 @@
 
         # TODO - allow a simple way to get Keywords out
         self.setupDB()
-        if isinstance(query, str):
+        if isinstance(query, (str, unicode)):
             query = ParsedQuery(query)
         elif not(isinstance(query, Query)):
             raise ValueError("query %s must be either a string or a "
@@ -859,6 +860,8 @@
 
 class ParsedQuery(Query):
     def __init__(self, queryString):
+        if isinstance(queryString, unicode):
+            queryString = queryString.encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY)
         self.queryString = queryString
 
     def prepare(self, queryParser):