changeset 925:4508fc92fcb1

index exact positions of terms (postings)
author Franz Pletz <fpletz AT franz-pletz DOT org>
date Wed, 05 Jul 2006 12:19:22 +0200
parents 22f6f589162a
children 134b5ee99046
files MoinMoin/search/Xapian.py MoinMoin/search/queryparser.py MoinMoin/support/xapwrap/document.py
diffstat 3 files changed, 39 insertions(+), 24 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/search/Xapian.py	Sat Jul 01 22:47:14 2006 +0200
+++ b/MoinMoin/search/Xapian.py	Wed Jul 05 12:19:22 2006 +0200
@@ -92,38 +92,47 @@
             tokenstream = re.finditer(self.token_re, value)
             for m in tokenstream:
                 if m.group("acronym"):
-                    yield enc(m.group("acronym").replace('.', ''))
+                    yield (enc(m.group("acronym").replace('.', '')),
+                            m.start())
                 elif m.group("company"):
-                    yield enc(m.group("company"))
+                    yield (enc(m.group("company")), m.start())
                 elif m.group("email"):
+                    displ = 0
                     for word in self.mail_re.split(m.group("email")):
                         if word:
-                            yield enc(word)
+                            yield (enc(word), m.start() + displ)
+                            displ += len(word) + 1
                 elif m.group("hostname"):
+                    displ = 0
                     for word in self.dot_re.split(m.group("hostname")):
-                        yield enc(word)
+                        yield (enc(word), m.start() + displ)
+                        displ += len(word) + 1
                 elif m.group("num"):
+                    displ = 0
                     for word in self.dot_re.split(m.group("num")):
-                        yield enc(word)
+                        yield (enc(word), m.start() + displ)
+                        displ += len(word) + 1
                 elif m.group("word"):
                     word = m.group("word")
-                    yield enc(word)
+                    yield (enc(word), m.start())
                     # if it is a CamelCaseWord, we additionally yield Camel, Case and Word
                     if self.wikiword_re.match(word):
                         for sm in re.finditer(self.singleword_re, word):
-                            yield enc(sm.group())
+                            yield (enc(sm.group()), m.start() + sm.start())
 
     def tokenize(self, value, flat_stemming=True):
         """Yield a stream of lower cased raw and stemmed (optional) words from a string.
            value must be an UNICODE object or a list of unicode objects
         """
-        for i in self.raw_tokenize(value):
+        for word, pos in self.raw_tokenize(value):
             if flat_stemming:
-                yield i # XXX: should we really use a prefix for that? Index.prefixMap['raw'] + i
+                # XXX: should we really use a prefix for that?
+                # Index.prefixMap['raw'] + i
+                yield (word, pos)
                 if self.stemmer:
-                    yield self.stemmer.stemWord(i)
+                    yield (self.stemmer.stemWord(word), pos)
             else:
-                yield (i, self.stemmer.stemWord(i))
+                yield (i, self.stemmer.stemWord(i), pos)
 
 
 #############################################################################
--- a/MoinMoin/search/queryparser.py	Sat Jul 01 22:47:14 2006 +0200
+++ b/MoinMoin/search/queryparser.py	Wed Jul 05 12:19:22 2006 +0200
@@ -323,13 +323,13 @@
                 if request.cfg.xapian_stemming:
                     # stemmed OR not stemmed
                     tmp = []
-                    for i in analyzer.tokenize(t, flat_stemming=False):
-                        tmp.append(UnicodeQuery(Query.OP_OR, i))
-                        stemmed.append(i[1])
+                    for w, s, pos in analyzer.tokenize(t, flat_stemming=False):
+                        tmp.append(UnicodeQuery(Query.OP_OR, (w, s)))
+                        stemmed.append(w)
                     t = tmp
                 else:
                     # just not stemmed
-                    t = [UnicodeQuery(i) for i in analyzer.tokenize(t)]
+                    t = [UnicodeQuery(w) for w, pos in analyzer.tokenize(t)]
                 queries.append(Query(Query.OP_AND, t))
 
             if stemmed:
@@ -423,7 +423,7 @@
             analyzer = Xapian.WikiAnalyzer(request=request,
                     language=request.cfg.language_default)
             terms = self._pattern.split()
-            terms = [list(analyzer.raw_tokenize(t)) for t in terms]
+            terms = [[w for w, pos in analyzer.raw_tokenize(t)] for t in terms]
 
             # all parsed wikiwords, AND'ed
             queries = []
@@ -432,15 +432,16 @@
                 if request.cfg.xapian_stemming:
                     # stemmed OR not stemmed
                     tmp = []
-                    for i in analyzer.tokenize(t, flat_stemming=False):
-                        tmp.append(UnicodeQuery(Query.OP_OR, ['%s%s' %
-                            (Xapian.Index.prefixMap['title'], j) for j in i]))
-                        stemmed.append(i[1])
+                    for w, s, pos in analyzer.tokenize(t, flat_stemming=False):
+                        tmp.append(UnicodeQuery(Query.OP_OR,
+                            ['%s%s' % (Xapian.Index.prefixMap['title'], j)
+                                for j in (w, s)]))
+                        stemmed.append(w)
                     t = tmp
                 else:
                     # just not stemmed
-                    t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], i))
-                        for i in analyzer.tokenize(t)]
+                    t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], w))
+                        for w, pos in analyzer.tokenize(t)]
 
                 queries.append(Query(Query.OP_AND, t))
 
--- a/MoinMoin/support/xapwrap/document.py	Sat Jul 01 22:47:14 2006 +0200
+++ b/MoinMoin/support/xapwrap/document.py	Wed Jul 05 12:19:22 2006 +0200
@@ -140,12 +140,16 @@
 
     def toXapianDocument(self, indexValueMap, prefixMap=None):
         d = xapian.Document()
-        position = 1
+        position = 0
         analyzer = self.analyzerFactory()
 
         # add text fields
         for field in self.textFields:
             for token in analyzer.tokenize(field.text):
+                if isinstance(token, tuple):
+                    token, position = token
+                else:
+                    position += 1
                 # the xapian swig bindings don't like unicode objects, so we
                 # decode terms to UTF-8 before indexing. this is fine as
                 # long as all data that goes into the db (whether for
@@ -159,12 +163,13 @@
                 # the process, the string length could expand, so we
                 # need to check here as well.
                 d.add_posting(checkKeyLen(token), position)
-                position += 1
             position += INTER_FIELD_POSITION_GAP
 
             if field.prefix:
                 prefix = field.name
                 for token in analyzer.tokenize(field.text):
+                    if isinstance(token, tuple):
+                        token = token[0]
                     # token is unicode, but gets converted to UTF-8
                     # by makePairForWrite:
                     term = makePairForWrite(prefix, token, prefixMap)