changeset 435:368e6a8e4f98

lupy: tokenizer adds the complete CamelCase word to the index (not only the splitted words), lowered mergeFactor, cleaner dir structure in cache directory, source cleanup imported from: moin--main--1.5--patch-439
author Thomas Waldmann <tw@waldmann-edv.de>
date Sun, 12 Feb 2006 12:06:05 +0000
parents b8337d7318d4
children ae3ae7e9e14d
files ChangeLog MoinMoin/action/fullsearch.py MoinMoin/lupy.py MoinMoin/scripts/moin_optimize_index.py MoinMoin/search.py
diffstat 5 files changed, 101 insertions(+), 87 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Sat Feb 11 15:52:39 2006 +0000
+++ b/ChangeLog	Sun Feb 12 12:06:05 2006 +0000
@@ -2,6 +2,21 @@
 # arch-tag: automatic-ChangeLog--arch@arch.thinkmo.de--2003-archives/moin--main--1.5
 #
 
+2006-02-12 13:06:05 GMT	Thomas Waldmann <tw@waldmann-edv.de>	patch-439
+
+    Summary:
+      lupy: tokenizer adds the complete CamelCase word to the index (not only the splitted words), lowered mergeFactor, cleaner dir structure in cache directory, source cleanup
+    Revision:
+      moin--main--1.5--patch-439
+
+    lupy: tokenizer adds the complete CamelCase word to the index (not only the splitted words), lowered mergeFactor, cleaner dir structure in cache directory, source cleanup
+    
+
+    modified files:
+     ChangeLog MoinMoin/action/fullsearch.py MoinMoin/lupy.py
+     MoinMoin/scripts/moin_optimize_index.py MoinMoin/search.py
+
+
 2006-02-11 16:52:39 GMT	Thomas Waldmann <tw@waldmann-edv.de>	patch-438
 
     Summary:
--- a/MoinMoin/action/fullsearch.py	Sat Feb 11 15:52:39 2006 +0000
+++ b/MoinMoin/action/fullsearch.py	Sun Feb 12 12:06:05 2006 +0000
@@ -108,3 +108,4 @@
     # End content and send footer
     request.write(request.formatter.endContent())
     wikiutil.send_footer(request, pagename)
+
--- a/MoinMoin/lupy.py	Sat Feb 11 15:52:39 2006 +0000
+++ b/MoinMoin/lupy.py	Sun Feb 12 12:06:05 2006 +0000
@@ -20,14 +20,16 @@
 ### Tokenizer
 ##############################################################################
 
-word_re = re.compile(r"\w+", re.U)
-wikiword_re = re.compile(r"^([%(u)s][%(l)s]+)+$" % {'u': config.chars_upper,
-                                                'l': config.chars_lower}, re.U)
-singleword_re = re.compile(r"[%(u)s][%(l)s]+" % {'u': config.chars_upper,
-                                             'l': config.chars_lower}, re.U)
+singleword = r"[%(u)s][%(l)s]+" % {
+                 'u': config.chars_upper,
+                 'l': config.chars_lower,
+             }
+
+singleword_re = re.compile(singleword, re.U)
+wikiword_re = re.compile(r"^(%s){2,}$" % singleword, re.U)
 
 token_re = re.compile(
-    r"(?P<company>\w+[&@]\w+)|" + #company names like AT&T and Excite@Home.
+    r"(?P<company>\w+[&@]\w+)|" + # company names like AT&T and Excite@Home.
     r"(?P<email>\w+([.-]\w+)*@\w+([.-]\w+)*)|" +    # email addresses
     r"(?P<hostname>\w+(\.\w+)+)|" +                 # hostnames
     r"(?P<num>(\w+[-/.,])*\w*\d\w*([-/.,]\w+)*)|" + # version numbers
@@ -47,7 +49,7 @@
         tokenstream = re.finditer(token_re, value)
         for m in tokenstream:
             if m.group("acronym"):
-                yield m.group("acronym").replace('.','').lower()
+                yield m.group("acronym").replace('.', '').lower()
             elif m.group("company"):
                 yield m.group("company").lower()
             elif m.group("email"):
@@ -61,11 +63,12 @@
                 for word in dot_re.split(m.group("num").lower()):
                     yield word
             elif m.group("word"):
-                if wikiword_re.match(m.group("word")):
-                    for sm in re.finditer(singleword_re, m.group()):
+                word = m.group("word")
+                yield  word.lower()
+                # if it is a CamelCaseWord, we additionally yield Camel, Case and Word
+                if wikiword_re.match(word):
+                    for sm in re.finditer(singleword_re, word):
                         yield sm.group().lower()
-                else:
-                    yield  m.group("word").lower()
 
 
 #############################################################################
@@ -117,7 +120,7 @@
         When the queue is empty, the queue file is removed, so exists()
         can tell if there is something waiting in the queue.
         
-        TODO: tune the timeout
+        TODO: tune timeout
         """
         if self.writeLock.acquire(30.0):
             try:
@@ -148,10 +151,9 @@
         unique = []
         seen = {}
         for name in pages:
-            if name in seen:
-                continue
-            unique.append(name)
-            seen[name] = 1
+            if not name in seen:
+                unique.append(name)
+                seen[name] = 1
         return unique
 
     def _read(self):
@@ -174,7 +176,7 @@
     def _write(self, pages):
         """ Write pages to queue file
         
-        Require queue write locking.
+        Requires queue write locking.
         """
         # XXX use tmpfile/move for atomic replace on real operating systems
         data = '\n'.join(pages) + '\n'
@@ -187,7 +189,7 @@
     def _removeFile(self):
         """ Remove queue file 
         
-        Require write locking.
+        Requires queue write locking.
         """
         try:
             os.remove(self.file)
@@ -195,6 +197,7 @@
             if err.errno != errno.ENOENT:
                 raise
 
+
 class Index:
     class LockedException(Exception):
         pass
@@ -202,16 +205,17 @@
     def __init__(self, request):
         self.request = request
         cache_dir = request.cfg.cache_dir
-        self.dir = os.path.join(cache_dir, 'lupy_index')
+        self.main_dir = os.path.join(cache_dir, 'lupy')
+        self.dir = os.path.join(self.main_dir, 'index')
         filesys.makeDirs(self.dir)
-        self.sig_file = os.path.join(self.dir, '__complete__')
+        self.sig_file = os.path.join(self.main_dir, 'complete')
         self.segments_file = os.path.join(self.dir, 'segments')
-        lock_dir = os.path.join(cache_dir, 'lupy_index_lock')
+        lock_dir = os.path.join(self.main_dir, 'index-lock')
         self.lock = lock.WriteLock(lock_dir,
                                    timeout=3600.0, readlocktimeout=60.0)
         self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0)
-        self.queue = UpdateQueue(os.path.join(self.dir, "__update_queue__"),
-                                 os.path.join(cache_dir, 'lupy_queue_lock'))
+        self.queue = UpdateQueue(os.path.join(self.main_dir, "update-queue"),
+                                 os.path.join(self.main_dir, 'update-queue-lock'))
         
         # Disabled until we have a sane way to build the index with a
         # queue in small steps.
@@ -232,7 +236,7 @@
             while True:
                 try:
                     searcher, timestamp = self.request.cfg.lupy_searchers.pop()
-                    if timestamp!=self.mtime():
+                    if timestamp != self.mtime():
                         searcher.close()
                     else:
                         break
@@ -278,8 +282,7 @@
     def indexPagesInNewThread(self):
         """ Index all pages in a new thread
         
-        Should be called from a user request. From a script, use
-        indexPages.
+        Should be called from a user request. From a script, use indexPages.
 
         TODO: tune the acquire timeout
         """
@@ -336,7 +339,8 @@
         """ Assumes that the write lock is acquired """
         pages = self.queue.pages()[:amount]
         for name in pages:
-            self._update_page(Page(self.request, name))
+            p = Page(self.request, name)
+            self._update_page(p)
         self.queue.remove(pages)
 
     def _update_page(self, page):
@@ -367,11 +371,10 @@
     def _index_pages(self, request, lock=None):
         """ Index all pages
         
-        This should be called from indexPages or indexPagesInNewThread
-        only!
+        This should be called from indexPages or indexPagesInNewThread only!
         
-        This may take few minutes up to few hours, depending on the
-        size of the wiki.
+        This may take few minutes up to few hours, depending on the size of
+        the wiki.
 
         When called in a new thread, lock is acquired before the call,
         and this method must release it when it finishes or fails.
@@ -380,13 +383,14 @@
             self._unsign()
             start = time.time()
             writer = IndexWriter(self.dir, True, tokenizer)
-            writer.mergeFactor = 200
+            writer.mergeFactor = 50
             pages = request.rootpage.getPageList(user='', exists=1)
             request.log("indexing all (%d) pages..." % len(pages))
             for pagename in pages:
-                # Some code assumes request.page
-                request.page = Page(request, pagename)
-                self._index_page(writer, request.page)
+                p = Page(request, pagename)
+                # code does NOT seem to assume request.page being set any more
+                #request.page = p
+                self._index_page(writer, p)
             writer.close()
             request.log("indexing completed successfully in %0.2f seconds." % 
                         (time.time() - start))
@@ -398,6 +402,7 @@
 
     def _optimize(self, request):
         """ Optimize the index """
+        self._unsign()
         start = time.time()
         request.log("optimizing index...")
         writer = IndexWriter(self.dir, False, tokenizer)
@@ -405,6 +410,7 @@
         writer.close()
         request.log("optimizing completed successfully in %0.2f seconds." % 
                     (time.time() - start))
+        self._sign()
 
     def _indexingRequest(self, request):
         """ Return a new request that can be used for index building.
--- a/MoinMoin/scripts/moin_optimize_index.py	Sat Feb 11 15:52:39 2006 +0000
+++ b/MoinMoin/scripts/moin_optimize_index.py	Sun Feb 12 12:06:05 2006 +0000
@@ -9,7 +9,7 @@
 @copyright: 2005 by Florian Festi, Nir Soffer
 @license: GNU GPL, see COPYING for details.
 """
-
+doit = 1
 import os
 
 # Insert the path to MoinMoin in the start of the path
@@ -18,7 +18,8 @@
 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), 
                                 os.pardir, os.pardir))
 
-print """
+if not doit:
+    print """
 Until the following bug is closed, we avoid running this script:
 
 http://moinmoin.wikiwikiweb.de/MoinMoinBugs/LupyOptimizeBreaksIndex
@@ -27,7 +28,7 @@
 
 Terminating now, doing NOTHING...
 """
-sys.exit(1)
+    sys.exit(1)
 
 from MoinMoin.scripts.moin_build_index import IndexScript
 from MoinMoin.request import RequestCLI
--- a/MoinMoin/search.py	Sat Feb 11 15:52:39 2006 +0000
+++ b/MoinMoin/search.py	Sun Feb 12 12:06:05 2006 +0000
@@ -65,9 +65,9 @@
             return None
     
     def costs(self):
-        """ estimated time to calculate this term
+        """ Return estimated time to calculate this term
         
-        Number is is relative to other terms and has no real unit.
+        Number is relative to other terms and has no real unit.
         It allows to do the fast searches first.
         """ 
         return 0
@@ -81,13 +81,7 @@
 
     def _build_re(self, pattern, use_re=False, case=False):
         """ Make a regular expression out of a text pattern """
-        if case:
-            # case sensitive
-            flags = re.U
-        else:
-            # ignore case
-            flags = re.U | re.I
-            
+        flags = case and re.U or (re.I | re.U)
         if use_re:
             try:
                 self.search_re = re.compile(pattern, flags)
@@ -135,15 +129,13 @@
     def pageFilter(self):
         """ Return a page filtering function
 
-        This function is used to filter page list before we search
-        it.
+        This function is used to filter page list before we search it.
 
-        Return a function that get a page name, and return bool, or None.
+        Return a function that gets a page name, and return bool, or None.
         """
         # Sort terms by cost, then get all title searches
         self.sortByCost()
-        terms = [term for term in self._subterms
-                 if isinstance(term, TitleSearch)]
+        terms = [term for term in self._subterms if isinstance(term, TitleSearch)]
         if terms:
             # Create and return a filter function
             def filter(name):
@@ -227,16 +219,14 @@
         self.negated = 0
         self.use_re = use_re
         self.case = case
-        self._build_re(self._pattern,
-                       use_re=use_re, case=case)
+        self._build_re(self._pattern, use_re=use_re, case=case)
         self.titlesearch = TitleSearch(self._pattern, use_re=use_re, case=case)
         
     def costs(self):
         return 10000
     
     def __unicode__(self):
-        if self.negated: neg = '-'
-        else: neg = ''
+        neg = self.negated and '-' or ''
         return u'%s"%s"' % (neg, unicode(self._pattern))
 
     def highlight_re(self):
@@ -271,9 +261,9 @@
         or_term.add(term, False, False)
         pattern = self._pattern.lower()
         if self.use_re:
-            if pattern[0]=='^':
+            if pattern[0] == '^':
                 pattern = pattern[1:]
-            if pattern[:2]=='\b':
+            if pattern[:2] == '\b':
                 pattern = pattern[2:]
             term = RegularExpressionQuery(Term("text", pattern))
         else:
@@ -281,7 +271,7 @@
             terms = [list(tokenizer(t)) for t in terms]
             term = BooleanQuery()
             for t in terms:
-                if len(t)==1:
+                if len(t) == 1:
                     term.add(CamelCaseQuery(Term("text", t[0])), True, False)
                 else:
                     phrase = PhraseQuery()
@@ -293,7 +283,8 @@
             #term = TermQuery(Term("text", pattern))
         or_term.add(term, False, False)
         return or_term
-        
+
+
 class TitleSearch(BaseExpression):
     """ Term searches in pattern in page title only """
 
@@ -314,8 +305,7 @@
         return 100
 
     def __unicode__(self):
-        if self.negated: neg = '-'
-        else: neg = ''
+        neg = self.negated and '-' or ''
         return u'%s!"%s"' % (neg, unicode(self._pattern))
 
     def highlight_re(self):
@@ -349,13 +339,15 @@
     def lupy_term(self):
         pattern = self._pattern.lower()
         if self.use_re:
-            if pattern[0]=='^': pattern = pattern[1:]
+            if pattern[0] == '^':
+                pattern = pattern[1:]
             term = RegularExpressionQuery(Term("title", pattern))
         else:
             term = PrefixQuery(Term("title", pattern), 3)
         #term.boost = 100.0
         return term
-    
+
+
 class LinkSearch(BaseExpression):
     """ Search the term in the pagelinks """
 
@@ -379,8 +371,7 @@
 
     def _build_re(self, pattern, use_re=False, case=False):
         """ Make a regular expression out of a text pattern """
-        flags = (re.U | re.I, re.U)[case]
-
+        flags = case and re.U or (re.I | re.U)
         try:
             if not use_re:
                 raise re.error
@@ -394,7 +385,8 @@
         return 5000 # cheaper than a TextSearch
 
     def __unicode__(self):
-        return u'%s!"%s"' % (('', '-')[self.negated], unicode(self._pattern))
+        neg = self.negated and '-' or ''
+        return u'%s!"%s"' % (neg, unicode(self._pattern))
 
     def highlight_re(self):
         return u"(%s)" % self._textpattern    
@@ -418,7 +410,7 @@
             if results:
                 matches.extend(results)
             else: #This happens e.g. for pages that use navigation macros
-                matches.append(TextMatch(0,0))
+                matches.append(TextMatch(0, 0))
 
         # Decide what to do with the results.
         if ((self.negated and matches) or
@@ -433,7 +425,8 @@
     def lupy_term(self):        
         pattern = self.pattern
         if self.use_re:
-            if pattern[0]=="^": pattern = pattern[1:]
+            if pattern[0] == "^":
+                pattern = pattern[1:]
             term = RegularExpressionQuery(Term("links", pattern))
         else:
             term = TermQuery(Term("links", pattern))
@@ -619,7 +612,6 @@
 ### Parse Query
 ##############################################################################
 
-
 class QueryParser:
     """
     Converts a String into a tree of Query objects
@@ -727,6 +719,7 @@
         return (text.startswith('"') and text.endswith('"') or
                 text.startswith("'") and text.endswith("'"))
 
+
 ############################################################################
 ### Search results formatting
 ############################################################################
@@ -735,7 +728,7 @@
     """ Manage search results, supply different views
 
     Search results can hold valid search results and format them for
-    many requests, until the wiki content change.
+    many requests, until the wiki content changes.
 
     For example, one might ask for full page list sorted from A to Z,
     and then ask for the same list sorted from Z to A. Or sort results
@@ -883,7 +876,7 @@
         if not page.page:
             page.page = Page(self.request, page.page_name)
         body = page.page.get_raw_body()
-        last = len(body) -1
+        last = len(body) - 1
         lineCount = 0
         output = []
         
@@ -916,7 +909,7 @@
                     break
 
             # Add all matches in context and the text between them 
-            while 1:
+            while True:
                 match = matches[j]
                 # Ignore matches behind the current position
                 if start < match.end:
@@ -1096,8 +1089,7 @@
     def _reset(self, request, formatter):
         """ Update internal state before new output
 
-        Do not calls this, it should be called only by the instance
-        code.
+        Do not call this, it should be called only by the instance code.
 
         Each request might need different translations or other user
         preferences.
@@ -1109,6 +1101,7 @@
         _ = request.getText    
         self.matchLabel = (_('match'), _('matches'))
 
+
 ##############################################################################
 ### Searching
 ##############################################################################
@@ -1147,18 +1140,16 @@
         Get a list of pages using fast lupy search and return moin
         search in those pages.
         """
+        pages = None
         index = Index(self.request)
-        if not index.exists():
-            return self._moinSearch()
-        self.request.clock.start('_lupySearch')
-        try:
-            hits = index.search(self.query.lupy_term())
-            pages = [hit.get('pagename') for hit in hits]
-        except index.LockedException:
-            pages = None
-        self.request.clock.stop('_lupySearch')
-        if pages == []:
-            return pages
+        if index.exists():
+            self.request.clock.start('_lupySearch')
+            try:
+                hits = index.search(self.query.lupy_term())
+                pages = [hit.get('pagename') for hit in hits]
+            except index.LockedException:
+                pass
+            self.request.clock.stop('_lupySearch')
         return self._moinSearch(pages)
 
     def _moinSearch(self, pages=None):
@@ -1169,7 +1160,7 @@
         """
         self.request.clock.start('_moinSearch')
         from MoinMoin.Page import Page
-        if not pages:
+        if pages is None:
             pages = self._getPageList()
         hits = []
         for name in pages: