changeset 437:f96c2c8d7c91

attachment search (lupy only) imported from: moin--main--1.5--patch-441
author Thomas Waldmann <tw@waldmann-edv.de>
date Sun, 12 Feb 2006 20:27:04 +0000
parents ae3ae7e9e14d
children 8bb545d58e6f
files ChangeLog MoinMoin/action/fullsearch.py MoinMoin/lupy.py MoinMoin/search.py docs/CHANGES
diffstat 5 files changed, 183 insertions(+), 43 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Sun Feb 12 12:15:01 2006 +0000
+++ b/ChangeLog	Sun Feb 12 20:27:04 2006 +0000
@@ -2,6 +2,21 @@
 # arch-tag: automatic-ChangeLog--arch@arch.thinkmo.de--2003-archives/moin--main--1.5
 #
 
+2006-02-12 21:27:04 GMT	Thomas Waldmann <tw@waldmann-edv.de>	patch-441
+
+    Summary:
+      attachment search (lupy only)
+    Revision:
+      moin--main--1.5--patch-441
+
+    attachment search (lupy only)
+    
+
+    modified files:
+     ChangeLog MoinMoin/action/fullsearch.py MoinMoin/lupy.py
+     MoinMoin/search.py docs/CHANGES
+
+
 2006-02-12 13:15:01 GMT	Thomas Waldmann <tw@waldmann-edv.de>	patch-440
 
     Summary:
--- a/MoinMoin/action/fullsearch.py	Sun Feb 12 12:15:01 2006 +0000
+++ b/MoinMoin/action/fullsearch.py	Sun Feb 12 20:27:04 2006 +0000
@@ -66,12 +66,14 @@
     # XXX won't work with attachment search
     # improve if we have one...
     if len(results.hits) == 1:
-        page = Page(request, results.hits[0].page_name)
-        # TODO: remove escape=0 in 2.0
-        url = page.url(request, querystr={'highlight': query.highlight_re()},
-                       escape=0)
-        request.http_redirect(url)
-        raise MoinMoinNoFooter
+        page = results.hits[0]
+        if not page.attachment: # we did not find an attachment
+            page = Page(request, page.page_name)
+            # TODO: remove escape=0 in 2.0
+            url = page.url(request, querystr={'highlight': query.highlight_re()},
+                           escape=0)
+            request.http_redirect(url)
+            raise MoinMoinNoFooter
 
     # send http headers
     request.http_headers()
--- a/MoinMoin/lupy.py	Sun Feb 12 12:15:01 2006 +0000
+++ b/MoinMoin/lupy.py	Sun Feb 12 20:27:04 2006 +0000
@@ -252,16 +252,38 @@
         return hits
 
     def update_page(self, page):
+        self.queue.append(page.page_name)
+        self._do_queued_updates_InNewThread()
+
+    def _do_queued_updates_InNewThread(self):
+        """ do queued index updates in a new thread
+        
+        Should be called from a user request. From a script, use indexPages.
+
+        TODO: tune the acquire timeout
+        """
         if not self.lock.acquire(1.0):
-            self.queue.append(page.page_name)
+            self.request.log("can't index: can't acquire lock")
             return
-        self.request.clock.start('update_page')
         try:
-            self._do_queued_updates()
-            self._update_page(page)
-        finally:
+            from threading import Thread
+            indexThread = Thread(target=self._do_queued_updates,
+                args=(self._indexingRequest(self.request), self.lock))
+            indexThread.setDaemon(True)
+            
+            # Join the index thread after current request finish, prevent
+            # Apache CGI from killing the process.
+            def joinDecorator(finish):
+                def func():
+                    finish()
+                    indexThread.join()
+                return func
+                
+            self.request.finish = joinDecorator(self.request.finish)        
+            indexThread.start()
+        except:
             self.lock.release()
-        self.request.clock.stop('update_page')
+            raise
 
     def indexPages(self):
         """ Index all pages
@@ -335,13 +357,18 @@
     # -------------------------------------------------------------------
     # Private
 
-    def _do_queued_updates(self, amount=5):
+    def _do_queued_updates(self, request, lock=None, amount=5):
         """ Assumes that the write lock is acquired """
-        pages = self.queue.pages()[:amount]
-        for name in pages:
-            p = Page(self.request, name)
-            self._update_page(p)
-        self.queue.remove(pages)
+        try:
+            self.translate_table = self.make_transtable()
+            pages = self.queue.pages() # [:amount]
+            for name in pages:
+                p = Page(request, name)
+                self._update_page(p)
+            self.queue.remove(pages)
+        finally:
+            if lock:
+                lock.release()
 
     def _update_page(self, page):
         """ Assumes that the write lock is acquired """
@@ -352,21 +379,52 @@
             writer = IndexWriter(self.dir, False, tokenizer)
             self._index_page(writer, page)
             writer.close()
-        
+   
+    def make_transtable(self):
+        import string
+        norm = string.maketrans('', '') # builds a list of all characters
+        non_alnum = string.translate(norm, norm, string.letters+string.digits) 
+        trans_nontext = string.maketrans(non_alnum, ' '*len(non_alnum))
+        return trans_nontext
+   
     def _index_page(self, writer, page):
         """ Assumes that the write lock is acquired """
         d = document.Document()
-        d.add(document.Keyword('pagename', page.page_name))
-        d.add(document.Text('title', page.page_name, store=False))        
+        pagename = page.page_name
+        request = page.request
+        d.add(document.Keyword('pagename', pagename))
+        d.add(document.Keyword('attachment', '')) # this is a real page, not an attachment
+        d.add(document.Text('title', pagename, store=False))        
         d.add(document.Text('text', page.get_raw_body(), store=False))
         
-        links = page.getPageLinks(page.request)
+        links = page.getPageLinks(request)
         t = document.Text('links', '', store=False)
         t.stringVal = links
         d.add(t)
         d.add(document.Text('link_text', ' '.join(links), store=False))
 
         writer.addDocument(d)
+        
+        from MoinMoin.action import AttachFile
+        def filecontent(fn):
+            f = file(fn, "rb")
+            data = f.read()
+            f.close()
+            data = data.translate(self.translate_table)
+            data = ' '.join(data.split()) # remove lots of blanks
+            return data.decode('utf-8')
+        
+        attachments = AttachFile._get_files(request, pagename)
+        for att in attachments:
+            att_content = filecontent(AttachFile.getFilename(request, pagename, att))
+            d = document.Document()
+            d.add(document.Keyword('pagename', pagename))
+            d.add(document.Keyword('attachment', att)) # this is an attachment, store its filename
+            d.add(document.Text('title', att, store=False)) # the filename is the "title" of an attachment
+            d.add(document.Text('text', att_content, store=False))
+            
+            writer.addDocument(d)
+
 
     def _index_pages(self, request, lock=None):
         """ Index all pages
@@ -386,6 +444,7 @@
             writer.mergeFactor = 50
             pages = request.rootpage.getPageList(user='', exists=1)
             request.log("indexing all (%d) pages..." % len(pages))
+            self.translate_table = self.make_transtable()
             for pagename in pages:
                 p = Page(request, pagename)
                 # code does NOT seem to assume request.page being set any more
--- a/MoinMoin/search.py	Sun Feb 12 12:15:01 2006 +0000
+++ b/MoinMoin/search.py	Sun Feb 12 20:27:04 2006 +0000
@@ -515,6 +515,7 @@
 
     def __init__(self, page_name, matches=None, page=None):
         self.page_name = page_name
+        self.attachment = '' # this is not an attachment
         self.page = page
         if matches is None:
             matches = []
@@ -605,7 +606,23 @@
 
 class FoundAttachment(FoundPage):
     """ Represent an attachment in search results """
-    pass
+    
+    def __init__(self, page_name, attachment, matches=None, page=None):
+        self.page_name = page_name
+        self.attachment = attachment
+        self.page = page
+        if matches is None:
+            matches = []
+        self._matches = matches
+
+    def weight(self, unique=1):
+        return 1
+
+    def get_matches(self, unique=1, sort='start', type=Match):
+        return []
+
+    def _unique_matches(self, type=Match):
+        return []
 
 
 ##############################################################################
@@ -793,13 +810,22 @@
             list = f.number_list
         else:
             list = f.bullet_list
-        querystr = self.querystring()
-            
+
         # Add pages formatted as list
         if self.hits:
             write(list(1))
 
             for page in self.hits:
+                if page.attachment:
+                    querydict = {
+                        'action': 'AttachFile',
+                        'do': 'get',
+                        'target': page.attachment,
+                    }
+                else:
+                    querydict = None
+                querystr = self.querystring(querydict)
+            
                 matchInfo = ''
                 if info:
                     matchInfo = self.formatInfo(f, page)
@@ -836,7 +862,6 @@
         self._reset(request, formatter)
         f = formatter
         write = self.buffer.write
-        querystr = self.querystring()
         
         # Add pages formatted as definition list
         if self.hits:
@@ -846,6 +871,17 @@
                 matchInfo = ''
                 if info:
                     matchInfo = self.formatInfo(f, page)
+                if page.attachment:
+                    fmt_context = ""
+                    querydict = {
+                        'action': 'AttachFile',
+                        'do': 'get',
+                        'target': page.attachment,
+                    }
+                else:
+                    fmt_context = self.formatContext(page, context, maxlines)
+                    querydict = None
+                querystr = self.querystring(querydict)
                 item = [
                     f.definition_term(1),
                     f.pagelink(1, page.page_name, querystr=querystr),
@@ -854,7 +890,7 @@
                     matchInfo,
                     f.definition_term(0),
                     f.definition_desc(1),
-                    self.formatContext(page, context, maxlines),
+                    fmt_context,
                     f.definition_desc(0),
                     ]
                 write(''.join(item))
@@ -1031,6 +1067,13 @@
         # Add text after match
         if start < len(pagename):
             output.append(f.text(pagename[start:]))
+        
+        if page.attachment: # show the attachment that matched
+            output.extend([
+                    " ",
+                    f.strong(1),
+                    f.text("(%s)" % page.attachment),
+                    f.strong(0)])
 
         return ''.join(output)
 
@@ -1058,11 +1101,12 @@
             return ''.join(output)
         return ''
 
-    def querystring(self):
+    def querystring(self, querydict=None):
         """ Return query string, used in the page link """
-        querystr = {'highlight': self.query.highlight_re()}
-        querystr = wikiutil.makeQueryString(querystr)
-        querystr = wikiutil.escape(querystr)
+        if querydict is None:
+            querydict = {'highlight': self.query.highlight_re()}
+        querystr = wikiutil.makeQueryString(querydict)
+        #querystr = wikiutil.escape(querystr)
         return querystr
 
     def formatInfo(self, formatter, page):
@@ -1115,7 +1159,7 @@
         self.filtered = False
 
     def run(self):
-        """ Preform search and return results object """
+        """ Perform search and return results object """
         start = time.time()
         if self.request.cfg.lupy_search:
             hits = self._lupySearch()
@@ -1125,11 +1169,17 @@
         # important - filter deleted pages or pages the user may not read!
         if not self.filtered:
             hits = self._filter(hits)
+        
+        result_hits = []
+        for page, attachment, match in hits:
+            if attachment:
+                result_hits.append(FoundAttachment(page.page_name, attachment))
+            else:
+                result_hits.append(FoundPage(page.page_name, match))
             
-        hits = [FoundPage(page.page_name, match) for page, match in hits]
         elapsed = time.time() - start
         count = self.request.rootpage.getPageCount()
-        return SearchResults(self.query, hits, count, elapsed)
+        return SearchResults(self.query, result_hits, count, elapsed)
 
     # ----------------------------------------------------------------
     # Private!
@@ -1146,14 +1196,14 @@
             self.request.clock.start('_lupySearch')
             try:
                 hits = index.search(self.query.lupy_term())
-                pages = [hit.get('pagename') for hit in hits]
+                pages = [(hit.get('pagename'), hit.get('attachment')) for hit in hits]
             except index.LockedException:
                 pass
             self.request.clock.stop('_lupySearch')
         return self._moinSearch(pages)
 
     def _moinSearch(self, pages=None):
-        """ Search pages using moin built in full text search 
+        """ Search pages using moin's built-in full text search 
         
         Return list of tuples (page, match). The list may contain
         deleted pages or pages the user may not read.
@@ -1161,13 +1211,18 @@
         self.request.clock.start('_moinSearch')
         from MoinMoin.Page import Page
         if pages is None:
-            pages = self._getPageList()
+            # if we are not called from _lupySearch, we make a full pagelist,
+            # but don't search attachments (thus attachment name = '')
+            pages = [(p, '') for p in self._getPageList()]
         hits = []
-        for name in pages:
-            page = Page(self.request, name)
-            match = self.query.search(page)
-            if match:
-                hits.append((page, match))
+        for pagename, attachment in pages:
+            page = Page(self.request, pagename)
+            if attachment:
+               hits.append((page, attachment, None))
+            else:
+                match = self.query.search(page)
+                if match:
+                    hits.append((page, attachment, match))
         self.request.clock.stop('_moinSearch')
         return hits
 
@@ -1190,7 +1245,7 @@
     def _filter(self, hits):
         """ Filter out deleted or acl protected pages """
         userMayRead = self.request.user.may.read
-        filtered = [(page, match) for page, match in hits
+        filtered = [(page, attachment, match) for page, attachment, match in hits
                     if page.exists() and userMayRead(page.page_name)]    
         return filtered
         
--- a/docs/CHANGES	Sun Feb 12 12:15:01 2006 +0000
+++ b/docs/CHANGES	Sun Feb 12 20:27:04 2006 +0000
@@ -33,6 +33,13 @@
     $ make test >tests/make_test.out
 
 Version 1.5.current:
+  New features:
+    * attachment search using lupy (lupy_search = 1 in your config)
+      Title search will also search attachment filenames.
+      Full text search will also search attachment contents (filtered through
+      some very simple ASCII-only filter when the index is built - sorry, no
+      umlauts).
+
   Bugfixes:
     * cookie_lifetime didn't work comfortable for low values. The cookie was
       created once on login and never updated afterwards. So you got logged
@@ -40,6 +47,8 @@
       that time or not. This has been changed, we update the cookie expiry now
       on every request, so it will expire cookie_lifetime after your last
       request (not after login).
+    * lupy search now behaves a bit less curious. Still no guarantees...
+
 
 Version 1.5.2: