changeset 807:0ffa96e9e7f2 pytest2

search tokenizers/analyzers/schema: add comments/docstrings
author Thomas Waldmann <tw AT waldmann-edv DOT de>
date Tue, 30 Aug 2011 23:21:00 +0200
parents 1165a3658720
children 4411b124f647
files MoinMoin/search/analyzers.py MoinMoin/search/indexing.py
diffstat 2 files changed, 38 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/search/analyzers.py	Tue Aug 30 21:19:15 2011 +0200
+++ b/MoinMoin/search/analyzers.py	Tue Aug 30 23:21:00 2011 +0200
@@ -20,10 +20,10 @@
         Tokenizer behaviour:
 
         Input: u"text/x.moin.wiki;charset=utf-8"
-        Output: u"text", u"x.moin.wiki", u"charset=utf-8"
+        Output: u"text/x.moin.wiki;charset=utf-8", u"text", u"x.moin.wiki", u"charset=utf-8"
 
         Input: u"application/pdf"
-        Output: u"application", u"pdf"
+        Output: u"application/pdf", u"application", u"pdf"
 
         :param value: String for tokenization
         :param start_pos: The position number of the first token. For example,
--- a/MoinMoin/search/indexing.py	Tue Aug 30 21:19:15 2011 +0200
+++ b/MoinMoin/search/indexing.py	Tue Aug 30 23:21:00 2011 +0200
@@ -63,31 +63,60 @@
         self._index_dir = index_dir or self._cfg.index_dir
 
         common_fields = dict(
+            # wikiname so we can have a shared index in a wiki farm, always check this!
+            # taken from app.cfg.interwikiname
             wikiname=ID(stored=True),
+            # tokenized NAME from metadata - use this for manual searching from UI
             name=TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0),
+            # unmodified NAME from metadata - use this for precise lookup by the code.
+            # also needed for wildcard search, so the original string as well as the query
+            # (with the wildcard) is not cut into pieces.
             name_exact=ID(field_boost=3.0),
+            # revision number, integer 0..n
             rev_no=NUMERIC(stored=True),
+            # MTIME from revision metadata (converted to UTC datetime)
             mtime=DATETIME(stored=True),
+            # tokenized CONTENTTYPE from metadata
             contenttype=TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()),
+            # unmodified list of TAGS from metadata
             tags=ID(stored=True),
+            # LANGUAGE from metadata
             language=ID(stored=True),
+            # USERID from metadata
             userid=ID(stored=True),
+            # ADDRESS from metadata
             address=ID(stored=True),
+            # HOSTNAME from metadata
             hostname=ID(stored=True),
+            # SIZE from metadata
             size=NUMERIC(stored=True),
+            # ACTION from metadata
             action=ID(stored=True),
+            # tokenized COMMENT from metadata
             comment=TEXT(stored=True, multitoken_query="and"),
+            # data (content), converted to text/plain and tokenized
             content=TEXT(stored=True, multitoken_query="and"),
         )
+        latest_revs_fields = dict(
+            # UUID from metadata - as there is only latest rev of same item here, it is unique
+            uuid=ID(unique=True, stored=True),
+            # unmodified list of ITEMLINKS from metadata
+            itemlinks=ID(stored=True),
+            # unmodified list of ITEMTRANSCLUSIONS from metadata
+            itemtransclusions=ID(stored=True),
+            # tokenized ACL from metadata
+            acl=TEXT(analyzer=AclTokenizer(self._cfg), multitoken_query="and", stored=True),
+            **common_fields
+        )
 
-        self.latest_revisions_schema = Schema(uuid=ID(unique=True, stored=True),
-                                              itemlinks=ID(stored=True),
-                                              itemtransclusions=ID(stored=True),
-                                              acl=TEXT(analyzer=AclTokenizer(self._cfg), multitoken_query="and", stored=True),
-                                              **common_fields)
+        all_revs_fields = dict(
+            # UUID from metadata
+            uuid=ID(stored=True),
+            **common_fields
+        )
 
-        self.all_revisions_schema = Schema(uuid=ID(stored=True),
-                                           **common_fields)
+        self.latest_revisions_schema = Schema(**latest_revs_fields)
+        self.all_revisions_schema = Schema(**all_revs_fields)
 
         # Define dynamic fields
         dynamic_fields = [("*_id", ID(stored=True)),