Mercurial > moin > 2.0
changeset 807:0ffa96e9e7f2 pytest2
search tokenizers/analyzers/schema: add comments/docstrings
author | Thomas Waldmann <tw AT waldmann-edv DOT de> |
---|---|
date | Tue, 30 Aug 2011 23:21:00 +0200 |
parents | 1165a3658720 |
children | 4411b124f647 |
files | MoinMoin/search/analyzers.py MoinMoin/search/indexing.py |
diffstat | 2 files changed, 38 insertions(+), 9 deletions(-) [+] |
line wrap: on
line diff
--- a/MoinMoin/search/analyzers.py Tue Aug 30 21:19:15 2011 +0200 +++ b/MoinMoin/search/analyzers.py Tue Aug 30 23:21:00 2011 +0200 @@ -20,10 +20,10 @@ Tokenizer behaviour: Input: u"text/x.moin.wiki;charset=utf-8" - Output: u"text", u"x.moin.wiki", u"charset=utf-8" + Output: u"text/x.moin.wiki;charset=utf-8", u"text", u"x.moin.wiki", u"charset=utf-8" Input: u"application/pdf" - Output: u"application", u"pdf" + Output: u"application/pdf", u"application", u"pdf" :param value: String for tokenization :param start_pos: The position number of the first token. For example,
--- a/MoinMoin/search/indexing.py Tue Aug 30 21:19:15 2011 +0200 +++ b/MoinMoin/search/indexing.py Tue Aug 30 23:21:00 2011 +0200 @@ -63,31 +63,60 @@ self._index_dir = index_dir or self._cfg.index_dir common_fields = dict( + # wikiname so we can have a shared index in a wiki farm, always check this! + # taken from app.cfg.interwikiname wikiname=ID(stored=True), + # tokenized NAME from metadata - use this for manual searching from UI name=TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0), + # unmodified NAME from metadata - use this for precise lookup by the code. + # also needed for wildcard search, so the original string as well as the query + # (with the wildcard) is not cut into pieces. name_exact=ID(field_boost=3.0), + # revision number, integer 0..n rev_no=NUMERIC(stored=True), + # MTIME from revision metadata (converted to UTC datetime) mtime=DATETIME(stored=True), + # tokenized CONTENTTYPE from metadata contenttype=TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()), + # unmodified list of TAGS from metadata tags=ID(stored=True), + # LANGUAGE from metadata language=ID(stored=True), + # USERID from metadata userid=ID(stored=True), + # ADDRESS from metadata address=ID(stored=True), + # HOSTNAME from metadata hostname=ID(stored=True), + # SIZE from metadata size=NUMERIC(stored=True), + # ACTION from metadata action=ID(stored=True), + # tokenized COMMENT from metadata comment=TEXT(stored=True, multitoken_query="and"), + # data (content), converted to text/plain and tokenized content=TEXT(stored=True, multitoken_query="and"), ) + latest_revs_fields = dict( + # UUID from metadata - as there is only latest rev of same item here, it is unique + uuid=ID(unique=True, stored=True), + # unmodified list of ITEMLINKS from metadata + itemlinks=ID(stored=True), + # unmodified list of ITEMTRANSCLUSIONS from metadata + itemtransclusions=ID(stored=True), + # tokenized ACL from metadata + acl=TEXT(analyzer=AclTokenizer(self._cfg), multitoken_query="and", stored=True), + **common_fields + ) - self.latest_revisions_schema = Schema(uuid=ID(unique=True, stored=True), - itemlinks=ID(stored=True), - itemtransclusions=ID(stored=True), - acl=TEXT(analyzer=AclTokenizer(self._cfg), multitoken_query="and", stored=True), - **common_fields) + all_revs_fields = dict( + # UUID from metadata + uuid=ID(stored=True), + **common_fields + ) - self.all_revisions_schema = Schema(uuid=ID(stored=True), - **common_fields) + self.latest_revisions_schema = Schema(**latest_revs_fields) + self.all_revisions_schema = Schema(**all_revs_fields) # Define dynamic fields dynamic_fields = [("*_id", ID(stored=True)),