Mercurial > moin > 1.9
changeset 1379:22526c2fd5b4
merged xapian branch
author | Thomas Waldmann <tw AT waldmann-edv DOT de> |
---|---|
date | Sun, 20 Aug 2006 22:25:00 +0200 |
parents | 8960f31bb162 (current diff) fa0b7d2d998b (diff) |
children | cc9805777571 |
files | MoinMoin/action/fullsearch.py wiki/htdocs/modern/img/nav_current.png wiki/htdocs/modern/img/nav_first.png wiki/htdocs/modern/img/nav_last.png wiki/htdocs/modern/img/nav_next.png wiki/htdocs/modern/img/nav_page.png wiki/htdocs/modern/img/nav_prev.png |
diffstat | 15 files changed, 421 insertions(+), 75 deletions(-) [+] |
line wrap: on
line diff
--- a/MoinMoin/action/fullsearch.py Sun Aug 20 22:01:20 2006 +0200 +++ b/MoinMoin/action/fullsearch.py Sun Aug 20 22:25:00 2006 +0200 @@ -8,6 +8,7 @@ @license: GNU GPL, see COPYING for details. """ +import re from MoinMoin.Page import Page from MoinMoin import wikiutil @@ -25,16 +26,26 @@ except ValueError: return True except KeyError: - return 'fullsearch' not in request.form + return 'fullsearch' not in request.form and \ + not isAdvancedSearch(request) +def isAdvancedSearch(request): + try: + return int(request.form['advancedsearch'][0]) + except KeyError: + return False def execute(pagename, request, fieldname='value', titlesearch=0): _ = request.getText titlesearch = isTitleSearch(request) + advancedsearch = isAdvancedSearch(request) # context is relevant only for full search if titlesearch: context = 0 + elif advancedsearch: + # XXX: hardcoded + context = 180 else: context = int(request.form.get('context', [0])[0]) @@ -46,6 +57,42 @@ max_context = 1 # only show first `max_context` contexts XXX still unused + if advancedsearch: + and_terms = request.form.get('and_terms', [''])[0].strip() + or_terms = request.form.get('or_terms', [''])[0].strip() + not_terms = request.form.get('not_terms', [''])[0].strip() + #xor_terms = request.form.get('xor_terms', [''])[0].strip() + categories = request.form.get('categories', [''])[0].strip() + timeframe = request.form.get('time', [''])[0].strip() + language = request.form.get('language', + [request.cfg.language_default])[0] + mimetype = request.form.get('mimetype', [0])[0] + includeunderlay = request.form.get('includeunderlay', [0])[0] + onlysystempages = request.form.get('onlysystempages', [0])[0] + mtime = request.form.get('mtime', [''])[0] + + word_re = re.compile(r'(\"[\w\s]+"|\w+)') + needle = '' + if language: + needle += 'language:%s ' % language + if mimetype: + needle += 'mimetype:%s ' % mimetype + if not includeunderlay: + needle += '-domain:underlay ' + if onlysystempages: + needle += 'domain:system ' + if mtime: + needle += 'lastmodifiedsince:%s ' % mtime + if categories: + needle += '(%s) ' % ' or '.join(['category:%s' % cat + for cat in word_re.findall(categories)]) + if and_terms: + needle += '(%s) ' % and_terms + if not_terms: + needle += '(%s) ' % ' '.join(['-%s' % t for t in word_re.findall(not_terms)]) + if or_terms: + needle += '(%s) ' % ' or '.join(word_re.findall(or_terms)) + # check for sensible search term striped = needle.strip() if len(striped) == 0: @@ -54,6 +101,7 @@ request.emit_http_headers() Page(request, pagename).send_page(request, msg=err) return + needle = striped # Setup for type of search if titlesearch:
--- a/MoinMoin/formatter/text_html.py Sun Aug 20 22:01:20 2006 +0200 +++ b/MoinMoin/formatter/text_html.py Sun Aug 20 22:25:00 2006 +0200 @@ -6,7 +6,12 @@ @license: GNU GPL, see COPYING for details. """ import os.path, re -from sets import Set # TODO: when we require Python 2.4+ use the builtin 'set' type + +try: + set +except: + from sets import Set as set + from MoinMoin.formatter import FormatterBase from MoinMoin import wikiutil, i18n, config from MoinMoin.Page import Page @@ -16,7 +21,7 @@ prettyprint = False # These are the HTML elements that we treat as block elements. -_blocks = Set(['dd', 'div', 'dl', 'dt', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', +_blocks = set(['dd', 'div', 'dl', 'dt', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'li', 'ol', 'p', 'pre', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'ul', 'blockquote', ]) @@ -26,30 +31,30 @@ # content, and also IE has a parsing bug with those two elements (only) # when they don't have a closing tag even if valid XHTML. -_self_closing_tags = Set(['area', 'base', 'br', 'col', 'frame', 'hr', 'img', 'input', - 'isindex', 'link', 'meta', 'param']) +_self_closing_tags = set(['area', 'base', 'br', 'col', 'frame', 'hr', 'img', + 'input', 'isindex', 'link', 'meta', 'param']) # We only open those tags and let the browser auto-close them: -_auto_closing_tags = Set(['p']) +_auto_closing_tags = set(['p']) # These are the elements which generally should cause an increase in the # indention level in the html souce code. -_indenting_tags = Set(['ol', 'ul', 'dl', 'li', 'dt', 'dd', 'tr', 'td']) +_indenting_tags = set(['ol', 'ul', 'dl', 'li', 'dt', 'dd', 'tr', 'td']) # These are the elements that discard any whitespace they contain as # immediate child nodes. -_space_eating_tags = Set(['colgroup', 'dl', 'frameset', 'head', 'map' 'menu', +_space_eating_tags = set(['colgroup', 'dl', 'frameset', 'head', 'map' 'menu', 'ol', 'optgroup', 'select', 'table', 'tbody', 'tfoot', 'thead', 'tr', 'ul']) # These are standard HTML attributes which are typically used without any # value; e.g., as boolean flags indicated by their presence. -_html_attribute_boolflags = Set(['compact', 'disabled', 'ismap', 'nohref', +_html_attribute_boolflags = set(['compact', 'disabled', 'ismap', 'nohref', 'noresize', 'noshade', 'nowrap', 'readonly', 'selected', 'wrap']) # These are all the standard HTML attributes that are allowed on any element. -_common_attributes = Set(['accesskey', 'class', 'dir', 'disabled', 'id', 'lang', +_common_attributes = set(['accesskey', 'class', 'dir', 'disabled', 'id', 'lang', 'style', 'tabindex', 'title'])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/macro/AdvancedSearch.py Sun Aug 20 22:25:00 2006 +0200 @@ -0,0 +1,136 @@ +# -*- coding: iso-8859-1 -*- +''' + MoinMoin - AdvancedSearch Macro + + [[AdvancedSearch]] + displays advanced search dialog. + + MAYBE: + [[AdvancedSearch(Help)]] + embed results of an advanced search (use more parameters...) +''' + +from MoinMoin import config, wikiutil, search +from MoinMoin.i18n import languages + +import mimetypes + +Dependencies = ['pages'] + +try: + sorted +except NameError: + def sorted(l, *args, **kw): + l = l[:] + l.sort(*args, **kw) + return l + +def advanced_ui(macro): + _ = macro._ + f = macro.formatter + + search_boxes = ''.join([ + f.table_row(1), + f.table_cell(1, attrs={'rowspan': '6', 'class': 'searchfor'}), + f.text(_('Search for pages')), + f.table_cell(0), + ''.join([''.join([ + f.table_row(1), + f.table_cell(1), + f.text(_(txt)), + f.table_cell(0), + f.table_cell(1), + f.rawHTML(input_field), + f.table_cell(0), + f.table_row(0), + ]) for txt, input_field in ( + (_('containing all the following terms'), + '<input type="text" name="and_terms" size="30">'), + (_('containing one or more of the following terms'), + '<input type="text" name="or_terms" size="30">'), + (_('not containing the following terms'), + '<input type="text" name="not_terms" size="30">'), + #('containing only one of the following terms', + # '<input type="text" name="xor_terms" size="30">'), + # TODO: dropdown-box? + (_('belonging to one of the following categories'), + '<input type="text" name="categories" size="30">'), + (_('last modified since (XXX)'), + '<input type="text" name="mtime" size="30" value="">'), + )]) + ]) + + langs = dict([(lang, lmeta['x-language-in-english']) + for lang, lmeta in sorted(languages.items())]) + lang_dropdown = ''.join([ + u'<select name="language" size="1">', + u'<option value="" selected>%s</option>' % _('any language'), + ''.join(['<option value="%s">%s</option>' % lt for lt in + langs.items()]), + u'</select>', + ]) + + ft_dropdown = ''.join([ + u'<select name="mimetype" size="1">', + u'<option value="" selected>%s</option>' % _('any type'), + ''.join(['<option value="%s">%s</option>' % (m[1], '*%s - %s' % m) + for m in sorted(mimetypes.types_map.items())]), + u'</select>', + ]) + + search_options = ''.join([ + ''.join([ + f.table_row(1), + f.table_cell(1, attrs={'class': 'searchfor'}), + txt[0], + f.table_cell(0), + f.table_cell(1, colspan=2), + txt[1], + f.table_cell(0), + f.table_row(0), + ]) for txt in ( + (_('Language'), lang_dropdown), + (_('File Type'), ft_dropdown), + ('', '<input type="checkbox" name="titlesearch" value="1">%s</input>' % + _('Search only in titles')), + ('', '<input type="checkbox" name="case" value="1">%s</input>' % + _('Case-sensitive search')), + ('', '<input type="checkbox" name="includeunderlay" value="1" checked>%s' + '</input>' % _('Include underlay')), + ('', '<input type="checkbox" name="onlysystempages" value="1">%s' + '</input>' % _('Only system pages')), + ) + ]) + + html = [ + u'<form method="get" action="">', + u'<div>', + u'<input type="hidden" name="action" value="fullsearch">', + u'<input type="hidden" name="advancedsearch" value="1">', + f.table(1, attrs={'tableclass': 'advancedsearch'}), + search_boxes, + search_options, + f.table_row(1), + f.table_cell(1, attrs={'class': 'submit', 'colspan': '3'}), + u'<input type="submit" value="%s">' % _('Go get it!'), + f.table_cell(0), + f.table_row(0), + f.table(0), + u'</div>', + u'</form>', + ] + + return f.rawHTML('\n'.join(html)) + + +def execute(macro, needle): + request = macro.request + _ = request.getText + + # no args given + if needle is None: + return advanced_ui(macro) + + return macro.formatter.rawHTML('wooza!') + +
--- a/MoinMoin/macro/FullSearch.py Sun Aug 20 22:01:20 2006 +0200 +++ b/MoinMoin/macro/FullSearch.py Sun Aug 20 22:25:00 2006 +0200 @@ -23,7 +23,8 @@ context argument, or make another macro that use context, which may be easier to use. - @copyright: 2000-2004 by Jürgen Hermann <jh@web.de> + @copyright: 2000-2004 by Jürgen Hermann <jh@web.de>, + 2005 MoinMoin:FranzPletz @license: GNU GPL, see COPYING for details. """
--- a/MoinMoin/search/Xapian.py Sun Aug 20 22:01:20 2006 +0200 +++ b/MoinMoin/search/Xapian.py Sun Aug 20 22:25:00 2006 +0200 @@ -152,7 +152,7 @@ # http://svn.xapian.org/*checkout*/trunk/xapian-applications/omega/docs/termprefixes.txt 'author': 'A', 'date': 'D', # numeric format: YYYYMMDD or "latest" - e.g. D20050224 or Dlatest - #G newsGroup (or similar entity - e.g. a web forum name) + #G newsGroup (or sim2006-08-17 05:11:53ilar entity - e.g. a web forum name) 'hostname': 'H', 'keyword': 'K', 'lang': 'L', # ISO Language code @@ -174,6 +174,7 @@ 'category': 'XCAT', # category this document belongs to 'full_title': 'XFT', # full title (for regex) 'domain': 'XDOMAIN', # standard or underlay + 'revision': 'XREV', # revision of page #Y year (four digits) } @@ -350,6 +351,8 @@ yield 'underlay' if page.isStandardPage(): yield 'standard' + if wikiutil.isSystemPage(self.request, page.page_name): + yield 'system' def _index_page(self, writer, page, mode='update'): """ Index a page - assumes that the write lock is acquired @@ -364,6 +367,8 @@ pagename = page.page_name mtime = page.mtime_usecs() itemid = "%s:%s" % (wikiname, pagename) + revision = str(page.get_real_rev()) + author = page.last_edit(request)['editor'] # XXX: Hack until we get proper metadata language, stem_language = self._get_languages(page) categories = self._get_categories(page) @@ -397,7 +402,10 @@ xkeywords = [xapdoc.Keyword('itemid', itemid), xapdoc.Keyword('lang', language), xapdoc.Keyword('stem_lang', stem_language), - xapdoc.Keyword('full_title', pagename.lower())] + xapdoc.Keyword('full_title', pagename.lower()), + xapdoc.Keyword('revision', revision), + xapdoc.Keyword('author', author), + )] for pagelink in page.getPageLinks(request): xkeywords.append(xapdoc.Keyword('linkto', pagelink)) for category in categories: @@ -452,11 +460,14 @@ xlanguage = xapdoc.Keyword('lang', language) xstem_language = xapdoc.Keyword('stem_lang', stem_language) mimetype, att_content = self.contentfilter(filename) - xmimetype = xapdoc.TextField('mimetype', mimetype, True) + xmimetype = xapdoc.Keyword('mimetype', mimetype) xcontent = xapdoc.TextField('content', att_content) - doc = xapdoc.Document(textFields=(xcontent, xmimetype, ), - keywords=(xatt_itemid, xtitle, xlanguage, xstem_language, ), - sortFields=(xpname, xattachment, xmtime, xwname, ), + doc = xapdoc.Document(textFields=(xcontent, ), + keywords=(xatt_itemid, xtitle, + xlanguage, xstem_language, + xmimetype, ), + sortFields=(xpname, xattachment, xmtime, + xwname, ), ) doc.analyzerFactory = getWikiAnalyzerFactory(request, stem_language)
--- a/MoinMoin/search/queryparser.py Sun Aug 20 22:01:20 2006 +0200 +++ b/MoinMoin/search/queryparser.py Sun Aug 20 22:25:00 2006 +0200 @@ -696,6 +696,134 @@ return UnicodeQuery('%s:%s' % (prefix, pattern)) +class MimetypeSearch(BaseExpression): + """ Search for files belonging to a specific mimetype """ + + def __init__(self, pattern, use_re=False, case=True): + """ Init a mimetype search + + @param pattern: pattern to search for, ascii string or unicode + @param use_re: treat pattern as re of plain text, bool + @param case: do case sensitive search, bool + """ + self._pattern = pattern.lower() + self.negated = 0 + self.use_re = use_re + self.case = False # not case-sensitive! + self.xapian_called = False + self._build_re(self._pattern, use_re=use_re, case=case) + + def costs(self): + return 5000 # cheaper than a TextSearch + + def __unicode__(self): + neg = self.negated and '-' or '' + return u'%s!"%s"' % (neg, unicode(self._pattern)) + + def highlight_re(self): + return "" + + def search(self, page): + # We just use (and trust ;)) xapian for this.. deactivated for _moinSearch + if not self.xapian_called: + return [] + else: + return [Match()] + + def xapian_wanted(self): + return True # only easy regexps possible + + def xapian_need_postproc(self): + return False # case-sensitivity would make no sense + + def xapian_term(self, request, allterms): + self.xapian_called = True + prefix = Xapian.Index.prefixMap['mimetype'] + if self.use_re: + # basic regex matching per term + terms = [] + found = None + n = len(prefix) + for term in allterms(): + if prefix == term[:n]: + found = True + if self.search_re.match(term[n:]): + terms.append(term) + elif found: + continue + + if not terms: + return Query() + return Query(Query.OP_OR, terms) + else: + pattern = self._pattern + return UnicodeQuery('%s%s' % (prefix, pattern)) + + +class DomainSearch(BaseExpression): + """ Search for pages belonging to a specific domain """ + + def __init__(self, pattern, use_re=False, case=True): + """ Init a domain search + + @param pattern: pattern to search for, ascii string or unicode + @param use_re: treat pattern as re of plain text, bool + @param case: do case sensitive search, bool + """ + self._pattern = pattern.lower() + self.negated = 0 + self.use_re = use_re + self.case = False # not case-sensitive! + self.xapian_called = False + self._build_re(self._pattern, use_re=use_re, case=case) + + def costs(self): + return 5000 # cheaper than a TextSearch + + def __unicode__(self): + neg = self.negated and '-' or '' + return u'%s!"%s"' % (neg, unicode(self._pattern)) + + def highlight_re(self): + return "" + + def search(self, page): + # We just use (and trust ;)) xapian for this.. deactivated for _moinSearch + if not self.xapian_called: + return [] + else: + return [Match()] + + def xapian_wanted(self): + return True # only easy regexps possible + + def xapian_need_postproc(self): + return False # case-sensitivity would make no sense + + def xapian_term(self, request, allterms): + self.xapian_called = True + prefix = Xapian.Index.prefixMap['domain'] + if self.use_re: + # basic regex matching per term + terms = [] + found = None + n = len(prefix) + for term in allterms(): + if prefix == term[:n]: + found = True + if self.search_re.match(term[n+1:]): + terms.append(term) + elif found: + continue + + if not terms: + return Query() + return Query(Query.OP_OR, terms) + else: + pattern = self._pattern + return UnicodeQuery('%s:%s' % (prefix, pattern)) + + ############################################################################## ### Parse Query ############################################################################## @@ -782,6 +910,8 @@ linkto = False lang = False category = False + mimetype = False + domain = False for m in modifiers: if "title".startswith(m): @@ -796,6 +926,10 @@ lang = True elif "category".startswith(m): category = True + elif "mimetype".startswith(m): + mimetype = True + elif "domain".startswith(m): + domain = True # oh, let's better call xapian if we encouter this nasty regexp ;) if not category: @@ -808,10 +942,14 @@ if category: obj = CategorySearch(text, use_re=regex, case=case) + elif mimetype: + obj = MimetypeSearch(text, use_re=regex, case=False) elif lang: obj = LanguageSearch(text, use_re=regex, case=False) elif linkto: obj = LinkSearch(text, use_re=regex, case=case) + elif domain: + obj = DomainSearch(text, use_re=regex, case=False) elif title_search: obj = TitleSearch(text, use_re=regex, case=case) else:
--- a/MoinMoin/search/results.py Sun Aug 20 22:01:20 2006 +0200 +++ b/MoinMoin/search/results.py Sun Aug 20 22:25:00 2006 +0200 @@ -287,7 +287,7 @@ output = [ formatter.paragraph(1, attr={'class': 'searchstats'}), _("Results %(bs)s%(hitsFrom)d - %(hitsTo)d%(be)s " - "of %(aboutHits)s %(bs)s%(hits)d%(be)s results out of" + "of %(aboutHits)s %(bs)s%(hits)d%(be)s results out of " "about %(pages)d pages.") % {'aboutHits': self.estimated_hits[0], 'hits': self.estimated_hits[1], 'pages': self.pages, @@ -652,10 +652,6 @@ return ''.join(output) return '' - def _img_url(self, img): - cfg = self.request.cfg - return '%s/%s/img/%s.png' % (cfg.url_prefix_static, self.request.theme.name, img) - def formatPrevNextPageLinks(self, hitsFrom, hitsPerPage, hitsNum): """ Format previous and next page links in page @@ -714,39 +710,6 @@ return ''.join([ f.table(1, attrs={'tableclass': 'searchpages'}), f.table_row(1), - f.table_cell(1, attrs={'class': 'prev'}), - # first image, previous page - l[0] and - ''.join([ - f.url(1, href=page_url(cur_page-1)), - f.image(self._img_url('nav_prev')), - f.url(0), - ]) or - f.image(self._img_url('nav_first')), - f.table_cell(0), - # images for ooos, highlighted current page - ''.join([ - ''.join([ - f.table_cell(1), - i != cur_page and f.url(1, href=page_url(i)) or '', - f.image(self._img_url(i == cur_page and - 'nav_current' or 'nav_page')), - i != cur_page and f.url(0) or '', - f.table_cell(0), - ]) for i in page_range - ]), - f.table_cell(1, attrs={'class': 'next'}), - # last image, next page - l[-1] and - ''.join([ - f.url(1, href=page_url(cur_page+1)), - f.image(self._img_url('nav_next')), - f.url(0), - ]) or - f.image(self._img_url('nav_last')), - f.table_cell(0), - f.table_row(0), - f.table_row(1), f.table_cell(1), # textlinks (f.table_cell(0) + f.table_cell(1)).join(l), @@ -772,7 +735,6 @@ f.paragraph(0), ]) - def querystring(self, querydict=None): """ Return query string, used in the page link """ if querydict is None:
--- a/docs/CHANGES.fpletz Sun Aug 20 22:01:20 2006 +0200 +++ b/docs/CHANGES.fpletz Sun Aug 20 22:25:00 2006 +0200 @@ -8,12 +8,13 @@ metadata) ToDo: - * Implement the new search UI * Write/update documentation for all the new search stuff + * Search based on mtime + * Index all revisions and let users search in them (rev, mtime) + + ToDo (low priority): * Reevaluate Xapwrap, possibly drop it and rip out usable stuff (i.e. ExceptionTranslator) - - ToDo (low priority): * Case-sensitive searches / Regexp on multiple terms: Graceful fallback to and/or merge with moinSearch based on nodes xapian can handle in the search term tree @@ -23,10 +24,12 @@ New Features: * Faster search thanks to Xapian - * Searching for languages with new prefix lang/language, i.e. lang:de + * New searches: + - LanguageSearch: language:de + - CategorySearch: category:Homepage + - MimetypeSearch: mimetype:image/png (for attachments/files) + - DomainSearch: domain:underlay Note: Currently only available when Xapian is used - * CategorySearch with prefix category or with the regexp previously - used (autodetected as CategorySearch) * New config options: xapian_search 0 enables xapian-powered search xapian_index_dir None directory for xapian indices @@ -228,7 +231,29 @@ -> still TODO: need real weight 2006-08-10 - * entry missing + * corrected range and count of results (estimated by xapian) + * pagelinks only there are enough results 2006-08-10 .. 13 no work on project +2006-08-14 + * fixed some remaining issues with the ui + +2006-08-15 + * removed Moooin gfx as requested by Google + +2006-08-16 no work on project + +2006-08-17 + * started advanced gui, new macro: AdvancedSearch + +2006-08-18 + * eye-candy for advanced gui + * reworked fullsearch action to work with AdvancedSearch and most of + the + +2006-08-19 + * mimetype search works (more or less) + * minor bugfixes (i18n etc.) + * domain-specific search (underlay -> system pages) +
--- a/wiki/htdocs/modern/css/common.css Sun Aug 20 22:01:20 2006 +0200 +++ b/wiki/htdocs/modern/css/common.css Sun Aug 20 22:25:00 2006 +0200 @@ -331,6 +331,35 @@ div.codearea pre span.DiffSeparator {color: #228B22; font-weight: bold} /* Search results */ +.advancedsearch { + border: 1pt solid #ADB9CC; +} + +.advancedsearch td { + vertical-align: top; + background-color: #E7E7E7; + border: 0px; +} + +.advancedsearch td.searchfor { + font-weight: bold; +} +.advancedsearch input { + border: 1px solid #ADB9CC; + background-color: #fff; +} + +.advancedsearch td.submit { + border-top: 1px solid #ADB9CC; + background-color: #fff; + text-align: right; +} + +.advancedsearch optioni, select { + border: 1px solid #ADB9CC; + background-color: #fff; +} + .searchresults dt { margin-top: 1em; @@ -363,26 +392,17 @@ .searchpages tr, .searchpages td { border: 0; - padding: 0; + padding: 5px; margin: 0; text-align: center; vertical-align: middle; - color: #a90a08; + color: #b93a58; font-weight: bold; -} - -.searchpages td.prev { - text-align: right; -} - -.searchpage td.next { - text-align: left; + font-size: 1.05em; } .searchpages td a, .searchpages td a:link { - color: #000000; text-decoration: underline; - font-weight: normal; } /* MonthCalendar css */