changeset 873:5019723cb7d4

improved google sitemap action (ported from 1.5)
author Thomas Waldmann <tw AT waldmann-edv DOT de>
date Sat, 24 Jun 2006 20:43:47 +0200
parents b0f7ec792299
children 4dd230fa84f8
files MoinMoin/action/sitemap.py docs/CHANGES
diffstat 2 files changed, 80 insertions(+), 16 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/action/sitemap.py	Thu Jun 22 22:30:02 2006 +0200
+++ b/MoinMoin/action/sitemap.py	Sat Jun 24 20:43:47 2006 +0200
@@ -7,35 +7,92 @@
     @copyright: 2006 by Thomas Waldmann, MoinMoin:ThomasWaldmann
     @license: GNU GPL, see COPYING for details.
 """
+import time
+from MoinMoin import wikiutil
 
-from MoinMoin import config, wikiutil
-from MoinMoin.util import MoinMoinNoFooter
+datetime_fmt = "%Y-%m-%dT%H:%M:%S+00:00"
+
+def now():
+    return time.strftime(datetime_fmt, time.gmtime())
+
+def make_url_xml(vars):
+    """ assemble a single <url> xml fragment """
+    return """\
+<url>
+  <loc>%(base)s%(url)s</loc>
+  <lastmod>%(lastmod)s</lastmod>
+  <changefreq>%(changefreq)s</changefreq>
+  <priority>%(priority)s</priority>
+</url>
+""" % vars
+    
+def sitemap_url(request, base, page):
+    """ return a sitemap <url>..</url> fragment for page object <page> """
+    url = page.url(request)
+    pagename = page.page_name
+    lastmod = page.mtime_printable(request)
+    if lastmod == "0": # can happen in case of errors
+        lastmod = now()
+
+    # page's changefreq, priority and lastmod depends on page type / name
+    if pagename in [u"RecentChanges", u"TitleIndex", ]:
+        # important dynamic pages with macros
+        changefreq = "hourly"
+        priority = "0.9"
+        lastmod = now() # the page text mtime never changes, but the macro output DOES
+
+    elif pagename in [request.cfg.page_front_page, ]:
+        # important user edited pages
+        changefreq = "hourly"
+        priority = "1.0"
+
+    elif wikiutil.isSystemPage(request, pagename):
+        # other system pages are rather boring
+        changefreq = "yearly"
+        priority = "0.1"
+
+    else:
+        # these are the content pages:
+        changefreq = "daily"
+        priority = "0.5"
+
+    return make_url_xml(locals())
 
 def execute(pagename, request):
     _ = request.getText
     form = request.form
-
-    mimetype = "text/xml"
-
+    request.user.datetime_fmt = datetime_fmt
     base = request.getBaseURL()
 
-    request.http_headers(["Content-Type: %s; charset=%s" % (mimetype, config.charset)])
+    request.http_headers(["Content-Type: text/xml; charset=UTF-8"])
 
-    request.write("""<?xml version="1.0" encoding="UTF-8"?>\r\n"""
-                  """<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">\r\n""")
+    # we emit a piece of data so other side doesn't get bored:
+    request.write("""<?xml version="1.0" encoding="UTF-8"?>\r\n""")
 
-    request.write("<url><loc>%s/</loc></url>\r\n" % (base,))
+    result = []
+    result.append("""<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">\n""")
+    
+    # we include the / url as an important and often changed URL
+    result.append(make_url_xml({
+        'base': base,
+        'url': '/',
+        'lastmod': now(), # fake
+        'changefreq': 'hourly',
+        'priority': '1.0',
+    }))
 
     # Get page dict readable by current user
     pages = request.rootpage.getPageDict()
     pagelist = pages.keys()
     pagelist.sort()
-
     for name in pagelist:
-        url = pages[name].url(request)
-        request.write("<url><loc>%s%s</loc></url>\r\n" % (base, url))
+        result.append(sitemap_url(request, base, pages[name]))
 
-    request.write("""</urlset>\r\n""")
+    result.append("""</urlset>\n""")
 
-    raise MoinMoinNoFooter
+    result = "".join(result)
+    result = result.replace("\n", "\r\n") # text/* requires CR/LF
 
+    # emit all real data
+    request.write(result)
+
--- a/docs/CHANGES	Thu Jun 22 22:30:02 2006 +0200
+++ b/docs/CHANGES	Sat Jun 24 20:43:47 2006 +0200
@@ -143,8 +143,15 @@
     * cfg.log_reverse_dns_lookups [default: True] - you can set this to False
       if rev. dns lookups are broken in your network (leading to long delays
       on page saves). With False, edit-log will only contain IP, not hostname.
-    * ?action=sitemap emits a simple google sitemap (XML), listing all your
-      wiki pages and the wiki root URL.
+    * ?action=sitemap emits a google sitemap (XML), listing all your wiki pages
+      and the wiki root URL.
+      Page                      Priority / Frequency / Last modification
+      --------------------------------------------------------------------
+      /                         1.0 / hourly / <now>
+      cfg.page_front_page       1.0 / hourly / page last edit
+      TitleIndex,RecentChanges  0.9 / hourly / <now>
+      content pages             0.5 / daily / page last edit
+      system/help pages         0.1 / yearly / page last edit
     * We use rel="nofollow" for some action links in the hope that some search
       engines don't fetch the targets (if they do, they will just get 403 and
       cause unnecessary traffic).