view MoinMoin/filter/ @ 864:b4006d50d2dc

add .z extension to application_octet_stream blacklist
author Thomas Waldmann <tw AT waldmann-edv DOT de>
date Tue, 20 Jun 2006 21:04:35 +0200
parents 957d482215ac
children 97ce857bdebf
line wrap: on
line source
# -*- coding: iso-8859-1 -*-
    MoinMoin - binary file Filter
    Processes any binary file and extracts ASCII content from it.
    We ignore any file with a file extension on the blacklist, because
    we either can't handle it or it usually has no indexable content.
    Due to speed reasons, we only read the first maxread bytes from a file.
    For reducing the amount of trash, we only return words with
    length >= minwordlen.
    Depends on: nothing (pure python)

    @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann
    @license: GNU GPL, see COPYING for details.

maxread = 10000
minwordlen = 4

blacklist = ('.iso', # CD/DVD images, TODO: add nero/... stuff
             '.zip', '.rar', '.lzh', '.lha',
             '.tar', '.gz', '.tgz', '.bz2', '.tb2', '.z',
             '.exe', '.com', '.dll', '.cab', '.msi', '.bin', # windows
             '.rpm', '.deb', # linux
             '.hqx', '.dmg', '.sit', # mac
             '.jar', '.class', # java

import os, string

# builds a list of all characters:
norm = string.maketrans('', '')

# builds a list of all non-alphanumeric characters:
non_alnum = string.translate(norm, norm, string.letters+string.digits) 

# translate table that replaces all non-alphanumeric by blanks:
trans_nontext = string.maketrans(non_alnum, ' '*len(non_alnum))

def execute(indexobj, filename):
    fileext = os.path.splitext(filename)[1]
    if fileext.lower() in blacklist:
        return u''
    f = file(filename, "rb")
    data =
    data = data.translate(trans_nontext) # replace non-ascii by blanks
    data = data.split() # removes lots of blanks
    data = [s for s in data if len(s) >= minwordlen] # throw away too short stuff
    data = ' '.join(data)
    return data.decode('ascii')