Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                

はてなキーワードAPIからはてなキーワード一覧を取得する

こうですか?良くわかりません。

% wget http://d.hatena.ne.jp/images/keyword/keywordlist
% python extractkeyword.py keywordlist > keywordlist.extracted
% grep java keywordlist.extracted | head -n 10
10688:java
10689:java community process
10690:java computing
10691:java desktop system
10692:java ee
10693:java press
10694:java se
10695:java server faces
10696:java studio
10697:java virtual machine

実体参照の変換等はしていない。あと出力エンコーディングEUC-JP。たまに変なコードが入っているけど、自分のせいではないはず。
きちんと抽出できているかどうか保証しないので、自己責任でよろしく。

import re

__all__ = ['InvalidFormatException', 'parse']

class InvalidFormatException(Exception):
    pass

_SIBLING_PATTERN = re.compile(r'(?<!\\)[\(\)\|]')

def parse(source):
    m = re.match(r'\(\?-xism:\(\?i:\(\?=.*?\)\(\?:(.*)\)\)\)', source)
    if not m:
        raise InvalidFormatException()
    source = m.group(1)
    return _parse(source)

def _parse(source):
    for parts in split_siblings(source):
        if has_child(parts):
            head, child, tail, optional = split_child(parts)
            if optional:
                for h in head:
                    for t in tail:
                        yield h + t
            for s in _parse(child):
                for h in head:
                    for t in tail:
                        yield h + s + t
        else:
            for s in expand(parts):
                yield s

def has_child(body):
    try:
        body.index('(?:')
    except:
        return False
    else:
        return True

def split_child(body):
    start = body.index('(?:', 0)
    end = body.rindex(')')
    while body[end - 1] == '\\':
        end = body.rindex(')', 0, end)
    head = expand(body[:start])
    child = body[start+3:end]
    if end + 1 < len(body) and body[end + 1] == '?':
        optional = True
        end += 1
    else:
        optional = False
    tail = expand(body[end + 1:])
    return (head, child, tail, optional)

def split_siblings(source):
    start = 0
    level = 0
    result = []
    for m in _SIBLING_PATTERN.finditer(source):
        if m.group() == '(':
            level += 1
        elif m.group() == ')':
            level -= 1
        else:
            if level == 0:
                yield source[start:m.start()]
                start = m.end()
    yield source[start:]

def expand(s, expanded=[""]):
    i = 0
    while i < len(s):
        if s[i] == '[':
            characters, i = read_bracket(s, i + 1)
            expanded = add_character_class(characters, expanded)
        elif s[i] == '?':
            expanded += add_optional(expanded)
            i += 1
        elif s[i] == '\\':
            expanded = add_character(s[i + 1], expanded)
            i += 2
        else:
            expanded = add_character(s[i], expanded)
            i += 1
    return set(expanded)

def add_character(c, expanded):
    return [s + c for s in expanded]

def add_character_class(characters, expanded):
    result = []
    for s in expanded:
        for c in characters:
            result.append(s + c)
    return result

def add_optional(expanded):
    return [s[:-1] for s in expanded] + expanded

def read_bracket(s, offset):
    characters = []
    i = offset
    while s[i] != ']':
        if s[i] == '\\':
            characters.append(s[i + 1])
            i += 2
        else:
            characters.append(s[i])
            i += 1
    return (characters, i + 1)

if __name__ == '__main__':
    import sys
    import itertools

    def usage():
        print >>sys.stderr, 'usage: %s filename' % sys.argv[0]

    def uniq(iterable):
        return (it[0] for it in itertools.groupby(iterable))

    if len(sys.argv) != 2:
        usage()
        sys.exit(1)

    fp = open(sys.argv[1], "rb")
    source = fp.read()
    fp.close()

    for keyword in uniq(sorted(parse(source))):
        sys.stdout.write(keyword)
        sys.stdout.write("\n")