debian/upstream/scripts/count_wiktionary.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

#!/usr/bin/python

import os
import sys
import codecs
import operator

from unidecode import unidecode

def usage():
    return '''
This script extracts words and counts from a 2006 wiktionary word frequency study over American
television and movies. To use, first visit the study and download, as .html files, all 26 of the
frequency lists:

https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#TV_and_movie_scripts

Put those into a single directory and point it to this script:

%s wiktionary_html_dir ../data/us_tv_and_film.txt

output.txt will include one line per word in the study, ordered by rank, of the form:

word1 count1
word2 count2
...
    ''' % sys.argv[0]

def parse_wiki_tokens(html_doc_str):
    '''fragile hax, but checks the result at the end'''
    results = []
    last3 = ['', '', '']
    header = True
    skipped = 0
    for line in html_doc_str.split('\n'):
        last3.pop(0)
        last3.append(line.strip())
        if all(s.startswith('<td>') and not s == '<td></td>' for s in last3):
            if header:
                header = False
                continue
            last3 = [s.replace('<td>', '').replace('</td>', '').strip() for s in last3]
            rank, token, count = last3
            rank = int(rank.split()[0])
            token = token.replace('</a>', '')
            token = token[token.index('>')+1:]
            token = normalize(token)
            # wikitonary has thousands of words that end in 's
            # keep the common ones (rank under 1000), discard the rest
            #
            # otherwise end up with a bunch of duplicates eg victor / victor's
            if token.endswith("'s") and rank > 1000:
                skipped += 1
                continue
            count = int(count)
            results.append((rank, token, count))
    # early docs have 1k entries, later 2k, last 1284
    assert len(results) + skipped in [1000, 2000, 1284]
    return results

def normalize(token):
    return unidecode(token).lower()

def main(wiktionary_html_root, output_filename):
    rank_token_count = [] # list of 3-tuples
    for filename in os.listdir(wiktionary_html_root):
        path = os.path.join(wiktionary_html_root, filename)
        with codecs.open(path, 'r', 'utf8') as f:
            rank_token_count.extend(parse_wiki_tokens(f.read()))
    rank_token_count.sort(key=operator.itemgetter(0))
    with codecs.open(output_filename, 'w', 'utf8') as f:
        for rank, token, count in rank_token_count:
            f.write('%-18s %d\n' % (token, count))

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print usage()
    else:
        main(*sys.argv[1:])
    sys.exit(0)