1 files changed, 80 insertions, 0 deletions
diff --git a/debian/upstream/scripts/count_wiktionary.py b/debian/upstream/scripts/count_wiktionary.py
new file mode 100644
index 0000000..b538d76
--- /dev/null
+++ b/debian/upstream/scripts/count_wiktionary.py
@@ -0,0 +1,80 @@
+#!/usr/bin/python
+
+import os
+import sys
+import codecs
+import operator
+
+from unidecode import unidecode
+
+def usage():
+    return '''
+This script extracts words and counts from a 2006 wiktionary word frequency study over American
+television and movies. To use, first visit the study and download, as .html files, all 26 of the
+frequency lists:
+
+https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#TV_and_movie_scripts
+
+Put those into a single directory and point it to this script:
+
+%s wiktionary_html_dir ../data/us_tv_and_film.txt
+
+output.txt will include one line per word in the study, ordered by rank, of the form:
+
+word1 count1
+word2 count2
+...
+    ''' % sys.argv[0]
+
+def parse_wiki_tokens(html_doc_str):
+    '''fragile hax, but checks the result at the end'''
+    results = []
+    last3 = ['', '', '']
+    header = True
+    skipped = 0
+    for line in html_doc_str.split('\n'):
+        last3.pop(0)
+        last3.append(line.strip())
+        if all(s.startswith('<td>') and not s == '<td></td>' for s in last3):
+            if header:
+                header = False
+                continue
+            last3 = [s.replace('<td>', '').replace('</td>', '').strip() for s in last3]
+            rank, token, count = last3
+            rank = int(rank.split()[0])
+            token = token.replace('</a>', '')
+            token = token[token.index('>')+1:]
+            token = normalize(token)
+            # wikitonary has thousands of words that end in 's
+            # keep the common ones (rank under 1000), discard the rest
+            #
+            # otherwise end up with a bunch of duplicates eg victor / victor's
+            if token.endswith("'s") and rank > 1000:
+                skipped += 1
+                continue
+            count = int(count)
+            results.append((rank, token, count))
+    # early docs have 1k entries, later 2k, last 1284
+    assert len(results) + skipped in [1000, 2000, 1284]
+    return results
+
+def normalize(token):
+    return unidecode(token).lower()
+
+def main(wiktionary_html_root, output_filename):
+    rank_token_count = [] # list of 3-tuples
+    for filename in os.listdir(wiktionary_html_root):
+        path = os.path.join(wiktionary_html_root, filename)
+        with codecs.open(path, 'r', 'utf8') as f:
+            rank_token_count.extend(parse_wiki_tokens(f.read()))
+    rank_token_count.sort(key=operator.itemgetter(0))
+    with codecs.open(output_filename, 'w', 'utf8') as f:
+        for rank, token, count in rank_token_count:
+            f.write('%-18s %d\n' % (token, count))
+
+if __name__ == '__main__':
+    if len(sys.argv) != 3:
+        print usage()
+    else:
+        main(*sys.argv[1:])
+    sys.exit(0)