aboutsummaryrefslogtreecommitdiffhomepage
path: root/debian/upstream/scripts/count_wiktionary.py
diff options
context:
space:
mode:
Diffstat (limited to 'debian/upstream/scripts/count_wiktionary.py')
-rw-r--r--debian/upstream/scripts/count_wiktionary.py80
1 files changed, 80 insertions, 0 deletions
diff --git a/debian/upstream/scripts/count_wiktionary.py b/debian/upstream/scripts/count_wiktionary.py
new file mode 100644
index 0000000..b538d76
--- /dev/null
+++ b/debian/upstream/scripts/count_wiktionary.py
@@ -0,0 +1,80 @@
+#!/usr/bin/python
+
+import os
+import sys
+import codecs
+import operator
+
+from unidecode import unidecode
+
+def usage():
+ return '''
+This script extracts words and counts from a 2006 wiktionary word frequency study over American
+television and movies. To use, first visit the study and download, as .html files, all 26 of the
+frequency lists:
+
+https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#TV_and_movie_scripts
+
+Put those into a single directory and point it to this script:
+
+%s wiktionary_html_dir ../data/us_tv_and_film.txt
+
+output.txt will include one line per word in the study, ordered by rank, of the form:
+
+word1 count1
+word2 count2
+...
+ ''' % sys.argv[0]
+
+def parse_wiki_tokens(html_doc_str):
+ '''fragile hax, but checks the result at the end'''
+ results = []
+ last3 = ['', '', '']
+ header = True
+ skipped = 0
+ for line in html_doc_str.split('\n'):
+ last3.pop(0)
+ last3.append(line.strip())
+ if all(s.startswith('<td>') and not s == '<td></td>' for s in last3):
+ if header:
+ header = False
+ continue
+ last3 = [s.replace('<td>', '').replace('</td>', '').strip() for s in last3]
+ rank, token, count = last3
+ rank = int(rank.split()[0])
+ token = token.replace('</a>', '')
+ token = token[token.index('>')+1:]
+ token = normalize(token)
+ # wikitonary has thousands of words that end in 's
+ # keep the common ones (rank under 1000), discard the rest
+ #
+ # otherwise end up with a bunch of duplicates eg victor / victor's
+ if token.endswith("'s") and rank > 1000:
+ skipped += 1
+ continue
+ count = int(count)
+ results.append((rank, token, count))
+ # early docs have 1k entries, later 2k, last 1284
+ assert len(results) + skipped in [1000, 2000, 1284]
+ return results
+
+def normalize(token):
+ return unidecode(token).lower()
+
+def main(wiktionary_html_root, output_filename):
+ rank_token_count = [] # list of 3-tuples
+ for filename in os.listdir(wiktionary_html_root):
+ path = os.path.join(wiktionary_html_root, filename)
+ with codecs.open(path, 'r', 'utf8') as f:
+ rank_token_count.extend(parse_wiki_tokens(f.read()))
+ rank_token_count.sort(key=operator.itemgetter(0))
+ with codecs.open(output_filename, 'w', 'utf8') as f:
+ for rank, token, count in rank_token_count:
+ f.write('%-18s %d\n' % (token, count))
+
+if __name__ == '__main__':
+ if len(sys.argv) != 3:
+ print usage()
+ else:
+ main(*sys.argv[1:])
+ sys.exit(0)