1 files changed, 173 insertions, 0 deletions
diff --git a/debian/upstream/scripts/count_wikipedia.py b/debian/upstream/scripts/count_wikipedia.py
new file mode 100644
index 0000000..aacf8d0
--- /dev/null
+++ b/debian/upstream/scripts/count_wikipedia.py
@@ -0,0 +1,173 @@
+#!/usr/bin/python
+
+import sys
+import os
+import re
+import codecs
+import operator
+import datetime
+import nltk
+import warnings
+
+from unidecode import unidecode
+
+def usage():
+    print '''
+tokenize a directory of text and count unigrams.
+
+usage:
+%s input_dir ../data/english_wikipedia.txt
+
+input_dir is the root directory where sentence files live. Each file should contain
+one sentence per line, with punctuation. This script will walk the directory recursively,
+looking for text files. For each text file, it will tokenize each sentence into words and
+add them to a global unigram count, outputted to output.txt of the form:
+
+word count
+word count
+...
+
+in descending order of count.
+
+For speed, tokenization is done w/ Penn Treebank regexes via nltk's port:
+http://www.cis.upenn.edu/~treebank/tokenizer.sed
+http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.treebank
+
+For input sentences, this script allows for the format output by WikiExtractor.py
+https://github.com/attardi/wikiextractor
+
+That is,
+- lines starting with <doc... are ignored
+- lines starting with </doc> are ignored
+- blank lines are ignored
+
+To obtain wikipedia dumps, visit: https://dumps.wikimedia.org/enwiki
+And download the file ending in '-pages-articles.xml.bz2'. This includes wikipedia pages
+and articles but not previous revisions, edit history, and metadata.
+
+Then run:
+./WikiExtractor.py -o en_sents --no-templates enwiki-20151002-pages-articles.xml.bz2
+
+''' % sys.argv[0]
+
+SENTENCES_PER_BATCH = 500000 # after each batch, delete all counts with count == 1 (hapax legomena)
+PRE_SORT_CUTOFF = 300        # before sorting, discard all words with less than this count
+
+ALL_NON_ALPHA = re.compile(r'^[\W\d]*$', re.UNICODE)
+SOME_NON_ALPHA = re.compile(r'[\W\d]', re.UNICODE)
+
+class TopTokenCounter(object):
+    def __init__(self):
+        self.count = {}
+        self.legomena = set()
+        self.discarded = set()
+
+    def add_tokens(self, tokens, split_hyphens=True):
+        for token in tokens:
+            # add eg 'marxist-leninist' as two tokens instead of one
+            if split_hyphens and token.count('-') in [1, 2]:
+                for subtoken in token.split('-'):
+                    self.add_token(subtoken)
+            else:
+                self.add_token(token)
+
+    def add_token(self, token):
+        if not self.should_include(token):
+            self.discarded.add(token)
+            return
+        token = self.normalize(token)
+        if token in self.count:
+            self.legomena.discard(token)
+            self.count[token] += 1
+        else:
+            self.legomena.add(token)
+            self.count[token] = 1
+
+    def should_include(self, token):
+        if len(token) < 2:
+            return False
+        if len(token) <= 2 and SOME_NON_ALPHA.search(token):
+            # B., '', (), ...
+            return False
+        if ALL_NON_ALPHA.match(token):
+            # 1,000, <<>>, ...
+            return False
+        if token.startswith('/'):
+            # eg //en.wikipedia.org/wiki, /doc
+            return False
+        if token.endswith('='):
+            # id=, title=, ...
+            return False
+        return True
+
+    def normalize(self, token):
+        return token.lower()
+
+    def batch_prune(self):
+        for token in self.legomena:
+            del self.count[token]
+        self.legomena = set()
+
+    def pre_sort_prune(self):
+        under_cutoff = set()
+        for token, count in self.count.iteritems():
+            if count < PRE_SORT_CUTOFF:
+                under_cutoff.add(token)
+        for token in under_cutoff:
+            del self.count[token]
+        self.legomena = set()
+
+    def get_sorted_pairs(self):
+        return sorted(self.count.items(), key=operator.itemgetter(1), reverse=True)
+
+    def get_ts(self):
+        return datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")
+
+    def get_stats(self):
+        ts = self.get_ts()
+        return "%s keys(count): %d" % (ts, len(self.count))
+
+def main(input_dir_str, output_filename):
+    counter = TopTokenCounter()
+    print counter.get_ts(), 'starting...'
+    lines = 0
+    for root, dirs, files in os.walk(input_dir_str, topdown=True):
+        if not files:
+            continue
+        for fname in files:
+            path = os.path.join(root, fname)
+            for line in codecs.open(path, 'r', 'utf8'):
+                with warnings.catch_warnings():
+                    # unidecode() occasionally (rarely but enough to clog terminal outout)
+                    # complains about surrogate characters in some wikipedia sentences.
+                    # ignore those warnings.
+                    warnings.simplefilter('ignore')
+                    line = unidecode(line)
+                tokens = nltk.word_tokenize(line)
+                counter.add_tokens(tokens)
+                lines += 1
+                if lines % SENTENCES_PER_BATCH == 0:
+                    counter.batch_prune()
+                    print counter.get_stats()
+                    print 'processing: %s' % path
+    print counter.get_stats()
+    print 'deleting tokens under cutoff of', PRE_SORT_CUTOFF
+    counter.pre_sort_prune()
+    print 'done'
+    print counter.get_stats()
+    print counter.get_ts(), 'sorting...'
+    sorted_pairs = counter.get_sorted_pairs()
+    print counter.get_ts(), 'done'
+    print 'writing...'
+    with codecs.open(output_filename, 'w', 'utf8') as f:
+        for token, count in sorted_pairs:
+            f.write('%-18s %d\n' % (token, count))
+    sys.exit(0)
+
+if __name__ == '__main__':
+    if len(sys.argv) != 3:
+        usage()
+        sys.exit(0)
+    else:
+        main(*sys.argv[1:])
+