diff options
Diffstat (limited to 'debian/upstream/scripts/count_wikipedia.py')
-rw-r--r-- | debian/upstream/scripts/count_wikipedia.py | 173 |
1 files changed, 173 insertions, 0 deletions
diff --git a/debian/upstream/scripts/count_wikipedia.py b/debian/upstream/scripts/count_wikipedia.py new file mode 100644 index 0000000..aacf8d0 --- /dev/null +++ b/debian/upstream/scripts/count_wikipedia.py @@ -0,0 +1,173 @@ +#!/usr/bin/python + +import sys +import os +import re +import codecs +import operator +import datetime +import nltk +import warnings + +from unidecode import unidecode + +def usage(): + print ''' +tokenize a directory of text and count unigrams. + +usage: +%s input_dir ../data/english_wikipedia.txt + +input_dir is the root directory where sentence files live. Each file should contain +one sentence per line, with punctuation. This script will walk the directory recursively, +looking for text files. For each text file, it will tokenize each sentence into words and +add them to a global unigram count, outputted to output.txt of the form: + +word count +word count +... + +in descending order of count. + +For speed, tokenization is done w/ Penn Treebank regexes via nltk's port: +http://www.cis.upenn.edu/~treebank/tokenizer.sed +http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.treebank + +For input sentences, this script allows for the format output by WikiExtractor.py +https://github.com/attardi/wikiextractor + +That is, +- lines starting with <doc... are ignored +- lines starting with </doc> are ignored +- blank lines are ignored + +To obtain wikipedia dumps, visit: https://dumps.wikimedia.org/enwiki +And download the file ending in '-pages-articles.xml.bz2'. This includes wikipedia pages +and articles but not previous revisions, edit history, and metadata. + +Then run: +./WikiExtractor.py -o en_sents --no-templates enwiki-20151002-pages-articles.xml.bz2 + +''' % sys.argv[0] + +SENTENCES_PER_BATCH = 500000 # after each batch, delete all counts with count == 1 (hapax legomena) +PRE_SORT_CUTOFF = 300 # before sorting, discard all words with less than this count + +ALL_NON_ALPHA = re.compile(r'^[\W\d]*$', re.UNICODE) +SOME_NON_ALPHA = re.compile(r'[\W\d]', re.UNICODE) + +class TopTokenCounter(object): + def __init__(self): + self.count = {} + self.legomena = set() + self.discarded = set() + + def add_tokens(self, tokens, split_hyphens=True): + for token in tokens: + # add eg 'marxist-leninist' as two tokens instead of one + if split_hyphens and token.count('-') in [1, 2]: + for subtoken in token.split('-'): + self.add_token(subtoken) + else: + self.add_token(token) + + def add_token(self, token): + if not self.should_include(token): + self.discarded.add(token) + return + token = self.normalize(token) + if token in self.count: + self.legomena.discard(token) + self.count[token] += 1 + else: + self.legomena.add(token) + self.count[token] = 1 + + def should_include(self, token): + if len(token) < 2: + return False + if len(token) <= 2 and SOME_NON_ALPHA.search(token): + # B., '', (), ... + return False + if ALL_NON_ALPHA.match(token): + # 1,000, <<>>, ... + return False + if token.startswith('/'): + # eg //en.wikipedia.org/wiki, /doc + return False + if token.endswith('='): + # id=, title=, ... + return False + return True + + def normalize(self, token): + return token.lower() + + def batch_prune(self): + for token in self.legomena: + del self.count[token] + self.legomena = set() + + def pre_sort_prune(self): + under_cutoff = set() + for token, count in self.count.iteritems(): + if count < PRE_SORT_CUTOFF: + under_cutoff.add(token) + for token in under_cutoff: + del self.count[token] + self.legomena = set() + + def get_sorted_pairs(self): + return sorted(self.count.items(), key=operator.itemgetter(1), reverse=True) + + def get_ts(self): + return datetime.datetime.now().strftime("%b %d %Y %H:%M:%S") + + def get_stats(self): + ts = self.get_ts() + return "%s keys(count): %d" % (ts, len(self.count)) + +def main(input_dir_str, output_filename): + counter = TopTokenCounter() + print counter.get_ts(), 'starting...' + lines = 0 + for root, dirs, files in os.walk(input_dir_str, topdown=True): + if not files: + continue + for fname in files: + path = os.path.join(root, fname) + for line in codecs.open(path, 'r', 'utf8'): + with warnings.catch_warnings(): + # unidecode() occasionally (rarely but enough to clog terminal outout) + # complains about surrogate characters in some wikipedia sentences. + # ignore those warnings. + warnings.simplefilter('ignore') + line = unidecode(line) + tokens = nltk.word_tokenize(line) + counter.add_tokens(tokens) + lines += 1 + if lines % SENTENCES_PER_BATCH == 0: + counter.batch_prune() + print counter.get_stats() + print 'processing: %s' % path + print counter.get_stats() + print 'deleting tokens under cutoff of', PRE_SORT_CUTOFF + counter.pre_sort_prune() + print 'done' + print counter.get_stats() + print counter.get_ts(), 'sorting...' + sorted_pairs = counter.get_sorted_pairs() + print counter.get_ts(), 'done' + print 'writing...' + with codecs.open(output_filename, 'w', 'utf8') as f: + for token, count in sorted_pairs: + f.write('%-18s %d\n' % (token, count)) + sys.exit(0) + +if __name__ == '__main__': + if len(sys.argv) != 3: + usage() + sys.exit(0) + else: + main(*sys.argv[1:]) + |