#!/usr/bin/python import sys import os import re import codecs import operator import datetime import nltk import warnings from unidecode import unidecode def usage(): print ''' tokenize a directory of text and count unigrams. usage: %s input_dir ../data/english_wikipedia.txt input_dir is the root directory where sentence files live. Each file should contain one sentence per line, with punctuation. This script will walk the directory recursively, looking for text files. For each text file, it will tokenize each sentence into words and add them to a global unigram count, outputted to output.txt of the form: word count word count ... in descending order of count. For speed, tokenization is done w/ Penn Treebank regexes via nltk's port: http://www.cis.upenn.edu/~treebank/tokenizer.sed http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.treebank For input sentences, this script allows for the format output by WikiExtractor.py https://github.com/attardi/wikiextractor That is, - lines starting with are ignored - blank lines are ignored To obtain wikipedia dumps, visit: https://dumps.wikimedia.org/enwiki And download the file ending in '-pages-articles.xml.bz2'. This includes wikipedia pages and articles but not previous revisions, edit history, and metadata. Then run: ./WikiExtractor.py -o en_sents --no-templates enwiki-20151002-pages-articles.xml.bz2 ''' % sys.argv[0] SENTENCES_PER_BATCH = 500000 # after each batch, delete all counts with count == 1 (hapax legomena) PRE_SORT_CUTOFF = 300 # before sorting, discard all words with less than this count ALL_NON_ALPHA = re.compile(r'^[\W\d]*$', re.UNICODE) SOME_NON_ALPHA = re.compile(r'[\W\d]', re.UNICODE) class TopTokenCounter(object): def __init__(self): self.count = {} self.legomena = set() self.discarded = set() def add_tokens(self, tokens, split_hyphens=True): for token in tokens: # add eg 'marxist-leninist' as two tokens instead of one if split_hyphens and token.count('-') in [1, 2]: for subtoken in token.split('-'): self.add_token(subtoken) else: self.add_token(token) def add_token(self, token): if not self.should_include(token): self.discarded.add(token) return token = self.normalize(token) if token in self.count: self.legomena.discard(token) self.count[token] += 1 else: self.legomena.add(token) self.count[token] = 1 def should_include(self, token): if len(token) < 2: return False if len(token) <= 2 and SOME_NON_ALPHA.search(token): # B., '', (), ... return False if ALL_NON_ALPHA.match(token): # 1,000, <<>>, ... return False if token.startswith('/'): # eg //en.wikipedia.org/wiki, /doc return False if token.endswith('='): # id=, title=, ... return False return True def normalize(self, token): return token.lower() def batch_prune(self): for token in self.legomena: del self.count[token] self.legomena = set() def pre_sort_prune(self): under_cutoff = set() for token, count in self.count.iteritems(): if count < PRE_SORT_CUTOFF: under_cutoff.add(token) for token in under_cutoff: del self.count[token] self.legomena = set() def get_sorted_pairs(self): return sorted(self.count.items(), key=operator.itemgetter(1), reverse=True) def get_ts(self): return datetime.datetime.now().strftime("%b %d %Y %H:%M:%S") def get_stats(self): ts = self.get_ts() return "%s keys(count): %d" % (ts, len(self.count)) def main(input_dir_str, output_filename): counter = TopTokenCounter() print counter.get_ts(), 'starting...' lines = 0 for root, dirs, files in os.walk(input_dir_str, topdown=True): if not files: continue for fname in files: path = os.path.join(root, fname) for line in codecs.open(path, 'r', 'utf8'): with warnings.catch_warnings(): # unidecode() occasionally (rarely but enough to clog terminal outout) # complains about surrogate characters in some wikipedia sentences. # ignore those warnings. warnings.simplefilter('ignore') line = unidecode(line) tokens = nltk.word_tokenize(line) counter.add_tokens(tokens) lines += 1 if lines % SENTENCES_PER_BATCH == 0: counter.batch_prune() print counter.get_stats() print 'processing: %s' % path print counter.get_stats() print 'deleting tokens under cutoff of', PRE_SORT_CUTOFF counter.pre_sort_prune() print 'done' print counter.get_stats() print counter.get_ts(), 'sorting...' sorted_pairs = counter.get_sorted_pairs() print counter.get_ts(), 'done' print 'writing...' with codecs.open(output_filename, 'w', 'utf8') as f: for token, count in sorted_pairs: f.write('%-18s %d\n' % (token, count)) sys.exit(0) if __name__ == '__main__': if len(sys.argv) != 3: usage() sys.exit(0) else: main(*sys.argv[1:])