aboutsummaryrefslogtreecommitdiffhomepage
path: root/debian/upstream/scripts/count_wikipedia.py
diff options
context:
space:
mode:
Diffstat (limited to 'debian/upstream/scripts/count_wikipedia.py')
-rw-r--r--debian/upstream/scripts/count_wikipedia.py173
1 files changed, 173 insertions, 0 deletions
diff --git a/debian/upstream/scripts/count_wikipedia.py b/debian/upstream/scripts/count_wikipedia.py
new file mode 100644
index 0000000..aacf8d0
--- /dev/null
+++ b/debian/upstream/scripts/count_wikipedia.py
@@ -0,0 +1,173 @@
+#!/usr/bin/python
+
+import sys
+import os
+import re
+import codecs
+import operator
+import datetime
+import nltk
+import warnings
+
+from unidecode import unidecode
+
+def usage():
+ print '''
+tokenize a directory of text and count unigrams.
+
+usage:
+%s input_dir ../data/english_wikipedia.txt
+
+input_dir is the root directory where sentence files live. Each file should contain
+one sentence per line, with punctuation. This script will walk the directory recursively,
+looking for text files. For each text file, it will tokenize each sentence into words and
+add them to a global unigram count, outputted to output.txt of the form:
+
+word count
+word count
+...
+
+in descending order of count.
+
+For speed, tokenization is done w/ Penn Treebank regexes via nltk's port:
+http://www.cis.upenn.edu/~treebank/tokenizer.sed
+http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.treebank
+
+For input sentences, this script allows for the format output by WikiExtractor.py
+https://github.com/attardi/wikiextractor
+
+That is,
+- lines starting with <doc... are ignored
+- lines starting with </doc> are ignored
+- blank lines are ignored
+
+To obtain wikipedia dumps, visit: https://dumps.wikimedia.org/enwiki
+And download the file ending in '-pages-articles.xml.bz2'. This includes wikipedia pages
+and articles but not previous revisions, edit history, and metadata.
+
+Then run:
+./WikiExtractor.py -o en_sents --no-templates enwiki-20151002-pages-articles.xml.bz2
+
+''' % sys.argv[0]
+
+SENTENCES_PER_BATCH = 500000 # after each batch, delete all counts with count == 1 (hapax legomena)
+PRE_SORT_CUTOFF = 300 # before sorting, discard all words with less than this count
+
+ALL_NON_ALPHA = re.compile(r'^[\W\d]*$', re.UNICODE)
+SOME_NON_ALPHA = re.compile(r'[\W\d]', re.UNICODE)
+
+class TopTokenCounter(object):
+ def __init__(self):
+ self.count = {}
+ self.legomena = set()
+ self.discarded = set()
+
+ def add_tokens(self, tokens, split_hyphens=True):
+ for token in tokens:
+ # add eg 'marxist-leninist' as two tokens instead of one
+ if split_hyphens and token.count('-') in [1, 2]:
+ for subtoken in token.split('-'):
+ self.add_token(subtoken)
+ else:
+ self.add_token(token)
+
+ def add_token(self, token):
+ if not self.should_include(token):
+ self.discarded.add(token)
+ return
+ token = self.normalize(token)
+ if token in self.count:
+ self.legomena.discard(token)
+ self.count[token] += 1
+ else:
+ self.legomena.add(token)
+ self.count[token] = 1
+
+ def should_include(self, token):
+ if len(token) < 2:
+ return False
+ if len(token) <= 2 and SOME_NON_ALPHA.search(token):
+ # B., '', (), ...
+ return False
+ if ALL_NON_ALPHA.match(token):
+ # 1,000, <<>>, ...
+ return False
+ if token.startswith('/'):
+ # eg //en.wikipedia.org/wiki, /doc
+ return False
+ if token.endswith('='):
+ # id=, title=, ...
+ return False
+ return True
+
+ def normalize(self, token):
+ return token.lower()
+
+ def batch_prune(self):
+ for token in self.legomena:
+ del self.count[token]
+ self.legomena = set()
+
+ def pre_sort_prune(self):
+ under_cutoff = set()
+ for token, count in self.count.iteritems():
+ if count < PRE_SORT_CUTOFF:
+ under_cutoff.add(token)
+ for token in under_cutoff:
+ del self.count[token]
+ self.legomena = set()
+
+ def get_sorted_pairs(self):
+ return sorted(self.count.items(), key=operator.itemgetter(1), reverse=True)
+
+ def get_ts(self):
+ return datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")
+
+ def get_stats(self):
+ ts = self.get_ts()
+ return "%s keys(count): %d" % (ts, len(self.count))
+
+def main(input_dir_str, output_filename):
+ counter = TopTokenCounter()
+ print counter.get_ts(), 'starting...'
+ lines = 0
+ for root, dirs, files in os.walk(input_dir_str, topdown=True):
+ if not files:
+ continue
+ for fname in files:
+ path = os.path.join(root, fname)
+ for line in codecs.open(path, 'r', 'utf8'):
+ with warnings.catch_warnings():
+ # unidecode() occasionally (rarely but enough to clog terminal outout)
+ # complains about surrogate characters in some wikipedia sentences.
+ # ignore those warnings.
+ warnings.simplefilter('ignore')
+ line = unidecode(line)
+ tokens = nltk.word_tokenize(line)
+ counter.add_tokens(tokens)
+ lines += 1
+ if lines % SENTENCES_PER_BATCH == 0:
+ counter.batch_prune()
+ print counter.get_stats()
+ print 'processing: %s' % path
+ print counter.get_stats()
+ print 'deleting tokens under cutoff of', PRE_SORT_CUTOFF
+ counter.pre_sort_prune()
+ print 'done'
+ print counter.get_stats()
+ print counter.get_ts(), 'sorting...'
+ sorted_pairs = counter.get_sorted_pairs()
+ print counter.get_ts(), 'done'
+ print 'writing...'
+ with codecs.open(output_filename, 'w', 'utf8') as f:
+ for token, count in sorted_pairs:
+ f.write('%-18s %d\n' % (token, count))
+ sys.exit(0)
+
+if __name__ == '__main__':
+ if len(sys.argv) != 3:
+ usage()
+ sys.exit(0)
+ else:
+ main(*sys.argv[1:])
+