From 68e192f281100e02c24ecefc25ab3b9a70a17714 Mon Sep 17 00:00:00 2001 From: Sean Whitton Date: Tue, 29 Nov 2016 16:22:23 -0700 Subject: add debian/upstream/scripts --- debian/README.source | 15 +-- debian/changelog | 2 +- debian/copyright | 2 +- debian/upstream/scripts/count_wikipedia.py | 173 ++++++++++++++++++++++++++++ debian/upstream/scripts/count_wiktionary.py | 80 +++++++++++++ 5 files changed, 263 insertions(+), 9 deletions(-) create mode 100644 debian/upstream/scripts/count_wikipedia.py create mode 100644 debian/upstream/scripts/count_wiktionary.py diff --git a/debian/README.source b/debian/README.source index 6710099..64ab7d5 100644 --- a/debian/README.source +++ b/debian/README.source @@ -27,8 +27,8 @@ for this Debian package. USE_DICT_FILE is unset. Calls to ZxcvbnInit() and ZxcvbnUninit() are not required, and README.md has been patched accordingly. -debian/missing-sources ----------------------- +debian/missing-sources & debian/upstream/scripts +------------------------------------------------ words-female.txt, words-male.txt and words-surname.txt are not in their preferred format for modification. The raw US census data, and @@ -39,11 +39,12 @@ Some of the other words-*.txt files were generated by scripts from HTML dumps of Wikipedia and Wiktionary articles. If someone wanted to modify these word lists, it would be appropriate to modify the words-*.txt files directly, rather than those HTML dumps. -Consequently, the HTML files and scripts have not been included in -this source package. For the curious, the scripts, and instructions -on how to run them, are available online: +Consequently, the HTML files have not been included. - https://github.com/dropbox/zxcvbn/tree/master/data-scripts +In case they are useful to someone, a copy of the scripts have been +included in debian/upstream/scripts. A separate directory has been +used to indicate that these scripts have not been included in order to +satisfy DFSG. DFSG repacking -------------- @@ -70,4 +71,4 @@ source, I have replaced words-passwd.txt with the old words-10k-pass.txt, and restored references to words-10k-pass.txt in the code. - -- Sean Whitton , Tue, 29 Nov 2016 16:07:52 -0700 + -- Sean Whitton , Tue, 29 Nov 2016 16:21:59 -0700 diff --git a/debian/changelog b/debian/changelog index c8fde22..5c08ebb 100644 --- a/debian/changelog +++ b/debian/changelog @@ -11,7 +11,7 @@ zxcvbn-c (2.0+dfsg-1) UNRELEASED; urgency=medium * Update d/copyright for changes to words-*.txt files. * Add a d/copyright stanza for the upstream makefile. * Add Files-Excluded: field to d/copyright. - * Add debian/missing-sources + * Add debian/missing-sources & debian/upstream/scripts - Update d/copyright accordingly - Add explanatory text to README.source * Add explanatory text to README.source regarding DFSG repacking. diff --git a/debian/copyright b/debian/copyright index ea165cc..cf327a7 100644 --- a/debian/copyright +++ b/debian/copyright @@ -27,7 +27,7 @@ Files: debian/* Copyright: (C) 2016 Sean Whitton License: BSD-3-clause -Files: debian/missing-sources/*.py words-eng_wiki.txt +Files: debian/missing-sources/*.py debian/upstream/scripts/*.py words-eng_wiki.txt Copyright: (C) Copyright (c) 2012-2016 Dan Wheeler and Dropbox, Inc. License: Expat diff --git a/debian/upstream/scripts/count_wikipedia.py b/debian/upstream/scripts/count_wikipedia.py new file mode 100644 index 0000000..aacf8d0 --- /dev/null +++ b/debian/upstream/scripts/count_wikipedia.py @@ -0,0 +1,173 @@ +#!/usr/bin/python + +import sys +import os +import re +import codecs +import operator +import datetime +import nltk +import warnings + +from unidecode import unidecode + +def usage(): + print ''' +tokenize a directory of text and count unigrams. + +usage: +%s input_dir ../data/english_wikipedia.txt + +input_dir is the root directory where sentence files live. Each file should contain +one sentence per line, with punctuation. This script will walk the directory recursively, +looking for text files. For each text file, it will tokenize each sentence into words and +add them to a global unigram count, outputted to output.txt of the form: + +word count +word count +... + +in descending order of count. + +For speed, tokenization is done w/ Penn Treebank regexes via nltk's port: +http://www.cis.upenn.edu/~treebank/tokenizer.sed +http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.treebank + +For input sentences, this script allows for the format output by WikiExtractor.py +https://github.com/attardi/wikiextractor + +That is, +- lines starting with are ignored +- blank lines are ignored + +To obtain wikipedia dumps, visit: https://dumps.wikimedia.org/enwiki +And download the file ending in '-pages-articles.xml.bz2'. This includes wikipedia pages +and articles but not previous revisions, edit history, and metadata. + +Then run: +./WikiExtractor.py -o en_sents --no-templates enwiki-20151002-pages-articles.xml.bz2 + +''' % sys.argv[0] + +SENTENCES_PER_BATCH = 500000 # after each batch, delete all counts with count == 1 (hapax legomena) +PRE_SORT_CUTOFF = 300 # before sorting, discard all words with less than this count + +ALL_NON_ALPHA = re.compile(r'^[\W\d]*$', re.UNICODE) +SOME_NON_ALPHA = re.compile(r'[\W\d]', re.UNICODE) + +class TopTokenCounter(object): + def __init__(self): + self.count = {} + self.legomena = set() + self.discarded = set() + + def add_tokens(self, tokens, split_hyphens=True): + for token in tokens: + # add eg 'marxist-leninist' as two tokens instead of one + if split_hyphens and token.count('-') in [1, 2]: + for subtoken in token.split('-'): + self.add_token(subtoken) + else: + self.add_token(token) + + def add_token(self, token): + if not self.should_include(token): + self.discarded.add(token) + return + token = self.normalize(token) + if token in self.count: + self.legomena.discard(token) + self.count[token] += 1 + else: + self.legomena.add(token) + self.count[token] = 1 + + def should_include(self, token): + if len(token) < 2: + return False + if len(token) <= 2 and SOME_NON_ALPHA.search(token): + # B., '', (), ... + return False + if ALL_NON_ALPHA.match(token): + # 1,000, <<>>, ... + return False + if token.startswith('/'): + # eg //en.wikipedia.org/wiki, /doc + return False + if token.endswith('='): + # id=, title=, ... + return False + return True + + def normalize(self, token): + return token.lower() + + def batch_prune(self): + for token in self.legomena: + del self.count[token] + self.legomena = set() + + def pre_sort_prune(self): + under_cutoff = set() + for token, count in self.count.iteritems(): + if count < PRE_SORT_CUTOFF: + under_cutoff.add(token) + for token in under_cutoff: + del self.count[token] + self.legomena = set() + + def get_sorted_pairs(self): + return sorted(self.count.items(), key=operator.itemgetter(1), reverse=True) + + def get_ts(self): + return datetime.datetime.now().strftime("%b %d %Y %H:%M:%S") + + def get_stats(self): + ts = self.get_ts() + return "%s keys(count): %d" % (ts, len(self.count)) + +def main(input_dir_str, output_filename): + counter = TopTokenCounter() + print counter.get_ts(), 'starting...' + lines = 0 + for root, dirs, files in os.walk(input_dir_str, topdown=True): + if not files: + continue + for fname in files: + path = os.path.join(root, fname) + for line in codecs.open(path, 'r', 'utf8'): + with warnings.catch_warnings(): + # unidecode() occasionally (rarely but enough to clog terminal outout) + # complains about surrogate characters in some wikipedia sentences. + # ignore those warnings. + warnings.simplefilter('ignore') + line = unidecode(line) + tokens = nltk.word_tokenize(line) + counter.add_tokens(tokens) + lines += 1 + if lines % SENTENCES_PER_BATCH == 0: + counter.batch_prune() + print counter.get_stats() + print 'processing: %s' % path + print counter.get_stats() + print 'deleting tokens under cutoff of', PRE_SORT_CUTOFF + counter.pre_sort_prune() + print 'done' + print counter.get_stats() + print counter.get_ts(), 'sorting...' + sorted_pairs = counter.get_sorted_pairs() + print counter.get_ts(), 'done' + print 'writing...' + with codecs.open(output_filename, 'w', 'utf8') as f: + for token, count in sorted_pairs: + f.write('%-18s %d\n' % (token, count)) + sys.exit(0) + +if __name__ == '__main__': + if len(sys.argv) != 3: + usage() + sys.exit(0) + else: + main(*sys.argv[1:]) + diff --git a/debian/upstream/scripts/count_wiktionary.py b/debian/upstream/scripts/count_wiktionary.py new file mode 100644 index 0000000..b538d76 --- /dev/null +++ b/debian/upstream/scripts/count_wiktionary.py @@ -0,0 +1,80 @@ +#!/usr/bin/python + +import os +import sys +import codecs +import operator + +from unidecode import unidecode + +def usage(): + return ''' +This script extracts words and counts from a 2006 wiktionary word frequency study over American +television and movies. To use, first visit the study and download, as .html files, all 26 of the +frequency lists: + +https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#TV_and_movie_scripts + +Put those into a single directory and point it to this script: + +%s wiktionary_html_dir ../data/us_tv_and_film.txt + +output.txt will include one line per word in the study, ordered by rank, of the form: + +word1 count1 +word2 count2 +... + ''' % sys.argv[0] + +def parse_wiki_tokens(html_doc_str): + '''fragile hax, but checks the result at the end''' + results = [] + last3 = ['', '', ''] + header = True + skipped = 0 + for line in html_doc_str.split('\n'): + last3.pop(0) + last3.append(line.strip()) + if all(s.startswith('') and not s == '' for s in last3): + if header: + header = False + continue + last3 = [s.replace('', '').replace('', '').strip() for s in last3] + rank, token, count = last3 + rank = int(rank.split()[0]) + token = token.replace('', '') + token = token[token.index('>')+1:] + token = normalize(token) + # wikitonary has thousands of words that end in 's + # keep the common ones (rank under 1000), discard the rest + # + # otherwise end up with a bunch of duplicates eg victor / victor's + if token.endswith("'s") and rank > 1000: + skipped += 1 + continue + count = int(count) + results.append((rank, token, count)) + # early docs have 1k entries, later 2k, last 1284 + assert len(results) + skipped in [1000, 2000, 1284] + return results + +def normalize(token): + return unidecode(token).lower() + +def main(wiktionary_html_root, output_filename): + rank_token_count = [] # list of 3-tuples + for filename in os.listdir(wiktionary_html_root): + path = os.path.join(wiktionary_html_root, filename) + with codecs.open(path, 'r', 'utf8') as f: + rank_token_count.extend(parse_wiki_tokens(f.read())) + rank_token_count.sort(key=operator.itemgetter(0)) + with codecs.open(output_filename, 'w', 'utf8') as f: + for rank, token, count in rank_token_count: + f.write('%-18s %d\n' % (token, count)) + +if __name__ == '__main__': + if len(sys.argv) != 3: + print usage() + else: + main(*sys.argv[1:]) + sys.exit(0) -- cgit v1.2.3