aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorSean Whitton <spwhitton@spwhitton.name>2016-11-29 16:22:23 -0700
committerSean Whitton <spwhitton@spwhitton.name>2016-11-29 16:22:23 -0700
commit68e192f281100e02c24ecefc25ab3b9a70a17714 (patch)
tree5ee64733ea29a4d8f4ec51e9a5abf39205c6deb3
parentad4ad38b6dfad6825b2948d5fb394aca3f5c94af (diff)
downloadzxcvbn-c-68e192f281100e02c24ecefc25ab3b9a70a17714.tar.gz
add debian/upstream/scripts
-rw-r--r--debian/README.source15
-rw-r--r--debian/changelog2
-rw-r--r--debian/copyright2
-rw-r--r--debian/upstream/scripts/count_wikipedia.py173
-rw-r--r--debian/upstream/scripts/count_wiktionary.py80
5 files changed, 263 insertions, 9 deletions
diff --git a/debian/README.source b/debian/README.source
index 6710099..64ab7d5 100644
--- a/debian/README.source
+++ b/debian/README.source
@@ -27,8 +27,8 @@ for this Debian package. USE_DICT_FILE is unset. Calls to
ZxcvbnInit() and ZxcvbnUninit() are not required, and README.md has
been patched accordingly.
-debian/missing-sources
-----------------------
+debian/missing-sources & debian/upstream/scripts
+------------------------------------------------
words-female.txt, words-male.txt and words-surname.txt are not in
their preferred format for modification. The raw US census data, and
@@ -39,11 +39,12 @@ Some of the other words-*.txt files were generated by scripts from
HTML dumps of Wikipedia and Wiktionary articles. If someone wanted to
modify these word lists, it would be appropriate to modify the
words-*.txt files directly, rather than those HTML dumps.
-Consequently, the HTML files and scripts have not been included in
-this source package. For the curious, the scripts, and instructions
-on how to run them, are available online:
+Consequently, the HTML files have not been included.
- https://github.com/dropbox/zxcvbn/tree/master/data-scripts
+In case they are useful to someone, a copy of the scripts have been
+included in debian/upstream/scripts. A separate directory has been
+used to indicate that these scripts have not been included in order to
+satisfy DFSG.
DFSG repacking
--------------
@@ -70,4 +71,4 @@ source, I have replaced words-passwd.txt with the old
words-10k-pass.txt, and restored references to words-10k-pass.txt in
the code.
- -- Sean Whitton <spwhitton@spwhitton.name>, Tue, 29 Nov 2016 16:07:52 -0700
+ -- Sean Whitton <spwhitton@spwhitton.name>, Tue, 29 Nov 2016 16:21:59 -0700
diff --git a/debian/changelog b/debian/changelog
index c8fde22..5c08ebb 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -11,7 +11,7 @@ zxcvbn-c (2.0+dfsg-1) UNRELEASED; urgency=medium
* Update d/copyright for changes to words-*.txt files.
* Add a d/copyright stanza for the upstream makefile.
* Add Files-Excluded: field to d/copyright.
- * Add debian/missing-sources
+ * Add debian/missing-sources & debian/upstream/scripts
- Update d/copyright accordingly
- Add explanatory text to README.source
* Add explanatory text to README.source regarding DFSG repacking.
diff --git a/debian/copyright b/debian/copyright
index ea165cc..cf327a7 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -27,7 +27,7 @@ Files: debian/*
Copyright: (C) 2016 Sean Whitton <spwhitton@spwhitton.name>
License: BSD-3-clause
-Files: debian/missing-sources/*.py words-eng_wiki.txt
+Files: debian/missing-sources/*.py debian/upstream/scripts/*.py words-eng_wiki.txt
Copyright: (C) Copyright (c) 2012-2016 Dan Wheeler and Dropbox, Inc.
License: Expat
diff --git a/debian/upstream/scripts/count_wikipedia.py b/debian/upstream/scripts/count_wikipedia.py
new file mode 100644
index 0000000..aacf8d0
--- /dev/null
+++ b/debian/upstream/scripts/count_wikipedia.py
@@ -0,0 +1,173 @@
+#!/usr/bin/python
+
+import sys
+import os
+import re
+import codecs
+import operator
+import datetime
+import nltk
+import warnings
+
+from unidecode import unidecode
+
+def usage():
+ print '''
+tokenize a directory of text and count unigrams.
+
+usage:
+%s input_dir ../data/english_wikipedia.txt
+
+input_dir is the root directory where sentence files live. Each file should contain
+one sentence per line, with punctuation. This script will walk the directory recursively,
+looking for text files. For each text file, it will tokenize each sentence into words and
+add them to a global unigram count, outputted to output.txt of the form:
+
+word count
+word count
+...
+
+in descending order of count.
+
+For speed, tokenization is done w/ Penn Treebank regexes via nltk's port:
+http://www.cis.upenn.edu/~treebank/tokenizer.sed
+http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.treebank
+
+For input sentences, this script allows for the format output by WikiExtractor.py
+https://github.com/attardi/wikiextractor
+
+That is,
+- lines starting with <doc... are ignored
+- lines starting with </doc> are ignored
+- blank lines are ignored
+
+To obtain wikipedia dumps, visit: https://dumps.wikimedia.org/enwiki
+And download the file ending in '-pages-articles.xml.bz2'. This includes wikipedia pages
+and articles but not previous revisions, edit history, and metadata.
+
+Then run:
+./WikiExtractor.py -o en_sents --no-templates enwiki-20151002-pages-articles.xml.bz2
+
+''' % sys.argv[0]
+
+SENTENCES_PER_BATCH = 500000 # after each batch, delete all counts with count == 1 (hapax legomena)
+PRE_SORT_CUTOFF = 300 # before sorting, discard all words with less than this count
+
+ALL_NON_ALPHA = re.compile(r'^[\W\d]*$', re.UNICODE)
+SOME_NON_ALPHA = re.compile(r'[\W\d]', re.UNICODE)
+
+class TopTokenCounter(object):
+ def __init__(self):
+ self.count = {}
+ self.legomena = set()
+ self.discarded = set()
+
+ def add_tokens(self, tokens, split_hyphens=True):
+ for token in tokens:
+ # add eg 'marxist-leninist' as two tokens instead of one
+ if split_hyphens and token.count('-') in [1, 2]:
+ for subtoken in token.split('-'):
+ self.add_token(subtoken)
+ else:
+ self.add_token(token)
+
+ def add_token(self, token):
+ if not self.should_include(token):
+ self.discarded.add(token)
+ return
+ token = self.normalize(token)
+ if token in self.count:
+ self.legomena.discard(token)
+ self.count[token] += 1
+ else:
+ self.legomena.add(token)
+ self.count[token] = 1
+
+ def should_include(self, token):
+ if len(token) < 2:
+ return False
+ if len(token) <= 2 and SOME_NON_ALPHA.search(token):
+ # B., '', (), ...
+ return False
+ if ALL_NON_ALPHA.match(token):
+ # 1,000, <<>>, ...
+ return False
+ if token.startswith('/'):
+ # eg //en.wikipedia.org/wiki, /doc
+ return False
+ if token.endswith('='):
+ # id=, title=, ...
+ return False
+ return True
+
+ def normalize(self, token):
+ return token.lower()
+
+ def batch_prune(self):
+ for token in self.legomena:
+ del self.count[token]
+ self.legomena = set()
+
+ def pre_sort_prune(self):
+ under_cutoff = set()
+ for token, count in self.count.iteritems():
+ if count < PRE_SORT_CUTOFF:
+ under_cutoff.add(token)
+ for token in under_cutoff:
+ del self.count[token]
+ self.legomena = set()
+
+ def get_sorted_pairs(self):
+ return sorted(self.count.items(), key=operator.itemgetter(1), reverse=True)
+
+ def get_ts(self):
+ return datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")
+
+ def get_stats(self):
+ ts = self.get_ts()
+ return "%s keys(count): %d" % (ts, len(self.count))
+
+def main(input_dir_str, output_filename):
+ counter = TopTokenCounter()
+ print counter.get_ts(), 'starting...'
+ lines = 0
+ for root, dirs, files in os.walk(input_dir_str, topdown=True):
+ if not files:
+ continue
+ for fname in files:
+ path = os.path.join(root, fname)
+ for line in codecs.open(path, 'r', 'utf8'):
+ with warnings.catch_warnings():
+ # unidecode() occasionally (rarely but enough to clog terminal outout)
+ # complains about surrogate characters in some wikipedia sentences.
+ # ignore those warnings.
+ warnings.simplefilter('ignore')
+ line = unidecode(line)
+ tokens = nltk.word_tokenize(line)
+ counter.add_tokens(tokens)
+ lines += 1
+ if lines % SENTENCES_PER_BATCH == 0:
+ counter.batch_prune()
+ print counter.get_stats()
+ print 'processing: %s' % path
+ print counter.get_stats()
+ print 'deleting tokens under cutoff of', PRE_SORT_CUTOFF
+ counter.pre_sort_prune()
+ print 'done'
+ print counter.get_stats()
+ print counter.get_ts(), 'sorting...'
+ sorted_pairs = counter.get_sorted_pairs()
+ print counter.get_ts(), 'done'
+ print 'writing...'
+ with codecs.open(output_filename, 'w', 'utf8') as f:
+ for token, count in sorted_pairs:
+ f.write('%-18s %d\n' % (token, count))
+ sys.exit(0)
+
+if __name__ == '__main__':
+ if len(sys.argv) != 3:
+ usage()
+ sys.exit(0)
+ else:
+ main(*sys.argv[1:])
+
diff --git a/debian/upstream/scripts/count_wiktionary.py b/debian/upstream/scripts/count_wiktionary.py
new file mode 100644
index 0000000..b538d76
--- /dev/null
+++ b/debian/upstream/scripts/count_wiktionary.py
@@ -0,0 +1,80 @@
+#!/usr/bin/python
+
+import os
+import sys
+import codecs
+import operator
+
+from unidecode import unidecode
+
+def usage():
+ return '''
+This script extracts words and counts from a 2006 wiktionary word frequency study over American
+television and movies. To use, first visit the study and download, as .html files, all 26 of the
+frequency lists:
+
+https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#TV_and_movie_scripts
+
+Put those into a single directory and point it to this script:
+
+%s wiktionary_html_dir ../data/us_tv_and_film.txt
+
+output.txt will include one line per word in the study, ordered by rank, of the form:
+
+word1 count1
+word2 count2
+...
+ ''' % sys.argv[0]
+
+def parse_wiki_tokens(html_doc_str):
+ '''fragile hax, but checks the result at the end'''
+ results = []
+ last3 = ['', '', '']
+ header = True
+ skipped = 0
+ for line in html_doc_str.split('\n'):
+ last3.pop(0)
+ last3.append(line.strip())
+ if all(s.startswith('<td>') and not s == '<td></td>' for s in last3):
+ if header:
+ header = False
+ continue
+ last3 = [s.replace('<td>', '').replace('</td>', '').strip() for s in last3]
+ rank, token, count = last3
+ rank = int(rank.split()[0])
+ token = token.replace('</a>', '')
+ token = token[token.index('>')+1:]
+ token = normalize(token)
+ # wikitonary has thousands of words that end in 's
+ # keep the common ones (rank under 1000), discard the rest
+ #
+ # otherwise end up with a bunch of duplicates eg victor / victor's
+ if token.endswith("'s") and rank > 1000:
+ skipped += 1
+ continue
+ count = int(count)
+ results.append((rank, token, count))
+ # early docs have 1k entries, later 2k, last 1284
+ assert len(results) + skipped in [1000, 2000, 1284]
+ return results
+
+def normalize(token):
+ return unidecode(token).lower()
+
+def main(wiktionary_html_root, output_filename):
+ rank_token_count = [] # list of 3-tuples
+ for filename in os.listdir(wiktionary_html_root):
+ path = os.path.join(wiktionary_html_root, filename)
+ with codecs.open(path, 'r', 'utf8') as f:
+ rank_token_count.extend(parse_wiki_tokens(f.read()))
+ rank_token_count.sort(key=operator.itemgetter(0))
+ with codecs.open(output_filename, 'w', 'utf8') as f:
+ for rank, token, count in rank_token_count:
+ f.write('%-18s %d\n' % (token, count))
+
+if __name__ == '__main__':
+ if len(sys.argv) != 3:
+ print usage()
+ else:
+ main(*sys.argv[1:])
+ sys.exit(0)