From 68e192f281100e02c24ecefc25ab3b9a70a17714 Mon Sep 17 00:00:00 2001
From: Sean Whitton <spwhitton@spwhitton.name>
Date: Tue, 29 Nov 2016 16:22:23 -0700
Subject: add debian/upstream/scripts

---
 debian/README.source                        |  15 +--
 debian/changelog                            |   2 +-
 debian/copyright                            |   2 +-
 debian/upstream/scripts/count_wikipedia.py  | 173 ++++++++++++++++++++++++++++
 debian/upstream/scripts/count_wiktionary.py |  80 +++++++++++++
 5 files changed, 263 insertions(+), 9 deletions(-)
 create mode 100644 debian/upstream/scripts/count_wikipedia.py
 create mode 100644 debian/upstream/scripts/count_wiktionary.py

diff --git a/debian/README.source b/debian/README.source
index 6710099..64ab7d5 100644
--- a/debian/README.source
+++ b/debian/README.source
@@ -27,8 +27,8 @@ for this Debian package.  USE_DICT_FILE is unset.  Calls to
 ZxcvbnInit() and ZxcvbnUninit() are not required, and README.md has
 been patched accordingly.
 
-debian/missing-sources
-----------------------
+debian/missing-sources & debian/upstream/scripts
+------------------------------------------------
 
 words-female.txt, words-male.txt and words-surname.txt are not in
 their preferred format for modification.  The raw US census data, and
@@ -39,11 +39,12 @@ Some of the other words-*.txt files were generated by scripts from
 HTML dumps of Wikipedia and Wiktionary articles.  If someone wanted to
 modify these word lists, it would be appropriate to modify the
 words-*.txt files directly, rather than those HTML dumps.
-Consequently, the HTML files and scripts have not been included in
-this source package.  For the curious, the scripts, and instructions
-on how to run them, are available online:
+Consequently, the HTML files have not been included.
 
-    https://github.com/dropbox/zxcvbn/tree/master/data-scripts
+In case they are useful to someone, a copy of the scripts have been
+included in debian/upstream/scripts.  A separate directory has been
+used to indicate that these scripts have not been included in order to
+satisfy DFSG.
 
 DFSG repacking
 --------------
@@ -70,4 +71,4 @@ source, I have replaced words-passwd.txt with the old
 words-10k-pass.txt, and restored references to words-10k-pass.txt in
 the code.
 
- -- Sean Whitton <spwhitton@spwhitton.name>, Tue, 29 Nov 2016 16:07:52 -0700
+ -- Sean Whitton <spwhitton@spwhitton.name>, Tue, 29 Nov 2016 16:21:59 -0700
diff --git a/debian/changelog b/debian/changelog
index c8fde22..5c08ebb 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -11,7 +11,7 @@ zxcvbn-c (2.0+dfsg-1) UNRELEASED; urgency=medium
   * Update d/copyright for changes to words-*.txt files.
   * Add a d/copyright stanza for the upstream makefile.
   * Add Files-Excluded: field to d/copyright.
-  * Add debian/missing-sources
+  * Add debian/missing-sources & debian/upstream/scripts
     - Update d/copyright accordingly
     - Add explanatory text to README.source
   * Add explanatory text to README.source regarding DFSG repacking.
diff --git a/debian/copyright b/debian/copyright
index ea165cc..cf327a7 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -27,7 +27,7 @@ Files: debian/*
 Copyright: (C) 2016 Sean Whitton <spwhitton@spwhitton.name>
 License: BSD-3-clause
 
-Files: debian/missing-sources/*.py words-eng_wiki.txt
+Files: debian/missing-sources/*.py debian/upstream/scripts/*.py words-eng_wiki.txt
 Copyright: (C) Copyright (c) 2012-2016 Dan Wheeler and Dropbox, Inc.
 License: Expat
 
diff --git a/debian/upstream/scripts/count_wikipedia.py b/debian/upstream/scripts/count_wikipedia.py
new file mode 100644
index 0000000..aacf8d0
--- /dev/null
+++ b/debian/upstream/scripts/count_wikipedia.py
@@ -0,0 +1,173 @@
+#!/usr/bin/python
+
+import sys
+import os
+import re
+import codecs
+import operator
+import datetime
+import nltk
+import warnings
+
+from unidecode import unidecode
+
+def usage():
+    print '''
+tokenize a directory of text and count unigrams.
+
+usage:
+%s input_dir ../data/english_wikipedia.txt
+
+input_dir is the root directory where sentence files live. Each file should contain
+one sentence per line, with punctuation. This script will walk the directory recursively,
+looking for text files. For each text file, it will tokenize each sentence into words and
+add them to a global unigram count, outputted to output.txt of the form:
+
+word count
+word count
+...
+
+in descending order of count.
+
+For speed, tokenization is done w/ Penn Treebank regexes via nltk's port:
+http://www.cis.upenn.edu/~treebank/tokenizer.sed
+http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.treebank
+
+For input sentences, this script allows for the format output by WikiExtractor.py
+https://github.com/attardi/wikiextractor
+
+That is,
+- lines starting with <doc... are ignored
+- lines starting with </doc> are ignored
+- blank lines are ignored
+
+To obtain wikipedia dumps, visit: https://dumps.wikimedia.org/enwiki
+And download the file ending in '-pages-articles.xml.bz2'. This includes wikipedia pages
+and articles but not previous revisions, edit history, and metadata.
+
+Then run:
+./WikiExtractor.py -o en_sents --no-templates enwiki-20151002-pages-articles.xml.bz2
+
+''' % sys.argv[0]
+
+SENTENCES_PER_BATCH = 500000 # after each batch, delete all counts with count == 1 (hapax legomena)
+PRE_SORT_CUTOFF = 300        # before sorting, discard all words with less than this count
+
+ALL_NON_ALPHA = re.compile(r'^[\W\d]*$', re.UNICODE)
+SOME_NON_ALPHA = re.compile(r'[\W\d]', re.UNICODE)
+
+class TopTokenCounter(object):
+    def __init__(self):
+        self.count = {}
+        self.legomena = set()
+        self.discarded = set()
+
+    def add_tokens(self, tokens, split_hyphens=True):
+        for token in tokens:
+            # add eg 'marxist-leninist' as two tokens instead of one
+            if split_hyphens and token.count('-') in [1, 2]:
+                for subtoken in token.split('-'):
+                    self.add_token(subtoken)
+            else:
+                self.add_token(token)
+
+    def add_token(self, token):
+        if not self.should_include(token):
+            self.discarded.add(token)
+            return
+        token = self.normalize(token)
+        if token in self.count:
+            self.legomena.discard(token)
+            self.count[token] += 1
+        else:
+            self.legomena.add(token)
+            self.count[token] = 1
+
+    def should_include(self, token):
+        if len(token) < 2:
+            return False
+        if len(token) <= 2 and SOME_NON_ALPHA.search(token):
+            # B., '', (), ...
+            return False
+        if ALL_NON_ALPHA.match(token):
+            # 1,000, <<>>, ...
+            return False
+        if token.startswith('/'):
+            # eg //en.wikipedia.org/wiki, /doc
+            return False
+        if token.endswith('='):
+            # id=, title=, ...
+            return False
+        return True
+
+    def normalize(self, token):
+        return token.lower()
+
+    def batch_prune(self):
+        for token in self.legomena:
+            del self.count[token]
+        self.legomena = set()
+
+    def pre_sort_prune(self):
+        under_cutoff = set()
+        for token, count in self.count.iteritems():
+            if count < PRE_SORT_CUTOFF:
+                under_cutoff.add(token)
+        for token in under_cutoff:
+            del self.count[token]
+        self.legomena = set()
+
+    def get_sorted_pairs(self):
+        return sorted(self.count.items(), key=operator.itemgetter(1), reverse=True)
+
+    def get_ts(self):
+        return datetime.datetime.now().strftime("%b %d %Y %H:%M:%S")
+
+    def get_stats(self):
+        ts = self.get_ts()
+        return "%s keys(count): %d" % (ts, len(self.count))
+
+def main(input_dir_str, output_filename):
+    counter = TopTokenCounter()
+    print counter.get_ts(), 'starting...'
+    lines = 0
+    for root, dirs, files in os.walk(input_dir_str, topdown=True):
+        if not files:
+            continue
+        for fname in files:
+            path = os.path.join(root, fname)
+            for line in codecs.open(path, 'r', 'utf8'):
+                with warnings.catch_warnings():
+                    # unidecode() occasionally (rarely but enough to clog terminal outout)
+                    # complains about surrogate characters in some wikipedia sentences.
+                    # ignore those warnings.
+                    warnings.simplefilter('ignore')
+                    line = unidecode(line)
+                tokens = nltk.word_tokenize(line)
+                counter.add_tokens(tokens)
+                lines += 1
+                if lines % SENTENCES_PER_BATCH == 0:
+                    counter.batch_prune()
+                    print counter.get_stats()
+                    print 'processing: %s' % path
+    print counter.get_stats()
+    print 'deleting tokens under cutoff of', PRE_SORT_CUTOFF
+    counter.pre_sort_prune()
+    print 'done'
+    print counter.get_stats()
+    print counter.get_ts(), 'sorting...'
+    sorted_pairs = counter.get_sorted_pairs()
+    print counter.get_ts(), 'done'
+    print 'writing...'
+    with codecs.open(output_filename, 'w', 'utf8') as f:
+        for token, count in sorted_pairs:
+            f.write('%-18s %d\n' % (token, count))
+    sys.exit(0)
+
+if __name__ == '__main__':
+    if len(sys.argv) != 3:
+        usage()
+        sys.exit(0)
+    else:
+        main(*sys.argv[1:])
+
diff --git a/debian/upstream/scripts/count_wiktionary.py b/debian/upstream/scripts/count_wiktionary.py
new file mode 100644
index 0000000..b538d76
--- /dev/null
+++ b/debian/upstream/scripts/count_wiktionary.py
@@ -0,0 +1,80 @@
+#!/usr/bin/python
+
+import os
+import sys
+import codecs
+import operator
+
+from unidecode import unidecode
+
+def usage():
+    return '''
+This script extracts words and counts from a 2006 wiktionary word frequency study over American
+television and movies. To use, first visit the study and download, as .html files, all 26 of the
+frequency lists:
+
+https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#TV_and_movie_scripts
+
+Put those into a single directory and point it to this script:
+
+%s wiktionary_html_dir ../data/us_tv_and_film.txt
+
+output.txt will include one line per word in the study, ordered by rank, of the form:
+
+word1 count1
+word2 count2
+...
+    ''' % sys.argv[0]
+
+def parse_wiki_tokens(html_doc_str):
+    '''fragile hax, but checks the result at the end'''
+    results = []
+    last3 = ['', '', '']
+    header = True
+    skipped = 0
+    for line in html_doc_str.split('\n'):
+        last3.pop(0)
+        last3.append(line.strip())
+        if all(s.startswith('<td>') and not s == '<td></td>' for s in last3):
+            if header:
+                header = False
+                continue
+            last3 = [s.replace('<td>', '').replace('</td>', '').strip() for s in last3]
+            rank, token, count = last3
+            rank = int(rank.split()[0])
+            token = token.replace('</a>', '')
+            token = token[token.index('>')+1:]
+            token = normalize(token)
+            # wikitonary has thousands of words that end in 's
+            # keep the common ones (rank under 1000), discard the rest
+            #
+            # otherwise end up with a bunch of duplicates eg victor / victor's
+            if token.endswith("'s") and rank > 1000:
+                skipped += 1
+                continue
+            count = int(count)
+            results.append((rank, token, count))
+    # early docs have 1k entries, later 2k, last 1284
+    assert len(results) + skipped in [1000, 2000, 1284]
+    return results
+
+def normalize(token):
+    return unidecode(token).lower()
+
+def main(wiktionary_html_root, output_filename):
+    rank_token_count = [] # list of 3-tuples
+    for filename in os.listdir(wiktionary_html_root):
+        path = os.path.join(wiktionary_html_root, filename)
+        with codecs.open(path, 'r', 'utf8') as f:
+            rank_token_count.extend(parse_wiki_tokens(f.read()))
+    rank_token_count.sort(key=operator.itemgetter(0))
+    with codecs.open(output_filename, 'w', 'utf8') as f:
+        for rank, token, count in rank_token_count:
+            f.write('%-18s %d\n' % (token, count))
+
+if __name__ == '__main__':
+    if len(sys.argv) != 3:
+        print usage()
+    else:
+        main(*sys.argv[1:])
+    sys.exit(0)
-- 
cgit v1.2.3