aboutsummaryrefslogtreecommitdiffhomepage
path: root/dict-generate.cpp
diff options
context:
space:
mode:
authortony <tsyrogit@users.noreply.github.com>2015-01-03 00:16:04 +0000
committertony <tsyrogit@users.noreply.github.com>2015-01-03 00:16:04 +0000
commit38a981df53d53cc96ef47e15d62b7c9f0151315f (patch)
tree7059c14f2e43d7c5d3cbb48ce5ec4004c6a752ca /dict-generate.cpp
parent572ce633e134e53bd6a72220613feb82a5ef0a16 (diff)
downloadzxcvbn-c-38a981df53d53cc96ef47e15d62b7c9f0151315f.tar.gz
Initial commit.
Diffstat (limited to 'dict-generate.cpp')
-rw-r--r--dict-generate.cpp1613
1 files changed, 1613 insertions, 0 deletions
diff --git a/dict-generate.cpp b/dict-generate.cpp
new file mode 100644
index 0000000..cd2fb05
--- /dev/null
+++ b/dict-generate.cpp
@@ -0,0 +1,1613 @@
+/**********************************************************************************
+ * Program to generate the dictionary for the C implementation of the zxcvbn password estimator.
+ * Copyright (c) 2015, Tony Evans
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are
+ * permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this list
+ * of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice, this
+ * list of conditions and the following disclaimer in the documentation and/or other
+ * materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without specific
+ * prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ **********************************************************************************/
+
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <list>
+#include <set>
+#include <vector>
+#include <map>
+#include <memory>
+#include <limits>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+using namespace std;
+
+class Node;
+typedef std::shared_ptr<Node> NodeSPtr;
+typedef std::weak_ptr<Node> NodeWPtr;
+typedef std::map<char, NodeSPtr> NodeMap_t;
+
+typedef unsigned int Check_t;
+
+/**********************************************************************************
+ * Class to perform CRC checksum calculation.
+ */
+class TrieCheck
+{
+public:
+ typedef uint64_t Check_t;
+ static const Check_t CHK_INIT = 0xffffffffffffffff;
+ TrieCheck() { Init(); }
+ void Init() { mCrc = CHK_INIT; }
+ operator Check_t() const { return Result(); }
+ Check_t Result() const { return mCrc; }
+ bool operator ! () const { return mCrc == CHK_INIT; }
+ void operator () (const void *, unsigned int);
+protected:
+ Check_t mCrc;
+};
+
+/**********************************************************************************
+ * Class to hold a node within the trie
+ */
+class Node
+{
+public:
+ Node();
+ Node(const Node &);
+ ~Node();
+ Node & operator = (const Node &);
+ //bool operator == (const Node & r) const { return !IsEqual(r); }
+ //bool operator != (const Node & r) const { return !IsEqual(r); }
+ void SetEnd() { mEnd = true; }
+ bool IsEnd() const { return mEnd; }
+ int Height() const { return mHeight; }
+
+ // Scan the trie and count nodes
+ int NodeCount() { ClearCounted() ; return CountNodes(); }
+
+
+ int CalcAddress() { int a=0; ClearCounted(); a=CalcAddr(a, true); return CalcAddr(a, false); }
+ Node *GetParent() { return mParent; }
+ unsigned int GetAddr() const { return mAddr; }
+ NodeMap_t::iterator ChildBegin() { return mChild.begin(); }
+ NodeMap_t::iterator ChildEnd() { return mChild.end(); }
+ int GetNumEnds() const { return mEndings; }
+ NodeSPtr FindChild(char);
+ std::string GetChildChars();
+
+ TrieCheck::Check_t CalcCheck();
+ int CalcEndings();
+ int CalcHeight();
+ NodeSPtr AddChild(char);
+ void ChangeChild(NodeSPtr &, NodeSPtr &);
+// bool IsEqual(const Node &) const;
+ void ClearCounted();
+ void SetCounted() { mCounted = true; }
+ bool IsCounted() const { return mCounted; }
+protected:
+ int CountNodes();
+ int CalcAddr(int, bool);
+
+ NodeMap_t mChild;
+ Node *mParent;
+ int mEndings;
+ int mHeight;
+ unsigned int mAddr;
+ TrieCheck mCheck;
+ bool mEnd;
+ bool mCounted;
+};
+
+/**********************************************************************************
+ * Static table used for the crc implementation.
+ */
+static const TrieCheck::Check_t CrcTable[16] =
+{
+ 0x0000000000000000, 0x7d08ff3b88be6f81, 0xfa11fe77117cdf02, 0x8719014c99c2b083,
+ 0xdf7adabd7a6e2d6f, 0xa2722586f2d042ee, 0x256b24ca6b12f26d, 0x5863dbf1e3ac9dec,
+ 0x95ac9329ac4bc9b5, 0xe8a46c1224f5a634, 0x6fbd6d5ebd3716b7, 0x12b5926535897936,
+ 0x4ad64994d625e4da, 0x37deb6af5e9b8b5b, 0xb0c7b7e3c7593bd8, 0xcdcf48d84fe75459
+};
+
+// Update the crc value with new data.
+void TrieCheck::operator () (const void *v, unsigned int Len)
+{
+ Check_t Crc = mCrc;
+ const unsigned char *Data = reinterpret_cast<const unsigned char *>(v);
+ while(Len--)
+ {
+ Crc = CrcTable[(Crc ^ (*Data >> 0)) & 0x0f] ^ (Crc >> 4);
+ Crc = CrcTable[(Crc ^ (*Data >> 4)) & 0x0f] ^ (Crc >> 4);
+ ++Data;
+ }
+ mCrc = Crc;
+}
+
+Node::Node()
+{
+ mEndings = -1;
+ mHeight = -1;
+ mEnd = false;
+ mParent = 0;
+}
+
+Node::Node(const Node &r)
+{
+ *this = r;
+}
+
+Node::~Node()
+{
+
+}
+
+Node &Node::operator = (const Node & r)
+{
+ mChild = r.mChild;
+ mParent = r.mParent;
+ mEndings = r.mEndings;
+ mHeight = r.mHeight;
+ mCheck = r.mCheck;
+ mEnd = r.mEnd;
+ return *this;
+}
+
+
+/**********************************************************************************
+ * Generate a checksum for the current node. Value also depends of the
+ * checksum of any child nodes
+ */
+TrieCheck::Check_t Node::CalcCheck()
+{
+ if (!mCheck)
+ {
+ // Not done this node before
+ char c;
+ NodeMap_t::iterator It;
+ mCheck.Init();
+ // Include number of children
+ c = mChild.size();
+ mCheck(&c, sizeof c);
+ // For each child include its character and node checksum
+ for(It = mChild.begin(); It != mChild.end(); ++It)
+ {
+ Check_t n = It->second->CalcCheck();
+ c = It->first;
+ mCheck(&c, sizeof c);
+ mCheck(&n, sizeof n);
+ }
+ // Finally include whether this node is an ending in the chaecksum
+ c = mEnd;
+ mCheck(&c, sizeof c);
+ }
+ return mCheck;
+}
+
+/**********************************************************************************
+ * Get number of nodes for this which end/finish a word
+ */
+int Node::CalcEndings()
+{
+ if (mEndings < 0)
+ {
+ // Not already done this node,so calculate the ends
+ int n = 0;
+ NodeMap_t::iterator It;
+ // Number of endings is sum of the endings of the child nodes and plus this node if it ends a word
+ for(It = mChild.begin(); It != mChild.end(); ++It)
+ n += It->second->CalcEndings();
+ n += !!mEnd;
+ mEndings = n;
+ }
+ return mEndings;
+
+}
+
+/**********************************************************************************
+ * Calculate the height of the trie starting at current node
+ */
+int Node::CalcHeight()
+{
+ if (mHeight < 0)
+ {
+ // Not already done this node,so calculate the height
+ int Hi = 0;
+ NodeMap_t::iterator It;
+ // Get height of all child nodes, remember the highest
+ for(It = mChild.begin(); It != mChild.end(); ++It)
+ {
+ int i = It->second->CalcHeight();
+ if (i >= Hi)
+ Hi = i+1;
+ }
+ mHeight = Hi;
+ }
+ return mHeight;
+}
+
+/**********************************************************************************
+ * Clear indication that node has been counted
+ */
+void Node::ClearCounted()
+{
+ NodeMap_t::iterator It;
+ mCounted = false;
+ for(It = mChild.begin(); It != mChild.end(); ++It)
+ It->second->ClearCounted();
+}
+
+/**********************************************************************************
+ * Count this plus the number of child nodes. As part of the tree node count
+ * scan, make sure not to double count nodes
+ */
+int Node::CountNodes()
+{
+ // Count is 0 if already done
+ if (mCounted)
+ return 0;
+ mCounted = true;
+ NodeMap_t::iterator It;
+ int i = 1; // 1 for this node
+
+ // Add the child nodes
+ for(It = mChild.begin(); It != mChild.end(); ++It)
+ i += It->second->CountNodes();
+ return i;
+}
+
+/**********************************************************************************
+ * Calculate the final node address
+ */
+int Node::CalcAddr(int Start, bool ManyEnds)
+{
+ NodeMap_t::iterator It;
+
+ if (!(mCounted || (ManyEnds && (mEndings < 256))))
+ {
+ mCounted = true;
+ mAddr = Start++;
+ }
+ for(It = mChild.begin(); It != mChild.end(); ++It)
+ Start = It->second->CalcAddr(Start, ManyEnds);
+
+ return Start;
+}
+
+/**********************************************************************************
+ * Add the given character to the current node, return the next lower node
+ */
+NodeSPtr Node::AddChild(char c)
+{
+ NodeMap_t::iterator It;
+ // Find character in map of child nodes
+ It = mChild.find(c);
+ if (It == mChild.end())
+ {
+ // New character, create new child node
+ NodeSPtr a(new Node);
+ a->mParent = this;
+ std::pair<char, NodeSPtr> x(c, a);
+ std::pair<NodeMap_t::iterator, bool> y = mChild.insert(x);
+ It = y.first;
+ }
+ return It->second;
+}
+
+/**********************************************************************************
+ * Find the child node which corresponds to the given character.
+ */
+NodeSPtr Node::FindChild(char Ch)
+{
+ NodeMap_t::iterator It;
+ It = mChild.find(Ch);
+ if (It == mChild.end())
+ return NodeSPtr();
+ return It->second;
+}
+
+/**********************************************************************************
+ * Replace the current child node (old param) with a new child (Replace param),
+ * and update the new child parent.
+ */
+void Node::ChangeChild(NodeSPtr & Replace, NodeSPtr & Old)
+{
+ NodeMap_t::iterator It;
+ for(It = mChild.begin(); It != mChild.end(); ++It)
+ {
+ NodeSPtr p = It->second;
+ if (p == Old)
+ {
+ It->second = Replace;
+ Replace->mParent = this;
+ break;
+ }
+ }
+}
+
+/**********************************************************************************
+ * Find all the characters corresponding to the children of this node.
+ */
+std::string Node::GetChildChars()
+{
+ NodeMap_t::iterator It;
+ std::string Result;
+ for(It = mChild.begin(); It != mChild.end(); ++It)
+ {
+ char c = It->first;
+ Result += c;
+ }
+ return Result;
+}
+
+
+/**********************************************************************************
+ * struct to hold data read from input file (except for the word string)
+ */
+struct Entry
+{
+ Entry() : mRank(0), mDict(0), mOrder(0), mOccurs(0) {}
+ int mRank;
+ int mDict;
+ int mOrder;
+ int mOccurs;
+};
+
+/**********************************************************************************
+ * Struct to hold a string and an int. Also provide the compare operators for std::set class
+ */
+struct StringInt
+{
+ string s;
+ unsigned int i;
+ StringInt() { i=0; }
+ StringInt(const StringInt & r) : s(r.s), i(r.i) {}
+ StringInt & operator = (const StringInt & r) { i = r.i; s = r.s; return *this; }
+ bool operator < (const StringInt & r) const { return s < r.s; }
+ bool operator > (const StringInt & r) const { return s > r.s; }
+ bool operator == (const StringInt & r) const { return s == r.s; }
+ StringInt * Self() const { return const_cast<StringInt *>(this); }
+};
+
+typedef std::map<std::string, Entry> EntryMap_t;
+typedef std::list<NodeSPtr> NodeList_t;
+typedef set<StringInt> StringIntSet_t;
+typedef basic_string<int> StringOfInts;
+typedef vector<unsigned int> UintVect;
+typedef vector<StringInt *> StrIntPtrVect_t;
+typedef vector<StringInt> StringIntVect_t;
+
+// Variables holding 'interesting' information on the data
+unsigned int MaxLength, MinLength, NumChars, NumInWords, NumDuplicate;
+int MaxOccurReduce;
+string MaxOccurStr;
+struct FileInfo
+{
+ FileInfo() : Words(0), BruteIgnore(0), Accented(0), Dups(0), Used(0) { }
+ string Name;
+ int Words;
+ int BruteIgnore;
+ int Accented;
+ int Dups;
+ int Used;
+};
+
+/**********************************************************************************
+ * Read the file of words and add them to Entries.
+ */
+static bool ReadInputFile(EntryMap_t & Entries, const string & FileName, int DictNum, FileInfo &Info)
+{
+ ifstream f(FileName.c_str());
+ if (!f.is_open())
+ {
+ cerr << "Error opening " << FileName << endl;
+ return false;
+ }
+ Info.Name = FileName;
+
+ // Rank is the position of the work in the dictionary file. Rank==1 is lowest for a word (and
+ // indicates a very popular or bad password).
+ int Rank = 0;
+ string Line;
+ while(getline(f, Line))
+ {
+ // Truncate at first space or tab to leave just the word in case additional info on line
+ string::size_type y = Line.find_first_of("\t ");
+ if (y != string::npos)
+ Line.erase(y);
+
+ y = Line.length();
+ if (!y)
+ continue;
+
+ ++Info.Words;
+ // Only use words where all chars are ascii (no accents etc.)
+ string::size_type x;
+ double BruteForce = 1.0;
+ for(x = 0; x < y; ++x)
+ {
+ unsigned char c = Line[x];
+ if (c >= 128)
+ break;
+ c = tolower(c);
+ Line[x] = c;
+ BruteForce *= 26.0;
+ }
+ if (x < y)
+ {
+ ++Info.Accented;
+ continue;
+ }
+
+ // Don't use words where the brute force strength is less than the word's rank
+ if (BruteForce < (Rank+1))
+ {
+ ++Info.BruteIgnore;
+ continue;
+ }
+ // Remember some interesting info
+ if (y > MaxLength)
+ MaxLength = y;
+ if (y < MinLength)
+ MinLength = y;
+ NumChars += y;
+ ++NumInWords;
+ ++Rank;
+
+ EntryMap_t::iterator It = Entries.find(Line);
+ if (It != Entries.end())
+ {
+ // This is a repeat of a previous entry
+ int r = It->second.mRank;
+ if (r > Rank)
+ {
+ // Remember new lower rank
+ It->second.mRank = Rank;
+ It->second.mDict = DictNum;
+ It->second.mOccurs += 1;
+ r -= Rank;
+ if (r > MaxOccurReduce)
+ {
+ MaxOccurStr = Line;
+ MaxOccurReduce = r;
+ }
+ }
+ else
+ ++Info.Dups;
+ ++NumDuplicate;
+ }
+ else
+ {
+ // New word
+ Entry e;
+ e.mDict = DictNum;
+ e.mRank = Rank;
+ Entries.insert(std::pair<std::string, Entry>(Line, e));
+ ++Info.Used;
+ }
+ }
+ f.close();
+ return true;
+}
+
+/**********************************************************************************
+ * Use all words previously read from file(s) and add them to a Trie, which starts
+ * at Root. Also update a bool array indicating the chars used in the words.
+ */
+static void ProcessEntries(NodeSPtr Root, EntryMap_t & Entries, bool *InputCharSet)
+{
+ EntryMap_t::iterator It;
+ std::string Text;
+ for(It = Entries.begin(); It != Entries.end(); ++It)
+ {
+ Text = It->first;
+ // Add latest word to tree
+ string::size_type x;
+ NodeSPtr pNode = Root;
+ for(x = 0; x < Text.length(); ++x)
+ {
+ char c = Text[x];
+ pNode = pNode->AddChild(c);
+ // Add char to set of used character codes
+ InputCharSet[c & 0xFF] = true;
+ }
+ pNode->SetEnd();
+ }
+}
+
+/**********************************************************************************
+ * Add the passed node to the list if it has same height as value in Hi (= number
+ * of steps to get to a terminal node). If current node has height greater than Hi,
+ * recursivly call with each child node as one of these may be at the required height.
+ */
+static void AddToListAtHeight(NodeList_t & Lst, NodeSPtr Node, int Hi)
+{
+ if (Hi == Node->Height())
+ {
+ Lst.push_back(Node);
+ return;
+ }
+ if (Hi < Node->Height())
+ {
+ NodeMap_t::iterator It;
+ for(It = Node->ChildBegin(); It != Node->ChildEnd(); ++It)
+ {
+ AddToListAtHeight(Lst, It->second, Hi);
+ }
+ }
+}
+
+/**********************************************************************************
+ * Scan the trie and update the original word list with the alphabetical order
+ * (or 'index location') of the words
+ */
+static void ScanTrieForOrder(EntryMap_t & Entries, int & Ord, NodeSPtr Root, const string & Str)
+{
+ if (Root->IsEnd())
+ {
+ // Root is a word ending node, so store its index in the input word store
+ EntryMap_t::iterator Ite;
+ Ite = Entries.find(Str);
+ if (Ite == Entries.end())
+ throw "Trie string not in entries";
+
+ Ite->second.mOrder = ++Ord;
+ }
+ NodeMap_t::iterator It;
+ string Tmp;
+ // For each child, append its character to the current word string and do a recursive
+ // call to update their word indexes.
+ for(It = Root->ChildBegin(); It != Root->ChildEnd(); ++It)
+ {
+ Tmp = Str + It->first;
+ ScanTrieForOrder(Entries, Ord, It->second, Tmp);
+ }
+}
+
+/**********************************************************************************
+ * Reduce the trie by merging tails where possible. Starting at greatest height,
+ * get a list of all nodes with given height, then test for identical nodes. If
+ * found, change the parent of the second identical node to use the first node,
+ * and delete second node and its children. Reduce height by one and repeat
+ * until height is zero.
+ */
+static void ReduceTrie(NodeSPtr Root)
+{
+ int Height;
+ int cnt=0, del=0;
+ Root->CalcCheck();
+
+ NodeSPtr pNode = Root;
+ for(Height = Root->CalcHeight(); Height >= 0; --Height)
+ {
+ // Get a list of all nodes at given height
+ int x=0;
+ NodeList_t Lst;
+ AddToListAtHeight(Lst, Root, Height);
+ cnt += Lst.size();
+
+ NodeList_t::iterator Ita, Itb;
+ for(Ita = Lst.begin(); Ita != Lst.end(); ++Ita)
+ {
+ // Going to use a CRC to decide if two nodes are identical
+ TrieCheck::Check_t Chka = (*Ita)->CalcCheck();
+ Itb = Ita;
+ for(++Itb; Itb != Lst.end(); )
+ {
+ if (Chka == (*Itb)->CalcCheck())
+ {
+ // Found two identical nodes (with identical children)
+ Node * Parentb = (*Itb)->GetParent();
+ if (Parentb)
+ {
+ // Change the 2nd parent to use the current node as child
+ // Remove the 2nd node from the scanning list to as it will
+ // get deleted by the sharing (as using std::shared_ptr)
+ Parentb->ChangeChild(*Ita, *Itb);
+ ++x;++del;
+ Itb = Lst.erase(Itb);
+ }
+ else
+ {
+ cout << " orphan ";
+ ++Itb;
+ }
+ }
+ else
+ {
+ ++Itb;
+ }
+ }
+ }
+ }
+}
+
+/**********************************************************************************
+ * Scan the trie to match with the supplied word. Return the order of the
+ * word, or -1 if it is not in the trie.
+ */
+static int CheckWord(NodeSPtr Root, const string & Str)
+{
+ int i = 1;
+ bool e = false;
+ string::size_type x;
+ NodeSPtr p = Root;
+
+ for(x = 0; x < Str.length(); ++x)
+ {
+ int j;
+ NodeMap_t::iterator It;
+ // Scan children to find one that matches current character
+ char c = Str[x];
+ for(It = p->ChildBegin(); It != p->ChildEnd(); ++It)
+ {
+ if (It->first == c)
+ break;
+ // Add the number of endings at or below child to track the alphabetical index
+ j = It->second->CalcEndings();
+ i += j;
+ }
+ // Fail if no child matches the character
+ if (It == p->ChildEnd())
+ return -1;
+ // Allow for this node being a word ending
+ e = p->IsEnd();
+ if (e)
+ ++i;
+
+ p = It->second;
+ }
+
+ if (p && p->IsEnd())
+ {
+ if (x == Str.length())
+ return i;
+ }
+ return -1;
+}
+
+/**********************************************************************************
+ * Try to find every input word in the reduced trie. The order should also
+ * match, otherwise the reduction has corrupted the trie.
+ */
+static int CheckReduction(StringIntVect_t & Ranks, NodeSPtr Root, EntryMap_t & Entries)
+{
+ int i = 0;
+ int n = 0;
+ EntryMap_t::iterator It;
+ std::string Text;
+ int b;
+ Ranks.resize(Entries.size()+1);
+ for(It = Entries.begin(); (It != Entries.end()) && (i <= 200000); ++It)
+ {
+ Text = It->first;
+ b = CheckWord(Root, Text);
+ if (b < 0)
+ {
+ ++i;
+ cout << It->second.mOrder << ": Missing " << Text.c_str() << endl;
+ }
+ else if (It->second.mOrder != b)
+ {
+ ++i;
+ cout << It->second.mOrder << ": Bad order " << b << " for " << Text.c_str() << endl;
+ }
+ else
+ {
+ //if (Text == "fred")
+ // cout << Text.c_str() << "-> " << It->second.mOrder << ", " << It->second.mRank << endl;
+ ++n;
+ }
+ if (b >= int(Ranks.size()))
+ throw " Using Ranks beyond end";
+ if (b >= 0)
+ {
+ char Tmp[20];
+ Ranks[b].i = It->second.mRank;
+ sprintf(Tmp, "%d: ", n);
+ Ranks[b].s = string(Tmp) + Text;
+ }
+ // Try to find a non-existant word
+ Text.insert(0, "a");
+ Text += '#';
+ b = CheckWord(Root, Text);
+ if (b > 0)
+ throw string("Found non-existant word ") + Text;
+ }
+ if (i > 0)
+ throw "Missing words in reduction check = " + to_string(i);
+ return n;
+}
+
+struct ChkNum
+{
+ int Match;
+ int Err;
+ ChkNum() : Match(0), Err(0) {}
+ ChkNum(const ChkNum &r) : Match(r.Match), Err(r.Err) {}
+ ChkNum & operator = (const ChkNum & r) { Match = r.Match; Err = r.Err; return *this; }
+ ChkNum & operator += (const ChkNum & r) { Match += r.Match; Err += r.Err; return *this; }
+};
+
+/**********************************************************************************
+ * Find all possible words in the trie and make sure they are input words.
+ * Return number of words found. Done as a second trie check.
+ */
+static ChkNum CheckEntries(NodeSPtr Root, string Str, const EntryMap_t & Entries)
+{
+ ChkNum Ret;
+ if (Root->IsEnd())
+ {
+ // This is an end node, find the word in the input words
+ EntryMap_t::const_iterator It = Entries.find(Str);
+ if (It != Entries.end())
+ ++Ret.Match;
+ else
+ ++Ret.Err;
+ }
+ // Add each child character to the passed string and recursively check
+ NodeMap_t::iterator It;
+ for(It = Root->ChildBegin(); It != Root->ChildEnd(); ++It)
+ {
+ string Tmp = Str;
+ Tmp += It->first;
+ Ret += CheckEntries(It->second, Tmp, Entries);
+ }
+ return Ret;
+}
+
+/**********************************************************************************
+ * Convert the passed bool array of used chars into a character string
+ */
+string MakeCharSet(bool *InputCharSet)
+{
+ int i;
+ string s;
+ for(i = 1; i < 256; ++i)
+ {
+ if (InputCharSet[i])
+ s += char(i);
+ }
+ return s;
+}
+
+/**********************************************************************************
+ * Create a set of strings which contain the possible characters matched at
+ * a node when checking a word.
+ */
+void MakeChildBitMap(StringIntSet_t & StrSet, NodeSPtr Root, int & Loc)
+{
+ // Skip if already done
+ if (Root->IsCounted())
+ return;
+
+ string::size_type x;
+ StringInt In;
+ NodeSPtr p = Root;
+ In.s = Root->GetChildChars();
+ if (StrSet.find(In) == StrSet.end())
+ {
+ // Not already in set of possible child chars for a node, so add it
+ In.i = Loc++; // Address in the final output array
+ StrSet.insert(In);
+ }
+ // Recursively do the child nodes
+ for(x = 0; x < In.s.length(); ++x)
+ {
+ char c = In.s[x];
+ NodeSPtr q = p->FindChild(c);
+ if (q)
+ MakeChildBitMap(StrSet, q, Loc);
+ }
+ Root->SetCounted();
+}
+
+/**********************************************************************************
+ * Create the arrays of data that will be output
+ */
+void CreateArrays(NodeSPtr Root, StringIntSet_t & StrSet, StringOfInts & ChildAddrs, UintVect & NodeData, UintVect & NodeEnds)
+{
+ NodeMap_t::iterator Itc;
+ StringInt Tmp;
+ StringOfInts Chld;
+
+ // Find children in the child pattern array
+ Tmp.s= Root->GetChildChars();
+ StringIntSet_t::iterator Its = StrSet.find(Tmp);
+
+ // Make a 'string' of pointers to the children
+ for(Itc = Root->ChildBegin(); Itc != Root->ChildEnd(); ++Itc)
+ {
+ int i = Itc->second->GetAddr();
+ Chld += i;
+ }
+ // Find where in pointer array the child pointer string is
+ StringOfInts::size_type x = ChildAddrs.find(Chld);
+ if (x == StringOfInts::npos)
+ {
+ // Not found, add it
+ x = ChildAddrs.length();
+ ChildAddrs += Chld;
+ }
+ // Val will contain the final node data
+ // Bits 12:0 Index of the child char pattern in the final child bitmap array
+ // Bits 29:13 Index of where the child pointers start for this node in the Child map array
+ // Bit 30 Set if this nod is a word ending
+ // Bit 31 Set if the number of word endings for this + child nodes is >= 256
+ unsigned int Val = Its->i;
+ Val |= x << 13;
+ if (Root->IsEnd())
+ Val |= 1<<30;
+ if (Root->GetNumEnds() >= 256)
+ Val |= 1<<31;
+
+ // Make sure output arrays are big enough
+ if (Root->GetAddr() > NodeData.size())
+ {
+ NodeData.resize(Root->GetAddr()+1, 4000000000);
+ NodeEnds.resize(Root->GetAddr()+1, 4000000000);
+ }
+ // Save the node data and number of word endings for the node
+ NodeData[Root->GetAddr()] = Val;
+ NodeEnds[Root->GetAddr()] = Root->GetNumEnds();
+
+ // Now do the children
+ for(Itc = Root->ChildBegin(); Itc != Root->ChildEnd(); ++Itc)
+ {
+ CreateArrays(Itc->second, StrSet, ChildAddrs, NodeData, NodeEnds);
+ }
+}
+
+/**********************************************************************************
+ * Output the data as a binary file.
+ */
+static int OutputBinary(ostream *Out, const string & ChkFile, const string & CharSet, StringIntSet_t & StrSet, //NodeSPtr & Root,
+ StringOfInts & ChildAddrs, UintVect & NodeData, UintVect & NodeEnds, StringIntVect_t & Ranks)
+{
+ int OutputSize;
+ unsigned int FewEndStart = 2000000000;
+ unsigned int i;
+ unsigned int Index;
+ unsigned short u;
+ TrieCheck h;
+
+ for(Index = 0; Index < NodeData.size(); ++Index)
+ {
+ i = NodeData[Index];
+ if ((FewEndStart >= 2000000000) && !(i & (1<<31)))
+ {
+ FewEndStart = Index;
+ break;
+ }
+ }
+ // Header words
+ const unsigned int MAGIC = 'z' + ('x'<< 8) + ('c' << 16) + ('v' << 24);
+ Out->write((char *)&MAGIC, sizeof MAGIC); // Write magic
+ h(&MAGIC, sizeof MAGIC);
+ OutputSize = sizeof MAGIC;
+
+ i = NodeData.size();
+ Out->write((char *)&i, sizeof i); // Write number of nodes
+ h(&i, sizeof i);
+ OutputSize += sizeof i;
+
+ i = ChildAddrs.size();
+ if (NodeData.size() > numeric_limits<unsigned short>::max())
+ i |= 1<<31;
+ Out->write((char *)&i, sizeof i); // Write number of child location entries & size of each entry
+ h(&i, sizeof i);
+ OutputSize += sizeof i;
+
+ i = Ranks.size();
+ Out->write((char *)&i, sizeof i); // Write number of ranks
+ h(&i, sizeof i);
+ OutputSize += sizeof i;
+
+ i = StrSet.size();
+ Out->write((char *)&i, sizeof i); // Write size of of child bitmap data
+ h(&i, sizeof i);
+ OutputSize += sizeof i;
+
+ unsigned int BytePerEntry = (CharSet.length() + 7) / 8;
+ Out->write((char *)&BytePerEntry, sizeof BytePerEntry); // Write size of each child bitmap
+ h(&BytePerEntry, sizeof BytePerEntry);
+ OutputSize += sizeof BytePerEntry;
+
+ Out->write((char *)&FewEndStart, sizeof FewEndStart); // Write number of large end counts
+ h(&FewEndStart, sizeof FewEndStart);
+ OutputSize += sizeof FewEndStart;
+
+ i = NodeData.size();
+ Out->write((char *)&i, sizeof i); // Write number of end counts
+ h(&i, sizeof i);
+ OutputSize += sizeof i;
+
+ i = CharSet.length();
+ Out->write((char *)&i, sizeof i); // Write size of character set
+ h(&i, sizeof i);
+ OutputSize += sizeof i;
+
+ // Output array of node data
+ for(Index = 0; Index < NodeData.size(); ++Index)
+ {
+ i = NodeData[Index];
+ Out->write((char *)&i, sizeof i);
+ h(&i, sizeof i);
+ }
+ OutputSize += Index * sizeof i;
+
+ // Output array of node pointers
+ if (NodeData.size() > numeric_limits<unsigned short>::max())
+ {
+ for(Index = 0; Index < ChildAddrs.size(); ++Index)
+ {
+ i = ChildAddrs[Index];
+ Out->write((char *)&i, sizeof i);
+ h(&i, sizeof i);
+ }
+ OutputSize += Index * sizeof i;
+ }
+ else
+ {
+ for(Index = 0; Index < ChildAddrs.size(); ++Index)
+ {
+ u = ChildAddrs[Index];
+ Out->write((char *)&u, sizeof u);
+ h(&u, sizeof u);
+ }
+ OutputSize += Index * sizeof u;
+ }
+ // Output ranks
+ for(Index = 0; Index < Ranks.size(); ++Index)
+ {
+ u = Ranks[Index].i;
+ Out->write((char *)&u, sizeof u);
+ h(&u, sizeof u);
+ }
+ OutputSize += Index * sizeof u;
+
+ StringIntSet_t::iterator Its;
+ string Str;
+ unsigned char Buf[8];
+
+ // Get the items from StrSet ordered by the index
+ StrIntPtrVect_t SetPtrs;
+ SetPtrs.resize(StrSet.size());
+ for(Its = StrSet.begin(); Its != StrSet.end(); ++Its)
+ {
+ StringInt *p = Its->Self();
+ if (p->i >= StrSet.size())
+ {
+ cout << "p->i=" << p->i << " " << p->s.c_str() << endl;
+ throw "Bad index";
+ }
+ SetPtrs[p->i] = p;
+ }
+ // Output child bitmap
+ for(Index = 0; Index < SetPtrs.size(); ++Index)
+ {
+ string::size_type z, y;
+ StringInt *p;
+ memset(Buf, 0, sizeof Buf);
+ p = SetPtrs[Index];
+ Str = p->s;
+ for(z = 0; z < Str.length(); ++z)
+ {
+ y = CharSet.find(Str[z]);
+ if (y != string::npos)
+ {
+ Buf[y/8] |= 1 << (y & 7);
+ }
+ }
+ Out->write((char *)Buf, BytePerEntry);
+ h(Buf, BytePerEntry);
+ }
+ OutputSize += Index * BytePerEntry;
+
+ unsigned char c;
+ for(Index = 0; Index < FewEndStart; ++Index)
+ {
+ i = NodeEnds[Index] >> 8;
+ if (i >= 256)
+ c = 0;
+ else
+ c = i;
+ Out->write((char *)&c, 1);
+ h(&c, 1);
+ }
+ OutputSize += Index * sizeof c;
+ for(Index = 0; Index < NodeEnds.size(); ++Index)
+ {
+ c = NodeEnds[Index];;
+ Out->write((char *)&c, 1);
+ h(&c, 1);
+ }
+ OutputSize += Index * sizeof c;
+
+ Out->write(CharSet.c_str(), CharSet.length());
+ h(CharSet.c_str(), CharSet.length());
+ OutputSize += CharSet.length();
+
+ if (!ChkFile.empty())
+ {
+ // Write the checksum file
+ TrieCheck::Check_t x = h.Result();
+ ofstream f(ChkFile);
+ f << "static const unsigned char WordCheck[" << sizeof x << "] =\n{\n ";
+ unsigned char *c = reinterpret_cast<unsigned char *>(&x);
+ for(Index = 0; Index < sizeof x; ++Index, ++c)
+ {
+ if (Index)
+ f << ',';
+ f << int(*c);
+ }
+ f << "\n};\n";
+ f << "#define WORD_FILE_SIZE " << OutputSize << endl;
+ f << "#define ROOT_NODE_LOC 0" << endl;
+ f.close();
+ }
+ return OutputSize;
+}
+
+int OutputTester(ostream *Out, bool /*Cmnts*/, StringIntVect_t & Ranks)
+{
+ unsigned int Index;
+ string Pwd;
+ for(Index = 01; Index < Ranks.size(); ++Index)
+ {
+ unsigned int v = Ranks[Index].i;
+ Pwd = Ranks[Index].s;
+ string::size_type x = Pwd.find(':');
+ if (x != string::npos)
+ Pwd.erase(0, x+1);
+
+ *Out << Pwd.c_str() << " ";
+ for(x = Pwd.length(); x < 16; ++x)
+ *Out << ' ';
+ *Out << log(v*1.0) / log(2.0) << '\n';
+ }
+ return Index;
+}
+/**********************************************************************************
+ * Output the data as C source.
+ */
+int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t & StrSet, NodeSPtr & Root,
+ StringOfInts & ChildAddrs, UintVect & NodeData, UintVect & NodeEnds, StringIntVect_t & Ranks)
+{
+ unsigned int Index;
+ int OutputSize;
+
+ if (Cmnts)
+ *Out << "#define ND(e,c,b) (e<<30)|(c<<13)|b\n";
+
+ // Output array of node data
+ *Out << "#define ROOT_NODE_LOC 0" << endl;
+ *Out << "static const unsigned int DictNodes[" << NodeData.size() << "] =\n{";
+ OutputSize = NodeData.size() * sizeof(unsigned int);
+ int x = 99;
+ unsigned int FewEndStart = 2000000000;
+ for(Index = 0; Index < NodeData.size(); ++Index)
+ {
+ unsigned int v;
+ if (++x >= 8)
+ {
+ *Out << "\n ";
+ x=0;
+ }
+ v = NodeData[Index];
+ if (Cmnts)
+ {
+ unsigned int i;
+ i = (v >> 30) & 3;
+ *Out << "ND(" << i << ',';
+ i= (v >> 13) & ((1<<17)-1);
+ *Out << i << ',';
+ if (i < 10000) *Out << ' ';
+ if (i < 1000) *Out << ' ';
+ if (i < 100) *Out << ' ';
+ if (i < 10) *Out << ' ';
+ i = v & ((1<<13)-1);
+ *Out << i << ")";
+ if (Index < (NodeData.size()-1))
+ {
+ *Out << ',';
+ if (i < 1000) *Out << ' ';
+ if (i < 100) *Out << ' ';
+ if (i < 10) *Out << ' ';
+ }
+ }
+ else
+ {
+ *Out << v;
+ if (Index < (NodeData.size()-1))
+ {
+ *Out << ',';
+ if (v < 1000000000) *Out << ' ';
+ if (v < 100000000) *Out << ' ';
+ if (v < 10000000) *Out << ' ';
+ if (v < 1000000) *Out << ' ';
+ if (v < 100000) *Out << ' ';
+ if (v < 10000) *Out << ' ';
+ if (v < 1000) *Out << ' ';
+ if (v < 100) *Out << ' ';
+ if (v < 10) *Out << ' ';
+ }
+ }
+ if ((FewEndStart >= 2000000000) && !(v & (1<<31)))
+ FewEndStart = Index;
+ }
+ *Out << "\n};\n";
+
+ // Output array of node pointers
+ *Out << "static const unsigned ";
+ if (NodeData.size() > numeric_limits<unsigned short>::max())
+ {
+ *Out << "int";
+ x = sizeof(unsigned int);
+ }
+ else
+ {
+ *Out << "short";
+ x = sizeof(unsigned short);
+ }
+ *Out << " ChildLocs[" << ChildAddrs.size() << "] =\n{";
+ OutputSize += x * ChildAddrs.size();
+ x = 99;
+ for(Index = 0; Index < ChildAddrs.size(); ++Index)
+ {
+ int v;
+ if (++x > 19)
+ {
+ *Out << "\n ";
+ x=0;
+ }
+ v = ChildAddrs[Index];
+ *Out << v;
+ if (Index < (ChildAddrs.size()-1))
+ {
+ *Out << ',';
+ if (v < 10000) *Out << ' ';
+ if (v < 1000) *Out << ' ';
+ if (v < 100) *Out << ' ';
+ if (v < 10) *Out << ' ';
+ }
+
+ }
+ *Out << "\n};\n";
+
+ // Output the rank of the words
+ *Out << "static const unsigned short Ranks[" << Ranks.size() << "] =\n{";
+ OutputSize += Ranks.size() * sizeof(unsigned short);
+ x=99;
+ if (Cmnts)
+ {
+ *Out << "\n";
+ for(Index = 0; Index < Ranks.size(); ++Index)
+ {
+ unsigned int v;
+ *Out << " ";
+ v = Ranks[Index].i;
+ *Out << v;
+ if (Index < (Ranks.size()-1))
+ {
+ *Out << ',';
+ if (v < 10000) *Out << ' ';
+ if (v < 1000) *Out << ' ';
+ if (v < 100) *Out << ' ';
+ if (v < 10) *Out << ' ';
+ }
+ *Out << " // " << Ranks[Index].s.c_str() << '\n';
+ }
+ }
+ else
+ {
+ for(Index = 0; Index < Ranks.size(); ++Index)
+ {
+ unsigned int v;
+ if (++x > 19)
+ {
+ *Out << "\n ";
+ x=0;
+ }
+ v = Ranks[Index].i;
+ *Out << v;
+ if (Index < (Ranks.size()-1))
+ {
+ *Out << ',';
+ if (v < 10000) *Out << ' ';
+ if (v < 1000) *Out << ' ';
+ if (v < 100) *Out << ' ';
+ if (v < 10) *Out << ' ';
+ }
+ }
+ }
+ *Out << "\n};\n";
+
+ unsigned int BytePerEntry = (CharSet.length() + 7) / 8;
+ *Out << "#define SizeChildMapEntry " << BytePerEntry << '\n';
+ *Out << "static const unsigned char ChildMap[" << StrSet.size() << '*' << BytePerEntry << "] =\n{";
+ OutputSize += StrSet.size() * BytePerEntry * sizeof(unsigned char);
+
+ StringIntSet_t::iterator Its;
+ string Str;
+ unsigned char Buf[8];
+
+ // Get the items from StrSet ordered by the index
+ StrIntPtrVect_t SetPtrs;
+ SetPtrs.resize(StrSet.size());
+ for(Its = StrSet.begin(); Its != StrSet.end(); ++Its)
+ {
+ StringInt *p = Its->Self();
+ if (p->i >= StrSet.size())
+ {
+ cout << "p->i=" << p->i << " " << p->s.c_str() << endl;
+ throw "Bad index";
+ }
+ SetPtrs[p->i] = p;
+ }
+
+ x=99;
+ for(Index = 0; Index < SetPtrs.size(); ++Index)
+ {
+ string::size_type z, y;
+ StringInt *p;
+ memset(Buf, 0, sizeof Buf);
+ if (x > 4)
+ {
+ *Out << "\n ";
+ x = 0;
+ }
+ p = SetPtrs[Index];
+ Str = p->s;
+ for(z = 0; z < Str.length(); ++z)
+ {
+ y = CharSet.find(Str[z]);
+ if (y != string::npos)
+ {
+ Buf[y/8] |= 1 << (y & 7);
+ }
+ }
+ for(z = 0; z < BytePerEntry; ++z)
+ {
+ y = Buf[z] & 0xFF;
+ *Out << y;
+ if (z < (BytePerEntry-1))
+ *Out << ',';
+ else
+ {
+ if (Index < (SetPtrs.size() - 1))
+ *Out << ", ";
+ }
+ if (y < 100)
+ *Out << ' ';
+ if (y < 10)
+ *Out << ' ';
+ }
+ ++x;
+ if (Cmnts)
+ {
+ *Out << " // " << p->i << ": " << Str;
+ x = 99;
+ }
+
+
+ }
+ *Out << "\n};" << endl;
+
+ // Output the top 8 bits of the node word endings count. Since node with >255 endings have
+ // been placed at the begining, and ther are not too many of them the array is fairly small.
+ *Out << "static const unsigned char EndCountLge[" << FewEndStart << "] =\n{";
+ OutputSize += FewEndStart * sizeof(unsigned char);
+ x=99;
+ for(Index = 0; Index < FewEndStart; ++Index)
+ {
+ unsigned int v;
+ if (++x > 19)
+ {
+ *Out << "\n ";
+ x=0;
+ }
+ v = NodeEnds[Index] >> 8;
+ if (v >= 256)
+ v = 0;
+ *Out << v;
+ if (Index < (FewEndStart-1))
+ {
+ *Out << ',';
+ if (v < 100) *Out << ' ';
+ if (v < 10) *Out << ' ';
+ }
+ }
+ *Out << "\n};\n";
+
+ // Output all the word ending counts. For the first few nodes this is just the lower 8 bits of
+ // the value. For the rest each entry contains the whole count. The split between lower and
+ // upper halves of the value for the first few nodes allows bytes arrays to be used, so saving
+ // memory.
+ *Out << "static const unsigned char EndCountSml[" << NodeEnds.size() << "] =\n{";
+ OutputSize += NodeEnds.size() * sizeof(unsigned char);
+ x=99;
+ for(Index = 0; Index < NodeEnds.size(); ++Index)
+ {
+ unsigned int v;
+ if (++x > 19)
+ {
+ *Out << "\n ";
+ x=0;
+ }
+ v = NodeEnds[Index] & 255;
+ *Out << v;
+ if (Index < (NodeEnds.size()-1))
+ {
+ *Out << ',';
+ if (v < 100) *Out << ' ';
+ if (v < 10) *Out << ' ';
+ }
+ }
+ *Out << "\n};\n";
+
+ // Finally output the used characters.
+ *Out << "static const char CharSet[" << CharSet.length()+1 << "] = \"";
+ OutputSize += CharSet.length() * sizeof(char);
+ for(Index = 0; Index < CharSet.length(); ++Index)
+ {
+ char c = CharSet[Index];
+ if ((c == '\\') || (c == '"'))
+ *Out << '\\';
+ *Out << c;
+ }
+ *Out << "\";" << endl;
+ *Out << "#define ROOT_NODE_LOC " << Root->GetAddr() << "\n";
+ return OutputSize + sizeof(unsigned int);
+}
+enum { OUT_C_CODE, OUT_BINARY, OUT_TESTER };
+/**********************************************************************************
+ */
+int main(int argc, char *argv[])
+{
+ int i;
+ int OutType = OUT_C_CODE;
+ bool Verbose = false;
+ bool Comments = false;
+ string FileName, HashFile;
+ char *OutFile = 0;
+ EntryMap_t Entries;
+ FileInfo InInfo[10];
+ int NumFiles = 0;
+ MinLength = 999;
+
+ try
+ {
+ for(i = 1; i < argc; ++i)
+ {
+ FileName = argv[i];
+ if (FileName == "-b")
+ {
+ // Output a binary file to stdout or file
+ OutType = OUT_BINARY;
+ continue;
+ }
+ if (FileName == "-t")
+ {
+ // Output a tester file to stdout or file
+ OutType = OUT_TESTER;
+ continue;
+ }
+ if (FileName == "-c")
+ {
+ // Add comments to the output (if text)
+ Comments = true;
+ continue;
+ }
+ if (FileName == "-o")
+ {
+ // Give output file
+ if (++i < argc)
+ OutFile = argv[i];
+ continue;
+ }
+ if (FileName == "-h")
+ {
+ // Give crc header output file
+ if (++i < argc)
+ HashFile = argv[i];
+ continue;
+ }
+ if (FileName == "-v")
+ {
+ Verbose = true;
+ continue;
+ }
+ if (FileName[0] == '-')
+ {
+ cerr << "Usage: " << argv[0] << " [ -c ] [ -b | -t ] [ -o Ofile ] [ -h Hfile ] Files...\n"
+ "Where:\n"
+ " -b Generate a binary output file\n"
+ " -t Generate a test file for testing zxcvbn\n"
+ " -c Add comments to output file if C code mode\n"
+ " -v Additional information output\n"
+ " -h Hfile Write file checksum to file Hfile as C code (for -b mode)\n"
+ " -o Ofile Write output to file Ofile\n"
+ " Files The dictionary input files to read\n"
+ " If the -o option is not used, output is written to stdout\n"
+ " if the -b option is not used, output is in the form of C source code\n"
+ << endl;
+ return 1;
+ }
+ ReadInputFile(Entries, FileName, i, InInfo[NumFiles]);
+ if (NumFiles < int(sizeof InInfo / sizeof InInfo[0] - 1))
+ ++NumFiles;
+ }
+ if (Verbose)
+ {
+ if (!OutFile && (OutType == OUT_C_CODE))
+ cout << "/*\n";
+ for(i = 0; i < NumFiles; ++i)
+ {
+ FileInfo *Fi = InInfo + i;
+ cout << "Read input file " << Fi->Name << endl;
+ cout << " Input words " << Fi->Words << endl;
+ cout << " Used words " << Fi->Used << endl;
+ cout << /*" Unused " << Fi->BruteIgnore <<
+ " Bruteforce compare, " << Fi->Accented <<
+ " Accented char, " << Fi->Dups << " Duplicates" << */ endl;
+ }
+ }
+ bool InputCharSet[256];
+ NodeSPtr Root(new Node);
+ // Initially charset of used chracters is empty
+ memset(InputCharSet, 0, sizeof InputCharSet);
+
+ // Add words to the trie with root in Root
+ ProcessEntries(Root, Entries, InputCharSet);
+
+ // Get some interesting info
+ int NumEnds = Root->CalcEndings();
+ int Hi = Root->CalcHeight();
+ int NumNodes = Root->NodeCount();
+ if (Verbose)
+ {
+ cout << "Max word length = " << MaxLength << endl;
+ cout << "Min word length = " << MinLength << endl;
+ cout << "Num input chars = " << NumChars << endl;
+ cout << "Num input words = " << NumInWords << endl;
+ cout << "Duplicate words = " << NumDuplicate;
+ cout << " (Rank most reduced for \"" << MaxOccurStr.c_str() << "\")"<< endl;
+ cout << "Number of Ends = " << NumEnds << endl;
+ cout << "Number of Nodes = " << NumNodes << endl;
+ cout << "Trie height = " << Hi << endl;
+ }
+ // Store the alphabetical ordering of the input words
+ i = 0;
+ ScanTrieForOrder(Entries, i, Root, string());
+ if (Verbose)
+ cout << "Trie Order = " << i << endl;
+ int InputOrder = i;
+ // Reduce the Trie
+ ReduceTrie(Root);
+
+ // Output some interesting information
+ NumNodes = Root->NodeCount();
+ int ReduceEnds = Root->CalcEndings();
+ if (Verbose)
+ {
+ cout << "After reduce:\n";
+ cout << "Number of Ends = " << ReduceEnds << endl;
+ cout << "Number of Nodes = " << NumNodes << endl;
+ }
+ // Check reduction was OK
+ StringIntVect_t Ranks;
+ int CheckEnds = CheckReduction(Ranks, Root, Entries);
+ if (Verbose)
+ cout << "Number of Words = " << CheckEnds << endl;
+
+ ChkNum Tst = CheckEntries(Root, string(), Entries);
+ if (Verbose)
+ {
+ cout << "2nd check - Number of valid words = " << Tst.Match << endl;
+ cout << " Number of invalid words = " << Tst.Err << endl;
+ }
+
+ // Give up if there was an error
+ if (Tst.Err)
+ throw "Checks show invalid words after reduction";
+ if ((Tst.Match != InputOrder) || (ReduceEnds != InputOrder))
+ throw "Word count changed after reduce";
+
+ // Output more info
+ StringIntSet_t ChildBits;
+ string CharSet = MakeCharSet(InputCharSet);
+ if (Verbose)
+ cout << "Used characters (" << CharSet.length() << "): " << CharSet.c_str() << endl;
+
+ // Make a set of all unique child character patterns for the nodes
+ i=0;
+ Root->ClearCounted();
+ MakeChildBitMap(ChildBits, Root, i);
+ if (Verbose)
+ cout << "Number of child bitmaps = " << ChildBits.size() << endl;
+
+ // Get final node address
+ Root->CalcAddress();
+
+ UintVect NodeData;
+ UintVect NodeEnds;
+ StringOfInts ChildAddrs;
+
+ // Resize to save library adjusting allocation during data creation
+ NodeData.resize(NumNodes, 4000000000);
+ NodeEnds.resize(NumNodes, 4000000000);
+ CreateArrays(Root, ChildBits, ChildAddrs, NodeData, NodeEnds);
+ if (Verbose)
+ {
+ cout << "Node data array size " << NodeData.size() << endl;
+ cout << "Child pointer array size " << ChildAddrs.size() << endl;
+ }
+ shared_ptr<ofstream> fout;
+ ostream *Out = &cout;
+ if (OutFile)
+ {
+ fout = shared_ptr<ofstream>(new ofstream);
+ if (OutType == OUT_BINARY)
+ fout->open(OutFile, ios_base::trunc | ios_base::binary);
+ else
+ fout->open(OutFile, ios_base::trunc);
+ Out = fout.get();
+ }
+ if (!OutFile && (OutType == OUT_C_CODE))
+ cout << "*/\n";
+
+ if (OutType == OUT_BINARY)
+ i = OutputBinary(Out, HashFile, CharSet, ChildBits, ChildAddrs, NodeData, NodeEnds, Ranks);
+ else if (OutType == OUT_TESTER)
+ i = OutputTester(Out, Comments, Ranks);
+ else
+ i = OutputCode(Out, Comments, CharSet, ChildBits, Root, ChildAddrs, NodeData, NodeEnds, Ranks);
+
+ if (fout)
+ {
+ fout->close();
+ }
+ }
+ catch(const char *m)
+ {
+ cerr << m << endl;
+ return 1;
+ }
+ catch(string m)
+ {
+ cerr << m.c_str() << endl;
+ return 1;
+ }
+ catch(...)
+ {
+ cerr << "Unhandled exception" << endl;
+ return 1;
+ }
+ return 0;
+}
+
+/**********************************************************************************/