1 files changed, 300 insertions, 151 deletions
diff --git a/dict-generate.cpp b/dict-generate.cpp
index cd2fb05..410182d 100644
--- a/dict-generate.cpp
+++ b/dict-generate.cpp
@@ -27,9 +27,9 @@
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
- * 
+ *
  **********************************************************************************/
- 
+
 #include <iostream>
 #include <string>
 #include <fstream>
@@ -85,11 +85,11 @@ public:
     void          SetEnd()           { mEnd = true; }
     bool          IsEnd() const      { return mEnd; }
     int           Height() const     { return mHeight; }
-    
+
     // Scan the trie and count nodes
     int           NodeCount()        { ClearCounted() ; return CountNodes(); }
-    
-    
+
+
     int           CalcAddress()      { int a=0; ClearCounted(); a=CalcAddr(a, true); return CalcAddr(a, false); }
     Node         *GetParent()        { return mParent; }
     unsigned int  GetAddr() const    { return mAddr; }
@@ -224,7 +224,6 @@ int Node::CalcEndings()
         mEndings = n;
     }
     return mEndings;
-    
 }
 
 /**********************************************************************************
@@ -363,7 +362,6 @@ std::string Node::GetChildChars()
     return Result;
 }
 
-
 /**********************************************************************************
  * struct to hold data read from input file (except for the word string)
  */
@@ -392,33 +390,35 @@ struct StringInt
     StringInt * Self() const { return const_cast<StringInt *>(this); }
 };
 
-typedef std::map<std::string, Entry> EntryMap_t;
-typedef std::list<NodeSPtr> NodeList_t;
+typedef map<string, Entry> EntryMap_t;
+typedef list<string> StringList_t;
+typedef list<NodeSPtr> NodeList_t;
 typedef set<StringInt> StringIntSet_t;
 typedef basic_string<int> StringOfInts;
 typedef vector<unsigned int> UintVect;
+typedef vector<uint64_t> Uint64Vect;
 typedef vector<StringInt *> StrIntPtrVect_t;
 typedef vector<StringInt> StringIntVect_t;
 
 // Variables holding 'interesting' information on the data
 unsigned int MaxLength, MinLength, NumChars, NumInWords, NumDuplicate;
-int MaxOccurReduce;
-string MaxOccurStr;
 struct FileInfo
 {
-    FileInfo() : Words(0), BruteIgnore(0), Accented(0), Dups(0), Used(0) { }
+    FileInfo() : Words(0), BruteIgnore(0), Accented(0), Dups(0), Used(0), Rank(0) { }
     string Name;
+    StringList_t Pwds;
     int Words;
     int BruteIgnore;
     int Accented;
     int Dups;
     int Used;
+    int Rank;
 };
 
 /**********************************************************************************
- * Read the file of words and add them to Entries.
+ * Read the file of words and add them to the file information.
  */
-static bool ReadInputFile(EntryMap_t & Entries, const string & FileName, int DictNum, FileInfo &Info)
+static bool ReadInputFile(const string & FileName, FileInfo &Info, int MaxRank)
 {
     ifstream f(FileName.c_str());
     if (!f.is_open())
@@ -427,12 +427,12 @@ static bool ReadInputFile(EntryMap_t & Entries, const string & FileName, int Dic
         return false;
     }
     Info.Name = FileName;
-    
+
     // Rank is the position of the work in the dictionary file. Rank==1 is lowest for a word (and
     // indicates a very popular or bad password).
     int Rank = 0;
     string Line;
-    while(getline(f, Line))
+    while(getline(f, Line) && (Rank < MaxRank))
     {
         // Truncate at first space or tab to leave just the word in case additional info on line
         string::size_type y = Line.find_first_of("\t ");
@@ -442,7 +442,7 @@ static bool ReadInputFile(EntryMap_t & Entries, const string & FileName, int Dic
         y = Line.length();
         if (!y)
             continue;
-        
+
         ++Info.Words;
         // Only use words where all chars are ascii (no accents etc.)
         string::size_type x;
@@ -461,7 +461,7 @@ static bool ReadInputFile(EntryMap_t & Entries, const string & FileName, int Dic
             ++Info.Accented;
             continue;
         }
-        
+
         // Don't use words where the brute force strength is less than the word's rank
         if (BruteForce < (Rank+1))
         {
@@ -474,43 +474,51 @@ static bool ReadInputFile(EntryMap_t & Entries, const string & FileName, int Dic
         if (y < MinLength)
             MinLength = y;
         NumChars += y;
-        ++NumInWords;
+
+        Info.Pwds.push_back(Line);
         ++Rank;
+    }
+    f.close();
+    return true;
+}
 
-        EntryMap_t::iterator It = Entries.find(Line);
-        if (It != Entries.end())
+static void CombineWordLists(EntryMap_t & Entries, FileInfo *Infos, int NumInfo)
+{
+    bool Done = false;
+    int Rank = 0;
+    while(!Done)
+    {
+        int i;
+        ++Rank;
+        Done = true;
+        for(i = 0; i < NumInfo; ++i)
         {
-            // This is a repeat of a previous entry
-            int r = It->second.mRank;
-            if (r > Rank)
+            FileInfo *p = Infos + i;
+            while(!p->Pwds.empty())
             {
-                // Remember new lower rank
-                It->second.mRank = Rank;
-                It->second.mDict = DictNum;
-                It->second.mOccurs += 1;
-                r -= Rank;
-                if (r > MaxOccurReduce)
+                Done = false;
+                string Word = p->Pwds.front();
+                p->Pwds.pop_front();
+                EntryMap_t::iterator It = Entries.find(Word);
+                if (It != Entries.end())
+                {
+                    // Word is repeat of one from another file
+                    p->Dups += 1;
+                    ++NumDuplicate;
+                }
+                else
                 {
-                    MaxOccurStr = Line;
-                    MaxOccurReduce = r;
+                    // New word, add it
+                    Entry e;
+                    e.mDict = i;
+                    e.mRank = Rank;
+                    Entries.insert(std::pair<std::string, Entry>(Word, e));
+                    p->Used += 1;
+                    break;
                 }
             }
-            else
-                ++Info.Dups;
-            ++NumDuplicate;
-        }
-        else
-        {
-            // New word
-            Entry e;
-            e.mDict = DictNum;
-            e.mRank = Rank;
-            Entries.insert(std::pair<std::string, Entry>(Line, e));
-            ++Info.Used;
         }
     }
-    f.close();
-    return true;
 }
 
 /**********************************************************************************
@@ -535,7 +543,7 @@ static void ProcessEntries(NodeSPtr Root, EntryMap_t & Entries, bool *InputCharS
             InputCharSet[c & 0xFF] = true;
         }
         pNode->SetEnd();
-    } 
+    }
 }
 
 /**********************************************************************************
@@ -599,7 +607,7 @@ static void ReduceTrie(NodeSPtr Root)
     int Height;
     int cnt=0, del=0;
     Root->CalcCheck();
-    
+
     NodeSPtr pNode = Root;
     for(Height = Root->CalcHeight(); Height >= 0; --Height)
     {
@@ -796,7 +804,7 @@ string MakeCharSet(bool *InputCharSet)
 
 /**********************************************************************************
  * Create a set of strings which contain the possible characters matched at
- * a node when checking a word. 
+ * a node when checking a word.
  */
 void MakeChildBitMap(StringIntSet_t & StrSet, NodeSPtr Root, int & Loc)
 {
@@ -825,15 +833,27 @@ void MakeChildBitMap(StringIntSet_t & StrSet, NodeSPtr Root, int & Loc)
     Root->SetCounted();
 }
 
+// Constants defining bit positions of node data
+// Number of bits to represent the index of the child char pattern in the final child bitmap array,
+const int BITS_CHILD_PATT_INDEX = 14;
+
+// Number of bits to represent index of where the child pointers start for this node in
+// the Child map array and its bit position
+const int BITS_CHILD_MAP_INDEX = 18;
+const int SHIFT_CHILD_MAP_INDEX = BITS_CHILD_PATT_INDEX;
+
+// Bit positions of word ending indicator and indicator for number of word endings for this + child nodes is >= 256
+const int SHIFT_WORD_ENDING_BIT = SHIFT_CHILD_MAP_INDEX + BITS_CHILD_MAP_INDEX;
+const int SHIFT_LARGE_ENDING_BIT = SHIFT_WORD_ENDING_BIT + 1;
 /**********************************************************************************
  * Create the arrays of data that will be output
  */
-void CreateArrays(NodeSPtr Root, StringIntSet_t & StrSet, StringOfInts & ChildAddrs, UintVect & NodeData, UintVect & NodeEnds)
+void CreateArrays(NodeSPtr Root, StringIntSet_t & StrSet, StringOfInts & ChildAddrs, Uint64Vect & NodeData, UintVect & NodeEnds)
 {
     NodeMap_t::iterator Itc;
     StringInt Tmp;
     StringOfInts Chld;
-    
+
     // Find children in the child pattern array
     Tmp.s= Root->GetChildChars();
     StringIntSet_t::iterator Its = StrSet.find(Tmp);
@@ -853,17 +873,27 @@ void CreateArrays(NodeSPtr Root, StringIntSet_t & StrSet, StringOfInts & ChildAd
         ChildAddrs += Chld;
     }
     // Val will contain the final node data
-    // Bits 12:0    Index of the child char pattern in the final child bitmap array
-    // Bits 29:13   Index of where the child pointers start for this node in the Child map array
-    // Bit 30       Set if this nod is a word ending
-    // Bit 31       Set if the number of word endings for this + child nodes is >= 256
-    unsigned int Val = Its->i;
-    Val |= x << 13;
+    uint64_t Val = Its->i;
+    if (Val >= (1 << BITS_CHILD_PATT_INDEX))
+    {
+        char Tmp[20];
+        snprintf(Tmp, sizeof Tmp, "%u", Its->i);
+        throw string("Not enough bits for child pattern index value of ") + Tmp + " for " +
+                Its->s + " (BITS_CHILD_PATT_INDEX too small)";
+    }
+    if (x >= (1 << BITS_CHILD_MAP_INDEX))
+    {
+        char Tmp[20];
+        snprintf(Tmp, sizeof Tmp, "%lu", x);
+        throw string("Not enough bits for child map index value of ") + Tmp + " for " +
+                Its->s + " (BITS_CHILD_MAP_INDEX too small)";
+    }
+    Val |= x << SHIFT_CHILD_MAP_INDEX;
     if (Root->IsEnd())
-        Val |= 1<<30;
+        Val |= uint64_t(1) << SHIFT_WORD_ENDING_BIT;
     if (Root->GetNumEnds() >= 256)
-        Val |= 1<<31;
-    
+        Val |= uint64_t(1) << SHIFT_LARGE_ENDING_BIT;
+
     // Make sure output arrays are big enough
     if (Root->GetAddr() > NodeData.size())
     {
@@ -884,8 +914,8 @@ void CreateArrays(NodeSPtr Root, StringIntSet_t & StrSet, StringOfInts & ChildAd
 /**********************************************************************************
  * Output the data as a binary file.
  */
-static int OutputBinary(ostream *Out, const string & ChkFile, const string & CharSet, StringIntSet_t & StrSet, //NodeSPtr & Root, 
-                        StringOfInts & ChildAddrs, UintVect & NodeData, UintVect & NodeEnds, StringIntVect_t & Ranks)
+static int OutputBinary(ostream *Out, const string & ChkFile, const string & CharSet, StringIntSet_t & StrSet, //NodeSPtr & Root,
+                        StringOfInts & ChildAddrs, Uint64Vect & NodeData, UintVect & NodeEnds, StringIntVect_t & Ranks)
 {
     int OutputSize;
     unsigned int FewEndStart = 2000000000;
@@ -896,26 +926,27 @@ static int OutputBinary(ostream *Out, const string & ChkFile, const string & Cha
 
     for(Index = 0; Index < NodeData.size(); ++Index)
     {
-        i = NodeData[Index];
-        if ((FewEndStart >= 2000000000) && !(i & (1<<31)))
+        uint64_t v = NodeData[Index];
+        if ((FewEndStart >= 2000000000) && !(v & (uint64_t(1) << SHIFT_LARGE_ENDING_BIT)))
         {
             FewEndStart = Index;
             break;
         }
     }
     // Header words
+    unsigned int NumWordEnd;
     const unsigned int MAGIC = 'z' + ('x'<< 8) + ('c' << 16) + ('v' << 24);
     Out->write((char *)&MAGIC, sizeof MAGIC);   // Write magic
     h(&MAGIC, sizeof MAGIC);
     OutputSize = sizeof MAGIC;
-    
+
     i = NodeData.size();
     Out->write((char *)&i, sizeof i);   // Write number of nodes
     h(&i, sizeof i);
     OutputSize += sizeof i;
-    
+
     i = ChildAddrs.size();
-    if (NodeData.size() > numeric_limits<unsigned short>::max())
+    if (NodeData.size() > numeric_limits<unsigned int>::max())
         i |= 1<<31;
     Out->write((char *)&i, sizeof i);   // Write number of child location entries & size of each entry
     h(&i, sizeof i);
@@ -925,7 +956,12 @@ static int OutputBinary(ostream *Out, const string & ChkFile, const string & Cha
     Out->write((char *)&i, sizeof i);   // Write number of ranks
     h(&i, sizeof i);
     OutputSize += sizeof i;
-    
+
+    NumWordEnd = (NodeData.size() + 7) / 8;
+    Out->write((char *)&NumWordEnd, sizeof NumWordEnd);   // Write number of word endings
+    h(&NumWordEnd, sizeof NumWordEnd);
+    OutputSize += sizeof NumWordEnd;
+
     i = StrSet.size();
     Out->write((char *)&i, sizeof i);   // Write size of of child bitmap data
     h(&i, sizeof i);
@@ -951,48 +987,71 @@ static int OutputBinary(ostream *Out, const string & ChkFile, const string & Cha
     OutputSize += sizeof i;
 
     // Output array of node data
+    unsigned char *WordEnds = new unsigned char[NumWordEnd];
+    unsigned char v = 0;
+    unsigned int z = 0;
+    int y = 0;
     for(Index = 0; Index < NodeData.size(); ++Index)
     {
         i = NodeData[Index];
         Out->write((char *)&i, sizeof i);
         h(&i, sizeof i);
-    }    
-    OutputSize += Index * sizeof i;
 
-    // Output array of node pointers
-    if (NodeData.size() > numeric_limits<unsigned short>::max())
-    {
-        for(Index = 0; Index < ChildAddrs.size(); ++Index)
+        if (NodeData[Index] & (uint64_t(1) << SHIFT_WORD_ENDING_BIT))
+            v |= 1 << y;
+        if (++y >= 8)
         {
-            i = ChildAddrs[Index];
-            Out->write((char *)&i, sizeof i);
-            h(&i, sizeof i);
+            WordEnds[z++] = v;
+            y = 0;
+            v = 0;
         }
-        OutputSize += Index * sizeof i;
     }
-    else
+    while(z < NumWordEnd)
     {
-        for(Index = 0; Index < ChildAddrs.size(); ++Index)
-        {
-            u = ChildAddrs[Index];
-            Out->write((char *)&u, sizeof u);
-            h(&u, sizeof u);
-        }
-        OutputSize += Index * sizeof u;
+         WordEnds[z++] = v;
+         v = 0;
     }
+    OutputSize += Index * sizeof i;
+
+    // Output array of node pointers
+    for(Index = 0; Index < ChildAddrs.size(); ++Index)
+    {
+        i = ChildAddrs[Index];
+        Out->write((char *)&i, sizeof i);
+        h(&i, sizeof i);
+    }
+    OutputSize += Index * sizeof i;
+
     // Output ranks
     for(Index = 0; Index < Ranks.size(); ++Index)
     {
-        u = Ranks[Index].i;
+        i = Ranks[Index].i;
+        if (i >= (1 << 15))
+        {
+            i -= 1 << 15;
+            i /= 4;
+            if (i >= (1 << 15))
+                i = (1 << 15) - 1;
+            i |= 1 << 15;
+        }
+        if (i > numeric_limits<unsigned short>::max())
+            i = numeric_limits<unsigned short>::max();
+        u = i;
         Out->write((char *)&u, sizeof u);
         h(&u, sizeof u);
-    }    
+    }
     OutputSize += Index * sizeof u;
 
+    // Output word end bit markers
+    Out->write((char *)WordEnds, NumWordEnd);
+    h(WordEnds, NumWordEnd);
+    OutputSize += NumWordEnd;
+    delete WordEnds;
+
     StringIntSet_t::iterator Its;
     string Str;
     unsigned char Buf[8];
-    
+
     // Get the items from StrSet ordered by the index
     StrIntPtrVect_t SetPtrs;
     SetPtrs.resize(StrSet.size());
@@ -1000,10 +1059,7 @@ static int OutputBinary(ostream *Out, const string & ChkFile, const string & Cha
     {
         StringInt *p = Its->Self();
         if (p->i >= StrSet.size())
-        {
-            cout << "p->i=" << p->i << "  " << p->s.c_str() << endl;
             throw "Bad index";
-        }
         SetPtrs[p->i] = p;
     }
     // Output child bitmap
@@ -1026,7 +1082,7 @@ static int OutputBinary(ostream *Out, const string & ChkFile, const string & Cha
         h(Buf, BytePerEntry);
      }
     OutputSize += Index * BytePerEntry;
-    
+
     unsigned char c;
     for(Index = 0; Index < FewEndStart; ++Index)
     {
@@ -1039,9 +1095,10 @@ static int OutputBinary(ostream *Out, const string & ChkFile, const string & Cha
         h(&c, 1);
     }
     OutputSize += Index * sizeof c;
+
     for(Index = 0; Index < NodeEnds.size(); ++Index)
     {
-        c = NodeEnds[Index];;
+        c = NodeEnds[Index];
         Out->write((char *)&c, 1);
         h(&c, 1);
     }
@@ -1050,7 +1107,7 @@ static int OutputBinary(ostream *Out, const string & ChkFile, const string & Cha
     Out->write(CharSet.c_str(), CharSet.length());
     h(CharSet.c_str(), CharSet.length());
     OutputSize += CharSet.length();
-    
+
     if (!ChkFile.empty())
     {
         // Write the checksum file
@@ -1066,7 +1123,11 @@ static int OutputBinary(ostream *Out, const string & ChkFile, const string & Cha
         }
         f << "\n};\n";
         f << "#define WORD_FILE_SIZE " << OutputSize << endl;
-        f << "#define ROOT_NODE_LOC  0" << endl;
+        f << "#define ROOT_NODE_LOC 0\n"
+             "#define BITS_CHILD_PATT_INDEX " << BITS_CHILD_PATT_INDEX << "\n"
+             "#define BITS_CHILD_MAP_INDEX  " << BITS_CHILD_MAP_INDEX << "\n" 
+             "#define SHIFT_CHILD_MAP_INDEX BITS_CHILD_PATT_INDEX\n"
+             "#define SHIFT_WORD_ENDING_BIT (SHIFT_CHILD_MAP_INDEX + BITS_CHILD_MAP_INDEX)" << endl;
         f.close();
     }
     return OutputSize;
@@ -1083,53 +1144,60 @@ int OutputTester(ostream *Out, bool /*Cmnts*/, StringIntVect_t & Ranks)
         string::size_type x = Pwd.find(':');
         if (x != string::npos)
             Pwd.erase(0, x+1);
-        
+
         *Out << Pwd.c_str() << "  ";
         for(x = Pwd.length(); x < 16; ++x)
             *Out << ' ';
-        *Out << log(v*1.0) / log(2.0) << '\n';
+        *Out << log(v*1.0) / log(2.0) << "  " << v << '\n';
     }
     return Index;
 }
+const int LINE_OUT_LEN = 160;
 /**********************************************************************************
  * Output the data as C source.
  */
 int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t & StrSet, NodeSPtr & Root,
-               StringOfInts & ChildAddrs, UintVect & NodeData, UintVect & NodeEnds, StringIntVect_t & Ranks)
+               StringOfInts & ChildAddrs, Uint64Vect & NodeData, UintVect & NodeEnds, StringIntVect_t & Ranks)
 {
     unsigned int Index;
     int OutputSize;
 
     if (Cmnts)
-        *Out << "#define ND(e,c,b) (e<<30)|(c<<13)|b\n";
-    
+        *Out << "#define ND(e,c,b) (c<<" << SHIFT_CHILD_MAP_INDEX << ")|b\n";
+
     // Output array of node data
-    *Out << "#define ROOT_NODE_LOC 0" << endl;
-    *Out << "static const unsigned int DictNodes[" << NodeData.size() << "] =\n{";
+    *Out << "#define ROOT_NODE_LOC 0\n"
+            "#define BITS_CHILD_PATT_INDEX " << BITS_CHILD_PATT_INDEX << "\n"
+            "#define BITS_CHILD_MAP_INDEX  " << BITS_CHILD_MAP_INDEX << "\n" 
+            "#define SHIFT_CHILD_MAP_INDEX BITS_CHILD_PATT_INDEX\n"
+            "#define SHIFT_WORD_ENDING_BIT (SHIFT_CHILD_MAP_INDEX + BITS_CHILD_MAP_INDEX)\n"
+            "static const unsigned int DictNodes[" << NodeData.size() << "] =\n{";
     OutputSize = NodeData.size() * sizeof(unsigned int);
-    int x = 99;
+    int x = 999;
     unsigned int FewEndStart = 2000000000;
     for(Index = 0; Index < NodeData.size(); ++Index)
     {
-        unsigned int v;
-        if (++x >= 8)
+        uint64_t v;
+        x += 11;
+        if (x > LINE_OUT_LEN)
         {
             *Out << "\n    ";
             x=0;
         }
         v = NodeData[Index];
+        v &= (uint64_t(1) << SHIFT_WORD_ENDING_BIT) - 1;
         if (Cmnts)
         {
-            unsigned int i;
-            i = (v >> 30) & 3;
+            uint64_t i;
+            i = (v >> SHIFT_WORD_ENDING_BIT) & 3;
             *Out << "ND(" << i << ',';
-            i= (v >> 13) & ((1<<17)-1);
+            i= (v >> SHIFT_CHILD_MAP_INDEX) & ((1<<BITS_CHILD_MAP_INDEX)-1);
             *Out << i << ',';
             if (i < 10000) *Out << ' ';
             if (i < 1000)  *Out << ' ';
             if (i < 100)   *Out << ' ';
             if (i < 10)    *Out << ' ';
-            i = v & ((1<<13)-1);
+            i = v & ((1<<BITS_CHILD_PATT_INDEX)-1);
             *Out << i << ")";
             if (Index < (NodeData.size()-1))
             {
@@ -1140,7 +1208,7 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
             }
         }
         else
-        {    
+        {
             *Out << v;
             if (Index < (NodeData.size()-1))
             {
@@ -1156,17 +1224,52 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
                 if (v < 10)    *Out << ' ';
             }
         }
-        if ((FewEndStart >= 2000000000) && !(v & (1<<31)))
+        if ((FewEndStart >= 2000000000) && !(NodeData[Index] & (uint64_t(1) << SHIFT_LARGE_ENDING_BIT)))
             FewEndStart = Index;
     }
     *Out << "\n};\n";
-    
+    unsigned int Len = ((NodeData.size() + 7) / 8);
+    OutputSize += Len;
+    x = 999;
+    *Out << "static unsigned char WordEndBits[" << Len << "] =\n{";
+    Index = 0;
+    unsigned int v = 0;
+    unsigned int y = 0;
+    unsigned int z = 0;
+    while(z < Len)
+    {
+        if (Index < NodeData.size())
+        {
+            if (NodeData[Index] & (uint64_t(1) << SHIFT_WORD_ENDING_BIT))
+                v |= 1 << y;
+        }
+        if (++y >= 8)
+        {
+            x += 4;
+            if (x > LINE_OUT_LEN)
+            {
+                *Out << "\n    ";
+                x = 0;
+            }
+            *Out << v;
+            if (++z < Len)
+            {
+                *Out << ',';
+                if (v < 100) *Out << ' ';
+                if (v < 10) *Out << ' ';
+            }
+            y = 0;
+            v = 0;
+        }
+        ++Index;
+    }
+    *Out << "\n};\n";
     // Output array of node pointers
     *Out << "static const unsigned ";
     if (NodeData.size() > numeric_limits<unsigned short>::max())
     {
         *Out << "int";
-        x = sizeof(unsigned int); 
+        x = sizeof(unsigned int);
     }
     else
     {
@@ -1175,11 +1278,12 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
     }
     *Out << " ChildLocs[" << ChildAddrs.size() << "] =\n{";
     OutputSize += x * ChildAddrs.size();
-    x = 99;
+    x = 999;
     for(Index = 0; Index < ChildAddrs.size(); ++Index)
     {
         int v;
-        if (++x > 19)
+        x += 6;
+        if (x > LINE_OUT_LEN)
         {
             *Out << "\n    ";
             x=0;
@@ -1194,14 +1298,14 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
             if (v < 100)   *Out << ' ';
             if (v < 10)    *Out << ' ';
         }
-        
     }
     *Out << "\n};\n";
 
     // Output the rank of the words
     *Out << "static const unsigned short Ranks[" << Ranks.size() << "] =\n{";
     OutputSize += Ranks.size() * sizeof(unsigned short);
-    x=99;
+    x = 999;
+    bool TooBig = false;
     if (Cmnts)
     {
         *Out << "\n";
@@ -1210,6 +1314,19 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
             unsigned int v;
             *Out << "    ";
             v = Ranks[Index].i;
+            if (v >= (1 << 15))
+            {
+                v -= 1 << 15;
+                v /= 4;
+                if (v >= (1 << 15))
+                {
+                    TooBig = true;
+                    v = (1 << 15) - 1;
+                }
+                v |= 1 << 15;
+            }
+            if (v > numeric_limits<unsigned short>::max())
+                v = numeric_limits<unsigned short>::max();
             *Out << v;
             if (Index < (Ranks.size()-1))
             {
@@ -1227,12 +1344,26 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
         for(Index = 0; Index < Ranks.size(); ++Index)
         {
             unsigned int v;
-            if (++x > 19)
+            x += 6;
+            if (x > LINE_OUT_LEN)
             {
                 *Out << "\n    ";
                 x=0;
             }
             v = Ranks[Index].i;
+            if (v >= (1 << 15))
+            {
+                v -= 1 << 15;
+                v /= 4;
+                if (v >= (1<<15))
+                {
+                    TooBig = true;
+                    v = (1 << 15) - 1;
+                }
+                v |= 1 << 15;
+            }
+            if (v > numeric_limits<unsigned short>::max())
+                v = numeric_limits<unsigned short>::max();
             *Out << v;
             if (Index < (Ranks.size()-1))
             {
@@ -1245,16 +1376,20 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
         }
     }
     *Out << "\n};\n";
-
+    if (TooBig)
+    {
+        unsigned int v  = ((1<<15) - 1) * 4 + (1<<15);
+        cout << "// Word ranks too large, value restricted to " << v << endl;
+    }
     unsigned int BytePerEntry = (CharSet.length() + 7) / 8;
     *Out << "#define SizeChildMapEntry " << BytePerEntry << '\n';
     *Out << "static const unsigned char ChildMap[" << StrSet.size() << '*' << BytePerEntry << "] =\n{";
     OutputSize += StrSet.size() * BytePerEntry * sizeof(unsigned char);
-        
+
     StringIntSet_t::iterator Its;
     string Str;
     unsigned char Buf[8];
-    
+
     // Get the items from StrSet ordered by the index
     StrIntPtrVect_t SetPtrs;
     SetPtrs.resize(StrSet.size());
@@ -1268,17 +1403,16 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
         }
         SetPtrs[p->i] = p;
     }
-
-    x=99;
+    x = 999;
     for(Index = 0; Index < SetPtrs.size(); ++Index)
     {
         string::size_type z, y;
         StringInt *p;
         memset(Buf, 0, sizeof Buf);
-        if (x > 4)
+        if (x > LINE_OUT_LEN)
         {
             *Out << "\n    ";
-            x = 0;
+            x = 4*BytePerEntry;
         }
         p = SetPtrs[Index];
         Str = p->s;
@@ -1305,27 +1439,27 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
                 *Out << ' ';
             if (y < 10)
                 *Out << ' ';
+            x += 4;
         }
-        ++x;
         if (Cmnts)
         {
             *Out << " // " << p->i << ": " << Str;
-            x = 99;
+            x = 999;
         }
-
-        
     }
     *Out << "\n};" << endl;
 
     // Output the top 8 bits of the node word endings count. Since node with >255 endings have
     // been placed at the begining, and ther are not too many of them the array is fairly small.
+    *Out << "#define NumLargeCounts " << FewEndStart << "\n";
     *Out << "static const unsigned char EndCountLge[" << FewEndStart << "] =\n{";
     OutputSize += FewEndStart * sizeof(unsigned char);
-    x=99;
+    x = 999;
     for(Index = 0; Index < FewEndStart; ++Index)
     {
         unsigned int v;
-        if (++x > 19)
+        x += 4;
+        if (x > LINE_OUT_LEN)
         {
             *Out << "\n    ";
             x=0;
@@ -1339,21 +1473,22 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
             *Out << ',';
             if (v < 100)   *Out << ' ';
             if (v < 10)    *Out << ' ';
-        }      
+        }
     }
     *Out << "\n};\n";
-    
+
     // Output all the word ending counts. For the first few nodes this is just the lower 8 bits of
     // the value. For the rest each entry contains the whole count. The split between lower and
     // upper halves of the value for the first few nodes allows bytes arrays to be used, so saving
     // memory.
     *Out << "static const unsigned char EndCountSml[" << NodeEnds.size() << "] =\n{";
     OutputSize += NodeEnds.size() * sizeof(unsigned char);
-    x=99;
+    x = 999;
     for(Index = 0; Index < NodeEnds.size(); ++Index)
     {
         unsigned int v;
-        if (++x > 19)
+        x += 4;
+        if (x > LINE_OUT_LEN)
         {
             *Out << "\n    ";
             x=0;
@@ -1365,10 +1500,10 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
             *Out << ',';
             if (v < 100)   *Out << ' ';
             if (v < 10)    *Out << ' ';
-        }      
+        }
     }
     *Out << "\n};\n";
-    
+
     // Finally output the used characters.
     *Out << "static const char CharSet[" << CharSet.length()+1 << "] = \"";
     OutputSize += CharSet.length() * sizeof(char);
@@ -1389,6 +1524,7 @@ enum { OUT_C_CODE, OUT_BINARY, OUT_TESTER };
 int main(int argc, char *argv[])
 {
     int i;
+    int MaxRank = 999999999;
     int OutType = OUT_C_CODE;
     bool Verbose = false;
     bool Comments = false;
@@ -1398,7 +1534,7 @@ int main(int argc, char *argv[])
     FileInfo InInfo[10];
     int NumFiles = 0;
     MinLength = 999;
-    
+
     try
     {
         for(i = 1; i < argc; ++i)
@@ -1436,6 +1572,18 @@ int main(int argc, char *argv[])
                     HashFile = argv[i];
                 continue;
             }
+            if (FileName == "-r")
+            {
+                // Ignore words with too high rank
+                if (++i < argc)
+                {
+                    char *p=0;
+                    MaxRank = strtol(argv[i], &p, 0);
+                    if ((MaxRank < 1000) || *p)
+                        MaxRank = 999999999;
+                    continue;
+                }
+            }
             if (FileName == "-v")
             {
                 Verbose = true;
@@ -1448,6 +1596,7 @@ int main(int argc, char *argv[])
                         "     -b        Generate a binary output file\n"
                         "     -t        Generate a test file for testing zxcvbn\n"
                         "     -c        Add comments to output file if C code mode\n"
+                        "     -r number Ignore words with rank greater than number (must be >=1000)\n"
                         "     -v        Additional information output\n"
                         "     -h Hfile  Write file checksum to file Hfile as C code (for -b mode)\n"
                         "     -o Ofile  Write output to file Ofile\n"
@@ -1457,10 +1606,11 @@ int main(int argc, char *argv[])
                         << endl;
                 return 1;
             }
-            ReadInputFile(Entries, FileName, i, InInfo[NumFiles]);
+            ReadInputFile(FileName, InInfo[NumFiles], MaxRank);
             if (NumFiles < int(sizeof InInfo / sizeof InInfo[0] - 1))
                 ++NumFiles;
         }
+        CombineWordLists(Entries, InInfo, NumFiles);
         if (Verbose)
         {
             if (!OutFile && (OutType == OUT_C_CODE))
@@ -1471,19 +1621,19 @@ int main(int argc, char *argv[])
                 cout << "Read input file " << Fi->Name << endl;
                 cout << "   Input words  " << Fi->Words << endl;
                 cout << "   Used words   " << Fi->Used << endl;
-                cout << /*"        Unused  " << Fi->BruteIgnore <<
+                cout << "        Unused  " << Fi->BruteIgnore <<
                         " Bruteforce compare, " << Fi->Accented <<
-                        " Accented char, " << Fi->Dups << " Duplicates" << */ endl;
+                        " Accented char, " << Fi->Dups << " Duplicates" << endl;
             }
         }
         bool InputCharSet[256];
         NodeSPtr Root(new Node);
         // Initially charset of used chracters is empty
         memset(InputCharSet, 0, sizeof InputCharSet);
-        
+
         // Add words to the trie with root in Root
         ProcessEntries(Root, Entries, InputCharSet);
-        
+
         // Get some interesting info
         int NumEnds = Root->CalcEndings();
         int Hi = Root->CalcHeight();
@@ -1495,7 +1645,6 @@ int main(int argc, char *argv[])
             cout << "Num input chars = " << NumChars << endl;
             cout << "Num input words = " << NumInWords << endl;
             cout << "Duplicate words = " << NumDuplicate;
-            cout << "  (Rank most reduced for \"" << MaxOccurStr.c_str() << "\")"<< endl;
             cout << "Number of Ends  = " << NumEnds << endl;
             cout << "Number of Nodes = " << NumNodes << endl;
             cout << "Trie height = " << Hi << endl;
@@ -1523,20 +1672,20 @@ int main(int argc, char *argv[])
         int CheckEnds = CheckReduction(Ranks, Root, Entries);
         if (Verbose)
             cout << "Number of Words = " << CheckEnds << endl;
-       
+
         ChkNum Tst = CheckEntries(Root, string(), Entries);
         if (Verbose)
         {
             cout << "2nd check - Number of valid words = " << Tst.Match << endl;
             cout << "          Number of invalid words = " << Tst.Err << endl;
         }
-        
+
         // Give up if there was an error
         if (Tst.Err)
             throw "Checks show invalid words after reduction";
         if ((Tst.Match != InputOrder) || (ReduceEnds != InputOrder))
             throw "Word count changed after reduce";
-        
+
         // Output more info
         StringIntSet_t ChildBits;
         string CharSet = MakeCharSet(InputCharSet);
@@ -1549,11 +1698,11 @@ int main(int argc, char *argv[])
         MakeChildBitMap(ChildBits, Root, i);
         if (Verbose)
             cout << "Number of child bitmaps = " << ChildBits.size() << endl;
-        
+
         // Get final node address
         Root->CalcAddress();
 
-        UintVect NodeData;
+        Uint64Vect NodeData;
         UintVect NodeEnds;
         StringOfInts ChildAddrs;
 
@@ -1579,7 +1728,7 @@ int main(int argc, char *argv[])
         }
         if (!OutFile && (OutType == OUT_C_CODE))
             cout << "*/\n";
- 
+
         if (OutType == OUT_BINARY)
             i = OutputBinary(Out, HashFile, CharSet, ChildBits, ChildAddrs, NodeData, NodeEnds, Ranks);
         else if (OutType == OUT_TESTER)