aboutsummaryrefslogtreecommitdiffhomepage
path: root/dict-generate.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'dict-generate.cpp')
-rw-r--r--dict-generate.cpp451
1 files changed, 300 insertions, 151 deletions
diff --git a/dict-generate.cpp b/dict-generate.cpp
index cd2fb05..410182d 100644
--- a/dict-generate.cpp
+++ b/dict-generate.cpp
@@ -27,9 +27,9 @@
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
- *
+ *
**********************************************************************************/
-
+
#include <iostream>
#include <string>
#include <fstream>
@@ -85,11 +85,11 @@ public:
void SetEnd() { mEnd = true; }
bool IsEnd() const { return mEnd; }
int Height() const { return mHeight; }
-
+
// Scan the trie and count nodes
int NodeCount() { ClearCounted() ; return CountNodes(); }
-
-
+
+
int CalcAddress() { int a=0; ClearCounted(); a=CalcAddr(a, true); return CalcAddr(a, false); }
Node *GetParent() { return mParent; }
unsigned int GetAddr() const { return mAddr; }
@@ -224,7 +224,6 @@ int Node::CalcEndings()
mEndings = n;
}
return mEndings;
-
}
/**********************************************************************************
@@ -363,7 +362,6 @@ std::string Node::GetChildChars()
return Result;
}
-
/**********************************************************************************
* struct to hold data read from input file (except for the word string)
*/
@@ -392,33 +390,35 @@ struct StringInt
StringInt * Self() const { return const_cast<StringInt *>(this); }
};
-typedef std::map<std::string, Entry> EntryMap_t;
-typedef std::list<NodeSPtr> NodeList_t;
+typedef map<string, Entry> EntryMap_t;
+typedef list<string> StringList_t;
+typedef list<NodeSPtr> NodeList_t;
typedef set<StringInt> StringIntSet_t;
typedef basic_string<int> StringOfInts;
typedef vector<unsigned int> UintVect;
+typedef vector<uint64_t> Uint64Vect;
typedef vector<StringInt *> StrIntPtrVect_t;
typedef vector<StringInt> StringIntVect_t;
// Variables holding 'interesting' information on the data
unsigned int MaxLength, MinLength, NumChars, NumInWords, NumDuplicate;
-int MaxOccurReduce;
-string MaxOccurStr;
struct FileInfo
{
- FileInfo() : Words(0), BruteIgnore(0), Accented(0), Dups(0), Used(0) { }
+ FileInfo() : Words(0), BruteIgnore(0), Accented(0), Dups(0), Used(0), Rank(0) { }
string Name;
+ StringList_t Pwds;
int Words;
int BruteIgnore;
int Accented;
int Dups;
int Used;
+ int Rank;
};
/**********************************************************************************
- * Read the file of words and add them to Entries.
+ * Read the file of words and add them to the file information.
*/
-static bool ReadInputFile(EntryMap_t & Entries, const string & FileName, int DictNum, FileInfo &Info)
+static bool ReadInputFile(const string & FileName, FileInfo &Info, int MaxRank)
{
ifstream f(FileName.c_str());
if (!f.is_open())
@@ -427,12 +427,12 @@ static bool ReadInputFile(EntryMap_t & Entries, const string & FileName, int Dic
return false;
}
Info.Name = FileName;
-
+
// Rank is the position of the work in the dictionary file. Rank==1 is lowest for a word (and
// indicates a very popular or bad password).
int Rank = 0;
string Line;
- while(getline(f, Line))
+ while(getline(f, Line) && (Rank < MaxRank))
{
// Truncate at first space or tab to leave just the word in case additional info on line
string::size_type y = Line.find_first_of("\t ");
@@ -442,7 +442,7 @@ static bool ReadInputFile(EntryMap_t & Entries, const string & FileName, int Dic
y = Line.length();
if (!y)
continue;
-
+
++Info.Words;
// Only use words where all chars are ascii (no accents etc.)
string::size_type x;
@@ -461,7 +461,7 @@ static bool ReadInputFile(EntryMap_t & Entries, const string & FileName, int Dic
++Info.Accented;
continue;
}
-
+
// Don't use words where the brute force strength is less than the word's rank
if (BruteForce < (Rank+1))
{
@@ -474,43 +474,51 @@ static bool ReadInputFile(EntryMap_t & Entries, const string & FileName, int Dic
if (y < MinLength)
MinLength = y;
NumChars += y;
- ++NumInWords;
+
+ Info.Pwds.push_back(Line);
++Rank;
+ }
+ f.close();
+ return true;
+}
- EntryMap_t::iterator It = Entries.find(Line);
- if (It != Entries.end())
+static void CombineWordLists(EntryMap_t & Entries, FileInfo *Infos, int NumInfo)
+{
+ bool Done = false;
+ int Rank = 0;
+ while(!Done)
+ {
+ int i;
+ ++Rank;
+ Done = true;
+ for(i = 0; i < NumInfo; ++i)
{
- // This is a repeat of a previous entry
- int r = It->second.mRank;
- if (r > Rank)
+ FileInfo *p = Infos + i;
+ while(!p->Pwds.empty())
{
- // Remember new lower rank
- It->second.mRank = Rank;
- It->second.mDict = DictNum;
- It->second.mOccurs += 1;
- r -= Rank;
- if (r > MaxOccurReduce)
+ Done = false;
+ string Word = p->Pwds.front();
+ p->Pwds.pop_front();
+ EntryMap_t::iterator It = Entries.find(Word);
+ if (It != Entries.end())
+ {
+ // Word is repeat of one from another file
+ p->Dups += 1;
+ ++NumDuplicate;
+ }
+ else
{
- MaxOccurStr = Line;
- MaxOccurReduce = r;
+ // New word, add it
+ Entry e;
+ e.mDict = i;
+ e.mRank = Rank;
+ Entries.insert(std::pair<std::string, Entry>(Word, e));
+ p->Used += 1;
+ break;
}
}
- else
- ++Info.Dups;
- ++NumDuplicate;
- }
- else
- {
- // New word
- Entry e;
- e.mDict = DictNum;
- e.mRank = Rank;
- Entries.insert(std::pair<std::string, Entry>(Line, e));
- ++Info.Used;
}
}
- f.close();
- return true;
}
/**********************************************************************************
@@ -535,7 +543,7 @@ static void ProcessEntries(NodeSPtr Root, EntryMap_t & Entries, bool *InputCharS
InputCharSet[c & 0xFF] = true;
}
pNode->SetEnd();
- }
+ }
}
/**********************************************************************************
@@ -599,7 +607,7 @@ static void ReduceTrie(NodeSPtr Root)
int Height;
int cnt=0, del=0;
Root->CalcCheck();
-
+
NodeSPtr pNode = Root;
for(Height = Root->CalcHeight(); Height >= 0; --Height)
{
@@ -796,7 +804,7 @@ string MakeCharSet(bool *InputCharSet)
/**********************************************************************************
* Create a set of strings which contain the possible characters matched at
- * a node when checking a word.
+ * a node when checking a word.
*/
void MakeChildBitMap(StringIntSet_t & StrSet, NodeSPtr Root, int & Loc)
{
@@ -825,15 +833,27 @@ void MakeChildBitMap(StringIntSet_t & StrSet, NodeSPtr Root, int & Loc)
Root->SetCounted();
}
+// Constants defining bit positions of node data
+// Number of bits to represent the index of the child char pattern in the final child bitmap array,
+const int BITS_CHILD_PATT_INDEX = 14;
+
+// Number of bits to represent index of where the child pointers start for this node in
+// the Child map array and its bit position
+const int BITS_CHILD_MAP_INDEX = 18;
+const int SHIFT_CHILD_MAP_INDEX = BITS_CHILD_PATT_INDEX;
+
+// Bit positions of word ending indicator and indicator for number of word endings for this + child nodes is >= 256
+const int SHIFT_WORD_ENDING_BIT = SHIFT_CHILD_MAP_INDEX + BITS_CHILD_MAP_INDEX;
+const int SHIFT_LARGE_ENDING_BIT = SHIFT_WORD_ENDING_BIT + 1;
/**********************************************************************************
* Create the arrays of data that will be output
*/
-void CreateArrays(NodeSPtr Root, StringIntSet_t & StrSet, StringOfInts & ChildAddrs, UintVect & NodeData, UintVect & NodeEnds)
+void CreateArrays(NodeSPtr Root, StringIntSet_t & StrSet, StringOfInts & ChildAddrs, Uint64Vect & NodeData, UintVect & NodeEnds)
{
NodeMap_t::iterator Itc;
StringInt Tmp;
StringOfInts Chld;
-
+
// Find children in the child pattern array
Tmp.s= Root->GetChildChars();
StringIntSet_t::iterator Its = StrSet.find(Tmp);
@@ -853,17 +873,27 @@ void CreateArrays(NodeSPtr Root, StringIntSet_t & StrSet, StringOfInts & ChildAd
ChildAddrs += Chld;
}
// Val will contain the final node data
- // Bits 12:0 Index of the child char pattern in the final child bitmap array
- // Bits 29:13 Index of where the child pointers start for this node in the Child map array
- // Bit 30 Set if this nod is a word ending
- // Bit 31 Set if the number of word endings for this + child nodes is >= 256
- unsigned int Val = Its->i;
- Val |= x << 13;
+ uint64_t Val = Its->i;
+ if (Val >= (1 << BITS_CHILD_PATT_INDEX))
+ {
+ char Tmp[20];
+ snprintf(Tmp, sizeof Tmp, "%u", Its->i);
+ throw string("Not enough bits for child pattern index value of ") + Tmp + " for " +
+ Its->s + " (BITS_CHILD_PATT_INDEX too small)";
+ }
+ if (x >= (1 << BITS_CHILD_MAP_INDEX))
+ {
+ char Tmp[20];
+ snprintf(Tmp, sizeof Tmp, "%lu", x);
+ throw string("Not enough bits for child map index value of ") + Tmp + " for " +
+ Its->s + " (BITS_CHILD_MAP_INDEX too small)";
+ }
+ Val |= x << SHIFT_CHILD_MAP_INDEX;
if (Root->IsEnd())
- Val |= 1<<30;
+ Val |= uint64_t(1) << SHIFT_WORD_ENDING_BIT;
if (Root->GetNumEnds() >= 256)
- Val |= 1<<31;
-
+ Val |= uint64_t(1) << SHIFT_LARGE_ENDING_BIT;
+
// Make sure output arrays are big enough
if (Root->GetAddr() > NodeData.size())
{
@@ -884,8 +914,8 @@ void CreateArrays(NodeSPtr Root, StringIntSet_t & StrSet, StringOfInts & ChildAd
/**********************************************************************************
* Output the data as a binary file.
*/
-static int OutputBinary(ostream *Out, const string & ChkFile, const string & CharSet, StringIntSet_t & StrSet, //NodeSPtr & Root,
- StringOfInts & ChildAddrs, UintVect & NodeData, UintVect & NodeEnds, StringIntVect_t & Ranks)
+static int OutputBinary(ostream *Out, const string & ChkFile, const string & CharSet, StringIntSet_t & StrSet, //NodeSPtr & Root,
+ StringOfInts & ChildAddrs, Uint64Vect & NodeData, UintVect & NodeEnds, StringIntVect_t & Ranks)
{
int OutputSize;
unsigned int FewEndStart = 2000000000;
@@ -896,26 +926,27 @@ static int OutputBinary(ostream *Out, const string & ChkFile, const string & Cha
for(Index = 0; Index < NodeData.size(); ++Index)
{
- i = NodeData[Index];
- if ((FewEndStart >= 2000000000) && !(i & (1<<31)))
+ uint64_t v = NodeData[Index];
+ if ((FewEndStart >= 2000000000) && !(v & (uint64_t(1) << SHIFT_LARGE_ENDING_BIT)))
{
FewEndStart = Index;
break;
}
}
// Header words
+ unsigned int NumWordEnd;
const unsigned int MAGIC = 'z' + ('x'<< 8) + ('c' << 16) + ('v' << 24);
Out->write((char *)&MAGIC, sizeof MAGIC); // Write magic
h(&MAGIC, sizeof MAGIC);
OutputSize = sizeof MAGIC;
-
+
i = NodeData.size();
Out->write((char *)&i, sizeof i); // Write number of nodes
h(&i, sizeof i);
OutputSize += sizeof i;
-
+
i = ChildAddrs.size();
- if (NodeData.size() > numeric_limits<unsigned short>::max())
+ if (NodeData.size() > numeric_limits<unsigned int>::max())
i |= 1<<31;
Out->write((char *)&i, sizeof i); // Write number of child location entries & size of each entry
h(&i, sizeof i);
@@ -925,7 +956,12 @@ static int OutputBinary(ostream *Out, const string & ChkFile, const string & Cha
Out->write((char *)&i, sizeof i); // Write number of ranks
h(&i, sizeof i);
OutputSize += sizeof i;
-
+
+ NumWordEnd = (NodeData.size() + 7) / 8;
+ Out->write((char *)&NumWordEnd, sizeof NumWordEnd); // Write number of word endings
+ h(&NumWordEnd, sizeof NumWordEnd);
+ OutputSize += sizeof NumWordEnd;
+
i = StrSet.size();
Out->write((char *)&i, sizeof i); // Write size of of child bitmap data
h(&i, sizeof i);
@@ -951,48 +987,71 @@ static int OutputBinary(ostream *Out, const string & ChkFile, const string & Cha
OutputSize += sizeof i;
// Output array of node data
+ unsigned char *WordEnds = new unsigned char[NumWordEnd];
+ unsigned char v = 0;
+ unsigned int z = 0;
+ int y = 0;
for(Index = 0; Index < NodeData.size(); ++Index)
{
i = NodeData[Index];
Out->write((char *)&i, sizeof i);
h(&i, sizeof i);
- }
- OutputSize += Index * sizeof i;
- // Output array of node pointers
- if (NodeData.size() > numeric_limits<unsigned short>::max())
- {
- for(Index = 0; Index < ChildAddrs.size(); ++Index)
+ if (NodeData[Index] & (uint64_t(1) << SHIFT_WORD_ENDING_BIT))
+ v |= 1 << y;
+ if (++y >= 8)
{
- i = ChildAddrs[Index];
- Out->write((char *)&i, sizeof i);
- h(&i, sizeof i);
+ WordEnds[z++] = v;
+ y = 0;
+ v = 0;
}
- OutputSize += Index * sizeof i;
}
- else
+ while(z < NumWordEnd)
{
- for(Index = 0; Index < ChildAddrs.size(); ++Index)
- {
- u = ChildAddrs[Index];
- Out->write((char *)&u, sizeof u);
- h(&u, sizeof u);
- }
- OutputSize += Index * sizeof u;
+ WordEnds[z++] = v;
+ v = 0;
}
+ OutputSize += Index * sizeof i;
+
+ // Output array of node pointers
+ for(Index = 0; Index < ChildAddrs.size(); ++Index)
+ {
+ i = ChildAddrs[Index];
+ Out->write((char *)&i, sizeof i);
+ h(&i, sizeof i);
+ }
+ OutputSize += Index * sizeof i;
+
// Output ranks
for(Index = 0; Index < Ranks.size(); ++Index)
{
- u = Ranks[Index].i;
+ i = Ranks[Index].i;
+ if (i >= (1 << 15))
+ {
+ i -= 1 << 15;
+ i /= 4;
+ if (i >= (1 << 15))
+ i = (1 << 15) - 1;
+ i |= 1 << 15;
+ }
+ if (i > numeric_limits<unsigned short>::max())
+ i = numeric_limits<unsigned short>::max();
+ u = i;
Out->write((char *)&u, sizeof u);
h(&u, sizeof u);
- }
+ }
OutputSize += Index * sizeof u;
+ // Output word end bit markers
+ Out->write((char *)WordEnds, NumWordEnd);
+ h(WordEnds, NumWordEnd);
+ OutputSize += NumWordEnd;
+ delete WordEnds;
+
StringIntSet_t::iterator Its;
string Str;
unsigned char Buf[8];
-
+
// Get the items from StrSet ordered by the index
StrIntPtrVect_t SetPtrs;
SetPtrs.resize(StrSet.size());
@@ -1000,10 +1059,7 @@ static int OutputBinary(ostream *Out, const string & ChkFile, const string & Cha
{
StringInt *p = Its->Self();
if (p->i >= StrSet.size())
- {
- cout << "p->i=" << p->i << " " << p->s.c_str() << endl;
throw "Bad index";
- }
SetPtrs[p->i] = p;
}
// Output child bitmap
@@ -1026,7 +1082,7 @@ static int OutputBinary(ostream *Out, const string & ChkFile, const string & Cha
h(Buf, BytePerEntry);
}
OutputSize += Index * BytePerEntry;
-
+
unsigned char c;
for(Index = 0; Index < FewEndStart; ++Index)
{
@@ -1039,9 +1095,10 @@ static int OutputBinary(ostream *Out, const string & ChkFile, const string & Cha
h(&c, 1);
}
OutputSize += Index * sizeof c;
+
for(Index = 0; Index < NodeEnds.size(); ++Index)
{
- c = NodeEnds[Index];;
+ c = NodeEnds[Index];
Out->write((char *)&c, 1);
h(&c, 1);
}
@@ -1050,7 +1107,7 @@ static int OutputBinary(ostream *Out, const string & ChkFile, const string & Cha
Out->write(CharSet.c_str(), CharSet.length());
h(CharSet.c_str(), CharSet.length());
OutputSize += CharSet.length();
-
+
if (!ChkFile.empty())
{
// Write the checksum file
@@ -1066,7 +1123,11 @@ static int OutputBinary(ostream *Out, const string & ChkFile, const string & Cha
}
f << "\n};\n";
f << "#define WORD_FILE_SIZE " << OutputSize << endl;
- f << "#define ROOT_NODE_LOC 0" << endl;
+ f << "#define ROOT_NODE_LOC 0\n"
+ "#define BITS_CHILD_PATT_INDEX " << BITS_CHILD_PATT_INDEX << "\n"
+ "#define BITS_CHILD_MAP_INDEX " << BITS_CHILD_MAP_INDEX << "\n"
+ "#define SHIFT_CHILD_MAP_INDEX BITS_CHILD_PATT_INDEX\n"
+ "#define SHIFT_WORD_ENDING_BIT (SHIFT_CHILD_MAP_INDEX + BITS_CHILD_MAP_INDEX)" << endl;
f.close();
}
return OutputSize;
@@ -1083,53 +1144,60 @@ int OutputTester(ostream *Out, bool /*Cmnts*/, StringIntVect_t & Ranks)
string::size_type x = Pwd.find(':');
if (x != string::npos)
Pwd.erase(0, x+1);
-
+
*Out << Pwd.c_str() << " ";
for(x = Pwd.length(); x < 16; ++x)
*Out << ' ';
- *Out << log(v*1.0) / log(2.0) << '\n';
+ *Out << log(v*1.0) / log(2.0) << " " << v << '\n';
}
return Index;
}
+const int LINE_OUT_LEN = 160;
/**********************************************************************************
* Output the data as C source.
*/
int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t & StrSet, NodeSPtr & Root,
- StringOfInts & ChildAddrs, UintVect & NodeData, UintVect & NodeEnds, StringIntVect_t & Ranks)
+ StringOfInts & ChildAddrs, Uint64Vect & NodeData, UintVect & NodeEnds, StringIntVect_t & Ranks)
{
unsigned int Index;
int OutputSize;
if (Cmnts)
- *Out << "#define ND(e,c,b) (e<<30)|(c<<13)|b\n";
-
+ *Out << "#define ND(e,c,b) (c<<" << SHIFT_CHILD_MAP_INDEX << ")|b\n";
+
// Output array of node data
- *Out << "#define ROOT_NODE_LOC 0" << endl;
- *Out << "static const unsigned int DictNodes[" << NodeData.size() << "] =\n{";
+ *Out << "#define ROOT_NODE_LOC 0\n"
+ "#define BITS_CHILD_PATT_INDEX " << BITS_CHILD_PATT_INDEX << "\n"
+ "#define BITS_CHILD_MAP_INDEX " << BITS_CHILD_MAP_INDEX << "\n"
+ "#define SHIFT_CHILD_MAP_INDEX BITS_CHILD_PATT_INDEX\n"
+ "#define SHIFT_WORD_ENDING_BIT (SHIFT_CHILD_MAP_INDEX + BITS_CHILD_MAP_INDEX)\n"
+ "static const unsigned int DictNodes[" << NodeData.size() << "] =\n{";
OutputSize = NodeData.size() * sizeof(unsigned int);
- int x = 99;
+ int x = 999;
unsigned int FewEndStart = 2000000000;
for(Index = 0; Index < NodeData.size(); ++Index)
{
- unsigned int v;
- if (++x >= 8)
+ uint64_t v;
+ x += 11;
+ if (x > LINE_OUT_LEN)
{
*Out << "\n ";
x=0;
}
v = NodeData[Index];
+ v &= (uint64_t(1) << SHIFT_WORD_ENDING_BIT) - 1;
if (Cmnts)
{
- unsigned int i;
- i = (v >> 30) & 3;
+ uint64_t i;
+ i = (v >> SHIFT_WORD_ENDING_BIT) & 3;
*Out << "ND(" << i << ',';
- i= (v >> 13) & ((1<<17)-1);
+ i= (v >> SHIFT_CHILD_MAP_INDEX) & ((1<<BITS_CHILD_MAP_INDEX)-1);
*Out << i << ',';
if (i < 10000) *Out << ' ';
if (i < 1000) *Out << ' ';
if (i < 100) *Out << ' ';
if (i < 10) *Out << ' ';
- i = v & ((1<<13)-1);
+ i = v & ((1<<BITS_CHILD_PATT_INDEX)-1);
*Out << i << ")";
if (Index < (NodeData.size()-1))
{
@@ -1140,7 +1208,7 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
}
}
else
- {
+ {
*Out << v;
if (Index < (NodeData.size()-1))
{
@@ -1156,17 +1224,52 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
if (v < 10) *Out << ' ';
}
}
- if ((FewEndStart >= 2000000000) && !(v & (1<<31)))
+ if ((FewEndStart >= 2000000000) && !(NodeData[Index] & (uint64_t(1) << SHIFT_LARGE_ENDING_BIT)))
FewEndStart = Index;
}
*Out << "\n};\n";
-
+ unsigned int Len = ((NodeData.size() + 7) / 8);
+ OutputSize += Len;
+ x = 999;
+ *Out << "static unsigned char WordEndBits[" << Len << "] =\n{";
+ Index = 0;
+ unsigned int v = 0;
+ unsigned int y = 0;
+ unsigned int z = 0;
+ while(z < Len)
+ {
+ if (Index < NodeData.size())
+ {
+ if (NodeData[Index] & (uint64_t(1) << SHIFT_WORD_ENDING_BIT))
+ v |= 1 << y;
+ }
+ if (++y >= 8)
+ {
+ x += 4;
+ if (x > LINE_OUT_LEN)
+ {
+ *Out << "\n ";
+ x = 0;
+ }
+ *Out << v;
+ if (++z < Len)
+ {
+ *Out << ',';
+ if (v < 100) *Out << ' ';
+ if (v < 10) *Out << ' ';
+ }
+ y = 0;
+ v = 0;
+ }
+ ++Index;
+ }
+ *Out << "\n};\n";
// Output array of node pointers
*Out << "static const unsigned ";
if (NodeData.size() > numeric_limits<unsigned short>::max())
{
*Out << "int";
- x = sizeof(unsigned int);
+ x = sizeof(unsigned int);
}
else
{
@@ -1175,11 +1278,12 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
}
*Out << " ChildLocs[" << ChildAddrs.size() << "] =\n{";
OutputSize += x * ChildAddrs.size();
- x = 99;
+ x = 999;
for(Index = 0; Index < ChildAddrs.size(); ++Index)
{
int v;
- if (++x > 19)
+ x += 6;
+ if (x > LINE_OUT_LEN)
{
*Out << "\n ";
x=0;
@@ -1194,14 +1298,14 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
if (v < 100) *Out << ' ';
if (v < 10) *Out << ' ';
}
-
}
*Out << "\n};\n";
// Output the rank of the words
*Out << "static const unsigned short Ranks[" << Ranks.size() << "] =\n{";
OutputSize += Ranks.size() * sizeof(unsigned short);
- x=99;
+ x = 999;
+ bool TooBig = false;
if (Cmnts)
{
*Out << "\n";
@@ -1210,6 +1314,19 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
unsigned int v;
*Out << " ";
v = Ranks[Index].i;
+ if (v >= (1 << 15))
+ {
+ v -= 1 << 15;
+ v /= 4;
+ if (v >= (1 << 15))
+ {
+ TooBig = true;
+ v = (1 << 15) - 1;
+ }
+ v |= 1 << 15;
+ }
+ if (v > numeric_limits<unsigned short>::max())
+ v = numeric_limits<unsigned short>::max();
*Out << v;
if (Index < (Ranks.size()-1))
{
@@ -1227,12 +1344,26 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
for(Index = 0; Index < Ranks.size(); ++Index)
{
unsigned int v;
- if (++x > 19)
+ x += 6;
+ if (x > LINE_OUT_LEN)
{
*Out << "\n ";
x=0;
}
v = Ranks[Index].i;
+ if (v >= (1 << 15))
+ {
+ v -= 1 << 15;
+ v /= 4;
+ if (v >= (1<<15))
+ {
+ TooBig = true;
+ v = (1 << 15) - 1;
+ }
+ v |= 1 << 15;
+ }
+ if (v > numeric_limits<unsigned short>::max())
+ v = numeric_limits<unsigned short>::max();
*Out << v;
if (Index < (Ranks.size()-1))
{
@@ -1245,16 +1376,20 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
}
}
*Out << "\n};\n";
-
+ if (TooBig)
+ {
+ unsigned int v = ((1<<15) - 1) * 4 + (1<<15);
+ cout << "// Word ranks too large, value restricted to " << v << endl;
+ }
unsigned int BytePerEntry = (CharSet.length() + 7) / 8;
*Out << "#define SizeChildMapEntry " << BytePerEntry << '\n';
*Out << "static const unsigned char ChildMap[" << StrSet.size() << '*' << BytePerEntry << "] =\n{";
OutputSize += StrSet.size() * BytePerEntry * sizeof(unsigned char);
-
+
StringIntSet_t::iterator Its;
string Str;
unsigned char Buf[8];
-
+
// Get the items from StrSet ordered by the index
StrIntPtrVect_t SetPtrs;
SetPtrs.resize(StrSet.size());
@@ -1268,17 +1403,16 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
}
SetPtrs[p->i] = p;
}
-
- x=99;
+ x = 999;
for(Index = 0; Index < SetPtrs.size(); ++Index)
{
string::size_type z, y;
StringInt *p;
memset(Buf, 0, sizeof Buf);
- if (x > 4)
+ if (x > LINE_OUT_LEN)
{
*Out << "\n ";
- x = 0;
+ x = 4*BytePerEntry;
}
p = SetPtrs[Index];
Str = p->s;
@@ -1305,27 +1439,27 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
*Out << ' ';
if (y < 10)
*Out << ' ';
+ x += 4;
}
- ++x;
if (Cmnts)
{
*Out << " // " << p->i << ": " << Str;
- x = 99;
+ x = 999;
}
-
-
}
*Out << "\n};" << endl;
// Output the top 8 bits of the node word endings count. Since node with >255 endings have
// been placed at the begining, and ther are not too many of them the array is fairly small.
+ *Out << "#define NumLargeCounts " << FewEndStart << "\n";
*Out << "static const unsigned char EndCountLge[" << FewEndStart << "] =\n{";
OutputSize += FewEndStart * sizeof(unsigned char);
- x=99;
+ x = 999;
for(Index = 0; Index < FewEndStart; ++Index)
{
unsigned int v;
- if (++x > 19)
+ x += 4;
+ if (x > LINE_OUT_LEN)
{
*Out << "\n ";
x=0;
@@ -1339,21 +1473,22 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
*Out << ',';
if (v < 100) *Out << ' ';
if (v < 10) *Out << ' ';
- }
+ }
}
*Out << "\n};\n";
-
+
// Output all the word ending counts. For the first few nodes this is just the lower 8 bits of
// the value. For the rest each entry contains the whole count. The split between lower and
// upper halves of the value for the first few nodes allows bytes arrays to be used, so saving
// memory.
*Out << "static const unsigned char EndCountSml[" << NodeEnds.size() << "] =\n{";
OutputSize += NodeEnds.size() * sizeof(unsigned char);
- x=99;
+ x = 999;
for(Index = 0; Index < NodeEnds.size(); ++Index)
{
unsigned int v;
- if (++x > 19)
+ x += 4;
+ if (x > LINE_OUT_LEN)
{
*Out << "\n ";
x=0;
@@ -1365,10 +1500,10 @@ int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t
*Out << ',';
if (v < 100) *Out << ' ';
if (v < 10) *Out << ' ';
- }
+ }
}
*Out << "\n};\n";
-
+
// Finally output the used characters.
*Out << "static const char CharSet[" << CharSet.length()+1 << "] = \"";
OutputSize += CharSet.length() * sizeof(char);
@@ -1389,6 +1524,7 @@ enum { OUT_C_CODE, OUT_BINARY, OUT_TESTER };
int main(int argc, char *argv[])
{
int i;
+ int MaxRank = 999999999;
int OutType = OUT_C_CODE;
bool Verbose = false;
bool Comments = false;
@@ -1398,7 +1534,7 @@ int main(int argc, char *argv[])
FileInfo InInfo[10];
int NumFiles = 0;
MinLength = 999;
-
+
try
{
for(i = 1; i < argc; ++i)
@@ -1436,6 +1572,18 @@ int main(int argc, char *argv[])
HashFile = argv[i];
continue;
}
+ if (FileName == "-r")
+ {
+ // Ignore words with too high rank
+ if (++i < argc)
+ {
+ char *p=0;
+ MaxRank = strtol(argv[i], &p, 0);
+ if ((MaxRank < 1000) || *p)
+ MaxRank = 999999999;
+ continue;
+ }
+ }
if (FileName == "-v")
{
Verbose = true;
@@ -1448,6 +1596,7 @@ int main(int argc, char *argv[])
" -b Generate a binary output file\n"
" -t Generate a test file for testing zxcvbn\n"
" -c Add comments to output file if C code mode\n"
+ " -r number Ignore words with rank greater than number (must be >=1000)\n"
" -v Additional information output\n"
" -h Hfile Write file checksum to file Hfile as C code (for -b mode)\n"
" -o Ofile Write output to file Ofile\n"
@@ -1457,10 +1606,11 @@ int main(int argc, char *argv[])
<< endl;
return 1;
}
- ReadInputFile(Entries, FileName, i, InInfo[NumFiles]);
+ ReadInputFile(FileName, InInfo[NumFiles], MaxRank);
if (NumFiles < int(sizeof InInfo / sizeof InInfo[0] - 1))
++NumFiles;
}
+ CombineWordLists(Entries, InInfo, NumFiles);
if (Verbose)
{
if (!OutFile && (OutType == OUT_C_CODE))
@@ -1471,19 +1621,19 @@ int main(int argc, char *argv[])
cout << "Read input file " << Fi->Name << endl;
cout << " Input words " << Fi->Words << endl;
cout << " Used words " << Fi->Used << endl;
- cout << /*" Unused " << Fi->BruteIgnore <<
+ cout << " Unused " << Fi->BruteIgnore <<
" Bruteforce compare, " << Fi->Accented <<
- " Accented char, " << Fi->Dups << " Duplicates" << */ endl;
+ " Accented char, " << Fi->Dups << " Duplicates" << endl;
}
}
bool InputCharSet[256];
NodeSPtr Root(new Node);
// Initially charset of used chracters is empty
memset(InputCharSet, 0, sizeof InputCharSet);
-
+
// Add words to the trie with root in Root
ProcessEntries(Root, Entries, InputCharSet);
-
+
// Get some interesting info
int NumEnds = Root->CalcEndings();
int Hi = Root->CalcHeight();
@@ -1495,7 +1645,6 @@ int main(int argc, char *argv[])
cout << "Num input chars = " << NumChars << endl;
cout << "Num input words = " << NumInWords << endl;
cout << "Duplicate words = " << NumDuplicate;
- cout << " (Rank most reduced for \"" << MaxOccurStr.c_str() << "\")"<< endl;
cout << "Number of Ends = " << NumEnds << endl;
cout << "Number of Nodes = " << NumNodes << endl;
cout << "Trie height = " << Hi << endl;
@@ -1523,20 +1672,20 @@ int main(int argc, char *argv[])
int CheckEnds = CheckReduction(Ranks, Root, Entries);
if (Verbose)
cout << "Number of Words = " << CheckEnds << endl;
-
+
ChkNum Tst = CheckEntries(Root, string(), Entries);
if (Verbose)
{
cout << "2nd check - Number of valid words = " << Tst.Match << endl;
cout << " Number of invalid words = " << Tst.Err << endl;
}
-
+
// Give up if there was an error
if (Tst.Err)
throw "Checks show invalid words after reduction";
if ((Tst.Match != InputOrder) || (ReduceEnds != InputOrder))
throw "Word count changed after reduce";
-
+
// Output more info
StringIntSet_t ChildBits;
string CharSet = MakeCharSet(InputCharSet);
@@ -1549,11 +1698,11 @@ int main(int argc, char *argv[])
MakeChildBitMap(ChildBits, Root, i);
if (Verbose)
cout << "Number of child bitmaps = " << ChildBits.size() << endl;
-
+
// Get final node address
Root->CalcAddress();
- UintVect NodeData;
+ Uint64Vect NodeData;
UintVect NodeEnds;
StringOfInts ChildAddrs;
@@ -1579,7 +1728,7 @@ int main(int argc, char *argv[])
}
if (!OutFile && (OutType == OUT_C_CODE))
cout << "*/\n";
-
+
if (OutType == OUT_BINARY)
i = OutputBinary(Out, HashFile, CharSet, ChildBits, ChildAddrs, NodeData, NodeEnds, Ranks);
else if (OutType == OUT_TESTER)