00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "qdawg.h"
00021 #include <qintdict.h>
00022 #include <qvaluelist.h>
00023 #include <qtextstream.h>
00024 #include <qfile.h>
00025 #include <qtl.h>
00026
00027 #include <limits.h>
00028 #include <stdio.h>
00029
00030
00031 #include <sys/types.h>
00032 #include <sys/stat.h>
00033 #include <sys/mman.h>
00034 #include <fcntl.h>
00035 #include <errno.h>
00036 #include <unistd.h>
00037
00038 class QDawgPrivate;
00039 class QTrie;
00040
00041 typedef QValueList<QTrie*> TrieClub;
00042 typedef QIntDict<TrieClub> TrieClubDirectory;
00043
00044 class TriePtr {
00045 public:
00046 QChar letter;
00047 QTrie* p;
00048 int operator <(const TriePtr& o) const;
00049 int operator >(const TriePtr& o) const;
00050 int operator <=(const TriePtr& o) const;
00051 };
00052
00053 class TrieList : public QValueList<TriePtr> {
00054 bool sorted;
00055 public:
00056 TrieList()
00057 {
00058 sorted=TRUE;
00059 }
00060
00061 QTrie* findAdd(QChar c);
00062 bool equal(TrieList& l);
00063
00064 void sort()
00065 {
00066 if ( !sorted ) {
00067 qHeapSort(*this);
00068 sorted = TRUE;
00069 }
00070 }
00071 };
00072
00073
00074 class QTrie {
00075 public:
00076 QTrie();
00077 ~QTrie();
00078
00079 void insertWord(const QString& s, uint index=0);
00080 bool equal(QTrie* o);
00081 void dump(int indent=0);
00082
00083 private:
00084 TrieList children;
00085 bool isword;
00086
00087 friend class QDawgPrivate;
00088 int maxdepth;
00089 int decendants;
00090 int key;
00091 void distributeKeys(TrieClubDirectory& directory);
00092 QTrie* clubLeader(TrieClubDirectory& directory);
00093 int collectKeys();
00094 friend class TriePtr;
00095 friend class TrieList;
00096 };
00097
00098 QTrie::QTrie()
00099 {
00100 key = 0;
00101 isword = FALSE;
00102 }
00103
00104 QTrie::~QTrie()
00105 {
00106
00107
00108 }
00109
00110 void QTrie::insertWord(const QString& s, uint index)
00111 {
00112 if ( index == s.length() ) {
00113 isword = TRUE;
00114 } else {
00115 QTrie* t = children.findAdd(s[index]);
00116 t->insertWord(s,index+1);
00117 }
00118 }
00119
00120 bool QTrie::equal(QTrie* o)
00121 {
00122 if ( o == this ) return TRUE;
00123 if ( isword != o->isword )
00124 return FALSE;
00125 return children.equal(o->children);
00126 }
00127
00128 void QTrie::dump(int indent)
00129 {
00130 for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) {
00131 QTrie* s = (*it).p;
00132 for (int in=0; in<indent; in++)
00133 fputc(' ',stderr);
00134 fprintf(stderr," %c %d %s %p\n",(*it).letter.unicode(),
00135 s->key,s->isword?"word":"",s);
00136 s->dump(indent+2);
00137 }
00138 }
00139
00140 void QTrie::distributeKeys(TrieClubDirectory& directory)
00141 {
00142 maxdepth = INT_MIN;
00143 decendants = children.count();
00144 key = 0;
00145 for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) {
00146 QTrie* s = (*it).p;
00147 QChar l = (*it).letter;
00148 s->distributeKeys(directory);
00149 key = key*64+l.unicode()+s->key*5;
00150 decendants += s->decendants;
00151 if ( s->maxdepth+1 > maxdepth )
00152 maxdepth = s->maxdepth+1;
00153 }
00154 if ( decendants ) {
00155 key += decendants + maxdepth*256 + children.count() * 65536;
00156 if ( !key ) key++;
00157 }
00158 TrieClub* c = directory[key];
00159 if ( !c ) directory.insert(key, (c = new TrieClub) );
00160 c->prepend(this);
00161 }
00162
00163 QTrie* QTrie::clubLeader(TrieClubDirectory& directory)
00164 {
00165 if ( !key ) return directory[0]->first();
00166 for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) {
00167 QTrie* t= (*it).p->clubLeader(directory);
00168 (*it).p = t;
00169 }
00170 TrieClub *club = directory[key];
00171 for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) {
00172 QTrie* o = *it;
00173 if ( o->equal(this) )
00174 return o;
00175 }
00176 return this;
00177 }
00178
00179 int QTrie::collectKeys()
00180 {
00181 int n=0;
00182 if ( key ) key=0,n+=children.count();
00183 for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it)
00184 n += (*it).p->collectKeys();
00185 return n;
00186 }
00187
00188 int TriePtr::operator <(const TriePtr& o) const
00189 { return letter < o.letter; }
00190 int TriePtr::operator >(const TriePtr& o) const
00191 { return letter > o.letter; }
00192 int TriePtr::operator <=(const TriePtr& o) const
00193 { return letter <= o.letter; }
00194
00195 bool TrieList::equal(TrieList& l)
00196 {
00197 if ( count() != l.count() )
00198 return FALSE;
00199 sort(); l.sort();
00200 ConstIterator it2 = begin();
00201 ConstIterator it = l.begin();
00202 for( ; it != l.end(); ++it, ++it2 )
00203 if ( (*it).letter != (*it2).letter || ! (*it).p->equal((*it2).p) )
00204 return FALSE;
00205 return TRUE;
00206 }
00207 QTrie* TrieList::findAdd(QChar c)
00208 {
00209 for (Iterator it=begin(); it!=end(); ++it) {
00210 if ( (*it).letter == c )
00211 return (*it).p;
00212 }
00213 TriePtr p;
00214 p.p = new QTrie;
00215 p.letter = c;
00216 prepend(p);
00217 sorted=FALSE;
00218 sort();
00219 return p.p;
00220 }
00221
00222 static const char* dawg_sig = "QDAWG100";
00223
00224 class QDawgPrivate {
00225 public:
00226 QDawgPrivate(QIODevice* dev)
00227 {
00228 QDataStream ds(dev);
00229 char sig[8];
00230 ds.readRawBytes(sig,8);
00231 if ( !strncmp(dawg_sig,sig,8) ) {
00232 uint n;
00233 char* nn;
00234 ds.readBytes(nn,n);
00235
00236
00237 node = (QDawg::Node*)nn;
00238 nodes = n / sizeof(QDawg::Node);
00239 } else {
00240 node = 0;
00241 }
00242 }
00243
00244 bool ok() const { return node; }
00245
00246 QDawgPrivate(uchar* mem)
00247 {
00248 if ( !strncmp(dawg_sig,(char*)mem,8) ) {
00249 mem += 8;
00250
00251 int n = ((mem[0]*256+mem[1])*256+mem[2])*256+mem[3];
00252 mem += 4;
00253
00254
00255 node = (QDawg::Node*)((char*)mem);
00256 nodes = n / sizeof(QDawg::Node);
00257 }
00258 }
00259
00260 QDawgPrivate(QTrie* t)
00261 {
00262 TrieClubDirectory directory(9973);
00263 t->distributeKeys(directory);
00264 QTrie* l = t->clubLeader(directory);
00265 ASSERT(l==t);
00266 generateArray(t);
00267
00268 TrieClub *club;
00269 for (QIntDictIterator<TrieClub> dit(directory); (club=dit); ++dit)
00270 {
00271 for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) {
00272 delete *it;
00273 }
00274 delete club;
00275 }
00276 }
00277
00278 bool write(QIODevice* dev)
00279 {
00280 QDataStream ds(dev);
00281 ds.writeRawBytes(dawg_sig,8);
00282
00283 ds.writeBytes((char*)node,sizeof(QDawg::Node)*nodes);
00284 return dev->state() == IO_Ok;
00285 }
00286
00287 void dumpWords(int nid=0, int index=0)
00288 {
00289 static char word[256];
00290 int i=0;
00291 do {
00292 QDawg::Node& n = node[nid+i];
00293 word[index] = n.let;
00294 if ( n.isword )
00295 fprintf(stderr,"%.*s\n",index+1,word);
00296 if ( n.offset ) dumpWords(n.offset+nid+i,index+1);
00297 } while (!node[nid+i++].islast);
00298 }
00299
00300 void dump(int nid=0, int indent=0)
00301 {
00302 int i=0;
00303 do {
00304 QDawg::Node& n = node[nid+i];
00305 fprintf(stderr,"%d: ",nid+i);
00306 for (int in=0; in<indent; in++)
00307 fputc(' ',stderr);
00308 fprintf(stderr," %c %d %d %d\n",n.let,
00309 n.isword,n.islast,n.offset);
00310 if ( n.offset ) dump(n.offset+nid+i,indent+2);
00311 } while (!node[nid+i++].islast);
00312 }
00313
00314 int countWords(int nid=0)
00315 {
00316 int t=0;
00317 int i=0;
00318 do {
00319 QDawg::Node& n = node[nid+i];
00320 if ( n.isword )
00321 t++;
00322 if ( n.offset )
00323 t+=countWords(n.offset+nid+i);
00324 } while (!node[nid+i++].islast);
00325 return t;
00326 }
00327
00328 bool contains(const QString& s, int nid=0, int index=0) const
00329 {
00330 int i=0;
00331 do {
00332 QDawg::Node& n = node[nid+i];
00333 if ( s[index] == QChar((ushort)n.let) ) {
00334 if ( n.isword && index == (int)s.length()-1 )
00335 return TRUE;
00336 if ( n.offset )
00337 return contains(s,n.offset+nid+i,index+1);
00338 }
00339 } while (!node[nid+i++].islast);
00340 return FALSE;
00341 }
00342
00343 void appendAllWords(QStringList& list, int nid=0, QString s="") const
00344 {
00345 int i=0;
00346 int next = s.length();
00347 do {
00348 QDawg::Node& n = node[nid+i];
00349 s[next] = QChar((ushort)n.let);
00350 if ( n.isword )
00351 list.append(s);
00352 if ( n.offset )
00353 appendAllWords(list, n.offset+nid+i, s);
00354 } while (!node[nid+i++].islast);
00355 }
00356
00357 const QDawg::Node* root() { return node; }
00358
00359 private:
00360 void generateArray(QTrie* t)
00361 {
00362 nodes = 0;
00363 int n = t->collectKeys();
00364 node = new QDawg::Node[n];
00365 appendToArray(t);
00366 ASSERT(n == nodes);
00367 }
00368
00369 int appendToArray(QTrie* t)
00370 {
00371 if ( !t->key ) {
00372 if ( !t->children.count() )
00373 return 0;
00374 t->key = nodes;
00375 nodes += t->children.count();
00376 QDawg::Node* n = &node[t->key-1];
00377 int here = t->key;
00378 for (TrieList::Iterator it=t->children.begin(); it!=t->children.end(); ++it) {
00379 QTrie* s = (*it).p;
00380 ++n;
00381 n->let = (*it).letter.unicode();
00382 n->isword = s->isword;
00383 n->islast = 0;
00384 n->offset = appendToArray(s);
00385 if ( n->offset ) {
00386 int t = n->offset-here;
00387 n->offset=t;
00388 if ( n->offset != t )
00389 qWarning("Overflow: too many words");
00390 }
00391 here++;
00392 }
00393 n->islast = 1;
00394 }
00395 return t->key;
00396 }
00397
00398 private:
00399 int nodes;
00400 QDawg::Node *node;
00401 };
00402
00445 QDawg::QDawg()
00446 {
00447 d = 0;
00448 }
00449
00453 QDawg::~QDawg()
00454 {
00455 delete d;
00456 }
00457
00462 bool QDawg::createFromWords(QIODevice* dev)
00463 {
00464 delete d;
00465
00466 QTextStream i(dev);
00467 QTrie* trie = new QTrie;
00468 int n=0;
00469 while (!i.atEnd()) {
00470 trie->insertWord(QString::fromUtf8(i.readLine()));
00471 n++;
00472 }
00473 if ( n )
00474 d = new QDawgPrivate(trie);
00475 else
00476 d = 0;
00477 return TRUE;
00478 }
00479
00483 void QDawg::createFromWords(const QStringList& list)
00484 {
00485 delete d;
00486
00487 if ( list.count() ) {
00488 QTrie* trie = new QTrie;
00489 for (QStringList::ConstIterator it = list.begin(); it != list.end(); ++it) {
00490 trie->insertWord(*it);
00491 }
00492 d = new QDawgPrivate(trie);
00493 } else {
00494 d = 0;
00495 }
00496 }
00497
00501 QStringList QDawg::allWords() const
00502 {
00503 QStringList result;
00504 if ( d ) d->appendAllWords(result);
00505 return result;
00506 }
00507
00508
00515 bool QDawg::readFile(const QString& filename)
00516 {
00517 delete d;
00518 d = 0;
00519 int f = ::open( QFile::encodeName(filename), O_RDONLY );
00520 if ( f < 0 )
00521 return FALSE;
00522 struct stat st;
00523 if ( !fstat( f, &st ) ) {
00524 char * tmp = (char*)mmap( 0, st.st_size,
00525 PROT_READ,
00526 MAP_FILE | MAP_PRIVATE,
00527 f, 0 );
00528 if ( tmp && tmp != (char*)MAP_FAILED )
00529 d = new QDawgPrivate((uchar*)tmp);
00530 }
00531 ::close( f );
00532 return d;
00533 }
00534
00541 bool QDawg::read(QIODevice* dev)
00542 {
00543 delete d;
00544 d = new QDawgPrivate(dev);
00545 if ( d->ok() )
00546 return TRUE;
00547 delete d;
00548 d = 0;
00549 return FALSE;
00550 }
00551
00555 bool QDawg::write(QIODevice* dev) const
00556 {
00557 return d ? d->write(dev) : TRUE;
00558 }
00559
00563 int QDawg::countWords() const
00564 {
00565 return d ? d->countWords() : 0;
00566 }
00567
00571 const QDawg::Node* QDawg::root() const
00572 {
00573 return d ? d->root() : 0;
00574 }
00575
00580 bool QDawg::contains(const QString& s) const
00581 {
00582 return d ? d->contains(s) : FALSE;
00583 }
00584
00590 void QDawg::dump() const
00591 {
00592 if ( d ) d->dump();
00593 }
00594