Qtopia library API Documentation

qdawg.cpp

00001 /**********************************************************************
00002 ** Copyright (C) 2000-2002 Trolltech AS.  All rights reserved.
00003 **
00004 ** This file is part of the Qtopia Environment.
00005 **
00006 ** This file may be distributed and/or modified under the terms of the
00007 ** GNU General Public License version 2 as published by the Free Software
00008 ** Foundation and appearing in the file LICENSE.GPL included in the
00009 ** packaging of this file.
00010 **
00011 ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
00012 ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
00013 **
00014 ** See http://www.trolltech.com/gpl/ for GPL licensing information.
00015 **
00016 ** Contact info@trolltech.com if any conditions of this licensing are
00017 ** not clear to you.
00018 **
00019 **********************************************************************/
00020 #include "qdawg.h"
00021 #include <qintdict.h>
00022 #include <qvaluelist.h>
00023 #include <qtextstream.h>
00024 #include <qfile.h>
00025 #include <qtl.h>
00026 
00027 #include <limits.h>
00028 #include <stdio.h>
00029 
00030 // for mmap
00031 #include <sys/types.h>
00032 #include <sys/stat.h>
00033 #include <sys/mman.h>
00034 #include <fcntl.h>
00035 #include <errno.h>
00036 #include <unistd.h>
00037 
00038 class QDawgPrivate;
00039 class QTrie;
00040 
00041 typedef QValueList<QTrie*> TrieClub;
00042 typedef QIntDict<TrieClub> TrieClubDirectory;
00043 
00044 class TriePtr {
00045 public:
00046     QChar letter;
00047     QTrie* p;
00048     int operator <(const TriePtr& o) const;
00049     int operator >(const TriePtr& o) const;
00050     int operator <=(const TriePtr& o) const;
00051 };
00052 
00053 class TrieList : public QValueList<TriePtr> {
00054     bool sorted;
00055 public:
00056     TrieList()
00057     {
00058     sorted=TRUE;
00059     }
00060 
00061     QTrie* findAdd(QChar c);
00062     bool equal(TrieList& l);
00063 
00064     void sort()
00065     {
00066     if ( !sorted ) {
00067         qHeapSort(*this);
00068         sorted = TRUE;
00069     }
00070     }
00071 };
00072 
00073 // A fast but memory-wasting temporary class.  The Dawg is the goal.
00074 class QTrie {
00075 public:
00076     QTrie();
00077     ~QTrie();
00078 
00079     void insertWord(const QString& s, uint index=0);
00080     bool equal(QTrie* o);
00081     void dump(int indent=0);
00082 
00083 private:
00084     TrieList children;
00085     bool isword;
00086 
00087     friend class QDawgPrivate;
00088     int maxdepth;
00089     int decendants;
00090     int key;
00091     void distributeKeys(TrieClubDirectory& directory);
00092     QTrie* clubLeader(TrieClubDirectory& directory);
00093     int collectKeys();
00094     friend class TriePtr;
00095     friend class TrieList;
00096 };
00097 
00098 QTrie::QTrie()
00099 {
00100     key = 0;
00101     isword = FALSE;
00102 }
00103 
00104 QTrie::~QTrie()
00105 {
00106     // NOTE: we do not delete the children - after conversion to DAWG
00107     // it's too difficult.  The QTrie's are deleted via the directory.
00108 }
00109 
00110 void QTrie::insertWord(const QString& s, uint index)
00111 {
00112     if ( index == s.length() ) {
00113     isword = TRUE;
00114     } else {
00115     QTrie* t = children.findAdd(s[index]);
00116     t->insertWord(s,index+1);
00117     }
00118 }
00119 
00120 bool QTrie::equal(QTrie* o)
00121 {
00122     if ( o == this ) return TRUE;
00123     if ( isword != o->isword )
00124     return FALSE;
00125     return children.equal(o->children);
00126 }
00127 
00128 void QTrie::dump(int indent)
00129 {
00130     for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) {
00131     QTrie* s = (*it).p;
00132     for (int in=0; in<indent; in++)
00133         fputc(' ',stderr);
00134     fprintf(stderr," %c %d %s %p\n",(*it).letter.unicode(),
00135         s->key,s->isword?"word":"",s);
00136     s->dump(indent+2);
00137     }
00138 }
00139 
00140 void QTrie::distributeKeys(TrieClubDirectory& directory)
00141 {
00142     maxdepth = INT_MIN;
00143     decendants = children.count();
00144     key = 0;
00145     for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) {
00146     QTrie* s = (*it).p;
00147     QChar l = (*it).letter;
00148     s->distributeKeys(directory);
00149     key = key*64+l.unicode()+s->key*5;
00150     decendants += s->decendants;
00151     if ( s->maxdepth+1 > maxdepth )
00152         maxdepth = s->maxdepth+1;
00153     }
00154     if ( decendants ) {
00155     key += decendants + maxdepth*256 + children.count() * 65536;
00156     if ( !key ) key++; // unlikely
00157     }
00158     TrieClub* c = directory[key];
00159     if ( !c ) directory.insert(key, (c = new TrieClub) );
00160     c->prepend(this);
00161 }
00162 
00163 QTrie* QTrie::clubLeader(TrieClubDirectory& directory)
00164 {
00165     if ( !key ) return directory[0]->first();
00166     for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) {
00167     QTrie* t= (*it).p->clubLeader(directory);
00168     (*it).p = t;
00169     }
00170     TrieClub *club = directory[key];
00171     for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) {
00172     QTrie* o = *it;
00173     if ( o->equal(this) )
00174         return o;
00175     }
00176     return this;
00177 }
00178 
00179 int QTrie::collectKeys()
00180 {
00181     int n=0;
00182     if ( key ) key=0,n+=children.count();
00183     for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it)
00184     n += (*it).p->collectKeys();
00185     return n;
00186 }
00187 
00188 int TriePtr::operator <(const TriePtr& o) const
00189     { return letter < o.letter; }
00190 int TriePtr::operator >(const TriePtr& o) const
00191     { return letter > o.letter; }
00192 int TriePtr::operator <=(const TriePtr& o) const
00193     { return letter <= o.letter; }
00194 
00195 bool TrieList::equal(TrieList& l)
00196 {
00197     if ( count() != l.count() )
00198     return FALSE;
00199     sort(); l.sort();
00200     ConstIterator it2 = begin();
00201     ConstIterator it = l.begin();
00202     for( ; it != l.end(); ++it, ++it2 )
00203     if ( (*it).letter != (*it2).letter || ! (*it).p->equal((*it2).p) )
00204         return FALSE;
00205     return TRUE;
00206 }
00207 QTrie* TrieList::findAdd(QChar c)
00208 {
00209     for (Iterator it=begin(); it!=end(); ++it) {
00210     if ( (*it).letter == c )
00211         return (*it).p;
00212     }
00213     TriePtr p;
00214     p.p = new QTrie;
00215     p.letter = c;
00216     prepend(p);
00217     sorted=FALSE;
00218     sort();
00219     return p.p;
00220 }
00221 
00222 static const char* dawg_sig = "QDAWG100";
00223 
00224 class QDawgPrivate {
00225 public:
00226     QDawgPrivate(QIODevice* dev)
00227     {
00228     QDataStream ds(dev);
00229     char sig[8];
00230     ds.readRawBytes(sig,8);
00231     if ( !strncmp(dawg_sig,sig,8) ) {
00232         uint n;
00233         char* nn;
00234         ds.readBytes(nn,n);
00235 
00236         // #### endianness problem ignored.
00237         node = (QDawg::Node*)nn;
00238         nodes = n / sizeof(QDawg::Node);
00239     } else {
00240         node = 0;
00241     }
00242     }
00243 
00244     bool ok() const { return node; }
00245 
00246     QDawgPrivate(uchar* mem)
00247     {
00248     if ( !strncmp(dawg_sig,(char*)mem,8) ) {
00249         mem += 8;
00250 
00251         int n = ((mem[0]*256+mem[1])*256+mem[2])*256+mem[3];
00252         mem += 4;
00253 
00254         // #### endianness problem ignored.
00255         node = (QDawg::Node*)((char*)mem);
00256         nodes = n / sizeof(QDawg::Node);
00257     }
00258     }
00259 
00260     QDawgPrivate(QTrie* t) // destroys the QTrie.
00261     {
00262     TrieClubDirectory directory(9973);
00263     t->distributeKeys(directory);
00264     QTrie* l = t->clubLeader(directory);
00265     ASSERT(l==t);
00266     generateArray(t);
00267 
00268     TrieClub *club;
00269     for (QIntDictIterator<TrieClub> dit(directory); (club=dit); ++dit)
00270     {
00271         for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) {
00272         delete *it;
00273         }
00274         delete club;
00275     }
00276     }
00277 
00278     bool write(QIODevice* dev)
00279     {
00280     QDataStream ds(dev);
00281     ds.writeRawBytes(dawg_sig,8);
00282     // #### endianness problem ignored.
00283     ds.writeBytes((char*)node,sizeof(QDawg::Node)*nodes);
00284     return dev->state() == IO_Ok;
00285     }
00286 
00287     void dumpWords(int nid=0, int index=0)
00288     {
00289     static char word[256]; // ick latin1
00290     int i=0;
00291     do {
00292         QDawg::Node& n = node[nid+i];
00293         word[index] = n.let;
00294         if ( n.isword )
00295         fprintf(stderr,"%.*s\n",index+1,word);
00296         if ( n.offset ) dumpWords(n.offset+nid+i,index+1);
00297     } while (!node[nid+i++].islast);
00298     }
00299 
00300     void dump(int nid=0, int indent=0)
00301     {
00302     int i=0;
00303     do {
00304         QDawg::Node& n = node[nid+i];
00305         fprintf(stderr,"%d: ",nid+i);
00306         for (int in=0; in<indent; in++)
00307         fputc(' ',stderr);
00308         fprintf(stderr," %c %d %d %d\n",n.let,
00309         n.isword,n.islast,n.offset);
00310         if ( n.offset ) dump(n.offset+nid+i,indent+2);
00311     } while (!node[nid+i++].islast);
00312     }
00313 
00314     int countWords(int nid=0)
00315     {
00316     int t=0;
00317     int i=0;
00318     do {
00319         QDawg::Node& n = node[nid+i];
00320         if ( n.isword )
00321         t++;
00322         if ( n.offset )
00323         t+=countWords(n.offset+nid+i);
00324     } while (!node[nid+i++].islast);
00325     return t;
00326     }
00327 
00328     bool contains(const QString& s, int nid=0, int index=0) const
00329     {
00330     int i=0;
00331     do {
00332         QDawg::Node& n = node[nid+i];
00333         if ( s[index] == QChar((ushort)n.let) ) {
00334         if ( n.isword && index == (int)s.length()-1 )
00335             return TRUE;
00336         if ( n.offset )
00337             return contains(s,n.offset+nid+i,index+1);
00338         }
00339     } while (!node[nid+i++].islast);
00340     return FALSE;
00341     }
00342 
00343     void appendAllWords(QStringList& list, int nid=0, QString s="") const
00344     {
00345     int i=0;
00346     int next = s.length();
00347     do {
00348         QDawg::Node& n = node[nid+i];
00349         s[next] = QChar((ushort)n.let);
00350         if ( n.isword )
00351         list.append(s);
00352         if ( n.offset )
00353         appendAllWords(list, n.offset+nid+i, s);
00354     } while (!node[nid+i++].islast);
00355     }
00356 
00357     const QDawg::Node* root() { return node; }
00358 
00359 private:
00360     void generateArray(QTrie* t)
00361     {
00362     nodes = 0;
00363     int n = t->collectKeys();
00364     node = new QDawg::Node[n];
00365     appendToArray(t);
00366     ASSERT(n == nodes);
00367     }
00368 
00369     int appendToArray(QTrie* t)
00370     {
00371     if ( !t->key ) {
00372         if ( !t->children.count() )
00373         return 0;
00374         t->key = nodes;
00375         nodes += t->children.count();
00376         QDawg::Node* n = &node[t->key-1];
00377         int here = t->key;
00378         for (TrieList::Iterator it=t->children.begin(); it!=t->children.end(); ++it) {
00379         QTrie* s = (*it).p;
00380         ++n;
00381         n->let = (*it).letter.unicode();
00382         n->isword = s->isword;
00383         n->islast = 0;
00384         n->offset = appendToArray(s);
00385         if ( n->offset ) {
00386             int t = n->offset-here;
00387             n->offset=t;
00388             if ( n->offset != t )
00389             qWarning("Overflow: too many words");
00390         }
00391         here++;
00392         }
00393         n->islast = 1;
00394     }
00395     return t->key;
00396     }
00397 
00398 private:
00399     int nodes;
00400     QDawg::Node *node;
00401 };
00402 
00445 QDawg::QDawg()
00446 {
00447     d = 0;
00448 }
00449 
00453 QDawg::~QDawg()
00454 {
00455     delete d;
00456 }
00457 
00462 bool QDawg::createFromWords(QIODevice* dev)
00463 {
00464     delete d;
00465 
00466     QTextStream i(dev);
00467     QTrie* trie = new QTrie;
00468     int n=0;
00469     while (!i.atEnd()) {
00470     trie->insertWord(QString::fromUtf8(i.readLine()));
00471     n++;
00472     }
00473     if ( n )
00474     d = new QDawgPrivate(trie);
00475     else
00476     d = 0;
00477     return TRUE;
00478 }
00479 
00483 void QDawg::createFromWords(const QStringList& list)
00484 {
00485     delete d;
00486 
00487     if ( list.count() ) {
00488     QTrie* trie = new QTrie;
00489     for (QStringList::ConstIterator it = list.begin(); it != list.end(); ++it) {
00490         trie->insertWord(*it);
00491     }
00492     d = new QDawgPrivate(trie);
00493     } else {
00494     d = 0;
00495     }
00496 }
00497 
00501 QStringList QDawg::allWords() const
00502 {
00503     QStringList result;
00504     if ( d ) d->appendAllWords(result);
00505     return result;
00506 }
00507 
00508 
00515 bool QDawg::readFile(const QString& filename)
00516 {
00517     delete d;
00518     d = 0;
00519     int f = ::open( QFile::encodeName(filename), O_RDONLY );
00520     if ( f < 0 )
00521     return FALSE;
00522     struct stat st;
00523     if ( !fstat( f, &st ) ) {
00524     char * tmp = (char*)mmap( 0, st.st_size, // any address, whole file
00525                        PROT_READ, // read-only memory
00526                        MAP_FILE | MAP_PRIVATE, // swap-backed map from file
00527                        f, 0 ); // from offset 0 of f
00528     if ( tmp && tmp != (char*)MAP_FAILED )
00529         d = new QDawgPrivate((uchar*)tmp);
00530     }
00531     ::close( f );
00532     return d;
00533 }
00534 
00541 bool QDawg::read(QIODevice* dev)
00542 {
00543     delete d;
00544     d = new QDawgPrivate(dev);
00545     if ( d->ok() )
00546     return TRUE;
00547     delete d;
00548     d = 0;
00549     return FALSE;
00550 }
00551 
00555 bool QDawg::write(QIODevice* dev) const
00556 {
00557     return d ? d->write(dev) : TRUE;
00558 }
00559 
00563 int QDawg::countWords() const
00564 {
00565     return d ? d->countWords() : 0;
00566 }
00567 
00571 const QDawg::Node* QDawg::root() const
00572 {
00573     return d ? d->root() : 0;
00574 }
00575 
00580 bool QDawg::contains(const QString& s) const
00581 {
00582     return d ? d->contains(s) : FALSE;
00583 }
00584 
00590 void QDawg::dump() const
00591 {
00592     if ( d ) d->dump();
00593 }
00594 
KDE Logo
This file is part of the documentation for OPIE Version 1.5.5.
Documentation copyright © 1997-2003 the KDE developers. 2003 OPIE developers
Generated on Tue Feb 10 20:24:06 2004 by doxygen 1.3.5 written by Dimitri van Heesch, © 1997-2001