00001 // 00002 // WordDict.h 00003 // 00004 // NAME 00005 // 00006 // manage and use an inverted index dictionary. 00007 // 00008 // SYNOPSIS 00009 // 00010 // #include <mifluz.h> 00011 // 00012 // WordList* words = ...; 00013 // WordDict* dict = words->Dict(); 00014 // 00015 // DESCRIPTION 00016 // 00017 // WordDict maps strings to unique identifiers and frequency in the 00018 // inverted index. Whenever a new word is found, the WordDict class 00019 // can be asked to assign it a serial number. When doing so, an entry 00020 // is created in the dictionary with a frequency of zero. The application 00021 // may then increment or decrement the frequency to reflect the inverted 00022 // index content. 00023 // 00024 // The serial numbers range from 1 to 2^32 inclusive. 00025 // 00026 // A WordDict object is automatically created by the WordList object and 00027 // should not be created directly by the application. 00028 // 00029 // END 00030 // 00031 // Part of the ht://Dig package <http://www.htdig.org/> 00032 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group 00033 // For copyright details, see the file COPYING in your distribution 00034 // or the GNU General Public License version 2 or later 00035 // <http://www.gnu.org/copyleft/gpl.html> 00036 // 00037 // $Id: WordDict_8h-source.html,v 1.1 2008/06/08 10:13:09 sebdiaz Exp $ 00038 // 00039 00040 #ifndef _WordDict_h_ 00041 #define _WordDict_h_ 00042 00043 #include <stdio.h> 00044 00045 #ifndef SWIG 00046 #include "htString.h" 00047 #include "WordDB.h" 00048 00049 class WordList; 00050 class WordDictCursor; 00051 00052 #define WORD_DICT_SERIAL_INVALID 0 00053 00054 class WordDictRecord { 00055 public: 00056 inline WordDictRecord() { count = 0; id = WORD_DICT_SERIAL_INVALID; } 00057 00058 inline int Unpack(const String& coded) { 00059 int offset = 0; 00060 coded.ber_shift(offset, count); 00061 coded.ber_shift(offset, id); 00062 return OK; 00063 } 00064 00065 inline int Pack(String& coded) const { 00066 int offset = 0; 00067 coded.ber_push(offset, count); 00068 coded.ber_push(offset, id); 00069 return OK; 00070 } 00071 00072 inline int Get(WordDB* db, const String& word) { 00073 String tmp_word = word; 00074 String coded(BER_MAX_BYTES * 2); 00075 int ret; 00076 if((ret = db->Get(0, tmp_word, coded, 0)) != 0) return ret; 00077 00078 Unpack(coded); 00079 00080 return ret; 00081 } 00082 00083 inline int Put(WordDB* db, const String& word) { 00084 String coded(BER_MAX_BYTES * 2); 00085 Pack(coded); 00086 return db->Put(0, word, coded, 0); 00087 } 00088 00089 inline int Del(WordDB* db, const String& word) { 00090 return db->Del(0, word); 00091 } 00092 00093 inline unsigned int Count() { return count; } 00094 inline unsigned int Id() { return id; } 00095 00096 unsigned int count; 00097 unsigned int id; 00098 }; 00099 #endif /* SWIG */ 00100 00101 class WordDict 00102 { 00103 public: 00104 #ifndef SWIG 00105 //- 00106 // Private constructor. 00107 // 00108 WordDict() { words = 0; db = 0; } 00109 ~WordDict(); 00110 00111 //- 00112 // Bind the object a WordList inverted index. Return OK on success, 00113 // NOTOK otherwise. 00114 // 00115 int Initialize(WordList* words); 00116 00117 //- 00118 // Open the underlying Berkeley DB sub-database. The enclosing 00119 // file is given by the <i>words</i> data member. Return OK on success, 00120 // NOTOK otherwise. 00121 // 00122 int Open(); 00123 //- 00124 // Destroy the underlying Berkeley DB sub-database. Return OK on success, 00125 // NOTOK otherwise. 00126 // 00127 int Remove(); 00128 //- 00129 // Close the underlying Berkeley DB sub-database. Return OK on success, 00130 // NOTOK otherwise. 00131 // 00132 int Close(); 00133 00134 //- 00135 // If the <b>word</b> argument exists in the dictionnary, return its 00136 // serial number in the <b>serial</b> argument. If it does not already 00137 // exists, assign it a serial number, create an entry with a frequency 00138 // of zero and return the new serial in the <b>serial</b> argument. 00139 // Return OK on success, NOTOK otherwise. 00140 // 00141 int Serial(const String& word, unsigned int& serial); 00142 //- 00143 // If the <b>word</b> argument exists in the dictionnary, return its 00144 // serial number in the <b>serial</b> argument. If it does not exists 00145 // set the <b>serial</b> argument to WORD_DICT_SERIAL_INVALID. 00146 // Return OK on success, NOTOK otherwise. 00147 // 00148 int SerialExists(const String& word, unsigned int& serial); 00149 //- 00150 // Short hand for Serial() followed by Ref(). 00151 // Return OK on success, NOTOK otherwise. 00152 // 00153 int SerialRef(const String& word, unsigned int& serial); 00154 //- 00155 // Return the frequency of the <b>word</b> argument 00156 // in the <b>noccurrence</b> argument. 00157 // Return OK on success, NOTOK otherwise. 00158 // 00159 int Noccurrence(const String& word, unsigned int& noccurrence) const; 00160 #endif /* SWIG */ 00161 00162 //- 00163 // Short hand for words->GetContext()->GetType()->Normalize(word). 00164 // Return OK on success, NOTOK otherwise. 00165 // 00166 int Normalize(String& word) const; 00167 00168 //- 00169 // Short hand for Incr(word, 1) 00170 // 00171 int Ref(const String& word) { return Incr(word, 1); } 00172 //- 00173 // Add <b>incr</b> to the frequency of the <b>word</b>. 00174 // Return OK on success, NOTOK otherwise. 00175 // 00176 int Incr(const String& word, unsigned int incr); 00177 //- 00178 // Short hand for Decr(word, 1) 00179 // 00180 int Unref(const String& word) { return Decr(word, 1); } 00181 //- 00182 // Subtract <b>decr</b> to the frequency of the <b>word</b>. If 00183 // the frequency becomes lower or equal to zero, remove the entry 00184 // from the dictionnary and lose the association between the word and its 00185 // serial number. 00186 // Return OK on success, NOTOK otherwise. 00187 // 00188 int Decr(const String& word, unsigned int decr); 00189 //- 00190 // Set the frequency of <b>word</b> with the value of the <b>noccurrence</b> 00191 // argument. 00192 // 00193 int Put(const String& word, unsigned int noccurrence); 00194 00195 //- 00196 // Return true if <b>word</b> exists in the dictionnary, false otherwise. 00197 // 00198 int Exists(const String& word) const; 00199 00200 #ifndef SWIG 00201 //- 00202 // Return a pointer to the associated WordList object. 00203 // 00204 List* Words() const; 00205 00206 //- 00207 // Return a cursor to sequentially walk the dictionnary using the 00208 // <b>Next</b> method. 00209 // 00210 WordDictCursor* Cursor() const; 00211 //- 00212 // Return the next entry in the dictionnary. The <b>cursor</b> argument 00213 // must have been created using the <i>Cursor</i> method. The word is 00214 // returned in the <b>word</b> argument and the record is returned in 00215 // the <b>record</b> argument. 00216 // On success the function returns 0, at the end of the dictionnary it 00217 // returns DB_NOTFOUND. The <b>cursor</b> argument is deallocated when 00218 // the function hits the end of the dictionnary or an error occurs. 00219 // 00220 int Next(WordDictCursor* cursor, String& word, WordDictRecord& record); 00221 00222 //- 00223 // Return a cursor to sequentially walk the entries of the dictionnary 00224 // that start with the <b>prefix</b> argument, using the 00225 // <b>NextPrefix</b> method. 00226 // 00227 WordDictCursor* CursorPrefix(const String& prefix) const; 00228 //- 00229 // Return the next prefix from the dictionnary. The <b>cursor</b> argument 00230 // must have been created using the <i>CursorPrefix</i> method. The word is 00231 // returned in the <b>word</b> argument and the record is returned in 00232 // the <b>record</b> argument. The <b>word</b> is guaranteed to start with 00233 // the prefix specified to the <b>CursorPrefix</b> method. 00234 // On success the function returns 0, at the end of the dictionnary it 00235 // returns DB_NOTFOUND. The <b>cursor</b> argument is deallocated when 00236 // the function hits the end of the dictionnary or an error occurs. 00237 // 00238 int NextPrefix(WordDictCursor* cursor, String& word, WordDictRecord& record); 00239 00240 //- 00241 // Dump the complete dictionary in the file descriptor <b>f.</b> The 00242 // format of the dictionary is <i>word serial frequency</i>, one by 00243 // line. 00244 // 00245 int Write(FILE* f); 00246 00247 private: 00248 WordList* words; 00249 WordDB* db; 00250 #endif /* SWIG */ 00251 }; 00252 #endif /* _WordDict_h_ */