00001 // 00002 // WordList.h 00003 // 00004 // NAME 00005 // 00006 // abstract class to manage and use an inverted index file. 00007 // 00008 // SYNOPSIS 00009 // 00010 // #include <mifluz.h> 00011 // 00012 // WordContext context; 00013 // 00014 // WordList* words = context->List(); 00015 // 00016 // delete words; 00017 // 00018 // DESCRIPTION 00019 // 00020 // WordList is the <i>mifluz</i> equivalent of a database handler. Each 00021 // WordList object is bound to an inverted index file and implements the 00022 // operations to create it, fill it with word occurrences and search 00023 // for an entry matching a given criterion. 00024 // 00025 // WordList is an abstract class and cannot be instanciated. 00026 // The <b>List</b> method of the class WordContext will create 00027 // an instance using the appropriate derived class, either WordListOne 00028 // or WordListMulti. Refer to the corresponding manual pages for 00029 // more information on their specific semantic. 00030 // 00031 // When doing bulk insertions, mifluz creates temporary files that 00032 // contain the entries to be inserted in the index. Those files are 00033 // typically named <i>indexC00000000</i>. The maximum size of the 00034 // temporary file is <b>wordlist_cache_size</b> / 2. When the maximum 00035 // size of the temporary file is reached, mifluz creates another temporary 00036 // file named <i>indexC00000001</i>. The process continues until mifluz 00037 // created 50 temporary file. At this point it merges all temporary files 00038 // into one that replaces the first <i>indexC00000000</i>. Then it continues 00039 // to create temporary file again and keeps following this algorithm until 00040 // the bulk insertion is finished. When the bulk insertion is finished, 00041 // mifluz has one big file named <i>indexC00000000</i> that contains 00042 // all the entries to be inserted in the index. mifluz inserts all the 00043 // entries from <i>indexC00000000</i> into the index and delete the 00044 // temporary file when done. The insertion will be fast since all the 00045 // entries in <i>indexC00000000</i> are already sorted. 00046 // 00047 // The parameter <b>wordlist_cache_max</b> can be used to prevent the 00048 // temporary files to grow indefinitely. If the total cumulated size of 00049 // the <i>indexC*</i> files grow beyond this parameter, they are merged 00050 // into the main index and deleted. For instance setting this parameter 00051 // value to 500Mb garanties that the total size of the <i>indexC*</i> 00052 // files will not grow above 500Mb. 00053 // 00054 // CONFIGURATION 00055 // 00056 // wordlist_extend {true|false} (default false) 00057 // If <b>true</b> maintain reference count of unique 00058 // words. The <b>Noccurrence</b> method gives access to this count. 00059 // 00060 // wordlist_verbose <number> (default 0) 00061 // Set the verbosity level of the WordList class. 00062 // <br> 00063 // 1 walk logic 00064 // <br> 00065 // 2 walk logic details 00066 // <br> 00067 // 3 walk logic lots of details 00068 // 00069 // wordlist_page_size <bytes> (default 8192) 00070 // Berkeley DB page size (see Berkeley DB documentation) 00071 // 00072 // wordlist_cache_size <bytes> (default 500K) 00073 // Berkeley DB cache size (see Berkeley DB documentation) 00074 // Cache makes a huge difference in performance. It must be at least 2% 00075 // of the expected total data size. Note that if compression is activated 00076 // the data size is eight times larger than the actual file size. In this 00077 // case the cache must be scaled to 2% of the data size, not 2% 00078 // of the file size. See <b>Cache tuning</b> in the mifluz guide for 00079 // more hints. 00080 // See WordList(3) for the rationale behind cache file handling. 00081 // 00082 // wordlist_cache_max <bytes> (default 0) 00083 // Maximum size of the cumulated cache files generated when doing bulk 00084 // insertion with the <b>BatchStart()</b> function. When this limit is 00085 // reached, the cache files are all merged into the inverted index. 00086 // The value 0 means infinite size allowed. 00087 // See WordList(3) for the rationale behind cache file handling. 00088 // 00089 // wordlist_cache_inserts {true|false} (default false) 00090 // If true all <b>Insert</b> calls are cached in memory. When the 00091 // WordList object is closed or a different access method is called 00092 // the cached entries are flushed in the inverted index. 00093 // 00094 // wordlist_compress {true|false} (default false) 00095 // Activate compression of the index. The resulting index is eight times 00096 // smaller than the uncompressed index. 00097 // 00098 // 00099 // END 00100 // 00101 // Part of the ht://Dig package <http://www.htdig.org/> 00102 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group 00103 // For copyright details, see the file COPYING in your distribution 00104 // or the GNU General Public License version 2 or later 00105 // <http://www.gnu.org/copyleft/gpl.html> 00106 // 00107 // $Id: WordList_8h-source.html,v 1.1 2008/06/08 10:13:15 sebdiaz Exp $ 00108 // 00109 00110 #ifndef _WordList_h_ 00111 #define _WordList_h_ 00112 00113 #include <fcntl.h> 00114 #include <stdlib.h> 00115 #include <stdio.h> 00116 00117 #ifndef SWIG 00118 #include "Dictionary.h" 00119 #include "List.h" 00120 #include "htString.h" 00121 #include "WordRecord.h" 00122 #include "WordReference.h" 00123 #include "WordType.h" 00124 #include "WordDB.h" 00125 #include "WordDBCompress.h" 00126 #include "Configuration.h" 00127 #include "WordCursor.h" 00128 #include "WordDict.h" 00129 #endif /* SWIG */ 00130 00131 class List; 00132 class WordList; 00133 class WordDBCursor; 00134 class WordContext; 00135 class WordDBCaches; 00136 class WordMeta; 00137 class WordDead; 00138 00139 // 00140 // Inverted index interface 00141 // 00142 class WordList 00143 { 00144 public: 00145 virtual ~WordList() {} 00146 00147 //- 00148 // Return a pointer to the WordContext object used to create 00149 // this instance. 00150 // 00151 inline WordContext* GetContext() { return context; } 00152 #ifndef SWIG 00153 //- 00154 // Return a pointer to the WordContext object used to create 00155 // this instance as a const. 00156 // 00157 inline const WordContext* GetContext() const { return context; } 00158 #endif /* SWIG */ 00159 00160 //- 00161 // Insert <b>wordRef</b> in index. If the <i>Key()</i> part of 00162 // the <b>wordRef</b> exists in the index, override it. 00163 // Returns OK on success, NOTOK on error. 00164 // 00165 virtual inline int Override(const WordReference& wordRef) { NotImplemented(); return NOTOK; } 00166 00167 //- 00168 // Returns OK if <b>wordRef</b> exists in the index, NOTOK otherwise. 00169 // 00170 virtual int Exists(const WordReference& wordRef) { NotImplemented(); return NOTOK; } 00171 #ifndef SWIG 00172 //- 00173 // Returns OK if <b>word</b> exists in the index, NOTOK otherwise. 00174 // 00175 inline int Exists(const String& word) { return Dict()->Exists(word) ? OK : NOTOK; } 00176 #endif /* SWIG */ 00177 00178 // 00179 // Delete permanently 00180 // 00181 //- 00182 // Delete all entries in the index whose key matches the 00183 // <i>Key()</i> part of <b>wordRef</b>, using the <i>Walk</i> 00184 // method. 00185 // Returns the number of entries successfully deleted. 00186 // 00187 virtual int WalkDelete(const WordReference& wordRef) { NotImplemented(); return NOTOK; } 00188 //- 00189 // Delete the entry in the index that exactly matches the 00190 // <i>Key()</i> part of <b>wordRef.</b> 00191 // Returns OK if deletion is successfull, NOTOK otherwise. 00192 // 00193 virtual int Delete(const WordReference& wordRef) { NotImplemented(); return NOTOK; } 00194 00195 //- 00196 // Open inverted index <b>filename.</b> <b>mode</b> 00197 // may be <i>O_RDONLY</i> or <i>O_RDWR.</i> If mode is 00198 // <i>O_RDWR</i> it can be or'ed with <i>O_TRUNC</i> to reset 00199 // the content of an existing inverted index. 00200 // Return OK on success, NOTOK otherwise. 00201 // 00202 virtual int Open(const String& filename, int mode) { NotImplemented(); return NOTOK; } 00203 //- 00204 // Close inverted index. 00205 // Return OK on success, NOTOK otherwise. 00206 // 00207 virtual int Close() { NotImplemented(); return NOTOK; } 00208 //- 00209 // Return the size of the index in pages. 00210 // 00211 virtual unsigned int Size() const { NotImplemented(); return 0; } 00212 //- 00213 // Return the page size 00214 // 00215 virtual int Pagesize() const { NotImplemented(); return 0; } 00216 //- 00217 // Return a pointer to the inverted index dictionnary. 00218 // 00219 virtual WordDict *Dict() { NotImplemented(); return 0; } 00220 virtual WordMeta *Meta() { NotImplemented(); return 0; } 00221 virtual WordDead *Dead() { NotImplemented(); return 0; } 00222 //- 00223 // Return the filename given to the last call to Open. 00224 // 00225 const String& Filename() const { return filename; } 00226 //- 00227 // Return the mode given to the last call to Open. 00228 // 00229 int Flags() const { return flags; } 00230 00231 // 00232 // These returns a list of all the WordReference * matching 00233 // the constraint. 00234 //- 00235 // Returns the list of word occurrences exactly matching the 00236 // <i>Key()</i> part of <b>wordRef.</b> The <i>List</i> returned 00237 // contains pointers to <i>WordReference</i> objects. It is 00238 // the responsibility of the caller to free the list. See List.h 00239 // header for usage. 00240 // 00241 inline List *Find(const WordReference& wordRef) { return (*this)[wordRef]; } 00242 //- 00243 // Returns the list of word occurrences exactly matching the 00244 // <b>word.</b> The <i>List</i> returned 00245 // contains pointers to <i>WordReference</i> objects. It is 00246 // the responsibility of the caller to free the list. See List.h 00247 // header for usage. 00248 // 00249 inline List *FindWord(const String& word) { return (*this)[word]; } 00250 #ifndef SWIG 00251 //- 00252 // Alias to the <b>Find</b> method. 00253 // 00254 virtual List *operator [] (const WordReference& wordRef) { NotImplemented(); return 0; } 00255 //- 00256 // Alias to the <b>FindWord</b> method. 00257 // 00258 inline List *operator [] (const String& word) { 00259 WordReference wordRef(context, word); 00260 unsigned int wordid; 00261 Dict()->SerialExists(word, wordid); 00262 if(wordid != WORD_DICT_SERIAL_INVALID) { 00263 wordRef.Key().Set(WORD_KEY_WORD, wordid); 00264 return (*this)[wordRef]; 00265 } else { 00266 return new List; 00267 } 00268 } 00269 #endif /* SWIG */ 00270 //- 00271 // Returns the list of word occurrences matching the <i>Key()</i> 00272 // part of <b>wordRef.</b> In the <i>Key()</i>, the string 00273 // (accessed with <i>GetWord()</i>) matches any string that begins 00274 // with it. The <i>List</i> returned contains pointers to 00275 // <i>WordReference</i> objects. It is the responsibility of the 00276 // caller to free the list. 00277 // 00278 virtual List *Prefix (const WordReference& prefix) { NotImplemented(); return 0; } 00279 #ifndef SWIG 00280 //- 00281 // Returns the list of word occurrences matching the 00282 // <b>word.</b> In the <i>Key()</i>, the string (accessed with 00283 // <i>GetWord()</i>) matches any string that begins with it. The 00284 // <i>List</i> returned contains pointers to <i>WordReference</i> 00285 // objects. It is the responsibility of the caller to free the 00286 // list. 00287 // 00288 inline List *Prefix (const String& prefix) { return this->Prefix(WordReference(context, prefix)); } 00289 #endif /* SWIG */ 00290 00291 // 00292 // Iterate over the complete database. 00293 // 00294 #ifndef SWIG 00295 //- 00296 // Returns a list of all unique words contained in the inverted 00297 // index. The <i>List</i> returned contains pointers to 00298 // <i>String</i> objects. It is the responsibility of the caller 00299 // to free the list. See List.h header for usage. 00300 // 00301 virtual List *Words() { NotImplemented(); return 0; } 00302 #endif /* SWIG */ 00303 //- 00304 // Returns a list of all entries contained in the 00305 // inverted index. The <i>List</i> returned contains pointers to 00306 // <i>WordReference</i> objects. It is the responsibility of 00307 // the caller to free the list. See List.h header for usage. 00308 // 00309 virtual List *WordRefs() { NotImplemented(); return 0; } 00310 00311 #ifndef SWIG 00312 //- 00313 // Create a cursor that searches all the occurrences in the 00314 // inverted index and call <b>ncallback</b> with 00315 // <b>ncallback_data</b> for every match. 00316 // 00317 virtual WordCursor *Cursor(wordlist_walk_callback_t callback, Object *callback_data) { NotImplemented(); return 0; } 00318 //- 00319 // Create a cursor that searches all the occurrences in the 00320 // inverted index and that match <b>nsearchKey.</b> If 00321 // <b>naction</b> is set to HTDIG_WORDLIST_WALKER calls 00322 // <b>searchKey.callback</b> with <b>searchKey.callback_data</b> 00323 // for every match. If <b>naction</b> is set to 00324 // HTDIG_WORDLIST_COLLECT push each match in <b>searchKey.collectRes</b> 00325 // data member as a <b>WordReference</b> object. It is the responsibility 00326 // of the caller to free the <b>searchKey.collectRes</b> list. 00327 // 00328 virtual WordCursor *Cursor(const WordKey &searchKey, int action = HTDIG_WORDLIST_WALKER) { NotImplemented(); return 0; } 00329 //- 00330 // Create a cursor that searches all the occurrences in the 00331 // inverted index and that match <b>nsearchKey</b> and calls 00332 // <b>ncallback</b> with <b>ncallback_data</b> for every match. 00333 // 00334 virtual WordCursor *Cursor(const WordKey &searchKey, wordlist_walk_callback_t callback, Object * callback_data) { NotImplemented(); return 0; } 00335 #endif /* SWIG */ 00336 00337 //- 00338 // Create a WordKey object and return it. The <b>bufferin</b> argument 00339 // is used to initialize the key, as in the WordKey::Set method. 00340 // The first component of <b>bufferin</b> must be a word that is translated 00341 // to the corresponding numerical id using the WordDict::Serial 00342 // method. 00343 // 00344 virtual WordKey Key(const String& bufferin) { NotImplemented(); return WordKey(0); } 00345 //- 00346 // Create a WordReference object and return it. The 00347 // <b>bufferin</b> argument is used to initialize the structure, 00348 // as in the WordReference::Set method. The first component of 00349 // <b>bufferin</b> must be a word that is translated to the 00350 // corresponding numerical id using the WordDict::Serial method. 00351 // If the <b>exists</b> argument is set to 1, the method 00352 // WordDict::SerialExists is used instead, that is no serial is 00353 // assigned to the word if it does not already have one. 00354 // Before translation the word is normalized using the 00355 // WordType::Normalize method. The word is saved using the 00356 // WordReference::SetWord method. 00357 // 00358 virtual WordReference Word(const String& bufferin, int exists = 0) { NotImplemented(); return WordReference(0); } 00359 //- 00360 // Alias for Word(bufferin, 1). 00361 // 00362 virtual WordReference WordExists(const String& bufferin) { return Word(bufferin, 1); } 00363 00364 //- 00365 // Accelerate bulk insertions in the inverted index. All 00366 // insertion done with the <b>Override</b> method are batched 00367 // instead of being updating the inverted index immediately. 00368 // No update of the inverted index file is done before the 00369 // <b>BatchEnd</b> method is called. 00370 // 00371 virtual void BatchStart(); 00372 //- 00373 // Terminate a bulk insertion started with a call to the 00374 // <b>BatchStart</b> method. When all insertions are done 00375 // the <b>AllRef</b> method is called to restore statistics. 00376 // 00377 virtual void BatchEnd(); 00378 00379 #ifndef SWIG 00380 //- 00381 // Return in <b>noccurrence</b> the number of occurrences of the 00382 // string contained in the <i>GetWord()</i> part of <b>key.</b> 00383 // Returns OK on success, NOTOK otherwise. 00384 // 00385 virtual int Noccurrence(const String& key, unsigned int& noccurrence) const { NotImplemented(); return NOTOK; } 00386 00387 // 00388 // Input/Output 00389 // 00390 //- 00391 // Write on file descriptor <b>f</b> an ASCII description of the 00392 // index. Each line of the file contains a <i>WordReference</i> 00393 // ASCII description. 00394 // Return OK on success, NOTOK otherwise. 00395 // 00396 virtual int Write(FILE* f) { NotImplemented(); return NOTOK; } 00397 //- 00398 // Write on file descriptor <b>f</b> the complete dictionnary 00399 // with statistics. 00400 // Return OK on success, NOTOK otherwise. 00401 // 00402 virtual int WriteDict(FILE* f) { NotImplemented(); return NOTOK; } 00403 // 00404 //- 00405 // Read <i>WordReference</i> ASCII descriptions from <b>f</b>, 00406 // returns the number of inserted WordReference or < 0 if an error 00407 // occurs. Invalid descriptions are ignored as well as empty 00408 // lines. 00409 // 00410 virtual int Read(FILE* f) { NotImplemented(); return NOTOK; } 00411 00412 #endif /* SWIG */ 00413 // 00414 // Retrieve WordReferences from the database. 00415 // Backend of WordRefs, operator[], Prefix... 00416 // 00417 virtual List *Collect(const WordReference& word) { NotImplemented(); return 0; } 00418 #ifndef SWIG 00419 // 00420 // Compressor object accessors 00421 // 00422 inline WordDBCompress *GetCompressor() { return compressor; } 00423 inline void SetCompressor(WordDBCompress* compressor_arg) { compressor = compressor_arg; } 00424 00425 inline void NotImplemented() const { 00426 fprintf(stderr, "WordList::NotImplemented\n"); 00427 abort(); 00428 } 00429 00430 WordContext* context; 00431 00432 int isopen; 00433 int flags; 00434 String filename; 00435 00436 // 00437 // If true enable extended functionalities of WordList such 00438 // as per-word statistics. Read from wordlist_extended configuration 00439 // parameter. 00440 // 00441 int extended; 00442 00443 00444 WordDBCompress *compressor; 00445 int verbose; 00446 00447 WordDBCaches* caches; 00448 #endif /* SWIG */ 00449 }; 00450 00451 #endif /* _WordList_h_ */