00001 // 00002 // WordList.h 00003 // 00004 // NAME 00005 // 00006 // manage and use an inverted index file. 00007 // 00008 // SYNOPSIS 00009 // 00010 // #include <mifluz.h> 00011 // 00012 // Configuration* config; 00013 // WordReference wordRef; 00014 // ... 00015 // WordList* words = new WordList(config) 00016 // 00017 // delete words; 00018 // 00019 // DESCRIPTION 00020 // 00021 // WordList is the <i>mifluz</i> equivalent of a database handler. Each 00022 // WordList object is bound to an inverted index file and implements the 00023 // operations to create it, fill it with word occurrences and search 00024 // for an entry matching a given criterion. 00025 // 00026 // CONFIGURATION 00027 // 00028 // wordlist_extend {true|false} (default false) 00029 // If <b>true</b> maintain reference count of unique 00030 // words. The <b>Noccurrence</b> method gives access to this count. 00031 // 00032 // wordlist_verbose <number> (default 0) 00033 // Set the verbosity level of the WordList class. 00034 // <br> 00035 // 1 walk logic 00036 // <br> 00037 // 2 walk logic details 00038 // <br> 00039 // 3 walk logic lots of details 00040 // 00041 // wordlist_page_size <bytes> (default 8192) 00042 // Berkeley DB page size (see Berkeley DB documentation) 00043 // 00044 // wordlist_cache_size <bytes> (default 500K) 00045 // Berkeley DB cache size (see Berkeley DB documentation) 00046 // Cache makes a huge difference in performance. It must be at least 2% 00047 // of the expected total data size. Note that if compression is activated 00048 // the data size is eight times larger than the actual file size. In this 00049 // case the cache must be scaled to 2% of the data size, not 2% 00050 // of the file size. See <b>Cache tuning</b> in the mifluz guide for 00051 // more hints. 00052 // 00053 // wordlist_compress {true|false} (default false) 00054 // Activate compression of the index. The resulting index is eight times 00055 // smaller than the uncompressed index. 00056 // 00057 // 00058 // END 00059 // 00060 // Part of the ht://Dig package <http://www.htdig.org/> 00061 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group 00062 // For copyright details, see the file COPYING in your distribution 00063 // or the GNU General Public License version 2 or later 00064 // <http://www.gnu.org/copyleft/gpl.html> 00065 // 00066 // $Id: WordListMulti_8h-source.html,v 1.1 2008/06/08 10:13:14 sebdiaz Exp $ 00067 // 00068 00069 #ifndef _WordListMulti_h_ 00070 #define _WordListMulti_h_ 00071 00072 #include <fcntl.h> 00073 #include <stdio.h> 00074 00075 #ifndef SWIG 00076 #include "WordList.h" 00077 #include "WordCursorOne.h" 00078 //#include "WordCursorMulti.h" 00079 #endif /* SWIG */ 00080 00081 class WordContext; 00082 00083 // 00084 // Inverted index interface 00085 // 00086 class WordListMulti : public WordList 00087 { 00088 public: 00089 //- 00090 // Constructor. Build inverted index handling object using 00091 // run time configuration parameters listed in the <b>CONFIGURATION</b> 00092 // section. 00093 // 00094 WordListMulti(WordContext* ncontext); 00095 virtual ~WordListMulti(); 00096 00097 #ifndef SWIG 00098 virtual int Override(const WordReference& wordRef); 00099 #endif /* SWIG */ 00100 00101 //- 00102 // Returns OK if <b>wordRef</b> exists in the index, NOTOK otherwise. 00103 // 00104 virtual int Exists(const WordReference& wordRef); 00105 00106 // 00107 // Delete permanently 00108 // 00109 //- 00110 // Delete all entries in the index whose key matches the 00111 // <i>Key()</i> part of <b>wordRef</b>, using the <i>Walk</i> 00112 // method. 00113 // Returns the number of entries successfully deleted. 00114 // 00115 virtual int WalkDelete(const WordReference& wordRef); 00116 //- 00117 // Delete the entry in the index that exactly matches the 00118 // <i>Key()</i> part of <b>wordRef.</b> 00119 // Returns OK if deletion is successfull, NOTOK otherwise. 00120 // 00121 virtual int Delete(const WordReference& wordRef); 00122 00123 //- 00124 // Open inverted index <b>filename.</b> <b>mode</b> 00125 // may be <i>O_RDONLY</i> or <i>O_RDWR.</i> If mode is 00126 // <i>O_RDWR</i> it can be or'ed with <i>O_TRUNC</i> to reset 00127 // the content of an existing inverted index. 00128 // Return OK on success, NOTOK otherwise. 00129 // 00130 virtual int Open(const String& filename, int mode); 00131 //- 00132 // Close inverted index. 00133 // Return OK on success, NOTOK otherwise. 00134 // 00135 virtual int Close(); 00136 //- 00137 // Return the size of the index in pages. 00138 // 00139 virtual unsigned int Size() const; 00140 int AddIndex(); 00141 int Merge(); 00142 00143 //- 00144 // Alias to the <b>Find</b> method. 00145 // 00146 virtual List *operator [] (const WordReference& wordRef); 00147 //- 00148 // Returns the list of word occurrences matching the <i>Key()</i> 00149 // part of <b>wordRef.</b> In the <i>Key()</i>, the string 00150 // (accessed with <i>GetWord()</i>) matches any string that begins 00151 // with it. The <i>List</i> returned contains pointers to 00152 // <i>WordReference</i> objects. It is the responsibility of the 00153 // caller to free the list. 00154 // 00155 virtual List *Prefix (const WordReference& prefix); 00156 00157 // 00158 // Iterate over the complete database. 00159 // 00160 #ifndef SWIG 00161 //- 00162 // Returns a list of all unique words contained in the inverted 00163 // index. The <i>List</i> returned contains pointers to 00164 // <i>String</i> objects. It is the responsibility of the caller 00165 // to free the list. See List.h header for usage. 00166 // 00167 virtual List *Words(); 00168 #endif /* SWIG */ 00169 //- 00170 // Returns a list of all entries contained in the 00171 // inverted index. The <i>List</i> returned contains pointers to 00172 // <i>WordReference</i> objects. It is the responsibility of 00173 // the caller to free the list. See List.h header for usage. 00174 // 00175 virtual List *WordRefs(); 00176 00177 #ifndef SWIG 00178 //- 00179 // Create a cursor that searches all the occurrences in the 00180 // inverted index and call <b>ncallback</b> with 00181 // <b>ncallback_data</b> for every match. 00182 // 00183 virtual inline WordCursor *Cursor(wordlist_walk_callback_t callback, Object *callback_data) { return new WordCursorOne(this, callback, callback_data); } 00184 #endif /* SWIG */ 00185 //- 00186 // Create a cursor that searches all the occurrences in the 00187 // inverted index and that match <b>nsearchKey.</b> If 00188 // <b>naction</b> is set to HTDIG_WORDLIST_WALKER calls 00189 // <b>searchKey.callback</b> with <b>searchKey.callback_data</b> 00190 // for every match. If <b>naction</b> is set to 00191 // HTDIG_WORDLIST_COLLECT push each match in <b>searchKey.collectRes</b> 00192 // data member as a <b>WordReference</b> object. It is the responsibility 00193 // of the caller to free the <b>searchKey.collectRes</b> list. 00194 // 00195 virtual inline WordCursor *Cursor(const WordKey &searchKey, int action = HTDIG_WORDLIST_WALKER) { return new WordCursorOne(this, searchKey, action); } 00196 #ifndef SWIG 00197 //- 00198 // Create a cursor that searches all the occurrences in the 00199 // inverted index and that match <b>nsearchKey</b> and calls 00200 // <b>ncallback</b> with <b>ncallback_data</b> for every match. 00201 // 00202 virtual inline WordCursor *Cursor(const WordKey &searchKey, wordlist_walk_callback_t callback, Object * callback_data) { return new WordCursorOne(this, searchKey, callback, callback_data); } 00203 #endif /* SWIG */ 00204 00205 // 00206 // Update/get global word statistics statistics 00207 // 00208 //- 00209 // Add one to the reference count for the string contained 00210 // in the <i>Key().GetWord()</i> part of <b>wordRef.</b> 00211 // Returns OK on success, NOTOK otherwise. 00212 // 00213 virtual int Ref(const WordReference& wordRef); 00214 //- 00215 // Substract one to the reference count for the string contained 00216 // in the <i>Key().GetWord()</i> part of <b>wordRef.</b> 00217 // Returns OK on success, NOTOK otherwise. 00218 // 00219 virtual int Unref(const WordReference& wordRef); 00220 virtual int AllRef(); 00221 00222 #ifndef SWIG 00223 //- 00224 // Return in <b>noccurrence</b> the number of occurrences of the 00225 // string contained in the <i>GetWord()</i> part of <b>key.</b> 00226 // Returns OK on success, NOTOK otherwise. 00227 // 00228 virtual int Noccurrence(const String& key, unsigned int& noccurrence) const; 00229 virtual int Write(FILE* f) { return NOTOK; } 00230 virtual int Read(FILE* f) { return NOTOK; } 00231 00232 virtual WordKey Key(const String& bufferin) { abort(); return WordKey(0); } 00233 00234 virtual WordReference Word(const String& bufferin, int exists = 0) { abort(); return WordReference(0); } 00235 00236 #endif /* SWIG */ 00237 // 00238 // Retrieve WordReferences from the database. 00239 // Backend of WordRefs, operator[], Prefix... 00240 // 00241 virtual List *Collect(const WordReference& word); 00242 #ifndef SWIG 00243 List* dbs; 00244 int serial; 00245 int file_max; 00246 int file_min; 00247 unsigned int put_max; 00248 #endif /* SWIG */ 00249 }; 00250 00251 #endif /* _WordListMulti_h_ */ 00252