00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifdef HAVE_CONFIG_H
00014 #include "config.h"
00015 #endif
00016
00017 #include "WordListOne.h"
00018 #include "WordReference.h"
00019 #include "WordRecord.h"
00020 #include "WordType.h"
00021 #include "WordContext.h"
00022 #include "Configuration.h"
00023 #include "htString.h"
00024 #include "HtTime.h"
00025 #include "WordDBCompress.h"
00026 #include "WordDBCache.h"
00027 #include "WordDead.h"
00028 #include "WordMeta.h"
00029
00030 #include <stdio.h>
00031 #include <stdlib.h>
00032 #include <unistd.h>
00033 #include <ctype.h>
00034 #include <errno.h>
00035
00036
00037
00038 WordListOne::WordListOne(WordContext* ncontext)
00039 {
00040 context = ncontext;
00041 db = new WordDB(ncontext->GetDBInfo());
00042 dict = new WordDict();
00043 dict->Initialize(this);
00044 meta = new WordMeta();
00045 meta->Initialize(this);
00046 dead = new WordDead();
00047 dead->Initialize(this);
00048
00049
00050 isopen = 0;
00051 Configuration& config = context->GetConfiguration();
00052 extended = config.Boolean("wordlist_extend");
00053 verbose = config.Value("wordlist_verbose");
00054 compressor = 0;
00055 caches = 0;
00056 flags = 0;
00057 }
00058
00059
00060
00061 WordListOne::~WordListOne()
00062 {
00063 BatchEnd();
00064 Close();
00065 delete dead;
00066 delete meta;
00067 delete dict;
00068 delete db;
00069 }
00070
00071 static int word_db_qcmp(WordContext* context, const WordDBCacheEntry *a, const WordDBCacheEntry *b)
00072 {
00073 return WordKey::Compare(context, (const unsigned char*)a->key, a->key_size, (const unsigned char*)b->key, b->key_size);
00074 }
00075
00076
00077
00078 int WordListOne::Open(const String& nfilename, int mode)
00079 {
00080 filename = nfilename;
00081
00082 int usecompress = 0;
00083 Configuration& config = context->GetConfiguration();
00084
00085 if(config.Boolean("wordlist_compress") == 1) {
00086 usecompress = DB_COMPRESS;
00087 WordDBCompress* compressor = new WordDBCompress(context);
00088
00089 SetCompressor(compressor);
00090
00091 context->GetDBInfo().dbenv->mp_cmpr_info = compressor->CmprInfo();
00092 context->GetDBInfo().dbenv->flags |= DB_ENV_CMPR;
00093 }
00094
00095 flags = (mode & O_RDWR) ? DB_CREATE : DB_RDONLY;
00096 flags |= usecompress;
00097 if(mode & O_TRUNC) {
00098 if(mode & O_RDWR) {
00099 unlink((char*)filename);
00100 } else
00101 fprintf(stderr, "WordListOne::Open: O_TRUNC | O_RDONLY is meaningless\n");
00102 }
00103
00104 WordLock* lock;
00105 Meta()->Lock("open", lock);
00106
00107 db->set_bt_compare(word_db_cmp, (void*)context);
00108
00109 if(config.Boolean("wordlist_cache_inserts", 0)) {
00110 int size = config.Value("wordlist_cache_size", 0);
00111 if(size / 2 < WORD_DB_CACHE_MINIMUM)
00112 size = 0;
00113 else
00114 size /= 2;
00115
00116 db->CacheOn(context, size);
00117 db->CacheCompare(word_db_qcmp);
00118 }
00119
00120 db->set_pagesize(Pagesize());
00121
00122 int ret = db->Open(filename, "index", DB_BTREE, flags, 0666, WORD_DB_INDEX) == 0 ? OK : NOTOK;
00123 if(ret == NOTOK) return ret;
00124 if(dict->Open() != OK) return NOTOK;
00125 if(meta->Open() != OK) return NOTOK;
00126 if(dead->Open() != OK) return NOTOK;
00127
00128 isopen = 1;
00129
00130 Meta()->Unlock("open", lock);
00131
00132 return ret;
00133 }
00134
00135
00136
00137 int WordListOne::Close()
00138 {
00139 if(isopen) {
00140 if(db->Close() != 0) return NOTOK;
00141 if(dict->Close() != 0) return NOTOK;
00142 if(meta->Close() != 0) return NOTOK;
00143 if(dead->Close() != 0) return NOTOK;
00144 isopen = 0;
00145 }
00146
00147 {
00148 WordDBCompress* compressor = GetCompressor();
00149 if(compressor) {
00150 delete compressor;
00151 SetCompressor(0);
00152 }
00153 delete context->GetDBInfo().dbenv->mp_cmpr_info;
00154 context->GetDBInfo().dbenv->mp_cmpr_info = 0;
00155 context->GetDBInfo().dbenv->flags &= ~DB_ENV_CMPR;
00156 }
00157
00158 return OK;
00159 }
00160
00161
00162
00163 unsigned int WordListOne::Size() const
00164 {
00165 return db->Size();
00166 }
00167
00168
00169
00170 int WordListOne::Override(const WordReference& arg)
00171 {
00172 if (arg.GetWord().length() == 0) {
00173 fprintf(stderr, "WordListOne::Override(%s) word is zero length\n", (char*)arg.Get());
00174 return NOTOK;
00175 }
00176 if (!arg.Key().Filled()) {
00177 fprintf(stderr, "WordListOne::Override(%s) key is not fully defined\n", (char*)arg.Get());
00178 return NOTOK;
00179 }
00180
00181 WordType& wtype = context->GetType();
00182 WordReference wordRef(arg);
00183 String word = wordRef.GetWord();
00184 if(wtype.Normalize(word) & WORD_NORMALIZE_NOTOK)
00185 return NOTOK;
00186 wordRef.SetWord(word);
00187 unsigned int wordid = 0;
00188 if(dict->SerialRef(word, wordid) != OK) return NOTOK;
00189 wordRef.Key().Set(WORD_KEY_WORD, wordid);
00190
00191 int ret = NOTOK;
00192
00193 if(caches) {
00194 String key;
00195 String record;
00196 if(wordRef.Pack(key, record) != OK)
00197 return NOTOK;
00198 ret = caches->Add(key.get(), key.length(), record.get(), record.length()) == 0 ? OK : NOTOK;
00199 if(caches->Full()) caches->Merge(*db);
00200 } else {
00201 ret = db->Put(wordRef, 0) == 0 ? OK : NOTOK;
00202 }
00203
00204 return ret;
00205 }
00206
00207
00208
00209
00210 List *WordListOne::operator [] (const WordReference& wordRef)
00211 {
00212 return Collect(wordRef);
00213 }
00214
00215
00216
00217 List *WordListOne::Prefix (const WordReference& prefix)
00218 {
00219 List* result = new List();
00220 WordDictCursor* cursor = Dict()->CursorPrefix(prefix.GetWord());
00221 String word;
00222 WordDictRecord record;
00223 WordReference prefix2(prefix);
00224 while(Dict()->NextPrefix(cursor, word, record) == 0) {
00225 prefix2.Key().Set(WORD_KEY_WORD, record.Id());
00226 List* tmp_result = Collect(prefix2);
00227 while(tmp_result->Count() > 0) {
00228 WordReference* entry = (WordReference*)tmp_result->Shift(LIST_REMOVE_RELEASE);
00229 entry->SetWord(word);
00230 result->Push(entry);
00231 }
00232 delete tmp_result;
00233 }
00234 return result;
00235 }
00236
00237
00238
00239 List *WordListOne::WordRefs()
00240 {
00241 return Collect(WordReference(context));
00242 }
00243
00244
00245
00246 List *WordListOne::Collect(const WordReference& wordRef)
00247 {
00248 WordCursor *search = Cursor(wordRef.Key(), HTDIG_WORDLIST_COLLECTOR);
00249 if(search->Walk() != OK) return 0;
00250 List* result = search->GetResults();
00251 delete search;
00252 return result;
00253 }
00254
00255
00256
00257 int
00258 WordListOne::Read(FILE* f)
00259 {
00260 WordReference wordRef(context);
00261 #define WORD_BUFFER_SIZE 1024
00262 char buffer[WORD_BUFFER_SIZE + 1];
00263 String line;
00264 int line_number = 0;
00265 int inserted = 0;
00266
00267 BatchStart();
00268
00269 String key;
00270 String record;
00271
00272 while(fgets(buffer, WORD_BUFFER_SIZE, f)) {
00273 line_number++;
00274 int buffer_length = strlen(buffer);
00275 int eol = buffer[buffer_length - 1] == '\n';
00276
00277 if(eol) buffer[--buffer_length] = '\0';
00278
00279 line.append(buffer, buffer_length);
00280
00281
00282
00283 if(!eol) continue;
00284
00285
00286
00287 if(line.last() == '\\') {
00288 line.chop(1);
00289 continue;
00290 }
00291
00292 if(!line.empty()) {
00293 StringList fields(line, "\t ");
00294
00295
00296
00297
00298 String* word = (String*)fields.Get_First();
00299 unsigned int wordid;
00300 if(dict->SerialRef(*word, wordid) != OK) return NOTOK;
00301 word->trunc();
00302 (*word) << wordid;
00303
00304 if(wordRef.SetList(fields) != OK) {
00305 fprintf(stderr, "WordList::Read: line %d : %s\n", line_number, (char*)line);
00306 fprintf(stderr, " cannot build WordReference (ignored)\n");
00307 } else {
00308 if(wordRef.Pack(key, record) != OK) {
00309 fprintf(stderr, "WordList::Read: line %d : %s\n", line_number, (char*)line);
00310 fprintf(stderr, " pack failed (ignored)\n");
00311 } else {
00312 caches->Add(key.get(), key.length(), record.get(), record.length());
00313 inserted++;
00314 }
00315 if(verbose && (inserted % 10000 == 0)) fprintf(stderr, "WordList::Read: inserted %d entries\n", inserted);
00316 if(verbose > 1) fprintf(stderr, "WordList::Read: inserting %s\n", (char*)wordRef.Get());
00317 }
00318
00319 line.trunc();
00320 }
00321 }
00322
00323 BatchEnd();
00324
00325 return inserted;
00326 }
00327
00328
00329
00330
00331 class FileOutData : public Object
00332 {
00333 public:
00334 FILE* f;
00335 String word;
00336 FileOutData(FILE* f_arg) : f(f_arg) { }
00337 };
00338
00339
00340
00341 static int
00342 wordlist_walk_callback_file_out(WordList *, WordDBCursor& , const WordReference *wordRef, Object &ndata)
00343 {
00344 FileOutData& data = (FileOutData&)ndata;
00345 ((WordReference*)wordRef)->SetWord(data.word);
00346 fprintf(data.f, "%s\n", (char*)wordRef->Get());
00347 return OK;
00348 }
00349
00350 int WordListOne::Write(FILE* f)
00351 {
00352 FileOutData data(f);
00353 WordDictCursor* cursor = dict->Cursor();
00354 int ret;
00355 String word;
00356 WordDictRecord wordinfo;
00357 while((ret = dict->Next(cursor, word, wordinfo)) == 0) {
00358 WordKey key(context);
00359 key.Set(WORD_KEY_WORD, wordinfo.Id());
00360 data.word = word;
00361 WordCursor *search = Cursor(key, wordlist_walk_callback_file_out, (Object *)&data);
00362 search->Walk();
00363 delete search;
00364 }
00365 return ret == DB_NOTFOUND ? OK : NOTOK;
00366 }
00367
00368
00369
00370
00371
00372
00373 class DeleteWordData : public Object
00374 {
00375 public:
00376 DeleteWordData() { count = 0; }
00377
00378 int count;
00379 };
00380
00381
00382
00383
00384 static int delete_word(WordList *words, WordDBCursor &cursor, const WordReference *word, Object &data)
00385 {
00386 WordListOne *words_one = (WordListOne*)words;
00387 if(words_one->DeleteCursor(cursor) == 0) {
00388 ((DeleteWordData&)data).count++;
00389 return OK;
00390 } else {
00391 fprintf(stderr, "WordList delete_word: deleting %s failed\n", (char*)word->Get());
00392 return NOTOK;
00393 }
00394 }
00395
00396
00397
00398
00399
00400
00401 int WordListOne::WalkDelete(const WordReference& wordRef)
00402 {
00403 DeleteWordData data;
00404 WordKey key = wordRef.Key();
00405
00406 if(key.IsDefined(WORD_KEY_WORD)) {
00407 WordCursor *description = Cursor(key, delete_word, &data);
00408 description->Walk();
00409 delete description;
00410 dict->Decr(wordRef.GetWord(), data.count);
00411 } else {
00412 WordDictCursor* cursor = dict->Cursor();
00413 int ret;
00414 String word;
00415 WordDictRecord wordinfo;
00416 int total = 0;
00417 while((ret = dict->Next(cursor, word, wordinfo)) == 0) {
00418 key.Set(WORD_KEY_WORD, wordinfo.Id());
00419 WordCursor *search = Cursor(key, delete_word, &data);
00420 search->Walk();
00421 delete search;
00422 dict->Decr(word, data.count);
00423 total += data.count;
00424 data.count = 0;
00425 }
00426 data.count = total;
00427 }
00428 return data.count;
00429 }
00430
00431
00432
00433
00434
00435 int WordListOne::Noccurrence(const String& word, unsigned int& noccurrence) const
00436 {
00437 return dict->Noccurrence(word, noccurrence);
00438 }
00439
00440 WordKey WordListOne::Key(const String& bufferin)
00441 {
00442 WordKey key(context);
00443 StringList fields(bufferin, "\t ");
00444 String* field = (String*)fields.Get_First();
00445 unsigned int wordid;
00446 Dict()->Serial(*field, wordid);
00447 field->trunc();
00448 (*field) << wordid;
00449 key.SetList(fields);
00450 return key;
00451 }
00452
00453 WordReference WordListOne::Word(const String& bufferin, int exists )
00454 {
00455 WordReference wordRef(context);
00456 StringList fields(bufferin, "\t ");
00457 String* field = (String*)fields.Get_First();
00458 if(context->GetType().Normalize(*field) & WORD_NORMALIZE_NOTOK) {
00459
00460
00461
00462
00463 if(!exists)
00464 fprintf(stderr, "WordListOne::Word: cannot normalize word %s\n", (char*)*field);
00465 }
00466 String word = *field;
00467 unsigned int wordid;
00468 if(exists)
00469 Dict()->SerialExists(word, wordid);
00470 else
00471 Dict()->Serial(word, wordid);
00472 field->trunc();
00473 (*field) << wordid;
00474 wordRef.SetList(fields);
00475 wordRef.SetWord(word);
00476 return wordRef;
00477 }
00478
00479 void
00480 WordListOne::BatchEnd()
00481 {
00482 if(caches) {
00483 caches->Merge(*db);
00484 WordList::BatchEnd();
00485 }
00486 }