00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifdef HAVE_CONFIG_H
00014 #include "config.h"
00015 #endif
00016
00017 #include "WordListMulti.h"
00018 #include "WordListOne.h"
00019 #include "myqsort.h"
00020
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <ctype.h>
00024 #include <errno.h>
00025 #include <sys/stat.h>
00026 #include <unistd.h>
00027
00028 class WordDBMulti : public Object
00029 {
00030 public:
00031 WordDBMulti() { words = 0; size = 0; mode = 0; }
00032
00033 WordListOne *words;
00034 String filename;
00035 int mode;
00036 unsigned int size;
00037 };
00038
00039
00040
00041 WordListMulti::WordListMulti(WordContext* ncontext)
00042 {
00043 dbs = new List;
00044 context = ncontext;
00045
00046 isopen = 0;
00047 Configuration& config = context->GetConfiguration();
00048 extended = config.Boolean("wordlist_extend");
00049 verbose = config.Value("wordlist_verbose");
00050
00051 file_max = config.Value("wordlist_multi_max", 50);
00052 if(file_max < 4) file_max = 4;
00053
00054 file_min = config.Value("wordlist_multi_min", 4);
00055 if(file_min < 2) file_min = 2;
00056
00057 if(file_max < file_min) file_max = file_min * 2;
00058
00059 put_max = config.Value("wordlist_multi_put_max", 1000);
00060 if(put_max < 50) put_max = 50;
00061
00062 compressor = 0;
00063 serial = 0;
00064 }
00065
00066
00067
00068 WordListMulti::~WordListMulti()
00069 {
00070 Close();
00071 }
00072
00073
00074
00075 int WordListMulti::Open(const String& nfilename, int mode)
00076 {
00077 filename = nfilename;
00078
00079 char tmp[32];
00080 struct stat stat_buf;
00081 int i;
00082
00083
00084
00085 for(i = 0; i < file_max; i++) {
00086 String filename_one(filename);
00087 sprintf(tmp, "%08d", i);
00088 filename_one << tmp;
00089 if(stat((char*)filename_one, &stat_buf) == 0) {
00090 WordDBMulti* db = new WordDBMulti();
00091 db->words = new WordListOne(context);
00092 db->filename = filename_one;
00093 db->mode = mode;
00094 dbs->Push(db);
00095 } else {
00096 break;
00097 }
00098 }
00099 serial = i;
00100
00101
00102
00103 if(i == 0 && (flags & DB_RDONLY)) {
00104 fprintf(stderr, "WordListMulti::Open(%s, O_RDONLY): no index found\n", (char*)filename);
00105 return NOTOK;
00106 }
00107
00108 isopen = 1;
00109
00110
00111
00112
00113 if(i == 0)
00114 if(AddIndex() != OK) return NOTOK;
00115
00116 WordDBMulti* db = (WordDBMulti*)dbs->Last();
00117 if(db->words->Open(db->filename, mode) != OK)
00118 return NOTOK;
00119
00120 return OK;
00121 }
00122
00123
00124
00125 int WordListMulti::Close()
00126 {
00127 if(isopen) {
00128 WordDBMulti* db;
00129 ListCursor cursor;
00130 for(dbs->Start_Get(cursor); (db = (WordDBMulti*)dbs->Get_Next(cursor));) {
00131 delete db->words;
00132 }
00133 dbs->Destroy();
00134 isopen = 0;
00135 filename.trunc();
00136 }
00137 return OK;
00138 }
00139
00140
00141
00142 unsigned int WordListMulti::Size() const
00143 {
00144 unsigned int size = 0;
00145 if(isopen) {
00146 WordDBMulti* db;
00147 ListCursor cursor;
00148 for(dbs->Start_Get(cursor); (db = (WordDBMulti*)dbs->Get_Next(cursor));) {
00149 if(!db->words->isopen) {
00150 if(db->words->Open(db->filename, O_RDONLY) != OK) return 0;
00151 size += db->words->Size();
00152 if(db->words->Close() != OK) return 0;
00153 } else {
00154 size += db->words->Size();
00155 }
00156 }
00157 }
00158 return size;
00159 }
00160
00161 int WordListMulti::AddIndex()
00162 {
00163 if(Flags() & O_RDONLY) return NOTOK;
00164
00165 if(serial >= file_max)
00166 Merge();
00167
00168 char tmp[32];
00169
00170 String filename_one(filename);
00171 sprintf(tmp, "%08d", serial);
00172 filename_one << tmp;
00173 serial++;
00174
00175 WordDBMulti* db = new WordDBMulti();
00176 db->words = new WordListOne(context);
00177 db->words->extended = extended;
00178 db->filename = filename_one;
00179 dbs->Push(db);
00180
00181 return OK;
00182 }
00183
00184 static int merge_cmp_size(WordListMulti*, WordDBMulti* a, WordDBMulti* b)
00185 {
00186 return b->size - a->size;
00187 }
00188
00189 static int merge_cmp_filename(WordListMulti*, WordDBMulti* a, WordDBMulti* b)
00190 {
00191 return a->filename.compare(b->filename);
00192 }
00193
00194 int WordListMulti::Merge()
00195 {
00196 if(Flags() & DB_RDONLY) return NOTOK;
00197
00198 Configuration& config = context->GetConfiguration();
00199 int use_compress = config.Boolean("wordlist_compress");
00200
00201 WordDBMulti* db = (WordDBMulti*)dbs->Last();
00202 if(db->words->Close() != OK) return NOTOK;
00203
00204
00205
00206
00207 WordDBMulti* heap = new WordDBMulti[serial];
00208 {
00209 int i;
00210 WordDBMulti* db;
00211 ListCursor cursor;
00212 for(i = 0, dbs->Start_Get(cursor); (db = (WordDBMulti*)dbs->Get_Next(cursor)); i++) {
00213 if(db->words->Open(db->filename, O_RDONLY) != OK) return NOTOK;
00214 db->size = db->words->Size();
00215 if(db->words->Close() != OK) return NOTOK;
00216
00217 heap[i] = *db;
00218 }
00219 dbs->Destroy();
00220 myqsort((void*)heap, serial, sizeof(WordDBMulti), (myqsort_cmp)merge_cmp_size, (void*)this);
00221 }
00222
00223 String tmpname = filename;
00224 tmpname << ".tmp";
00225
00226 while(serial > file_min) {
00227 WordDBMulti* a = &heap[serial - 1];
00228 WordDBMulti* b = &heap[serial - 2];
00229
00230 WordListOne tmp(context);
00231 tmp.extended = 0;
00232
00233 if(a->words->Open(a->filename, O_RDONLY) != OK) return NOTOK;
00234 if(b->words->Open(b->filename, O_RDONLY) != OK) return NOTOK;
00235 if(tmp.Open(tmpname, O_RDWR) != OK) return NOTOK;
00236 if(tmp.db->CacheP() && tmp.db->CacheOff() != 0) return OK;
00237
00238 WordDBCursor* cursora = a->words->db->Cursor();
00239 WordDBCursor* cursorb = b->words->db->Cursor();
00240
00241 if(cursora->Open() != 0) return NOTOK;
00242 String keya;
00243 String dataa;
00244
00245 if(cursorb->Open() != 0) return NOTOK;
00246 String keyb;
00247 String datab;
00248
00249 int reta;
00250 int retb;
00251
00252 reta = cursora->Get(keya, dataa, DB_NEXT);
00253 retb = cursorb->Get(keyb, datab, DB_NEXT);
00254
00255
00256
00257
00258 while(reta == 0 && retb == 0) {
00259
00260
00261
00262 if(WordKey::Compare(context, keya, keyb) < 0) {
00263 if(tmp.db->Put(0, keya, dataa, 0) != 0) return NOTOK;
00264 reta = cursora->Get(keya, dataa, DB_NEXT);
00265 } else {
00266 if(tmp.db->Put(0, keyb, datab, 0) != 0) return NOTOK;
00267 retb = cursorb->Get(keyb, datab, DB_NEXT);
00268 }
00269 }
00270
00271
00272
00273
00274 if((reta != 0 && reta != DB_NOTFOUND) ||
00275 (retb != 0 && retb != DB_NOTFOUND))
00276 return NOTOK;
00277
00278
00279
00280
00281
00282 if(reta != DB_NOTFOUND || retb != DB_NOTFOUND) {
00283 String key = reta == 0 ? keya : keyb;
00284 String data = reta == 0 ? data : datab;
00285 WordDBCursor* cursor = reta == 0 ? cursora : cursorb;
00286 int ret = 0;
00287 while(ret == 0) {
00288 if(tmp.db->Put(0, key, data, 0) != 0) return NOTOK;
00289 ret = cursor->Get(key, data, DB_NEXT);
00290 }
00291 if(ret != DB_NOTFOUND)
00292 return NOTOK;
00293 }
00294
00295 delete cursora;
00296 delete cursorb;
00297
00298 a->words->Close();
00299 b->words->Close();
00300 tmp.Close();
00301
00302
00303
00304
00305 if(unlink((char*)a->filename) != 0) {
00306 const String message = String("WordListMulti::Merge: unlink ") + a->filename;
00307 perror((const char*)message);
00308 return NOTOK;
00309 }
00310 if(use_compress) {
00311 if(unlink((char*)(a->filename + String("_weakcmpr"))) != 0) {
00312 const String message = String("WordListMulti::Merge: unlink ") + a->filename + String("_weakcmpr");
00313 perror((const char*)message);
00314 return NOTOK;
00315 }
00316 }
00317
00318
00319
00320
00321 if(unlink((char*)b->filename) != 0) {
00322 const String message = String("WordListMulti::Merge: unlink ") + b->filename;
00323 perror((const char*)message);
00324 return NOTOK;
00325 }
00326 if(use_compress) {
00327 if(unlink((char*)(b->filename + String("_weakcmpr"))) != 0) {
00328 const String message = String("WordListMulti::Merge: unlink ") + b->filename + String("_weakcmpr");
00329 perror((const char*)message);
00330 return NOTOK;
00331 }
00332 }
00333
00334
00335
00336
00337 if(rename((char*)tmpname, (char*)b->filename) != 0) {
00338 const String message = String("WordListMulti::Merge: rename ") + tmpname + String(" ") + b->filename;
00339 perror((const char*)message);
00340 return NOTOK;
00341 }
00342 if(use_compress) {
00343 if(rename((char*)(tmpname + String("_weakcmpr")), (char*)(b->filename + String("_weakcmpr"))) != 0) {
00344 const String message = String("WordListMulti::Merge: rename ") + tmpname + String("_weakcmpr ") + b->filename + String("_weakcmpr");
00345 perror((const char*)message);
00346 return NOTOK;
00347 }
00348 }
00349
00350
00351
00352
00353
00354 b->size += a->size;
00355
00356
00357
00358
00359 delete a->words;
00360
00361 serial--;
00362
00363
00364
00365 myqsort((void*)heap, serial, sizeof(WordDBMulti), (myqsort_cmp)merge_cmp_size, (void*)this);
00366 }
00367
00368
00369
00370
00371
00372 myqsort((void*)heap, serial, sizeof(WordDBMulti), (myqsort_cmp)merge_cmp_filename, (void*)this);
00373 int i;
00374 for(i = 0; i < serial; i++) {
00375 WordDBMulti* db = new WordDBMulti();
00376 *db = heap[i];
00377
00378 String newname(filename);
00379 char tmp[32];
00380 sprintf(tmp, "%08d", i);
00381 newname << tmp;
00382
00383
00384
00385
00386 if(db->filename.compare(newname)) {
00387
00388
00389
00390 if(rename((char*)db->filename, (char*)newname) != 0) {
00391 const String message = String("WordListMulti::Merge: rename ") + db->filename + String(" ") + newname;
00392 perror((const char*)message);
00393 return NOTOK;
00394 }
00395 if(use_compress) {
00396 if(rename((char*)(db->filename + String("_weakcmpr")), (char*)(newname + String("_weakcmpr"))) != 0) {
00397 const String message = String("WordListMulti::Merge: rename ") + db->filename + String("_weakcmpr ") + newname + String("_weakcmpr");
00398 perror((const char*)message);
00399 return NOTOK;
00400 }
00401 }
00402
00403 db->filename = newname;
00404 }
00405
00406 dbs->Push(db);
00407 }
00408
00409 return OK;
00410 }
00411
00412
00413
00414 int WordListMulti::Override(const WordReference& arg)
00415 {
00416 WordDBMulti* db = (WordDBMulti*)dbs->Last();
00417
00418 if(db->words->Size() > put_max) {
00419 if(db->words->Close() != OK) return NOTOK;
00420 if(AddIndex() != OK) return NOTOK;
00421 db = (WordDBMulti*)dbs->Last();
00422 if(db->words->Open(db->filename, db->mode) != OK) return NOTOK;
00423 }
00424
00425 return db->words->Override(arg);
00426 }
00427
00428
00429 int WordListMulti::Exists(const WordReference& )
00430 {
00431 return 0;
00432 }
00433
00434
00435
00436 List *WordListMulti::operator [] (const WordReference& )
00437 {
00438 return 0;
00439 #if 0
00440 return Collect(wordRef);
00441 #endif
00442 }
00443
00444
00445
00446 List *WordListMulti::Prefix (const WordReference& )
00447 {
00448 return 0;
00449 #if 0
00450 WordReference prefix2(prefix);
00451 prefix2.Key().UndefinedWordSuffix();
00452 return Collect(prefix2);
00453 #endif
00454 }
00455
00456
00457
00458 List *WordListMulti::WordRefs()
00459 {
00460 return 0;
00461 #if 0
00462 return Collect(WordReference(context));
00463 #endif
00464 }
00465
00466
00467
00468 List *WordListMulti::Collect(const WordReference&)
00469 {
00470 return 0;
00471 #if 0
00472 WordCursor *search = Cursor(wordRef.Key(), HTDIG_WORDLIST_COLLECTOR);
00473 if(search->Walk() != OK) return 0;
00474 List* result = search->GetResults();
00475 delete search;
00476 return result;
00477 #endif
00478 }
00479
00480
00481
00482
00483
00484
00485 int WordListMulti::WalkDelete(const WordReference& )
00486 {
00487 return 0;
00488 #if 0
00489 DeleteWordData data;
00490 WordCursor *description = Cursor(wordRef.Key(), delete_word, &data);
00491 description->Walk();
00492 delete description;
00493 return data.count;
00494 #endif
00495 }
00496
00497 int WordListMulti::Delete(const WordReference& )
00498 {
00499 return NOTOK;
00500 }
00501
00502
00503
00504
00505 List *WordListMulti::Words()
00506 {
00507 return 0;
00508 #if 0
00509 List *list = 0;
00510 String key;
00511 String record;
00512 WordReference lastWord(context);
00513 WordDBCursor* cursor = db.Cursor();
00514
00515 if(!cursor) return 0;
00516
00517
00518
00519
00520 const WordReference& last = WordStat::Last(context);
00521 last.Pack(key, record);
00522 if(cursor->Get(key, record, DB_SET_RANGE) != 0)
00523 return 0;
00524 list = new List;
00525 do {
00526 WordReference wordRef(context, key, record);
00527 if(lastWord.Key().GetWord().empty() ||
00528 wordRef.Key().GetWord() != lastWord.Key().GetWord())
00529 {
00530 list->Add(new String(wordRef.Key().GetWord()));
00531 lastWord = wordRef;
00532 }
00533 } while (cursor->Get(key, record, DB_NEXT) == 0);
00534
00535 return list;
00536 #endif
00537 }
00538
00539
00540
00541
00542
00543 int WordListMulti::Noccurrence(const String& , unsigned int& ) const
00544 {
00545 return 0;
00546 #if 0
00547 noccurrence = 0;
00548 WordStat stat(context, key.GetWord());
00549 int ret;
00550 if((ret = db.Get(stat)) != 0) {
00551 if(ret != DB_NOTFOUND)
00552 return NOTOK;
00553 } else {
00554 noccurrence = stat.Noccurrence();
00555 }
00556
00557 return OK;
00558 #endif
00559 }
00560
00561
00562
00563
00564
00565 int WordListMulti::Ref(const WordReference& )
00566 {
00567 return NOTOK;
00568 }
00569
00570
00571
00572
00573
00574 int WordListMulti::Unref(const WordReference& )
00575 {
00576 return NOTOK;
00577 }
00578
00579
00580
00581 int WordListMulti::AllRef() {
00582 if(!extended) return OK;
00583
00584 Merge();
00585
00586 WordDBMulti* db;
00587 ListCursor cursor;
00588 for(dbs->Start_Get(cursor); (db = (WordDBMulti*)dbs->Get_Next(cursor));) {
00589 if(!db->words->isopen) {
00590 if(db->words->Open(db->filename, O_RDWR) != OK) return NOTOK;
00591 if(db->words->Close() != OK) return NOTOK;
00592 }
00593 }
00594
00595 return OK;
00596 }