WordListMulti.cc

Go to the documentation of this file.
00001 //
00002 // WordListMulti.cc
00003 //
00004 // Part of the ht://Dig package   <http://www.htdig.org/>
00005 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group
00006 // For copyright details, see the file COPYING in your distribution
00007 // or the GNU General Public License version 2 or later
00008 // <http://www.gnu.org/copyleft/gpl.html>
00009 //
00010 // $Id: WordListMulti_8cc-source.html,v 1.1 2008/06/08 10:13:14 sebdiaz Exp $
00011 //
00012 
00013 #ifdef HAVE_CONFIG_H
00014 #include "config.h"
00015 #endif /* HAVE_CONFIG_H */
00016 
00017 #include "WordListMulti.h"
00018 #include "WordListOne.h"
00019 #include "myqsort.h"
00020 
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <ctype.h>
00024 #include <errno.h>
00025 #include <sys/stat.h>
00026 #include <unistd.h>
00027 
00028 class WordDBMulti : public Object 
00029 {
00030 public:
00031   WordDBMulti() { words = 0; size = 0; mode = 0; }
00032 
00033   WordListOne *words;
00034   String filename;
00035   int mode;
00036   unsigned int size;
00037 };
00038 
00039 // *****************************************************************************
00040 //
00041 WordListMulti::WordListMulti(WordContext* ncontext)
00042 {
00043   dbs = new List;
00044   context = ncontext;
00045   // The database itself hasn't been opened yet
00046   isopen = 0;
00047   Configuration& config = context->GetConfiguration();
00048   extended = config.Boolean("wordlist_extend");
00049   verbose =  config.Value("wordlist_verbose");
00050 
00051   file_max =  config.Value("wordlist_multi_max", 50);
00052   if(file_max < 4) file_max = 4;
00053 
00054   file_min =  config.Value("wordlist_multi_min", 4);
00055   if(file_min < 2) file_min = 2;
00056 
00057   if(file_max < file_min) file_max = file_min * 2;
00058 
00059   put_max =  config.Value("wordlist_multi_put_max", 1000);
00060   if(put_max < 50) put_max = 50;
00061 
00062   compressor = 0;
00063   serial = 0;
00064 }
00065 
00066 // *****************************************************************************
00067 //
00068 WordListMulti::~WordListMulti()
00069 {
00070   Close();
00071 }
00072 
00073 // *****************************************************************************
00074 //
00075 int WordListMulti::Open(const String& nfilename, int mode)
00076 {
00077   filename = nfilename;
00078 
00079   char tmp[32];
00080   struct stat stat_buf;
00081   int i;
00082   //
00083   // Open existing indexes
00084   //
00085   for(i = 0; i < file_max; i++) {
00086     String filename_one(filename);
00087     sprintf(tmp, "%08d", i);
00088     filename_one << tmp;
00089     if(stat((char*)filename_one, &stat_buf) == 0) {
00090       WordDBMulti* db = new WordDBMulti();
00091       db->words = new WordListOne(context);
00092       db->filename = filename_one;
00093       db->mode = mode;
00094       dbs->Push(db);
00095     } else {
00096       break;
00097     }
00098   }
00099   serial = i;
00100   //
00101   // If no indexes exists and read-only, abort
00102   //
00103   if(i == 0 && (flags & DB_RDONLY)) {
00104     fprintf(stderr, "WordListMulti::Open(%s, O_RDONLY): no index found\n", (char*)filename);
00105     return NOTOK;
00106   }
00107 
00108   isopen = 1;
00109 
00110   //
00111   // If no indexes exists and read/write, create the first
00112   //
00113   if(i == 0)
00114     if(AddIndex() != OK) return NOTOK;
00115 
00116   WordDBMulti* db = (WordDBMulti*)dbs->Last();
00117   if(db->words->Open(db->filename, mode) != OK)
00118     return NOTOK;
00119 
00120   return OK;
00121 }
00122 
00123 // *****************************************************************************
00124 //
00125 int WordListMulti::Close()
00126 {
00127   if(isopen) {
00128     WordDBMulti* db;
00129     ListCursor cursor;
00130     for(dbs->Start_Get(cursor); (db = (WordDBMulti*)dbs->Get_Next(cursor));) {
00131       delete db->words;
00132     }
00133     dbs->Destroy();
00134     isopen = 0;
00135     filename.trunc();
00136   }
00137   return OK;
00138 }
00139 
00140 // ****************************************************************************
00141 //
00142 unsigned int WordListMulti::Size() const 
00143 {
00144   unsigned int size = 0;
00145   if(isopen) {
00146     WordDBMulti* db;
00147     ListCursor cursor;
00148     for(dbs->Start_Get(cursor); (db = (WordDBMulti*)dbs->Get_Next(cursor));) {
00149       if(!db->words->isopen) {
00150         if(db->words->Open(db->filename, O_RDONLY) != OK) return 0;
00151         size += db->words->Size();
00152         if(db->words->Close() != OK) return 0;
00153       } else {
00154         size += db->words->Size();
00155       }
00156     }
00157   }
00158   return size;
00159 }
00160 
00161 int WordListMulti::AddIndex()
00162 {
00163   if(Flags() & O_RDONLY) return NOTOK;
00164 
00165   if(serial >= file_max)
00166     Merge();
00167 
00168   char tmp[32];
00169 
00170   String filename_one(filename);
00171   sprintf(tmp, "%08d", serial);
00172   filename_one << tmp;
00173   serial++;
00174 
00175   WordDBMulti* db = new WordDBMulti();
00176   db->words = new WordListOne(context);
00177   db->words->extended = extended;
00178   db->filename = filename_one;
00179   dbs->Push(db);
00180 
00181   return OK;
00182 }
00183 
00184 static int merge_cmp_size(WordListMulti*, WordDBMulti* a, WordDBMulti* b)
00185 {
00186   return b->size - a->size;
00187 }
00188 
00189 static int merge_cmp_filename(WordListMulti*, WordDBMulti* a, WordDBMulti* b)
00190 {
00191   return a->filename.compare(b->filename);
00192 }
00193 
00194 int WordListMulti::Merge()
00195 {
00196   if(Flags() & DB_RDONLY) return NOTOK;
00197 
00198   Configuration& config = context->GetConfiguration();
00199   int use_compress = config.Boolean("wordlist_compress");
00200 
00201   WordDBMulti* db = (WordDBMulti*)dbs->Last();
00202   if(db->words->Close() != OK) return NOTOK;
00203 
00204   //
00205   // heap lists all the files in decreasing size order (biggest first)
00206   //
00207   WordDBMulti* heap = new WordDBMulti[serial];
00208   {
00209     int i;
00210     WordDBMulti* db;
00211     ListCursor cursor;
00212     for(i = 0, dbs->Start_Get(cursor); (db = (WordDBMulti*)dbs->Get_Next(cursor)); i++) {
00213       if(db->words->Open(db->filename, O_RDONLY) != OK) return NOTOK;
00214       db->size = db->words->Size();
00215       if(db->words->Close() != OK) return NOTOK;
00216       
00217       heap[i] = *db;
00218     }
00219     dbs->Destroy();
00220     myqsort((void*)heap, serial, sizeof(WordDBMulti), (myqsort_cmp)merge_cmp_size, (void*)this);
00221   }
00222   
00223   String tmpname = filename;
00224   tmpname << ".tmp";
00225 
00226   while(serial > file_min) {
00227     WordDBMulti* a = &heap[serial - 1];
00228     WordDBMulti* b = &heap[serial - 2];
00229 
00230     WordListOne tmp(context);
00231     tmp.extended = 0;
00232 
00233     if(a->words->Open(a->filename, O_RDONLY) != OK) return NOTOK;
00234     if(b->words->Open(b->filename, O_RDONLY) != OK) return NOTOK;
00235     if(tmp.Open(tmpname, O_RDWR) != OK) return NOTOK;
00236     if(tmp.db->CacheP() && tmp.db->CacheOff() != 0) return OK;
00237 
00238     WordDBCursor* cursora = a->words->db->Cursor();
00239     WordDBCursor* cursorb = b->words->db->Cursor();
00240 
00241     if(cursora->Open() != 0) return NOTOK;
00242     String keya;
00243     String dataa;
00244 
00245     if(cursorb->Open() != 0) return NOTOK;
00246     String keyb;
00247     String datab;
00248 
00249     int reta;
00250     int retb;
00251 
00252     reta = cursora->Get(keya, dataa, DB_NEXT);
00253     retb = cursorb->Get(keyb, datab, DB_NEXT);
00254       
00255       //
00256       // Merge while there are entries in both indexes
00257       //
00258     while(reta == 0 && retb == 0) {
00259       //
00260       // If keya lower than keyb
00261       //
00262       if(WordKey::Compare(context, keya, keyb) < 0) {
00263         if(tmp.db->Put(0, keya, dataa, 0) != 0) return NOTOK;
00264         reta = cursora->Get(keya, dataa, DB_NEXT);
00265       } else {
00266         if(tmp.db->Put(0, keyb, datab, 0) != 0) return NOTOK;
00267         retb = cursorb->Get(keyb, datab, DB_NEXT);
00268       }
00269     }
00270 
00271     //
00272     // Sanity check
00273     //
00274     if((reta != 0 && reta != DB_NOTFOUND) ||
00275        (retb != 0 && retb != DB_NOTFOUND))
00276       return NOTOK;
00277 
00278       //
00279       // Flush the remaining entries from the index that is
00280       // not yet empty.
00281       //
00282     if(reta != DB_NOTFOUND || retb != DB_NOTFOUND) {
00283       String key = reta == 0 ? keya : keyb;
00284       String data = reta == 0 ? data : datab;
00285       WordDBCursor* cursor = reta == 0 ? cursora : cursorb;
00286       int ret = 0;
00287       while(ret == 0) {
00288         if(tmp.db->Put(0, key, data, 0) != 0) return NOTOK;
00289         ret = cursor->Get(key, data, DB_NEXT);
00290       }
00291       if(ret != DB_NOTFOUND)
00292         return NOTOK;
00293     }
00294       
00295     delete cursora;
00296     delete cursorb;
00297 
00298     a->words->Close();
00299     b->words->Close();
00300     tmp.Close();
00301 
00302     //
00303     // Remove file a
00304     //
00305     if(unlink((char*)a->filename) != 0) {
00306       const String message = String("WordListMulti::Merge: unlink ") + a->filename;
00307       perror((const char*)message);
00308       return NOTOK;
00309     }
00310     if(use_compress) {
00311       if(unlink((char*)(a->filename + String("_weakcmpr"))) != 0) {
00312         const String message = String("WordListMulti::Merge: unlink ") + a->filename + String("_weakcmpr");
00313         perror((const char*)message);
00314         return NOTOK;
00315       }
00316     }
00317 
00318     //
00319     // Remove file b
00320     //
00321     if(unlink((char*)b->filename) != 0) {
00322       const String message = String("WordListMulti::Merge: unlink ") + b->filename;
00323       perror((const char*)message);
00324       return NOTOK;
00325     }
00326     if(use_compress) {
00327       if(unlink((char*)(b->filename + String("_weakcmpr"))) != 0) {
00328         const String message = String("WordListMulti::Merge: unlink ") + b->filename + String("_weakcmpr");
00329         perror((const char*)message);
00330         return NOTOK;
00331       }
00332     }
00333 
00334     //
00335     // Rename tmp file into file b
00336     //
00337     if(rename((char*)tmpname, (char*)b->filename) != 0) {
00338       const String message = String("WordListMulti::Merge: rename ") + tmpname + String(" ") + b->filename;
00339       perror((const char*)message);
00340       return NOTOK;
00341     }
00342     if(use_compress) {
00343       if(rename((char*)(tmpname + String("_weakcmpr")), (char*)(b->filename + String("_weakcmpr"))) != 0) {
00344         const String message = String("WordListMulti::Merge: rename ") + tmpname + String("_weakcmpr ") + b->filename + String("_weakcmpr");
00345         perror((const char*)message);
00346         return NOTOK;
00347       }
00348     }
00349 
00350     //
00351     // Update b file size. The size need not be accurate number as long
00352     // as it reflects the relative size of each file.
00353     //
00354     b->size += a->size;
00355 
00356     //
00357     // The 'a' index is no longer in use
00358     //
00359     delete a->words;
00360     
00361     serial--;
00362     //
00363     // update heap
00364     //
00365     myqsort((void*)heap, serial, sizeof(WordDBMulti), (myqsort_cmp)merge_cmp_size, (void*)this);
00366   }
00367 
00368   //
00369   // Rename the indexes so that they are in increasing order
00370   // and push them in the list of active indexes.
00371   //
00372   myqsort((void*)heap, serial, sizeof(WordDBMulti), (myqsort_cmp)merge_cmp_filename, (void*)this);
00373   int i;
00374   for(i = 0; i < serial; i++) {
00375     WordDBMulti* db = new WordDBMulti();
00376     *db = heap[i];
00377 
00378     String newname(filename);
00379     char tmp[32];
00380     sprintf(tmp, "%08d", i);
00381     newname << tmp;
00382 
00383     //
00384     // Rename if not equal
00385     //
00386     if(db->filename.compare(newname)) {
00387       //
00388       // Rename db index into newname
00389       //
00390       if(rename((char*)db->filename, (char*)newname) != 0) {
00391         const String message = String("WordListMulti::Merge: rename ") + db->filename + String(" ") + newname;
00392         perror((const char*)message);
00393         return NOTOK;
00394       }
00395       if(use_compress) {
00396         if(rename((char*)(db->filename + String("_weakcmpr")), (char*)(newname + String("_weakcmpr"))) != 0) {
00397           const String message = String("WordListMulti::Merge: rename ") + db->filename + String("_weakcmpr ") + newname + String("_weakcmpr");
00398           perror((const char*)message);
00399           return NOTOK;
00400         }
00401       }
00402 
00403       db->filename = newname;
00404     }
00405 
00406     dbs->Push(db);
00407   }
00408 
00409   return OK;
00410 }
00411 
00412 // ****************************************************************************
00413 //
00414 int WordListMulti::Override(const WordReference& arg)
00415 {
00416   WordDBMulti* db = (WordDBMulti*)dbs->Last();
00417 
00418   if(db->words->Size() > put_max) {
00419     if(db->words->Close() != OK) return NOTOK;
00420     if(AddIndex() != OK) return NOTOK;
00421     db = (WordDBMulti*)dbs->Last();
00422     if(db->words->Open(db->filename, db->mode) != OK) return NOTOK;
00423   }
00424 
00425   return db->words->Override(arg);
00426 }
00427 
00428 // *****************************************************************************
00429 int WordListMulti::Exists(const WordReference& )
00430 {
00431   return 0;
00432 }
00433 
00434 // *****************************************************************************
00435 //
00436 List *WordListMulti::operator [] (const WordReference& )
00437 {
00438   return 0;
00439 #if 0
00440   return Collect(wordRef);
00441 #endif
00442 }
00443 
00444 // *****************************************************************************
00445 //
00446 List *WordListMulti::Prefix (const WordReference& )
00447 {
00448   return 0;
00449 #if 0
00450   WordReference prefix2(prefix);
00451   prefix2.Key().UndefinedWordSuffix();
00452   return Collect(prefix2);
00453 #endif
00454 }
00455 
00456 // *****************************************************************************
00457 //
00458 List *WordListMulti::WordRefs()
00459 {
00460   return 0;
00461 #if 0
00462   return Collect(WordReference(context));
00463 #endif
00464 }
00465 
00466 // *****************************************************************************
00467 //
00468 List *WordListMulti::Collect(const WordReference&)
00469 {
00470   return 0;
00471 #if 0
00472   WordCursor *search = Cursor(wordRef.Key(), HTDIG_WORDLIST_COLLECTOR);
00473   if(search->Walk() != OK) return 0;
00474   List* result = search->GetResults();
00475   delete search;
00476   return result;
00477 #endif
00478 }
00479 
00480 // *****************************************************************************
00481 // 
00482 // Delete all records matching wordRef, return the number of 
00483 // deleted records.
00484 //
00485 int WordListMulti::WalkDelete(const WordReference& )
00486 {
00487   return 0;
00488 #if 0
00489   DeleteWordData data;
00490   WordCursor *description = Cursor(wordRef.Key(), delete_word, &data);
00491   description->Walk();
00492   delete description;
00493   return data.count;
00494 #endif
00495 }
00496 
00497 int WordListMulti::Delete(const WordReference& )
00498 {
00499   return NOTOK;
00500 }
00501 
00502 // *****************************************************************************
00503 //
00504 //
00505 List *WordListMulti::Words()
00506 {
00507   return 0;
00508 #if 0
00509   List          *list = 0;
00510   String                key;
00511   String                record;
00512   WordReference lastWord(context);
00513   WordDBCursor*         cursor = db.Cursor();
00514 
00515   if(!cursor) return 0;
00516 
00517   //
00518   // Move past the first word count record
00519   //
00520   const WordReference& last = WordStat::Last(context);
00521   last.Pack(key, record);
00522   if(cursor->Get(key, record, DB_SET_RANGE) != 0)
00523     return 0;
00524   list = new List;
00525   do {
00526     WordReference       wordRef(context, key, record);
00527     if(lastWord.Key().GetWord().empty() ||
00528        wordRef.Key().GetWord() != lastWord.Key().GetWord()) 
00529       {
00530         list->Add(new String(wordRef.Key().GetWord()));
00531         lastWord = wordRef;
00532       }
00533   } while (cursor->Get(key, record, DB_NEXT) == 0);
00534     
00535   return list;
00536 #endif
00537 }
00538 
00539 // *****************************************************************************
00540 //
00541 // Returns the reference count for word in <count> arg
00542 //
00543 int WordListMulti::Noccurrence(const String& , unsigned int& ) const
00544 {
00545   return 0;
00546 #if 0
00547   noccurrence = 0;
00548   WordStat stat(context, key.GetWord());
00549   int ret;
00550   if((ret = db.Get(stat)) != 0) {
00551     if(ret != DB_NOTFOUND)
00552       return NOTOK;
00553   } else {
00554     noccurrence = stat.Noccurrence();
00555   }
00556 
00557   return OK;
00558 #endif
00559 }
00560 
00561 // *****************************************************************************
00562 //
00563 // Increment reference count for wordRef
00564 //
00565 int WordListMulti::Ref(const WordReference& )
00566 {
00567   return NOTOK;
00568 }
00569 
00570 // *****************************************************************************
00571 //
00572 // Decrement reference count for wordRef
00573 //
00574 int WordListMulti::Unref(const WordReference& )
00575 {
00576   return NOTOK;
00577 }
00578 
00579 // *****************************************************************************
00580 //
00581 int WordListMulti::AllRef() {
00582   if(!extended) return OK;
00583 
00584   Merge();
00585 
00586   WordDBMulti* db;
00587   ListCursor cursor;
00588   for(dbs->Start_Get(cursor); (db = (WordDBMulti*)dbs->Get_Next(cursor));) {
00589     if(!db->words->isopen) {
00590       if(db->words->Open(db->filename, O_RDWR) != OK) return NOTOK;
00591       if(db->words->Close() != OK) return NOTOK;
00592     }
00593   }
00594 
00595   return OK;
00596 }

Generated on Sun Jun 8 10:56:40 2008 for GNUmifluz by  doxygen 1.5.5