WordKey.cc

Go to the documentation of this file.
00001 //
00002 // WordKey.cc
00003 //
00004 // WordKey: All the functions are implemented regardless of the actual
00005 //          structure of the key using word_key_info.
00006 //          WARNING: although it may seem that you can have two String 
00007 //          fields in the key, some code does not support that. This should
00008 //          not be a problem since the goal of the WordKey class is to
00009 //          implement the keys of an inverted index.
00010 //
00011 // Part of the ht://Dig package   <http://www.htdig.org/>
00012 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group
00013 // For copyright details, see the file COPYING in your distribution
00014 // or the GNU General Public License version 2 or later
00015 // <http://www.gnu.org/copyleft/gpl.html>
00016 //
00017 // $Id: WordKey_8cc-source.html,v 1.1 2008/06/08 10:13:12 sebdiaz Exp $
00018 //
00019 
00020 #ifdef HAVE_CONFIG_H
00021 #include "config.h"
00022 #endif /* HAVE_CONFIG_H */
00023 
00024 #include <stdlib.h>
00025 #include <ctype.h>
00026 
00027 #include "clib.h"
00028 #include "WordKey.h"
00029 #include "ber.h"
00030 #include "HtMaxMin.h"
00031 
00032 //
00033 // Returns OK if fields set in 'object' and 'other' are all equal.
00034 //
00035 // Fields not set in either 'object' or 'other' are ignored 
00036 // completely. If the prefix_length is > 0 the 'object' String
00037 // fields are compared to the prefix_length bytes of the 'other'
00038 // String fields only.
00039 //
00040 // This function is useful to compare existing keys with a search
00041 // criterion that may be incomplete. For instance if we look for keys
00042 // that contain words starting with a given prefix or keys that
00043 // are located in a specific document, regardless of their location
00044 // in the document.
00045 //
00046 int WordKey::Equal(const WordKey& other) const
00047 {
00048   const WordKeyInfo& info = context->GetKeyInfo();
00049   //
00050   // Walk the fields in sorting order. As soon as one of them
00051   // does not compare equal, return.
00052   //
00053   for(int j = 0; j < info.nfields; j++) 
00054   {
00055     //
00056     // Only compare fields that are set in both key
00057     //
00058     if(!IsDefined(j) || !other.IsDefined(j)) continue;
00059 
00060     if(Get(j) != other.Get(j)) return 0;
00061   }
00062   return 1;
00063 }
00064 
00065 int 
00066 WordKey::Cmp(const WordKey& other) const
00067 {
00068   const WordKeyInfo& info = context->GetKeyInfo();
00069 
00070   for(int j = 0; j < info.nfields; j++) {
00071     if(!IsDefined(j) || !other.IsDefined(j)) continue;
00072 
00073     if(Get(j) != other.Get(j)) {
00074       return Get(j) > other.Get(j) ? 1 : -1;
00075     }
00076   }
00077   return 0;
00078 }
00079 
00080 //
00081 // Compare <a> and <b> in the Berkeley DB fashion. 
00082 // <a> and <b> are packed keys.
00083 //
00084 int 
00085 WordKey::Compare(WordContext* context, const unsigned char *a, int a_length, const unsigned char *b, int b_length)
00086 {
00087   const WordKeyInfo& info = context->GetKeyInfo();
00088   int bytes;
00089   ber_t a_value;
00090   ber_t b_value;
00091 
00092   for(int j = 0; j < info.nfields; j++) {
00093     if((bytes = ber_buf2value(a, a_length, a_value)) < 1) {
00094       fprintf(stderr, "WordKey::Compare: failed to retrieve field %d for a\n", j);
00095       abort();
00096     }
00097     a += bytes;
00098     a_length -= bytes;
00099     if((bytes = ber_buf2value(b, b_length, b_value)) < 1) {
00100       fprintf(stderr, "WordKey::Compare: failed to retrieve field %d for b\n", j);
00101       abort();
00102     }
00103     b += bytes;
00104     b_length -= bytes;
00105 
00106     if(a_value != b_value)
00107       return a_value > b_value ? 1 : -1;
00108   }
00109 
00110   //
00111   // If we reach this point, everything compared equal
00112   //
00113   return 0;
00114 }
00115 
00116 //
00117 // Compare <a> and <b> in the Berkeley DB fashion. 
00118 // <a> and <b> are packed keys.
00119 //
00120 int 
00121 WordKey::Compare(WordContext* context, const String& a, const String& b)
00122 {
00123   return WordKey::Compare(context, (const unsigned char*)a, a.length(), (const unsigned char*)b, b.length());
00124 }
00125 
00126 //
00127 // C comparison function interface for Berkeley DB (bt_compare)
00128 // Just call the static Compare function of WordKey. It is *critical*
00129 // that this function is as fast as possible. See the Berkeley DB
00130 // documentation for more information on the return values.
00131 //
00132 int
00133 word_db_cmp(const DBT *a, const DBT *b)
00134 {
00135   WordContext* context = (WordContext*)a->app_private;
00136   if(context == 0) {
00137     fprintf(stderr, "word_db_cmp: null context\n");
00138     abort();
00139   }
00140   return WordKey::Compare(context, (const unsigned char*)a->data, a->size, (const unsigned char*)b->data, b->size);
00141 }
00142 
00143 //
00144 // Compare current key defined fields with other key defined fields only,
00145 // ignore fields that are not defined in key or other. Return 1 if different
00146 // 0 if equal. If different, position is set to the field number that differ,
00147 // lower is set to 1 if Get(position) is lower than other.Get(position) otherwise
00148 // lower is set to 0.
00149 //
00150 int WordKey::Diff(const WordKey& other, int& position, int& lower)
00151 {
00152   position = -1;
00153 
00154   int nfields=WordKey::NFields();
00155 
00156   int i;
00157   for(i = 0; i < nfields; i++) {
00158     if(IsDefined(i) && other.IsDefined(i) &&
00159        Get(i) != other.Get(i)) {
00160       lower = Get(i) < other.Get(i);
00161       break;
00162     }
00163   }
00164   if(i < nfields)
00165     position = i;
00166 
00167   return position >= 0;
00168 }
00169 
00170 //
00171 // Compare object and <other> using comparison of their packed form
00172 //
00173 int 
00174 WordKey::PackEqual(const WordKey& other) const
00175 {
00176   String this_pack;
00177   Pack(this_pack);
00178 
00179   String other_pack;
00180   other.Pack(other_pack);
00181 
00182   return this_pack == other_pack;
00183 }
00184 
00185 //
00186 // Implement ++ on a key.
00187 //
00188 // It behaves like arithmetic but follows these rules:
00189 // . Increment starts at field <position>
00190 // . If a field value overflows, increment field <position> - 1
00191 // . Undefined fields are ignored and their value untouched
00192 // . Incrementing the word field is done by appending \001
00193 // . When a field is incremented all fields to the left are set to 0
00194 // If position is not specified it is equivalent to NFields() - 1.
00195 // It returns OK if successfull, NOTOK if position out of range or
00196 // WORD_FOLLOWING_ATEND if the maximum possible value was reached.
00197 //
00198 // Examples assuming numerical fields are 8 bits wide:
00199 // 
00200 // 0                 1     2     3       OPERATION             RESULT
00201 // ---------------------------------------------------------------------------------------
00202 // foo     <DEF>     1     1     1 -> SetToFollowing(3) -> foo     <DEF>     1     1     2
00203 // foo     <DEF>     1     1     1 -> SetToFollowing(2) -> foo     <DEF>     1     2     0
00204 // foo     <DEF>     1     1   255 -> SetToFollowing(3) -> foo     <DEF>     1     2     0
00205 // foo     <DEF>   255   255   255 -> SetToFollowing(3) -> foo\001 <DEF>     0     0     0
00206 // foo     <DEF>   255     1     1 -> SetToFollowing(1) -> foo\001 <DEF>     0     0     0
00207 // <UNDEF><UNDEF>  255     1     1 -> SetToFollowing(1) -> WORD_FOLLOWING_ATEND
00208 // foo     <DEF>     1 <UNDEF> 255 -> SetToFollowing(3) -> foo     <DEF>     2 <UNDEF>   0
00209 // foo     <DEF><UNDEF><UNDEF> 255 -> SetToFollowing(3) -> foo\001 <DEF><UNDEF><UNDEF>   0
00210 //
00211 //
00212 int WordKey::SetToFollowing(int position /* = WORD_FOLLOWING_MAX */)
00213 {
00214   if(position == WORD_FOLLOWING_MAX)
00215     position = NFields() - 1;
00216   
00217   if(position < 0 || position >= NFields()) {
00218     fprintf(stderr, "WordKey::SetToFollowing invalid position = %d\n", position);
00219     return NOTOK;
00220   }
00221 
00222   int i = position;
00223   while(i >= 0) {
00224     if(IsDefined(i)) {
00225       if(Overflow(i, 1))
00226         Set(i, 0);
00227       else
00228         break;
00229     }
00230     i--;
00231   }
00232 
00233   if(i < 0) {
00234     fprintf(stderr, "WordKey::SetToFollowing cannot increment\n");
00235     return NOTOK;
00236   }
00237   
00238   Get(i)++;
00239 
00240   for(i = position + 1; i < NFields(); i++)
00241     if(IsDefined(i)) Set(i,0);
00242 
00243   return OK;
00244 }
00245 
00246 int 
00247 WordKey::Prefix() const
00248 {
00249   const WordKeyInfo& info = context->GetKeyInfo();
00250   //
00251   // If all fields are set, it can be considered as a prefix although
00252   // it really is a fully qualified key.
00253   //
00254   if(Filled()) return OK;
00255   //
00256   // If the first field is not set this cannot be a prefix
00257   //
00258   if(!IsDefined(0)) return NOTOK;
00259   
00260   int found_unset = 0;
00261 
00262   for(int j = 0; j < info.nfields; j++) {
00263     //
00264     // Fields set, then fields unset then field set -> not a prefix
00265     //
00266     if(IsDefined(j)) {
00267       if(found_unset) return NOTOK;
00268     } else {
00269       //
00270       // Found unset fields and this is fine as long as we do
00271       // not find a field set later on.
00272       //
00273       found_unset++;
00274     }
00275   }
00276 
00277   return OK;
00278 }
00279 
00280 //
00281 // Unset all fields past the first unset field
00282 // Return the number of fields in the prefix or 0 if
00283 // first field is not set, ie no possible prefix.
00284 //
00285 int 
00286 WordKey::PrefixOnly()
00287 {
00288   const WordKeyInfo& info = context->GetKeyInfo();
00289   //
00290   // If all fields are set, the whole key is the prefix.
00291   //
00292   if(Filled()) return OK;
00293   //
00294   // If the first field is not set there is no possible prefix
00295   //
00296   if(!IsDefined(0))
00297       return NOTOK;
00298   
00299   int found_unset = 0;
00300   //
00301   // Walk the fields in sorting order. 
00302   //
00303 
00304   for(int j = 0; j < info.nfields; j++) {
00305     //
00306     // Unset all fields after the first unset field
00307     //
00308     if(IsDefined(j)) {
00309       if(found_unset) {
00310         Set(j,0);
00311         Undefined(j);
00312       }
00313     } else {
00314       found_unset = 1;
00315     }
00316   }
00317 
00318   return OK;
00319 }
00320 
00321 //
00322 // Unpack from data and fill fields of object
00323 // 
00324 int 
00325 WordKey::Unpack(const char* string, int length)
00326 {
00327   const WordKeyInfo& info = context->GetKeyInfo();
00328 
00329   const unsigned char* p = (const unsigned char*)string;
00330   int p_length = length;
00331   ber_t value;
00332 
00333   for(int j = 0; j < info.nfields; j++) {
00334     int bytes = ber_buf2value(p, p_length, value);
00335     if(bytes < 1) {
00336       fprintf(stderr, "WordKey::Unpack: ber_buf2value failed at %d\n", j);
00337       return NOTOK;
00338     }
00339     p_length -= bytes;
00340     if(p_length < 0) {
00341       fprintf(stderr, "WordKey::Unpack: ber_buf2value overflow at %d\n", j);
00342       return NOTOK;
00343     }
00344     p += bytes;
00345     Set(j, value);
00346   }
00347 
00348   return OK;
00349 }
00350 
00351 //
00352 // Pack object into the <packed> string
00353 //
00354 int 
00355 WordKey::Pack(String& packed) const
00356 {
00357   const WordKeyInfo& info = context->GetKeyInfo();
00358 
00359   unsigned char* string;
00360   // 
00361   // + 1 : storage for the string length 
00362   //
00363   int length = BER_MAX_BYTES * info.nfields;
00364 
00365   if((string = (unsigned char*)malloc(length)) == 0) {
00366     fprintf(stderr, "WordKey::Pack: malloc returned 0\n");
00367     return NOTOK;
00368   }
00369 
00370   unsigned char* p = string;
00371   int p_length = length;
00372 
00373   for(int i = 0; i < info.nfields; i++) {
00374     int bytes = ber_value2buf(p, p_length, Get(i));
00375     if(bytes < 1) {
00376       fprintf(stderr, "WordKey::Pack: ber_value2buf failed at %d\n", i);
00377       return NOTOK;
00378     }
00379     p_length -= bytes;
00380     if(p_length < 0) {
00381       fprintf(stderr, "WordKey::Pack: ber_value2buf overflow at %d\n", i);
00382       return NOTOK;
00383     }
00384     p += bytes;
00385   }
00386 
00387   packed.set((const char*)string, p - string);
00388 
00389   free(string);
00390 
00391   return OK;
00392 }
00393 
00394 //
00395 // Copy all fields set in <other> to object, only if 
00396 // the field is not already set in <other>
00397 //
00398 int WordKey::Merge(const WordKey& other)
00399 {
00400   const WordKeyInfo& info = context->GetKeyInfo();
00401   
00402   for(int j = 0; j < info.nfields; j++) {
00403     if(!IsDefined(j) && other.IsDefined(j)) {
00404       Set(j,other.Get(j)); 
00405     }
00406   }
00407 
00408   return OK;
00409 }
00410 
00411 //
00412 // Convert the whole structure to an ascii string description
00413 //
00414 int
00415 WordKey::Get(String& buffer) const
00416 {
00417   buffer.trunc();
00418   const WordKeyInfo& info = context->GetKeyInfo();
00419 
00420   //
00421   // Walk the fields in sorting order. As soon as one of them
00422   // does not compare equal, return.
00423   //
00424   for(int j = 0; j < info.nfields; j++) {
00425     if(!IsDefined(j)) {
00426       buffer << "<UNDEF>";
00427     } else {
00428       buffer << Get(j);
00429     }
00430     buffer << "\t";
00431   }
00432   return OK;
00433 }
00434 
00435 String
00436 WordKey::Get() const
00437 {
00438   String tmp;
00439   Get(tmp);
00440   return tmp;
00441 }
00442 
00443 //
00444 // Set a key from an ascii representation
00445 //
00446 int
00447 WordKey::Set(const String& buffer)
00448 {
00449   StringList fields(buffer, "\t ");
00450   return SetList(fields);
00451 }
00452 
00453 //
00454 // Set a key from list of fields
00455 //
00456 int
00457 WordKey::SetList(StringList& fields)
00458 {
00459   const WordKeyInfo& info = context->GetKeyInfo();
00460   int length = fields.Count();
00461 
00462   if(length < info.nfields) {
00463     fprintf(stderr, "WordKey::SetList: expected at least %d fields and found %d (ignored)\n", info.nfields, length);
00464     return NOTOK;
00465   }
00466   if(length < 1) {
00467     fprintf(stderr, "WordKey::SetList: expected at least one field in line\n");
00468     return NOTOK;
00469   }
00470 
00471   Clear();
00472 
00473   //
00474   // Handle numerical fields
00475   //
00476   int i;
00477   for(i = 0; i < info.nfields; i++) {
00478     String* field = (String*)fields.Get_First();
00479 
00480     if(field == 0) {
00481       fprintf(stderr, "WordKey::Set: failed to retrieve field %d\n", i);
00482       return NOTOK;
00483     }
00484     
00485     if(field->nocase_compare("<undef>") == 0) {
00486       Undefined(i);
00487     } else {
00488       WordKeyNum value = strtoul(field->get(), 0, 10);
00489       Set(i, value);
00490     }
00491     fields.Remove(0);
00492   }
00493 
00494   return OK;
00495 }
00496 
00497 int WordKey::Write(FILE* f) const
00498 {
00499   String tmp;
00500   Get(tmp);
00501   fprintf(f, "%s", (char*)tmp);
00502   return 0;
00503 }
00504 
00505 void WordKey::Print() const
00506 {
00507   Write(stderr);
00508 }
00509 

Generated on Sun Jun 8 10:56:40 2008 for GNUmifluz by  doxygen 1.5.5