00001 // WordKey.h 00002 // 00003 // NAME 00004 // inverted index key. 00005 // 00006 // SYNOPSIS 00007 // 00008 // #include <WordKey.h> 00009 // 00010 // #define WORD_KEY_DOCID 1 00011 // #define WORD_KEY_LOCATION 2 00012 // 00013 // WordList* words = ...; 00014 // WordKey key = words->Key("word 100 20"); 00015 // WordKey searchKey; 00016 // words->Dict()->SerialExists("dog", searchKey.Get(WORD_KEY_WORD)); 00017 // searchKey.Set(WORD_KEY_LOCATION, 5); 00018 // WordCursor* cursor = words->Key(searchKey); 00019 // 00020 // DESCRIPTION 00021 // 00022 // Describes the key used to store a entry in the inverted index. 00023 // Each field in the key has a bit in the <b>set</b> 00024 // member that says if it is set or not. This bit allows to 00025 // say that a particular field is <i>undefined</i> regardless of 00026 // the actual value stored. The methods 00027 // <b>IsDefined, SetDefined</b> and <b>Undefined</b> are used to manipulate 00028 // the <i>defined</i> status of a field. The <b>Pack</b> and <b>Unpack</b> 00029 // methods are used to convert to and from the disk storage representation 00030 // of the key. 00031 // 00032 // Although constructors may be used, the prefered way to create a 00033 // WordKey object is by using the <b>WordContext::Key</b> method. 00034 // 00035 // The following constants are defined: 00036 // <dl> 00037 // <dt> WORD_KEY_WORD 00038 // <dd> the index of the word identifier with the key for Set and Get 00039 // methods. 00040 // <dt> WORD_KEY_VALUE_INVALID 00041 // <dd> a value that is invalid for any field of the key. 00042 // </dl> 00043 // 00044 // ASCII FORMAT 00045 // 00046 // The ASCII description is a string with fields separated by tabs or 00047 // white space. 00048 // <pre> 00049 // Example: 200 <UNDEF> 1 4 2 00050 // Field 1: The word identifier or <UNDEF> if not defined 00051 // Field 2 to the end: numerical value of the field or <UNDEF> if 00052 // not defined 00053 // 00054 // </pre> 00055 // 00056 // END 00057 // 00058 // Part of the ht://Dig package <http://www.htdig.org/> 00059 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group 00060 // For copyright details, see the file COPYING in your distribution 00061 // or the GNU General Public License version 2 or later 00062 // <http://www.gnu.org/copyleft/gpl.html> 00063 // 00064 // 00065 00066 #ifndef _WordKey_h_ 00067 #define _WordKey_h_ 00068 00069 #ifndef SWIG 00070 #include "db.h" 00071 #include "htString.h" 00072 #include "StringList.h" 00073 #include "WordContext.h" 00074 #endif /* SWIG */ 00075 00076 // 00077 // Possible return values of Outbound/Overflow/Underflow methods 00078 // 00079 #define WORD_INBOUND 0 00080 #define WORD_OVERFLOW 1 00081 #define WORD_UNDERFLOW 2 00082 00083 // 00084 // Possible return values of SetToFollowing 00085 // 00086 #define WORD_FOLLOWING_ATEND 0x0001 00087 // 00088 // Default value for position argument of SetToFollowing 00089 // meaning NFields() - 1 00090 // 00091 #define WORD_FOLLOWING_MAX -1 00092 00093 // 00094 // No value in a key may be 0 00095 // 00096 #define WORD_KEY_VALUE_INVALID 0 00097 00098 // 00099 // Unknown field position 00100 // 00101 #define WORD_KEY_UNKNOWN_POSITION -1 00102 00103 // 00104 // Index of the word identifier within the key 00105 // 00106 #define WORD_KEY_WORD 0 00107 00108 #ifndef SWIG 00109 // 00110 // C comparison function interface for Berkeley DB (bt_compare) 00111 // 00112 int word_db_cmp(const DBT *a, const DBT *b); 00113 #endif /* SWIG */ 00114 00115 #ifndef SWIG 00116 #include"WordKeyInfo.h" 00117 #endif /* SWIG */ 00118 00119 // 00120 // Describe a word occurrence 00121 // 00122 class WordKey 00123 { 00124 public: 00125 // 00126 // Constructors, destructors, copy and clear 00127 // 00128 //- 00129 // Constructor. Build an empty key. 00130 // The <b>ncontext</b> argument must be a pointer to a valid 00131 // WordContext object. 00132 // 00133 WordKey(WordContext* ncontext) { 00134 context = ncontext; 00135 Clear(); 00136 } 00137 #ifndef SWIG 00138 //- 00139 // Constructor. Initialize from an ASCII description of a key. 00140 // See <i>ASCII FORMAT</i> section. 00141 // The <b>ncontext</b> argument must be a pointer to a valid 00142 // WordContext object. 00143 // 00144 WordKey(WordContext* ncontext, const String& desc) { 00145 context = ncontext; 00146 Set(desc); 00147 } 00148 public: 00149 #endif /* SWIG */ 00150 //- 00151 // Reset to empty key. 00152 // 00153 void Clear() { 00154 setbits = 0; 00155 for(int i = 0; i < NFields(); i++) { 00156 values[i] = 0; 00157 } 00158 } 00159 00160 //- 00161 // Convenience functions to access the total number of fields 00162 // in a key (see <i>WordKeyInfo(3)</i>). 00163 // 00164 inline int NFields() const { return context->GetKeyInfo().nfields; } 00165 //- 00166 // Convenience functions to access the 00167 // maximum possible value for field at <b>position.</b> 00168 // in a key (see <i>WordKeyInfo(3)</i>). 00169 // 00170 inline WordKeyNum MaxValue(int position) { return context->GetKeyInfo().MaxValue(position); } 00171 00172 // 00173 // Accessors 00174 // 00175 //- 00176 // Return a pointer to the WordContext object used to create 00177 // this instance. 00178 // 00179 inline WordContext* GetContext() { return context; } 00180 #ifndef SWIG 00181 //- 00182 // Return a pointer to the WordContext object used to create 00183 // this instance as a const. 00184 // 00185 inline const WordContext* GetContext() const { return context; } 00186 #endif /* SWIG */ 00187 00188 // 00189 // Get/Set fields 00190 // 00191 //- 00192 // Return value of numerical field at <b>position</b> as const. 00193 // 00194 inline WordKeyNum Get(int position) const { 00195 return(values[position]); 00196 } 00197 #ifndef SWIG 00198 //- 00199 // Return value of numerical field at <b>position.</b> 00200 // 00201 inline WordKeyNum& Get(int position) { 00202 return(values[position]); 00203 } 00204 //- 00205 // Return value of numerical field at <b>position</b> as const. 00206 // 00207 inline const WordKeyNum & operator[] (int position) const { return(values[position]); } 00208 //- 00209 // Return value of numerical field at <b>position.</b> 00210 // 00211 inline WordKeyNum & operator[] (int position) { return(values[position]); } 00212 #endif /* SWIG */ 00213 //- 00214 // Set value of numerical field at <b>position</b> to <b>val.</b> 00215 // 00216 inline void Set(int position, WordKeyNum val) { 00217 SetDefined(position); 00218 values[position] = val; 00219 } 00220 00221 // 00222 // Key field value existenz. Defined means the value of the field contains 00223 // a valid value. Undefined means the value of the field is not valid. 00224 // 00225 //- 00226 // Returns true if field at <b>position</b> is <i>defined</i>, false 00227 // otherwise. 00228 // 00229 int IsDefined(int position) const { return setbits & (1 << position); } 00230 //- 00231 // Value in field <b>position</b> becomes <i>defined.</i> A bit 00232 // is set in the bit field describing the defined/undefined state 00233 // of the value and the actual value of the field is not modified. 00234 // 00235 void SetDefined(int position) { setbits |= (1 << position); } 00236 //- 00237 // Value in field <b>position</b> becomes <i>undefined.</i> A bit 00238 // is set in the bit field describing the defined/undefined state 00239 // of the value and the actual value of the field is not modified. 00240 // 00241 void Undefined(int position) { setbits &= ~(1 << position); } 00242 00243 #ifndef SWIG 00244 // 00245 // Set and Get the whole structure from/to ASCII description 00246 //- 00247 // Set the whole structure from ASCII string in <b>bufferin.</b> 00248 // See <i>ASCII FORMAT</i> section. 00249 // Return OK if successfull, NOTOK otherwise. 00250 // 00251 int Set(const String& bufferin); 00252 int SetList(StringList& fields); 00253 //- 00254 // Convert the whole structure to an ASCII string description 00255 // in <b>bufferout.</b> 00256 // See <i>ASCII FORMAT</i> section. 00257 // Return OK if successfull, NOTOK otherwise. 00258 // 00259 int Get(String& bufferout) const; 00260 //- 00261 // Convert the whole structure to an ASCII string description 00262 // and return it. 00263 // See <i>ASCII FORMAT</i> section. 00264 // 00265 String Get() const; 00266 #endif /* SWIG */ 00267 00268 // 00269 // Storage format conversion 00270 // 00271 #ifndef SWIG 00272 //- 00273 // Set structure from disk storage format as found in 00274 // <b>string</b> buffer or length <b>length.</b> 00275 // Return OK if successfull, NOTOK otherwise. 00276 // 00277 int Unpack(const char* string, int length); 00278 // 00279 //- 00280 // Set structure from disk storage format as found in 00281 // <b>data</b> string. 00282 // Return OK if successfull, NOTOK otherwise. 00283 // 00284 inline int Unpack(const String& data) { return(Unpack(data,data.length())); } 00285 // 00286 //- 00287 // Convert object into disk storage format as found in 00288 // and place the result in <b>data</b> string. 00289 // Return OK if successfull, NOTOK otherwise. 00290 // 00291 int Pack(String& data) const; 00292 #endif /* SWIG */ 00293 00294 // 00295 // Transformations 00296 // 00297 //- 00298 // Copy each <i>defined</i> field from other into the object, if 00299 // the corresponding field of the object is not defined. 00300 // Return OK if successfull, NOTOK otherwise. 00301 // 00302 int Merge(const WordKey& other); 00303 //- 00304 // Undefine all fields found after the first undefined field. The 00305 // resulting key has a set of defined fields followed by undefined fields. 00306 // Returns NOTOK if the word is not defined because the resulting key would 00307 // be empty and this is considered an error. Returns OK on success. 00308 // 00309 int PrefixOnly(); 00310 #ifndef SWIG 00311 //- 00312 // Implement ++ on a key. 00313 // 00314 // It behaves like arithmetic but follows these rules: 00315 // <pre> 00316 // . Increment starts at field <position> 00317 // . If a field value overflows, increment field <b>position</b> - 1 00318 // . Undefined fields are ignored and their value untouched 00319 // . When a field is incremented all fields to the left are set to 0 00320 // </pre> 00321 // If position is not specified it is equivalent to NFields() - 1. 00322 // It returns OK if successfull, NOTOK if <b>position</b> out of range or 00323 // WORD_FOLLOWING_ATEND if the maximum possible value was reached. 00324 // 00325 int SetToFollowing(int position = WORD_FOLLOWING_MAX); 00326 #endif /* SWIG */ 00327 00328 // 00329 // Predicates 00330 // 00331 //- 00332 // Return true if all the fields are <i>defined</i>, false otherwise. 00333 // 00334 int Filled() const { return setbits == (unsigned int) (((1 << NFields()) - 1)); } 00335 //- 00336 // Return true if no fields are <i>defined</i>, false otherwise. 00337 // 00338 int Empty() const { return setbits == 0; } 00339 //- 00340 // Return true if the object and <b>other</b> are equal. 00341 // Only fields defined in both keys are compared. 00342 // 00343 int Equal(const WordKey& other) const; 00344 //- 00345 // Return true if the object and <b>other</b> are equal. 00346 // All fields are compared. If a field is defined in <b>object</b> 00347 // and not defined in the object, the key are not considered 00348 // equal. 00349 // 00350 int ExactEqual(const WordKey& other) const { return(Equal(other) && other.setbits == setbits); } 00351 //- 00352 // Compare <b>object</b> and <b>other</b> as in strcmp. Undefined 00353 // fields are ignored. Returns a positive number if <b>object</b> is 00354 // greater than <b>other</b>, zero if they are equal, a negative 00355 // number if <b>object</b> is lower than <b>other.</b> 00356 // 00357 int Cmp(const WordKey& other) const; 00358 #ifndef SWIG 00359 //- 00360 // Return true if the object and <b>other</b> are equal. 00361 // The packed string are compared. An <i>undefined</i> numerical field 00362 // will be 0 and therefore undistinguishable from a <i>defined</i> field 00363 // whose value is 0. 00364 // 00365 int PackEqual(const WordKey& other) const; 00366 //- 00367 // Return true if adding <b>increment</b> in field at <b>position</b> makes 00368 // it overflow or underflow, false if it fits. 00369 // 00370 int Outbound(int position, int increment) { 00371 if(increment < 0) return Underflow(position, increment); 00372 else if(increment > 0) return Overflow(position, increment); 00373 else return WORD_INBOUND; 00374 } 00375 //- 00376 // Return true if adding positive <b>increment</b> to field at 00377 // <b>position</b> makes it overflow, false if it fits. 00378 // 00379 int Overflow(int position, int increment) { 00380 return MaxValue(position) - Get(position) < (WordKeyNum)increment ? WORD_OVERFLOW : WORD_INBOUND; 00381 } 00382 //- 00383 // Return true if subtracting positive <b>increment</b> to field 00384 // at <b>position</b> makes it underflow, false if it fits. 00385 // 00386 int Underflow(int position, int increment) { 00387 return Get(position) < (WordKeyNum)(-increment) ? WORD_UNDERFLOW : WORD_INBOUND; 00388 } 00389 #endif /* SWIG */ 00390 //- 00391 // Return OK if the key may be used as a prefix for search. 00392 // In other words return OK if the fields set in the key 00393 // are all contiguous, starting from the first field. 00394 // Otherwise returns NOTOK 00395 // 00396 int Prefix() const; 00397 00398 #ifndef SWIG 00399 //- 00400 // Compare <b>a</b> and <b>b</b> in the Berkeley DB fashion. 00401 // <b>a</b> and <b>b</b> are packed keys. The semantics of the 00402 // returned int is as of strcmp and is driven by the key description 00403 // found in <i>WordKeyInfo.</i> Returns a positive number if <b>a</b> is 00404 // greater than <b>b</b>, zero if they are equal, a negative number 00405 // if <b>a</b> is lower than <b>b.</b> 00406 // 00407 static int Compare(WordContext* context, const String& a, const String& b); 00408 //- 00409 // Compare <b>a</b> and <b>b</b> in the Berkeley DB fashion. 00410 // <b>a</b> and <b>b</b> are packed keys. The semantics of the 00411 // returned int is as of strcmp and is driven by the key description 00412 // found in <i>WordKeyInfo.</i> Returns a positive number if <b>a</b> is 00413 // greater than <b>b</b>, zero if they are equal, a negative number 00414 // if <b>a</b> is lower than <b>b.</b> 00415 // 00416 static int Compare(WordContext* context, const unsigned char *a, int a_length, const unsigned char *b, int b_length); 00417 //- 00418 // Compare object defined fields with <b>other</b> key defined fields only, 00419 // ignore fields that are not defined in object or <b>other.</b> 00420 // Return 1 if different 0 if equal. 00421 // If different, <b>position</b> is set to the field number that differ, 00422 // <b>lower</b> is set to 1 if Get(<b>position</b>) is lower than 00423 // other.Get(<b>position</b>) otherwise lower is set to 0. 00424 // 00425 int Diff(const WordKey& other, int& position, int& lower); 00426 00427 //- 00428 // Print object in ASCII form on <b>f</b> (uses <i>Get</i> method). 00429 // See <i>ASCII FORMAT</i> section. 00430 // 00431 int Write(FILE* f) const; 00432 #endif /* SWIG */ 00433 //- 00434 // Print object in ASCII form on <b>stdout</b> (uses <i>Get</i> method). 00435 // See <i>ASCII FORMAT</i> section. 00436 // 00437 void Print() const; 00438 00439 // 00440 // Direct access to values array. Only use if you know what you're 00441 // doing. 00442 // 00443 WordKeyNum* Values() { return values; } 00444 const WordKeyNum* Values() const { return values; } 00445 #ifndef SWIG 00446 00447 private: 00448 00449 // 00450 // Data members 00451 // 00452 // 00453 // Bit field for defined/undefined status of each key field 00454 // 00455 unsigned int setbits; 00456 // 00457 // Holds the numerical values of the key fields 00458 // 00459 WordKeyNum values[WORD_KEY_MAX_NFIELDS]; 00460 00461 WordContext *context; 00462 #endif /* SWIG */ 00463 }; 00464 00465 #endif /* _WordKey_h */