WordKey.h

Go to the documentation of this file.
00001 // WordKey.h
00002 //
00003 // NAME
00004 // inverted index key.
00005 //
00006 // SYNOPSIS
00007 //
00008 // #include <WordKey.h>
00009 // 
00010 // #define WORD_KEY_DOCID    1
00011 // #define WORD_KEY_LOCATION 2
00012 //
00013 // WordList* words = ...;
00014 // WordKey key = words->Key("word 100 20");
00015 // WordKey searchKey;
00016 // words->Dict()->SerialExists("dog", searchKey.Get(WORD_KEY_WORD));
00017 // searchKey.Set(WORD_KEY_LOCATION, 5);
00018 // WordCursor* cursor = words->Key(searchKey);
00019 // 
00020 // DESCRIPTION
00021 //
00022 // Describes the key used to store a entry in the inverted index.
00023 // Each field in the key has a bit in the <b>set</b>
00024 // member that says if it is set or not. This bit allows to
00025 // say that a particular field is <i>undefined</i> regardless of
00026 // the actual value stored. The methods
00027 // <b>IsDefined, SetDefined</b> and <b>Undefined</b> are used to manipulate
00028 // the <i>defined</i> status of a field. The <b>Pack</b> and <b>Unpack</b>
00029 // methods are used to convert to and from the disk storage representation
00030 // of the key. 
00031 // 
00032 // Although constructors may be used, the prefered way to create a 
00033 // WordKey object is by using the <b>WordContext::Key</b> method.
00034 //
00035 // The following constants are defined:
00036 // <dl>
00037 // <dt> WORD_KEY_WORD
00038 // <dd> the index of the word identifier with the key for Set and Get
00039 // methods.
00040 // <dt> WORD_KEY_VALUE_INVALID
00041 // <dd> a value that is invalid for any field of the key.
00042 // </dl>
00043 //
00044 // ASCII FORMAT
00045 //
00046 // The ASCII description is a string with fields separated by tabs or
00047 // white space.
00048 // <pre>
00049 // Example: 200 <UNDEF> 1 4 2
00050 // Field 1: The word identifier or <UNDEF> if not defined
00051 // Field 2 to the end: numerical value of the field or <UNDEF> if
00052 //                     not defined
00053 //
00054 // </pre>
00055 //
00056 // END
00057 //
00058 // Part of the ht://Dig package   <http://www.htdig.org/>
00059 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group
00060 // For copyright details, see the file COPYING in your distribution
00061 // or the GNU General Public License version 2 or later
00062 // <http://www.gnu.org/copyleft/gpl.html>
00063 //
00064 //
00065 
00066 #ifndef _WordKey_h_
00067 #define _WordKey_h_
00068 
00069 #ifndef SWIG
00070 #include "db.h"
00071 #include "htString.h"
00072 #include "StringList.h"
00073 #include "WordContext.h"
00074 #endif /* SWIG */
00075 
00076 //
00077 // Possible return values of Outbound/Overflow/Underflow methods
00078 //
00079 #define WORD_INBOUND    0
00080 #define WORD_OVERFLOW   1
00081 #define WORD_UNDERFLOW  2
00082 
00083 //
00084 // Possible return values of SetToFollowing
00085 //
00086 #define WORD_FOLLOWING_ATEND    0x0001
00087 //
00088 // Default value for position argument of SetToFollowing
00089 // meaning NFields() - 1
00090 //
00091 #define WORD_FOLLOWING_MAX      -1
00092 
00093 //
00094 // No value in a key may be 0
00095 //
00096 #define WORD_KEY_VALUE_INVALID 0
00097 
00098 //
00099 // Unknown field position
00100 //
00101 #define WORD_KEY_UNKNOWN_POSITION       -1
00102 
00103 //
00104 // Index of the word identifier within the key
00105 // 
00106 #define WORD_KEY_WORD   0
00107 
00108 #ifndef SWIG
00109 //
00110 // C comparison function interface for Berkeley DB (bt_compare)
00111 //
00112 int word_db_cmp(const DBT *a, const DBT *b);
00113 #endif /* SWIG */
00114 
00115 #ifndef SWIG
00116 #include"WordKeyInfo.h"
00117 #endif /* SWIG */
00118 
00119 //
00120 // Describe a word occurrence
00121 //
00122 class WordKey
00123 {
00124  public:
00125   //
00126   // Constructors, destructors, copy and clear 
00127   //
00128   //-
00129   // Constructor. Build an empty key.
00130   // The <b>ncontext</b> argument must be a pointer to a valid
00131   // WordContext object.
00132   //
00133   WordKey(WordContext* ncontext) {
00134     context = ncontext;
00135     Clear();
00136   }
00137 #ifndef SWIG
00138   //-
00139   // Constructor. Initialize from an ASCII description of a key.
00140   // See <i>ASCII FORMAT</i> section.
00141   // The <b>ncontext</b> argument must be a pointer to a valid
00142   // WordContext object.
00143   //
00144   WordKey(WordContext* ncontext, const String& desc) {
00145     context = ncontext;
00146     Set(desc); 
00147   }
00148  public:
00149 #endif /* SWIG */
00150   //-
00151   // Reset to empty key. 
00152   //
00153   void  Clear() { 
00154     setbits = 0;
00155     for(int i = 0; i < NFields(); i++) {
00156       values[i] = 0;
00157     }
00158   }
00159 
00160   //-
00161   // Convenience functions to access the total number of fields
00162   // in a key (see <i>WordKeyInfo(3)</i>).
00163   //
00164   inline int               NFields() const { return context->GetKeyInfo().nfields; }
00165   //-
00166   // Convenience functions to access the 
00167   // maximum possible value for field at <b>position.</b>
00168   // in a key (see <i>WordKeyInfo(3)</i>).
00169   //
00170   inline WordKeyNum         MaxValue(int position) { return context->GetKeyInfo().MaxValue(position); }
00171 
00172   //
00173   // Accessors
00174   //
00175   //-
00176   // Return a pointer to the WordContext object used to create
00177   // this instance.
00178   //
00179   inline WordContext* GetContext() { return context; }
00180 #ifndef SWIG
00181   //-
00182   // Return a pointer to the WordContext object used to create
00183   // this instance as a const.
00184   //
00185   inline const WordContext* GetContext() const { return context; }
00186 #endif /* SWIG */
00187 
00188   //
00189   // Get/Set fields
00190   //
00191   //-
00192   // Return value of numerical field at <b>position</b> as const.
00193   //
00194   inline WordKeyNum Get(int position) const {
00195     return(values[position]);
00196   }
00197 #ifndef SWIG
00198   //-
00199   // Return value of numerical field at <b>position.</b>
00200   //
00201   inline WordKeyNum& Get(int position) {
00202     return(values[position]);
00203   }
00204   //-
00205   // Return value of numerical field at <b>position</b> as const.
00206   //
00207   inline const WordKeyNum &      operator[] (int position) const  { return(values[position]); }
00208   //-
00209   // Return value of numerical field at <b>position.</b>
00210   //
00211   inline       WordKeyNum &      operator[] (int position)        { return(values[position]); }
00212 #endif /* SWIG */
00213   //-
00214   // Set value of numerical field at <b>position</b> to <b>val.</b>
00215   //
00216   inline void Set(int position, WordKeyNum val) {
00217     SetDefined(position);
00218     values[position] = val;
00219   }
00220     
00221   //
00222   // Key field value existenz. Defined means the value of the field contains
00223   // a valid value. Undefined means the value of the field is not valid.
00224   //
00225   //-
00226   // Returns true if field at <b>position</b> is <i>defined</i>, false
00227   // otherwise.
00228   //
00229   int   IsDefined(int position) const { return setbits & (1 << position); }
00230   //-
00231   // Value in field <b>position</b> becomes <i>defined.</i> A bit
00232   // is set in the bit field describing the defined/undefined state
00233   // of the value and the actual value of the field is not modified.
00234   //
00235   void  SetDefined(int position)      { setbits |= (1 << position); }
00236   //-
00237   // Value in field <b>position</b> becomes <i>undefined.</i> A bit
00238   // is set in the bit field describing the defined/undefined state
00239   // of the value and the actual value of the field is not modified.
00240   //
00241   void  Undefined(int position)       { setbits &= ~(1 << position); }
00242 
00243 #ifndef SWIG
00244   //
00245   // Set and Get the whole structure from/to ASCII description
00246   //-
00247   // Set the whole structure from ASCII string in <b>bufferin.</b>
00248   // See <i>ASCII FORMAT</i> section.
00249   // Return OK if successfull, NOTOK otherwise.
00250   //
00251   int Set(const String& bufferin);
00252   int SetList(StringList& fields);
00253   //-
00254   // Convert the whole structure to an ASCII string description 
00255   // in <b>bufferout.</b>
00256   // See <i>ASCII FORMAT</i> section.
00257   // Return OK if successfull, NOTOK otherwise.
00258   //
00259   int Get(String& bufferout) const;
00260   //-
00261   // Convert the whole structure to an ASCII string description 
00262   // and return it.
00263   // See <i>ASCII FORMAT</i> section.
00264   // 
00265   String Get() const;
00266 #endif /* SWIG */
00267 
00268   //
00269   // Storage format conversion
00270   //
00271 #ifndef SWIG
00272   //-
00273   // Set structure from disk storage format as found in 
00274   // <b>string</b> buffer or length <b>length.</b>
00275   // Return OK if successfull, NOTOK otherwise.
00276   //
00277   int           Unpack(const char* string, int length);
00278   //
00279   //-
00280   // Set structure from disk storage format as found in 
00281   // <b>data</b> string.
00282   // Return OK if successfull, NOTOK otherwise.
00283   //
00284   inline int    Unpack(const String& data) { return(Unpack(data,data.length())); }
00285   //
00286   //-
00287   // Convert object into disk storage format as found in 
00288   // and place the result in <b>data</b> string.
00289   // Return OK if successfull, NOTOK otherwise.
00290   //
00291   int           Pack(String& data) const;
00292 #endif /* SWIG */
00293 
00294   //
00295   // Transformations
00296   //
00297   //-
00298   // Copy each <i>defined</i> field from other into the object, if 
00299   // the corresponding field of the object is not defined. 
00300   // Return OK if successfull, NOTOK otherwise.
00301   //
00302   int           Merge(const WordKey& other);
00303   //-
00304   // Undefine all fields found after the first undefined field. The
00305   // resulting key has a set of defined fields followed by undefined fields.
00306   // Returns NOTOK if the word is not defined because the resulting key would 
00307   // be empty and this is considered an error. Returns OK on success.
00308   //
00309   int           PrefixOnly();
00310 #ifndef SWIG
00311   //-
00312   // Implement ++ on a key.
00313   //
00314   // It behaves like arithmetic but follows these rules:
00315   // <pre>
00316   // . Increment starts at field <position>
00317   // . If a field value overflows, increment field <b>position</b> - 1
00318   // . Undefined fields are ignored and their value untouched
00319   // . When a field is incremented all fields to the left are set to 0
00320   // </pre>
00321   // If position is not specified it is equivalent to NFields() - 1.
00322   // It returns OK if successfull, NOTOK if <b>position</b> out of range or
00323   // WORD_FOLLOWING_ATEND if the maximum possible value was reached.
00324   //
00325   int           SetToFollowing(int position = WORD_FOLLOWING_MAX);
00326 #endif /* SWIG */
00327 
00328   //
00329   // Predicates
00330   //
00331   //-
00332   // Return true if all the fields are <i>defined</i>, false otherwise.
00333   //
00334   int           Filled() const { return setbits == (unsigned int) (((1 << NFields()) - 1)); }
00335   //-
00336   // Return true if no fields are <i>defined</i>, false otherwise.
00337   //
00338   int           Empty() const  { return setbits == 0; }
00339   //-
00340   // Return true if the object and <b>other</b> are equal. 
00341   // Only fields defined in both keys are compared.
00342   //
00343   int           Equal(const WordKey& other) const;
00344   //-
00345   // Return true if the object and <b>other</b> are equal. 
00346   // All fields are compared. If a field is defined in <b>object</b>
00347   // and not defined in the object, the key are not considered
00348   // equal.
00349   //
00350   int           ExactEqual(const WordKey& other) const { return(Equal(other) && other.setbits == setbits); }
00351   //-
00352   // Compare <b>object</b> and <b>other</b> as in strcmp. Undefined
00353   // fields are ignored. Returns a positive number if <b>object</b> is
00354   // greater than <b>other</b>, zero if they are equal, a negative
00355   // number if <b>object</b> is lower than <b>other.</b>
00356   //
00357   int           Cmp(const WordKey& other) const;
00358 #ifndef SWIG
00359   //-
00360   // Return true if the object and <b>other</b> are equal. 
00361   // The packed string are compared. An <i>undefined</i> numerical field 
00362   // will be 0 and therefore undistinguishable from a <i>defined</i> field
00363   // whose value is 0.
00364   //
00365   int           PackEqual(const WordKey& other) const;
00366   //-
00367   // Return true if adding <b>increment</b> in field at <b>position</b> makes
00368   // it overflow or underflow, false if it fits.
00369   //
00370   int           Outbound(int position, int increment) {
00371     if(increment < 0) return Underflow(position, increment);
00372     else if(increment > 0) return Overflow(position, increment);
00373     else return WORD_INBOUND;
00374   }
00375   //-
00376   // Return true if adding positive <b>increment</b> to field at 
00377   // <b>position</b> makes it overflow, false if it fits.
00378   //
00379   int           Overflow(int position, int increment) {
00380     return MaxValue(position) - Get(position) < (WordKeyNum)increment ? WORD_OVERFLOW : WORD_INBOUND;
00381   }
00382   //-
00383   // Return true if subtracting positive <b>increment</b> to field 
00384   // at <b>position</b> makes it underflow, false if it fits.
00385   //
00386   int           Underflow(int position, int increment) {
00387     return Get(position) < (WordKeyNum)(-increment) ? WORD_UNDERFLOW : WORD_INBOUND;
00388   }
00389 #endif /* SWIG */
00390   //-
00391   // Return OK if the key may be used as a prefix for search.
00392   // In other words return OK if the fields set in the key
00393   // are all contiguous, starting from the first field.
00394   // Otherwise returns NOTOK
00395   //
00396   int           Prefix() const;
00397 
00398 #ifndef SWIG
00399   //-
00400   // Compare <b>a</b> and <b>b</b> in the Berkeley DB fashion. 
00401   // <b>a</b> and <b>b</b> are packed keys. The semantics of the
00402   // returned int is as of strcmp and is driven by the key description
00403   // found in <i>WordKeyInfo.</i> Returns a positive number if <b>a</b> is
00404   // greater than <b>b</b>, zero if they are equal, a negative number 
00405   // if <b>a</b> is lower than <b>b.</b>
00406   //
00407   static int        Compare(WordContext* context, const String& a, const String& b);
00408   //-
00409   // Compare <b>a</b> and <b>b</b> in the Berkeley DB fashion. 
00410   // <b>a</b> and <b>b</b> are packed keys. The semantics of the
00411   // returned int is as of strcmp and is driven by the key description
00412   // found in <i>WordKeyInfo.</i> Returns a positive number if <b>a</b> is
00413   // greater than <b>b</b>, zero if they are equal, a negative number 
00414   // if <b>a</b> is lower than <b>b.</b>
00415   //
00416   static int        Compare(WordContext* context, const unsigned char *a, int a_length, const unsigned char *b, int b_length);
00417   //-
00418   // Compare object defined fields with <b>other</b> key defined fields only,
00419   // ignore fields that are not defined in object or <b>other.</b> 
00420   // Return 1 if different 0 if equal. 
00421   // If different, <b>position</b> is set to the field number that differ,
00422   // <b>lower</b> is set to 1 if Get(<b>position</b>) is lower than
00423   // other.Get(<b>position</b>) otherwise lower is set to 0.
00424   //
00425   int               Diff(const WordKey& other, int& position, int& lower);
00426 
00427   //-
00428   // Print object in ASCII form on <b>f</b> (uses <i>Get</i> method).
00429   // See <i>ASCII FORMAT</i> section.
00430   //
00431   int Write(FILE* f) const;
00432 #endif /* SWIG */
00433   //-
00434   // Print object in ASCII form on <b>stdout</b> (uses <i>Get</i> method).
00435   // See <i>ASCII FORMAT</i> section.
00436   //
00437   void Print() const;
00438 
00439   //
00440   // Direct access to values array. Only use if you know what you're
00441   // doing.
00442   //
00443   WordKeyNum* Values() { return values; }
00444   const WordKeyNum* Values() const { return values; }
00445 #ifndef SWIG
00446 
00447 private:
00448 
00449   //
00450   // Data members
00451   //
00452   //
00453   // Bit field for defined/undefined status of each key field
00454   //
00455   unsigned int setbits;
00456   //
00457   // Holds the numerical values of the key fields
00458   //
00459   WordKeyNum   values[WORD_KEY_MAX_NFIELDS];
00460 
00461   WordContext  *context;
00462 #endif /* SWIG */
00463 };
00464 
00465 #endif /* _WordKey_h */

Generated on Sun Jun 8 10:56:40 2008 for GNUmifluz by  doxygen 1.5.5