00001 // 00002 // WordBitCompress.h 00003 // 00004 // NAME 00005 // 00006 // Read and write objects in a bit stream, possibly compressing them. 00007 // 00008 // SYNOPSIS 00009 // 00010 // #include <WordBitStream.h> 00011 // 00012 // unsigned int value = 200; 00013 // WordBitCompress stream; 00014 // stream.PutUint(value, 8); 00015 // stream.Rewind(); 00016 // stream.GetUint(value); 00017 // 00018 // DESCRIPTION 00019 // 00020 // A <b>WordBitCompress</b> object is a variable size string with methods 00021 // to write and read numerical objects using a controlled amount of bits. 00022 // Some methods implement compression such as custom Shannon-Fano or 00023 // static compression similar to Golomb or Gamma. 00024 // 00025 // It is not possible to read a stream while writing into it and vice 00026 // versa. Another limitation is that the largest number is of type unsigned 00027 // int. 00028 // 00029 // For examples of use, check the <b>WordDBCompress</b> implementation. 00030 // 00031 // 00032 // END 00033 // 00034 // Part of the ht://Dig package <http://www.htdig.org/> 00035 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group 00036 // For copyright details, see the file COPYING in your distribution 00037 // or the GNU General Public License version 2 or later 00038 // <http://www.gnu.org/copyleft/gpl.html> 00039 // 00040 // $Id: WordBitCompress_8h-source.html,v 1.1 2008/06/08 10:12:58 sebdiaz Exp $ 00041 // 00042 00043 #ifndef _WordBitCompress_h 00044 #define _WordBitCompress_h 00045 00046 #include <stdio.h> 00047 #include <stdlib.h> 00048 #include <string.h> 00049 00050 class WordBitStream { 00051 public: 00052 inline WordBitStream(unsigned int nbuff_size) { 00053 Init(); 00054 Allocate(nbuff_size); 00055 } 00056 inline WordBitStream() { 00057 Init(); 00058 } 00059 inline ~WordBitStream() { 00060 free(buff); 00061 } 00062 00063 // 00064 // Build and initialize stream 00065 // 00066 inline void Init() { 00067 buff_size = 1024; 00068 buff = (unsigned char*)malloc(buff_size); 00069 Clear(); 00070 } 00071 00072 // 00073 // Reset stream to empty state 00074 // 00075 inline void Clear() { 00076 bitpos = 0; 00077 buff_length = 1; 00078 buff_idx = 0; 00079 buff[buff_idx] = '\0'; 00080 freezeon = 0; 00081 freeze_bitcount = 0; 00082 } 00083 00084 inline void Allocate(unsigned int nbuff_size) { 00085 buff_size = nbuff_size; 00086 buff = (unsigned char*)realloc(buff, buff_size); 00087 } 00088 00089 // 00090 // Make sure the buffer is large enough to safely access the 00091 // byte located at buff[index] 00092 // 00093 inline void Check(int index) { 00094 while(index >= buff_size) 00095 Allocate(buff_size * 2); 00096 } 00097 00098 // 00099 // Advance bitpos by adding incr to it, take care of space allocation. 00100 // 00101 inline void BitposIncr(int incr) { 00102 bitpos += incr; 00103 if(!(bitpos & 0x07)) { 00104 Check(++buff_idx); 00105 buff[buff_idx] = 0; 00106 buff_length++; 00107 } 00108 } 00109 00110 // 00111 // Append a bit : 0 if v null, 1 otherwise 00112 // 00113 inline void Put(unsigned int v) { 00114 if(freezeon) { 00115 freeze_bitcount++; 00116 return; 00117 } 00118 00119 if(v) buff[buff_idx] |= 1 << (bitpos & 0x07); 00120 BitposIncr(1); 00121 } 00122 // 00123 // Return 0 if current bit in stream is not set, and not 0 if it is set. The 00124 // bit position is incremented. 00125 // 00126 inline unsigned char Get() { 00127 if(bitpos >= (buff_length << 3)) { 00128 fprintf(stderr, "BitStream::get reading past end of BitStream.\n"); 00129 } 00130 00131 unsigned char res = buff[bitpos >> 3] & (1 << (bitpos & 0x07)); 00132 bitpos++; 00133 return res; 00134 } 00135 00136 // 00137 // Put in stream the lowest <b>n</b> bits from the value <b>v</b> 00138 // 00139 void PutUint(unsigned int v, int n); 00140 // 00141 // Return <b>n</b> bits from the stream. 00142 // 00143 unsigned int GetUint(int n); 00144 00145 // 00146 // Put in stream <b>n</b> bits from the array <b>vals</b>, each char 00147 // considered as a storage for 8 bits. 00148 // 00149 void PutZone(unsigned char *vals, int n); 00150 // 00151 // Get from stream <b>n</b> bits and move them to the array <b>vals</b>. 00152 // The size of <b>vals</b> must be at least <b>n</b>/8 long. 00153 // 00154 void GetZone(unsigned char *vals, int n); 00155 00156 // 00157 // Return the length of the stream, in bits. 00158 // 00159 inline int Length() { return freezeon ? freeze_bitcount : bitpos; } 00160 // 00161 // Return the length of the stream, in bytes. 00162 // 00163 inline int BuffLength() { return buff_length; } 00164 // 00165 // Return a pointer to the beginning of the stream. 00166 // 00167 inline unsigned char* Buff() { return buff; } 00168 // 00169 // Return a pointer to a copy of the stream allocated with malloc. 00170 // 00171 inline unsigned char* BuffCopy() { 00172 unsigned char* copy = (unsigned char*)malloc(buff_length); 00173 memcpy(copy, buff, buff_length); 00174 return copy; 00175 } 00176 // 00177 // Copy <b>nbuff_length</b> bytes from <b>nbuff</b> into the stream 00178 // 00179 inline void BuffSet(const unsigned char* nbuff, int nbuff_length) { 00180 bitpos = 0; 00181 buff_length = nbuff_length; 00182 buff_idx = buff_length - 1; 00183 Check(buff_length); 00184 memcpy(buff, nbuff, buff_length); 00185 } 00186 // 00187 // Move the stream cursor to the beginning of the stream. 00188 // 00189 void Rewind() { bitpos = 0; } 00190 00191 // 00192 // Stop insertion into the stream. Only the stream cursor is updated 00193 // to accurately reflect the size of the stream. All the Get* and Put* 00194 // methods behaviour is modified. 00195 // 00196 void Freeze() { 00197 if(freezeon) { 00198 fprintf(stderr, "WordBitCompress::Freeze: recursive call not permitted\n"); 00199 } 00200 freeze_bitcount = 0; 00201 freezeon = 1; 00202 } 00203 // 00204 // Enable insertion into the stream, this is the normal behaviour. See 00205 // the <b>Freeze</b> method. 00206 // 00207 void UnFreeze() { 00208 freezeon = 0; 00209 } 00210 00211 private: 00212 00213 // the buffer were the bitstream is stored 00214 unsigned char* buff; 00215 int buff_length; 00216 int buff_size; 00217 int buff_idx; 00218 00219 // current bit position within the buffer 00220 int bitpos; 00221 00222 // freezing the bitstream 00223 int freeze_bitcount; 00224 int freezeon; 00225 }; 00226 00227 // Constants used by WordBitCompress 00228 // number of bits to code the number of values in an array 00229 #define WORD_CMPR_NBITS_NVALS 16 00230 00231 // 00232 // Number of bits encoding the log2 of 32 bits value 00233 // 5 == log2(32) == log2(log2(0xffffffff)) 00234 // 00235 #define WORD_CMPR_LOG32_BITS 5 00236 00237 // 00238 // Number of bits encoding the log2 of 8 bits value 00239 // 3 == log2(8) == log2(log2(0xff)) 00240 // (currently set to 4 for unknown reasons, never tried with 00241 // 3. The fact that the above LOG32_BITS = 5 apparenlty works 00242 // is not a good hint to guess that LOG8_BITS = 3 would work 00243 // since the coding functions refuse values with bit 32 set 00244 // so the number of bits really encoded is really 31. To 00245 // make sure this works properly unary tests should be 00246 // written). 00247 // 00248 #define WORD_CMPR_LOG8_BITS 4 00249 00250 // 00251 // Number of bits encoding the compression model 00252 // 00253 #define WORD_CMPR_MODEL_BITS 2 00254 // 00255 // Compression model value for Decr 00256 // 00257 #define WORD_CMPR_MODEL_DECR 0 00258 // 00259 // Compression model value for Fixed 00260 // 00261 #define WORD_CMPR_MODEL_FIXED 1 00262 00263 class WordBitCompress : public WordBitStream { 00264 public: 00265 //- 00266 // Constructor. Create a empty stream. 00267 // 00268 WordBitCompress() { } 00269 //- 00270 // Constructor. Create a empty stream and pre-allocate <b>size</b> 00271 // bytes. 00272 // 00273 WordBitCompress(int size) : WordBitStream(size) { } 00274 00275 //- 00276 // Put in bitstream integer value <b>v</b> (coded on <b>n</b> bits). 00277 // 00278 // The encoding is : 00279 // <ul> 00280 // <li> WordBitStream::PutUint(log2(v), log2(n)) 00281 // <li> WordBitStream::PutUint(v, log2(v)) 00282 // </ul> 00283 // 00284 void PutUint(unsigned int v, int n); 00285 //- 00286 // Get integer value from bitstream and return it (coded on <b>n</b> bits). 00287 // 00288 // The decoding is : 00289 // <ul> 00290 // <li> nbits = WordBitStream::GetUint(log2(n)) 00291 // <li> return WordBitStream::GetUint(nbits)) 00292 // </ul> 00293 // 00294 unsigned int GetUint(int n); 00295 00296 //- 00297 // Put in bitstream the array of <b>vals</b> integer values 00298 // of size <b>n</b> and return the number of bits used in the bitstream. 00299 // 00300 // The encoding is : 00301 // <ul> 00302 // <li> PutUint(n, WORD_CMPR_NBITS_NVALS) 00303 // <ul> 00304 // If Decr preforms better than Fixed 00305 // <ul> 00306 // <li> PutUint(WORD_CMPR_MODEL_DECR, WORD_COMPRESS_MODEL_BITS) 00307 // <li> PutUintsDecr(vals, n) 00308 // </ul> 00309 // Otherwise, if Fixed preforms better than Decr 00310 // <ul> 00311 // <li> PutUint(WORD_CMPR_MODEL_FIXED, WORD_COMPRESS_MODEL_BITS) 00312 // <li> PutUintsDecr(vals, n) 00313 // </ul> 00314 // 00315 int PutUints(unsigned int *vals, int n); 00316 //- 00317 // Get list of integer values from bitstream and return them in 00318 // the <b>valsp</b> argument. The length of the <b>vals</b> array 00319 // is returned. If 0 is returned, the <b>valsp</b> is set to null. 00320 // The <b>valsp</b> argument is allocated with <i>new</i>, it is 00321 // the responsibility of the caller to free it. 00322 // 00323 // The decoding is : 00324 // <ul> 00325 // <li> count = GetUint(WORD_CMPR_NBITS_NVALS) 00326 // <li> model = GetUint(WORD_COMPRESS_MODEL_BITS) 00327 // <li> GetUintsFixed(vals, count) (if model == WORD_COMPRESS_MODEL_FIXED) 00328 // <li> GetUintsDecr(vals, count) (if model == WORD_COMPRESS_MODEL_DECR) 00329 // <ul> 00330 // 00331 int GetUints(unsigned int **valsp); 00332 //- 00333 // Alias for GetUints(unsigned int **valsp). The <b>valsp</b> argument 00334 // must point to an allocated array of <b>valsp_size</b> bytes. 00335 // 00336 int GetUints(unsigned int **valsp, int *valsp_size); 00337 00338 //- 00339 // Put in bitstream the array of <b>vals</b> unsigned char values 00340 // of size <b>n</b> and return the number of bits used in the bitstream. 00341 // 00342 // The encoding is : 00343 // <ul> 00344 // <li> PutUint(n, WORD_CMPR_NBITS_NVALS) 00345 // <li> PutUint(log2(max of all vals), WORD_CMPR_LOG8_BITS) 00346 // <li> foreach val in vals : PutUint(val, log2(max of all vals)) 00347 // <ul> 00348 // 00349 int PutUchars(unsigned char *vals, int n); 00350 //- 00351 // Get list of unsigned char values from bitstream and return them in 00352 // the <b>valsp</b> argument. The length of the <b>vals</b> array 00353 // is returned. If 0 is returned, the <b>valsp</b> is set to null. 00354 // The <b>valsp</b> argument is allocated with <i>new</i>, it is 00355 // the responsibility of the caller to free it. 00356 // 00357 // The decoding is : 00358 // <ul> 00359 // <li> count = GetUint(WORD_CMPR_NBITS_NVALS) 00360 // <li> log2(max of all vals) = GetUint(WORD_CMPR_LOG8_BITS) 00361 // <li> count times : GetUint(log2(max of all vals)) 00362 // </ul> 00363 // 00364 int GetUchars(unsigned char **valsp); 00365 //- 00366 // Alias for GetUchars(unsigned int **valsp). The <b>valsp</b> argument 00367 // must point to an allocated array of <b>valsp_size</b> bytes. 00368 // 00369 int GetUchars(unsigned char **valsp, int *valsp_size); 00370 00371 //- 00372 // Put in bitstream the array of <b>vals</b> integer values 00373 // of size <b>n</b>. 00374 // 00375 // The encoding is : 00376 // <ul> 00377 // <li> PutUint(log2(max of all vals), WORD_CMPR_LOG32_BITS) 00378 // <li> foreach val in vals : PutUint(val, log2(max of all vals)) 00379 // </ul> 00380 // 00381 void PutUintsFixed(unsigned int *vals, int n); 00382 //- 00383 // Get list of integer values from bitstream and return them in 00384 // the <b>vals</b> argument. The array pointed by the <b>vals</b> 00385 // argument must be large enough to hold <b>n</b> elements. 00386 // 00387 // The decoding is : 00388 // <ul> 00389 // <li> bits = GetUint(WORD_CMPR_LOG32_BITS) 00390 // <li> count times : GetUint(bits) 00391 // </ul> 00392 // 00393 void GetUintsFixed(unsigned int *vals, int n); 00394 00395 //- 00396 // Put in bitstream the array of <b>vals</b> integer values 00397 // of size <b>n</b>. 00398 // 00399 // The encoding uses an algorithm inspired by Shannon-Fano. 00400 // The <b>vals</b> array is divided in chunks of equal size 00401 // and each number in a chunck is coded by the chunk number 00402 // and its offset within the chunk. See VLengthCoder implementation 00403 // in WordBitCompress.cc for more information. 00404 // 00405 void PutUintsDecr(unsigned int *vals, int n); 00406 //- 00407 // Get list of integer values from bitstream and return them in 00408 // the <b>vals</b> argument. The array pointed by the <b>vals</b> 00409 // argument must be large enough to hold <b>n</b> elements. 00410 // 00411 void GetUintsDecr(unsigned int *vals, int n); 00412 }; 00413 00414 #endif /* _WordBitCompress_h */ 00415