GNUmifluz: mifluz/WordBitCompress.h Source File

00001 //
00002 // WordBitCompress.h
00003 //
00004 // NAME
00005 //
00006 // Read and write objects in a bit stream, possibly compressing them.
00007 //
00008 // SYNOPSIS
00009 //
00010 // #include <WordBitStream.h>
00011 //
00012 // unsigned int value = 200;
00013 // WordBitCompress stream;
00014 // stream.PutUint(value, 8);
00015 // stream.Rewind();
00016 // stream.GetUint(value);
00017 //
00018 // DESCRIPTION
00019 //
00020 // A <b>WordBitCompress</b> object is a variable size string with methods
00021 // to write and read numerical objects using a controlled amount of bits.
00022 // Some methods implement compression such as custom Shannon-Fano or 
00023 // static compression similar to Golomb or Gamma. 
00024 //
00025 // It is not possible to read a stream while writing into it and vice 
00026 // versa. Another limitation is that the largest number is of type unsigned
00027 // int. 
00028 //
00029 // For examples of use, check the <b>WordDBCompress</b> implementation. 
00030 //
00031 // 
00032 // END
00033 //
00034 // Part of the ht://Dig package   <http://www.htdig.org/>
00035 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group
00036 // For copyright details, see the file COPYING in your distribution
00037 // or the GNU General Public License version 2 or later
00038 // <http://www.gnu.org/copyleft/gpl.html>
00039 //
00040 // $Id: WordBitCompress_8h-source.html,v 1.1 2008/06/08 10:12:58 sebdiaz Exp $
00041 //
00042 
00043 #ifndef _WordBitCompress_h
00044 #define _WordBitCompress_h
00045 
00046 #include <stdio.h>
00047 #include <stdlib.h>
00048 #include <string.h>
00049 
00050 class WordBitStream {
00051 public:
00052   inline WordBitStream(unsigned int nbuff_size) {
00053     Init();
00054     Allocate(nbuff_size);
00055   }
00056   inline WordBitStream() {
00057     Init();
00058   }
00059   inline ~WordBitStream() {
00060     free(buff);
00061   }
00062 
00063   //
00064   // Build and initialize stream
00065   //
00066   inline void Init() {
00067     buff_size = 1024;
00068     buff = (unsigned char*)malloc(buff_size);
00069     Clear();
00070   }
00071 
00072   //
00073   // Reset stream to empty state
00074   //
00075   inline void Clear() {
00076     bitpos = 0;
00077     buff_length = 1;
00078     buff_idx = 0;
00079     buff[buff_idx] = '\0';
00080     freezeon = 0;
00081     freeze_bitcount = 0;
00082   }
00083 
00084   inline void Allocate(unsigned int nbuff_size) {
00085     buff_size = nbuff_size;
00086     buff = (unsigned char*)realloc(buff, buff_size);
00087   }
00088 
00089   //
00090   // Make sure the buffer is large enough to safely access the
00091   // byte located at buff[index]
00092   //
00093   inline void Check(int index) {
00094     while(index >= buff_size)
00095       Allocate(buff_size * 2);
00096   }
00097 
00098   //
00099   // Advance bitpos by adding incr to it, take care of space allocation.
00100   //
00101   inline void BitposIncr(int incr) {
00102     bitpos += incr;
00103     if(!(bitpos & 0x07)) {
00104       Check(++buff_idx);
00105       buff[buff_idx] = 0;
00106       buff_length++;
00107     }
00108   }
00109 
00110   //
00111   // Append a bit : 0 if v null, 1 otherwise
00112   //
00113   inline void Put(unsigned int v) {
00114     if(freezeon) {
00115       freeze_bitcount++;
00116       return;
00117     }
00118 
00119     if(v) buff[buff_idx] |= 1 << (bitpos & 0x07);
00120     BitposIncr(1);
00121   }
00122   //
00123   // Return 0 if current bit in stream is not set, and not 0 if it is set. The
00124   // bit position is incremented.
00125   //
00126   inline unsigned char Get() {
00127     if(bitpos >= (buff_length << 3)) {
00128       fprintf(stderr, "BitStream::get reading past end of BitStream.\n");
00129     }
00130     
00131     unsigned char res = buff[bitpos >> 3] & (1 << (bitpos & 0x07));
00132     bitpos++;
00133     return res;
00134   }
00135 
00136   //
00137   // Put in stream the lowest <b>n</b> bits from the value <b>v</b>
00138   //
00139   void PutUint(unsigned int v, int n);
00140   //
00141   // Return <b>n</b> bits from the stream.
00142   //
00143   unsigned int GetUint(int n);
00144 
00145   //
00146   // Put in stream <b>n</b> bits from the array <b>vals</b>, each char
00147   // considered as a storage for 8 bits.
00148   //
00149   void PutZone(unsigned char *vals, int n);
00150   //
00151   // Get from stream <b>n</b> bits and move them to the array <b>vals</b>.
00152   // The size of <b>vals</b> must be at least <b>n</b>/8 long.
00153   //
00154   void GetZone(unsigned char *vals, int n);
00155 
00156   //
00157   // Return the length of the stream, in bits.
00158   //
00159   inline int Length() { return freezeon ? freeze_bitcount : bitpos; }
00160   //
00161   // Return the length of the stream, in bytes.
00162   //
00163   inline int BuffLength() { return buff_length; }
00164   //
00165   // Return a pointer to the beginning of the stream.
00166   //
00167   inline unsigned char* Buff() { return buff; }
00168   //
00169   // Return a pointer to a copy of the stream allocated with malloc.
00170   //
00171   inline unsigned char* BuffCopy() {
00172     unsigned char* copy = (unsigned char*)malloc(buff_length);
00173     memcpy(copy, buff, buff_length);
00174     return copy;
00175   }
00176   //
00177   // Copy <b>nbuff_length</b> bytes from <b>nbuff</b> into the stream
00178   //
00179   inline void BuffSet(const unsigned char* nbuff, int nbuff_length) {
00180     bitpos = 0;
00181     buff_length = nbuff_length;
00182     buff_idx = buff_length - 1;
00183     Check(buff_length);
00184     memcpy(buff, nbuff, buff_length);
00185   }
00186   //
00187   // Move the stream cursor to the beginning of the stream.
00188   //
00189   void Rewind() { bitpos = 0; }
00190 
00191   //
00192   // Stop insertion into the stream. Only the stream cursor is updated
00193   // to accurately reflect the size of the stream. All the Get* and Put* 
00194   // methods behaviour is modified.
00195   //
00196   void Freeze() {
00197     if(freezeon) {
00198       fprintf(stderr, "WordBitCompress::Freeze: recursive call not permitted\n");
00199     }
00200     freeze_bitcount = 0;
00201     freezeon = 1;
00202   }
00203   //
00204   // Enable insertion into the stream, this is the normal behaviour. See
00205   // the <b>Freeze</b> method.
00206   //
00207   void UnFreeze() {
00208     freezeon = 0;
00209   }
00210       
00211 private:
00212 
00213   // the buffer were the bitstream is stored
00214   unsigned char* buff;
00215   int buff_length;
00216   int buff_size;
00217   int buff_idx;
00218 
00219   // current bit position within the buffer
00220   int bitpos;
00221 
00222   // freezing the bitstream
00223   int freeze_bitcount;
00224   int freezeon;
00225 };
00226 
00227 // Constants used by WordBitCompress
00228 // number of bits to code the number of values in an array 
00229 #define WORD_CMPR_NBITS_NVALS 16
00230 
00231 //
00232 // Number of bits encoding the log2 of 32 bits value
00233 // 5 == log2(32) == log2(log2(0xffffffff))
00234 //
00235 #define WORD_CMPR_LOG32_BITS    5
00236 
00237 //
00238 // Number of bits encoding the log2 of 8 bits value
00239 // 3 == log2(8) == log2(log2(0xff))
00240 // (currently set to 4 for unknown reasons, never tried with
00241 //  3. The fact that the above LOG32_BITS = 5 apparenlty works
00242 //  is not a good hint to guess that LOG8_BITS = 3 would work
00243 //  since the coding functions refuse values with bit 32 set
00244 //  so the number of bits really encoded is really 31. To
00245 //  make sure this works properly unary tests should be 
00246 //  written).
00247 //
00248 #define WORD_CMPR_LOG8_BITS     4
00249 
00250 //
00251 // Number of bits encoding the compression model
00252 //
00253 #define WORD_CMPR_MODEL_BITS    2
00254 //
00255 // Compression model value for Decr
00256 //
00257 #define WORD_CMPR_MODEL_DECR    0
00258 //
00259 // Compression model value for Fixed
00260 //
00261 #define WORD_CMPR_MODEL_FIXED   1
00262 
00263 class WordBitCompress : public WordBitStream {
00264  public:
00265   //-
00266   // Constructor. Create a empty stream.
00267   //
00268   WordBitCompress() { }
00269   //-
00270   // Constructor. Create a empty stream and pre-allocate <b>size</b>
00271   // bytes.
00272   //
00273   WordBitCompress(int size) : WordBitStream(size) { }
00274 
00275   //-
00276   // Put in bitstream integer value <b>v</b> (coded on <b>n</b> bits).
00277   //
00278   // The encoding is : 
00279   // <ul>
00280   // <li> WordBitStream::PutUint(log2(v), log2(n))
00281   // <li> WordBitStream::PutUint(v, log2(v))
00282   // </ul>
00283   //
00284   void         PutUint(unsigned int v, int n);
00285   //-
00286   // Get integer value from bitstream and return it (coded on <b>n</b> bits).
00287   //
00288   // The decoding is : 
00289   // <ul>
00290   // <li> nbits = WordBitStream::GetUint(log2(n))
00291   // <li> return WordBitStream::GetUint(nbits))
00292   // </ul>
00293   //
00294   unsigned int GetUint(int n);
00295 
00296   //-
00297   // Put in bitstream the array of <b>vals</b> integer values 
00298   // of size <b>n</b> and return the number of bits used in the bitstream.
00299   //
00300   // The encoding is : 
00301   // <ul>
00302   // <li> PutUint(n, WORD_CMPR_NBITS_NVALS)
00303   // <ul> 
00304   // If Decr preforms better than Fixed
00305   // <ul>
00306   // <li> PutUint(WORD_CMPR_MODEL_DECR, WORD_COMPRESS_MODEL_BITS)
00307   // <li> PutUintsDecr(vals, n)
00308   // </ul>
00309   // Otherwise, if Fixed preforms better than Decr
00310   // <ul>
00311   // <li> PutUint(WORD_CMPR_MODEL_FIXED, WORD_COMPRESS_MODEL_BITS)
00312   // <li> PutUintsDecr(vals, n)
00313   // </ul>
00314   // 
00315   int PutUints(unsigned int *vals, int n);
00316   //-
00317   // Get list of integer values from bitstream and return them in
00318   // the <b>valsp</b> argument. The length of the <b>vals</b> array
00319   // is returned. If 0 is returned, the <b>valsp</b> is set to null.
00320   // The <b>valsp</b> argument is allocated with <i>new</i>, it is
00321   // the responsibility of the caller to free it.
00322   //
00323   // The decoding is : 
00324   // <ul>
00325   // <li> count = GetUint(WORD_CMPR_NBITS_NVALS)
00326   // <li> model = GetUint(WORD_COMPRESS_MODEL_BITS)
00327   // <li> GetUintsFixed(vals, count) (if model == WORD_COMPRESS_MODEL_FIXED)
00328   // <li> GetUintsDecr(vals, count) (if model == WORD_COMPRESS_MODEL_DECR)
00329   // <ul> 
00330   //
00331   int GetUints(unsigned int **valsp);
00332   //-
00333   // Alias for GetUints(unsigned int **valsp). The <b>valsp</b> argument
00334   // must point to an allocated array of <b>valsp_size</b> bytes.
00335   // 
00336   int GetUints(unsigned int **valsp, int *valsp_size);
00337 
00338   //-
00339   // Put in bitstream the array of <b>vals</b> unsigned char values 
00340   // of size <b>n</b> and return the number of bits used in the bitstream.
00341   //
00342   // The encoding is : 
00343   // <ul>
00344   // <li> PutUint(n, WORD_CMPR_NBITS_NVALS)
00345   // <li> PutUint(log2(max of all vals), WORD_CMPR_LOG8_BITS)
00346   // <li> foreach val in vals : PutUint(val, log2(max of all vals))
00347   // <ul> 
00348   //
00349   int PutUchars(unsigned char *vals, int n);    
00350   //-
00351   // Get list of unsigned char values from bitstream and return them in
00352   // the <b>valsp</b> argument. The length of the <b>vals</b> array
00353   // is returned. If 0 is returned, the <b>valsp</b> is set to null.
00354   // The <b>valsp</b> argument is allocated with <i>new</i>, it is
00355   // the responsibility of the caller to free it.
00356   //
00357   // The decoding is : 
00358   // <ul>
00359   // <li> count = GetUint(WORD_CMPR_NBITS_NVALS)
00360   // <li> log2(max of all vals) = GetUint(WORD_CMPR_LOG8_BITS)
00361   // <li> count times : GetUint(log2(max of all vals))
00362   // </ul>
00363   //
00364   int GetUchars(unsigned char **valsp);
00365   //-
00366   // Alias for GetUchars(unsigned int **valsp). The <b>valsp</b> argument
00367   // must point to an allocated array of <b>valsp_size</b> bytes.
00368   // 
00369   int GetUchars(unsigned char **valsp, int *valsp_size);
00370 
00371   //-
00372   // Put in bitstream the array of <b>vals</b> integer values 
00373   // of size <b>n</b>.
00374   //
00375   // The encoding is : 
00376   // <ul>
00377   // <li> PutUint(log2(max of all vals), WORD_CMPR_LOG32_BITS)
00378   // <li> foreach val in vals : PutUint(val, log2(max of all vals))
00379   // </ul>
00380   //
00381   void PutUintsFixed(unsigned int *vals, int n);
00382   //-
00383   // Get list of integer values from bitstream and return them in
00384   // the <b>vals</b> argument. The array pointed by the <b>vals</b>
00385   // argument must be large enough to hold <b>n</b> elements.
00386   //
00387   // The decoding is : 
00388   // <ul>
00389   // <li> bits = GetUint(WORD_CMPR_LOG32_BITS)
00390   // <li> count times : GetUint(bits)
00391   // </ul>
00392   //
00393   void GetUintsFixed(unsigned int *vals, int n);
00394 
00395   //-
00396   // Put in bitstream the array of <b>vals</b> integer values 
00397   // of size <b>n</b>.
00398   //
00399   // The encoding uses an algorithm inspired by Shannon-Fano. 
00400   // The <b>vals</b> array is divided in chunks of equal size
00401   // and each number in a chunck is coded by the chunk number 
00402   // and its offset within the chunk. See VLengthCoder implementation
00403   // in WordBitCompress.cc for more information.
00404   //
00405   void PutUintsDecr(unsigned int *vals, int n);
00406   //-
00407   // Get list of integer values from bitstream and return them in
00408   // the <b>vals</b> argument. The array pointed by the <b>vals</b>
00409   // argument must be large enough to hold <b>n</b> elements.
00410   //
00411   void GetUintsDecr(unsigned int *vals, int n);
00412 };
00413 
00414 #endif /* _WordBitCompress_h */
00415