00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #ifdef HAVE_CONFIG_H
00033 #include "config.h"
00034 #endif
00035
00036 #include <stdlib.h>
00037 #include <errno.h>
00038
00039 extern "C" {
00040 #include "db_int.h"
00041 #include "db_page.h"
00042 }
00043
00044 #include "lib.h"
00045 #include "WordDBCompress.h"
00046 #include "WordBitCompress.h"
00047 #include "WordKeyInfo.h"
00048 #include "WordKey.h"
00049 #include "WordRecord.h"
00050 #include "WordDB.h"
00051 #include "HtMaxMin.h"
00052
00053
00054
00055
00056
00057 extern "C"
00058 {
00059
00060 int WordDBCompress_compress_c(DB_ENV*, const u_int8_t* inbuff, int inbuff_length, u_int8_t** outbuffp, int* outbuff_lengthp, void *user_data)
00061 {
00062 if(!user_data) {
00063 fprintf(stderr, "WordDBCompress_compress_c:: user_data is NULL");
00064 return NOTOK;
00065 }
00066 return ((WordDBCompress *)user_data)->Compress((unsigned char*)inbuff, inbuff_length, (unsigned char**)outbuffp, outbuff_lengthp);
00067 }
00068
00069 int WordDBCompress_uncompress_c(DB_ENV*, const u_int8_t* inbuff, int inbuff_length, u_int8_t* outbuff, int outbuff_length, void *user_data)
00070 {
00071 if(!user_data) {
00072 fprintf(stderr, "WordDBCompress_uncompress_c:: user_data is NULL");
00073 return NOTOK;
00074 }
00075 return ((WordDBCompress *)user_data)->Uncompress((unsigned char *)inbuff, inbuff_length, (unsigned char*)outbuff, outbuff_length);
00076 }
00077
00078 }
00079
00080
00081
00082
00083
00084
00085
00086
00087 #define WORD_CMPR_VAL_FLAGS 0
00088
00089
00090
00091 #define WORD_CMPR_VAL_FIELDS 1
00092
00093
00094
00095 #define WORD_CMPR_VAL_PGNO (WORD_CMPR_VAL_FIELDS + WORD_KEY_MAX_NFIELDS)
00096
00097
00098
00099 #define WORD_CMPR_VAL_RLENGTH WORD_CMPR_VAL_PGNO
00100
00101
00102
00103 #define WORD_CMPR_VAL_NRECS (WORD_CMPR_VAL_PGNO + 1)
00104
00105
00106
00107 #define WORD_CMPR_VAL_RVALUE WORD_CMPR_VAL_NRECS
00108
00109
00110
00111 #define WORD_CMPR_VAL_PREFIX (WORD_CMPR_VAL_PGNO + 2)
00112
00113
00114
00115
00116 #define WORD_CMPR_VAL_SUFFIX (WORD_CMPR_VAL_PGNO + 3)
00117
00118
00119
00120 #define WORD_CMPR_VAL_LAST WORD_CMPR_VAL_SUFFIX
00121
00122 #define WORD_CMPR_VAL_ARRAY_SIZE (WORD_CMPR_VAL_LAST + 1)
00123
00124
00125
00126 #define WORD_CMPR_VAL_ARRAY_SIZE_BITS 8
00127
00128
00129
00130
00131 #define WORD_CMPR_VAL_FLAGS_BITS (WORD_CMPR_VAL_LAST + 1)
00132
00133
00134
00135 #define WORD_CMPR_VAL_FLAGS_FIELD(n) (1 << (WORD_CMPR_VAL_FIELDS + (n)))
00136 #define WORD_CMPR_VAL_FLAGS_PGNO (1 << (WORD_CMPR_VAL_PGNO))
00137 #define WORD_CMPR_VAL_FLAGS_RLENGTH WORD_CMPR_VAL_FLAGS_PGNO
00138 #define WORD_CMPR_VAL_FLAGS_NRECS (1 << (WORD_CMPR_VAL_NRECS))
00139 #define WORD_CMPR_VAL_FLAGS_RVALUE WORD_CMPR_VAL_FLAGS_NRECS
00140 #define WORD_CMPR_VAL_FLAGS_PREFIX (1 << (WORD_CMPR_VAL_PREFIX))
00141 #define WORD_CMPR_VAL_FLAGS_SUFFIX (1 << (WORD_CMPR_VAL_SUFFIX))
00142 #define WORD_CMPR_VAL_FLAGS_STRING (1 << (WORD_CMPR_VAL_LAST + 1))
00143 #define WORD_CMPR_VAL_FLAGS_EMPTY (1 << (WORD_CMPR_VAL_LAST + 2))
00144 #define WORD_CMPR_VAL_FLAGS_RECORD_EQ (1 << (WORD_CMPR_VAL_LAST + 3))
00145 #define WORD_CMPR_VAL_FLAGS_RECORD_NO (1 << (WORD_CMPR_VAL_LAST + 4))
00146 #define WORD_CMPR_VAL_FLAGS_RECORD_STR (1 << (WORD_CMPR_VAL_LAST + 5))
00147
00148
00149
00150
00151 class WordDBEncoded {
00152 public:
00153 inline WordDBEncoded() {
00154 Init();
00155 }
00156 inline ~WordDBEncoded() {
00157 free(strings);
00158 for(int i = 0; i < WORD_CMPR_VAL_ARRAY_SIZE; i++)
00159 free(values[i]);
00160 }
00161
00162 inline void Init() {
00163 strings_size = 32;
00164 strings = (unsigned char*)malloc(strings_size);
00165
00166 for(int i = 0; i < WORD_CMPR_VAL_ARRAY_SIZE; i++) {
00167 values_size[i] = 32;
00168 values[i] = (unsigned int*)malloc(values_size[i] * sizeof(unsigned int));
00169 }
00170
00171 Clear();
00172 }
00173
00174 inline void Clear() {
00175 strings_length = 0;
00176 strings_idx = 0;
00177
00178 for(int i = 0; i < WORD_CMPR_VAL_ARRAY_SIZE; i++) {
00179 values_length[i] = 0;
00180 values_idx[i] = 0;
00181 }
00182 }
00183
00184 inline void AllocateStrings(unsigned int size) {
00185 strings_size = size;
00186 strings = (unsigned char*)realloc(strings, strings_size);
00187 }
00188
00189 inline void CheckStrings(int index) {
00190 while(index >= strings_size)
00191 AllocateStrings(strings_size * 2);
00192 }
00193
00194 inline void AllocateValues(int what, unsigned int size) {
00195 values_size[what] = size;
00196 values[what] = (unsigned int*)realloc(values[what], values_size[what] * sizeof(unsigned int));
00197 }
00198
00199 inline void CheckValues(int what, int index) {
00200 while(index >= values_size[what])
00201 AllocateValues(what, values_size[what] * 2);
00202 }
00203
00204 inline void PushValue(int what, unsigned int value) {
00205 CheckValues(what, values_length[what]);
00206 values[what][values_length[what]] = value;
00207 values_length[what]++;
00208 }
00209
00210 inline void PushString(const unsigned char* string, int length) {
00211 CheckStrings(strings_length + length);
00212 memcpy(strings + strings_length, string, length);
00213 strings_length += length;
00214 }
00215
00216 inline unsigned int ShiftValue(int what) {
00217 if(values_idx[what] >= values_length[what]) {
00218 fprintf(stderr, "WordDBEncoded::ShiftValue: what = %d, (idx = %d) >= (length = %d)\n", what, values_idx[what], values_length[what]);
00219 abort();
00220 }
00221 return values[what][values_idx[what]++];
00222 }
00223
00224 inline unsigned char* ShiftString(int length) {
00225 if(strings_idx + length > strings_length) {
00226 fprintf(stderr, "WordDBEncoded::ShiftString: (idx + length = %d) >= (length = %d)\n", strings_idx + length, strings_length);
00227 abort();
00228 }
00229 unsigned char* string = strings + strings_idx;
00230 strings_idx += length;
00231 return string;
00232 }
00233
00234 void Put(WordBitCompress& stream);
00235 void Get(WordBitCompress& stream);
00236
00237 unsigned int *values[WORD_CMPR_VAL_ARRAY_SIZE];
00238 int values_length[WORD_CMPR_VAL_ARRAY_SIZE];
00239 int values_idx[WORD_CMPR_VAL_ARRAY_SIZE];
00240 int values_size[WORD_CMPR_VAL_ARRAY_SIZE];
00241
00242 unsigned char *strings;
00243 int strings_length;
00244 int strings_idx;
00245 int strings_size;
00246 };
00247
00248 void WordDBEncoded::Put(WordBitCompress& stream)
00249 {
00250 unsigned int count = 0;
00251 for(int i = 0; i < WORD_CMPR_VAL_ARRAY_SIZE; i++) {
00252 if(values_length[i] > 0) count++;
00253 }
00254 stream.WordBitStream::PutUint(count, WORD_CMPR_VAL_ARRAY_SIZE_BITS);
00255
00256 for(int i = 0; i < WORD_CMPR_VAL_ARRAY_SIZE; i++) {
00257 if(values_length[i] > 0) {
00258 stream.WordBitStream::PutUint(i, WORD_CMPR_VAL_ARRAY_SIZE_BITS);
00259 stream.PutUints(values[i], values_length[i]);
00260 }
00261 }
00262
00263 stream.PutUchars(strings, strings_length);
00264 }
00265
00266 void WordDBEncoded::Get(WordBitCompress& stream)
00267 {
00268 Clear();
00269
00270 unsigned int count = 0;
00271 count = stream.WordBitStream::GetUint(WORD_CMPR_VAL_ARRAY_SIZE_BITS);
00272
00273 for(unsigned int i = 0; i < count; i++) {
00274 unsigned int index = stream.WordBitStream::GetUint(WORD_CMPR_VAL_ARRAY_SIZE_BITS);
00275 values_length[index] = stream.GetUints(&values[index], &values_size[index]);
00276 }
00277
00278 strings_length = stream.GetUchars(&strings, &strings_size);
00279 }
00280
00281
00282
00283
00284
00285 #define WORD_CMPR_OVERHEAD ((int)sizeof(unsigned char))
00286
00287 WordDBCompress::WordDBCompress(WordContext* ncontext)
00288 {
00289 cmprInfo = 0;
00290 context = ncontext;
00291 encoded = new WordDBEncoded();
00292
00293 const Configuration& config = context->GetConfiguration();
00294 debug = config.Boolean("wordlist_compress_debug", 0);
00295 verbose = config.Value("wordlist_compress_verbose", 0);
00296 }
00297
00298 WordDBCompress::~WordDBCompress()
00299 {
00300 delete encoded;
00301 }
00302
00303 int
00304 WordDBCompress::Compress(const unsigned char *inbuff, int inbuff_length, unsigned char **outbuffp, int *outbuff_lengthp)
00305 {
00306
00307
00308
00309 int outbuff_length = inbuff_length + WORD_CMPR_OVERHEAD;
00310 unsigned char* outbuff = (unsigned char*)malloc(sizeof(unsigned char) * outbuff_length);
00311
00312 *outbuffp = 0;
00313 *outbuff_lengthp = outbuff_length;
00314
00315 if(outbuff == 0)
00316 return ENOMEM;
00317
00318 int ret = 0;
00319 PAGE* pp = (PAGE*)inbuff;
00320
00321 *outbuff = TYPE_TAGS(pp);
00322
00323 switch(TYPE_TAGS(pp)) {
00324 case P_IBTREE|WORD_DB_INDEX:
00325 case P_LBTREE|WORD_DB_INDEX:
00326 ret = CompressBtree(inbuff, inbuff_length, outbuff, &outbuff_length);
00327 break;
00328 default:
00329 memcpy((char*)(outbuff + WORD_CMPR_OVERHEAD), (char*)inbuff, inbuff_length);
00330 break;
00331 }
00332
00333 if(ret == 0) {
00334 *outbuffp = outbuff;
00335 *outbuff_lengthp = outbuff_length;
00336 } else {
00337 free(outbuff);
00338 }
00339
00340 return ret;
00341 }
00342
00343 int
00344 WordDBCompress::Uncompress(const unsigned char *inbuff, int inbuff_length, unsigned char *outbuff, int outbuff_length)
00345 {
00346 int ret = 0;
00347
00348 switch(*inbuff) {
00349 case P_IBTREE|WORD_DB_INDEX:
00350 case P_LBTREE|WORD_DB_INDEX:
00351 ret = UncompressBtree(inbuff, inbuff_length, outbuff, outbuff_length);
00352 break;
00353 default:
00354 memcpy((char*)outbuff, (char*)(inbuff + WORD_CMPR_OVERHEAD), outbuff_length);
00355 break;
00356 }
00357
00358 return ret;
00359 }
00360
00361 int
00362 WordDBCompress::CompressBtree(const unsigned char *inbuff, int inbuff_length, unsigned char *outbuff, int *outbuff_lengthp)
00363 {
00364 int ret = 0;
00365 PAGE *pp = (PAGE *)inbuff;
00366
00367 if(verbose) fprintf(stderr, "WordDBCompress::CompressBtree: page %d\n", PGNO(pp));
00368
00369 switch(TYPE(pp)) {
00370 case P_IBTREE:
00371 ret = CompressIBtree(inbuff, inbuff_length, outbuff, outbuff_lengthp);
00372 break;
00373 case P_LBTREE:
00374 ret = CompressLBtree(inbuff, inbuff_length, outbuff, outbuff_lengthp);
00375 break;
00376 }
00377
00378 return ret;
00379 }
00380
00381
00382
00383
00384
00385
00386
00387
00388 static inline unsigned int suffixlength(const String& a, const String& b)
00389 {
00390 const unsigned char* ap = (const unsigned char*)a.get();
00391 const unsigned char* bp = (const unsigned char*)b.get();
00392 int length = HtMIN(a.length(), b.length());
00393
00394 int i;
00395 for(i = 0; i < length && ap[i] == bp[i]; i++)
00396 ;
00397
00398 return b.length() - i;
00399 }
00400
00401 int
00402 WordDBCompress::CompressIBtree(const unsigned char *inbuff, int inbuff_length, unsigned char *outbuff, int *outbuff_lengthp)
00403 {
00404 PAGE *pp = (PAGE *)inbuff;
00405
00406 if(verbose > 5) DumpPage(inbuff);
00407
00408 WordBitCompress stream(inbuff_length);
00409
00410 stream.PutUint(pp->lsn.file, sizeof(pp->lsn.file) * 8);
00411 stream.PutUint(pp->lsn.offset, sizeof(pp->lsn.file) * 8);
00412 stream.PutUint(PGNO(pp), sizeof(PGNO(pp)) * 8);
00413 stream.PutUint(NUM_ENT(pp), sizeof(NUM_ENT(pp)) * 8);
00414 stream.PutUint(LEVEL(pp), sizeof(LEVEL(pp)) * 8);
00415
00416 if(NUM_ENT(pp) > 0) {
00417 int i;
00418 WordKey key(context);
00419 WordKey previous_key(context);
00420 BINTERNAL* previous_e = 0;
00421
00422 encoded->Clear();
00423
00424 for (i = 0; i < NUM_ENT(pp); i++) {
00425 BINTERNAL* e = GET_BINTERNAL(pp, i);
00426 if(debug) {
00427 if(e->type != B_KEYDATA)
00428 fprintf(stderr, "WordDBCompress::EncodeIBtree: unexpected type 0x%02x\n", e->type);
00429 }
00430 unsigned int changes = 0;
00431
00432 if(e->len <= 0) {
00433 changes = WORD_CMPR_VAL_FLAGS_EMPTY;
00434
00435
00436
00437 encoded->PushValue(WORD_CMPR_VAL_PGNO, e->pgno);
00438 encoded->PushValue(WORD_CMPR_VAL_NRECS, e->nrecs);
00439 } else {
00440
00441 key.Unpack((const char*)e->data, (int)e->len);
00442
00443 if(previous_e != 0) {
00444 int is_first_change = 1;
00445 #if 0
00446
00447
00448
00449 const String& word = key.GetWord();
00450 const String& previous_word = previous_key.GetWord();
00451 if(word != previous_word) {
00452 unsigned int suffix_length = suffixlength(previous_word, word);
00453 unsigned int prefix_length = word.length() - suffix_length;
00454 encoded->PushValue(WORD_CMPR_VAL_PREFIX, prefix_length);
00455 encoded->PushValue(WORD_CMPR_VAL_SUFFIX, suffix_length);
00456 const char* suffix = word.get() + prefix_length;
00457 encoded->PushString((const unsigned char*)suffix, suffix_length);
00458 changes |= WORD_CMPR_VAL_FLAGS_STRING;
00459 is_first_change = 0;
00460 }
00461 #endif
00462
00463
00464
00465 int nfields = key.NFields();
00466 for(int i = 0; i < nfields; i++) {
00467 unsigned int value = key.Get(i);
00468 unsigned int previous_value = previous_key.Get(i);
00469 if(value != previous_value) {
00470 if(is_first_change) {
00471 value -= previous_value;
00472 is_first_change = 0;
00473 }
00474 encoded->PushValue(i + WORD_CMPR_VAL_FIELDS, value);
00475 changes |= WORD_CMPR_VAL_FLAGS_FIELD(i);
00476 }
00477 }
00478
00479
00480
00481 if(e->pgno != previous_e->pgno) {
00482 encoded->PushValue(WORD_CMPR_VAL_PGNO, e->pgno);
00483 changes |= WORD_CMPR_VAL_FLAGS_PGNO;
00484 }
00485 if(e->nrecs != previous_e->nrecs) {
00486 encoded->PushValue(WORD_CMPR_VAL_NRECS, e->nrecs);
00487 changes |= WORD_CMPR_VAL_FLAGS_NRECS;
00488 }
00489 } else {
00490 #if 0
00491
00492
00493
00494 const String& word = key.GetWord();
00495 encoded->PushValue(WORD_CMPR_VAL_SUFFIX, word.length());
00496 encoded->PushString((const unsigned char*)word.get(), word.length());
00497 #endif
00498
00499
00500
00501 int nfields = key.NFields();
00502 for(int i = 0; i < nfields; i++) {
00503 encoded->PushValue(i + WORD_CMPR_VAL_FIELDS, key.Get(i));
00504 }
00505
00506
00507
00508 encoded->PushValue(WORD_CMPR_VAL_PGNO, e->pgno);
00509 encoded->PushValue(WORD_CMPR_VAL_NRECS, e->nrecs);
00510 }
00511
00512 previous_e = e;
00513 previous_key = key;
00514 }
00515
00516 encoded->PushValue(WORD_CMPR_VAL_FLAGS, changes);
00517 }
00518
00519 encoded->Put(stream);
00520 }
00521
00522 int outbuff_length = *outbuff_lengthp;
00523 if(stream.BuffLength() > (outbuff_length - WORD_CMPR_OVERHEAD)) {
00524 fprintf(stderr, "WordDBCompress::CompressIBtree: compressed length = %d > available length = %d\n", stream.BuffLength(), outbuff_length - WORD_CMPR_OVERHEAD);
00525 abort();
00526 }
00527 memcpy(outbuff + WORD_CMPR_OVERHEAD, stream.Buff(), stream.BuffLength());
00528 outbuff_length = WORD_CMPR_OVERHEAD + stream.BuffLength();
00529
00530 if(debug) {
00531 unsigned char* tmp = (unsigned char*)malloc(inbuff_length);
00532 memset((char*)tmp, '\0', inbuff_length);
00533 UncompressIBtree(outbuff, outbuff_length, tmp, inbuff_length);
00534 if(DiffPage(inbuff, tmp)) {
00535 fprintf(stderr, "WordDBCompress::CompressIBtree: failed\n");
00536 DumpPage(inbuff);
00537 DumpPage(tmp);
00538 exit(1);
00539 }
00540 free(tmp);
00541 }
00542
00543 *outbuff_lengthp = outbuff_length;
00544
00545 return 0;
00546 }
00547
00548 int
00549 WordDBCompress::CompressLBtree(const unsigned char *inbuff, int inbuff_length, unsigned char *outbuff, int *outbuff_lengthp)
00550 {
00551 PAGE *pp = (PAGE *)inbuff;
00552
00553 if(verbose > 5) DumpPage(inbuff);
00554
00555 WordBitCompress stream(inbuff_length);
00556
00557 stream.PutUint(pp->lsn.file, sizeof(pp->lsn.file) * 8);
00558 stream.PutUint(pp->lsn.offset, sizeof(pp->lsn.file) * 8);
00559 stream.PutUint(PGNO(pp), sizeof(PGNO(pp)) * 8);
00560 stream.PutUint(PREV_PGNO(pp), sizeof(PREV_PGNO(pp)) * 8);
00561 stream.PutUint(NEXT_PGNO(pp), sizeof(NEXT_PGNO(pp)) * 8);
00562 stream.PutUint(NUM_ENT(pp), sizeof(NUM_ENT(pp)) * 8);
00563 stream.PutUint(LEVEL(pp), sizeof(LEVEL(pp)) * 8);
00564
00565 if(NUM_ENT(pp) > 0) {
00566 int i;
00567 WordKey key(context);
00568 WordKey previous_key(context);
00569 WordRecord record(context);
00570 WordRecord previous_record(context);
00571 BKEYDATA* previous_key_e = 0;
00572 BKEYDATA* previous_record_e = 0;
00573
00574 encoded->Clear();
00575
00576 for (i = 0; i < NUM_ENT(pp); i += P_INDX) {
00577 unsigned int changes = 0;
00578
00579 {
00580 BKEYDATA* key_e = GET_BKEYDATA(pp, i);
00581 if(debug) {
00582 if(key_e->type != B_KEYDATA)
00583 fprintf(stderr, "WordDBCompress::EncodeLBtree: unexpected type 0x%02x\n", key_e->type);
00584 if(key_e->len <= 0)
00585 fprintf(stderr, "WordDBCompress::EncodeLBtree: unexpected length <= 0\n");
00586 }
00587
00588 key.Unpack((const char*)key_e->data, (int)key_e->len);
00589
00590 if(previous_key_e != 0) {
00591 int is_first_change = 1;
00592 #if 0
00593
00594
00595
00596 const String& word = key.GetWord();
00597 const String& previous_word = previous_key.GetWord();
00598 if(word != previous_word) {
00599 unsigned int suffix_length = suffixlength(previous_word, word);
00600 unsigned int prefix_length = word.length() - suffix_length;
00601 encoded->PushValue(WORD_CMPR_VAL_PREFIX, prefix_length);
00602 encoded->PushValue(WORD_CMPR_VAL_SUFFIX, suffix_length);
00603 const char* suffix = word.get() + prefix_length;
00604 encoded->PushString((const unsigned char*)suffix, suffix_length);
00605 changes |= WORD_CMPR_VAL_FLAGS_STRING;
00606 is_first_change = 0;
00607 }
00608 #endif
00609
00610
00611
00612 int nfields = key.NFields();
00613 for(int i = 0; i < nfields; i++) {
00614 unsigned int value = key.Get(i);
00615 unsigned int previous_value = previous_key.Get(i);
00616 if(value != previous_value) {
00617 if(is_first_change) {
00618 value -= previous_value;
00619 is_first_change = 0;
00620 }
00621 encoded->PushValue(WORD_CMPR_VAL_FIELDS + i, value);
00622 changes |= WORD_CMPR_VAL_FLAGS_FIELD(i);
00623 }
00624 }
00625 } else {
00626 #if 0
00627
00628
00629
00630 const String& word = key.GetWord();
00631 encoded->PushValue(WORD_CMPR_VAL_SUFFIX, word.length());
00632 encoded->PushString((const unsigned char*)word.get(), word.length());
00633 #endif
00634
00635
00636
00637 int nfields = key.NFields();
00638 for(int i = 0; i < nfields; i++) {
00639 encoded->PushValue(i + WORD_CMPR_VAL_FIELDS, key.Get(i));
00640 }
00641 }
00642
00643 previous_key_e = key_e;
00644 previous_key = key;
00645 }
00646 {
00647 BKEYDATA* record_e = GET_BKEYDATA(pp, i + 1);
00648 if(debug) {
00649 if(record_e->type != B_KEYDATA)
00650 fprintf(stderr, "WordDBCompress::EncodeLBtree: unexpected type 0x%02x\n", record_e->type);
00651 }
00652
00653 if(record_e->len <= 0) {
00654 changes |= WORD_CMPR_VAL_FLAGS_RECORD_NO;
00655 } else {
00656 record.Unpack((const char*)record_e->data, (int)record_e->len);
00657
00658 switch(record.type) {
00659 case WORD_RECORD_DATA:
00660 if(previous_record_e &&
00661 record.info.data == previous_record.info.data) {
00662 changes |= WORD_CMPR_VAL_FLAGS_RECORD_EQ;
00663 } else {
00664 encoded->PushValue(WORD_CMPR_VAL_RVALUE, record.info.data);
00665 }
00666 break;
00667 case WORD_RECORD_STR:
00668 changes |= WORD_CMPR_VAL_FLAGS_RECORD_STR;
00669 if(previous_record_e &&
00670 record.info.str == previous_record.info.str) {
00671 changes |= WORD_CMPR_VAL_FLAGS_RECORD_EQ;
00672 } else {
00673 const String& string = record.info.str;
00674 encoded->PushValue(WORD_CMPR_VAL_RLENGTH, string.length());
00675 encoded->PushString((const unsigned char*)string.get(), string.length());
00676 }
00677 break;
00678 default:
00679 fprintf(stderr, "WordDBCompress::EncodeLBtree: unexpected record.type = %d\n", record.type);
00680 break;
00681 }
00682
00683 previous_record = record;
00684 previous_record_e = record_e;
00685 }
00686
00687 }
00688 encoded->PushValue(WORD_CMPR_VAL_FLAGS, changes);
00689 }
00690
00691 encoded->Put(stream);
00692 }
00693
00694 int outbuff_length = *outbuff_lengthp;
00695 if(stream.BuffLength() > (outbuff_length - WORD_CMPR_OVERHEAD)) {
00696 fprintf(stderr, "WordDBCompress::CompressLBtree: compressed length = %d > available length = %d\n", stream.BuffLength(), outbuff_length - WORD_CMPR_OVERHEAD);
00697 abort();
00698 }
00699 memcpy(outbuff + WORD_CMPR_OVERHEAD, stream.Buff(), stream.BuffLength());
00700 outbuff_length = WORD_CMPR_OVERHEAD + stream.BuffLength();
00701
00702 if(debug) {
00703 unsigned char* tmp = (unsigned char*)malloc(inbuff_length);
00704 memset((char*)tmp, '\0', inbuff_length);
00705 UncompressLBtree(outbuff, outbuff_length, tmp, inbuff_length);
00706 if(DiffPage(inbuff, tmp)) {
00707 fprintf(stderr, "WordDBCompress::CompressLBtree: failed\n");
00708 DumpPage(inbuff);
00709 DumpPage(tmp);
00710 exit(1);
00711 }
00712 free(tmp);
00713 }
00714
00715 *outbuff_lengthp = outbuff_length;
00716
00717 return 0;
00718 }
00719
00720 int
00721 WordDBCompress::UncompressBtree(const unsigned char *inbuff, int inbuff_length, unsigned char *outbuff, int outbuff_length)
00722 {
00723 int ret = 0;
00724 switch((*inbuff) & TYPE_MASK) {
00725 case P_IBTREE:
00726 ret = UncompressIBtree(inbuff, inbuff_length, outbuff, outbuff_length);
00727 break;
00728 case P_LBTREE:
00729 ret = UncompressLBtree(inbuff, inbuff_length, outbuff, outbuff_length);
00730 break;
00731 }
00732 if(verbose) fprintf(stderr, "WordDBCompress::UncompressBtree: page %d\n", PGNO((PAGE*)outbuff));
00733 return ret;
00734 }
00735
00736
00737
00738
00739
00740 static void
00741 cdb___db_pitem(PAGE *pagep, u_int32_t indx, u_int32_t nbytes, DBT *hdr, DBT *data)
00742 {
00743 u_int8_t *p;
00744
00745 HOFFSET(pagep) -= nbytes;
00746 pagep->inp[indx] = HOFFSET(pagep);
00747 ++NUM_ENT(pagep);
00748
00749 p = P_ENTRY(pagep, indx);
00750 memcpy(p, hdr->data, hdr->size);
00751 if (data != NULL)
00752 memcpy(p + hdr->size, data->data, data->size);
00753 }
00754
00755 int
00756 WordDBCompress::UncompressIBtree(const unsigned char *inbuff, int inbuff_length, unsigned char *outbuff, int outbuff_length)
00757 {
00758 memset((char*)outbuff, '\0', outbuff_length);
00759
00760 WordBitCompress stream(inbuff_length);
00761 stream.BuffSet(inbuff + WORD_CMPR_OVERHEAD, inbuff_length - WORD_CMPR_OVERHEAD);
00762
00763 PAGE *pp = (PAGE *)outbuff;
00764
00765 TYPE_TAGS(pp) = *inbuff;
00766 HOFFSET(pp) = outbuff_length;
00767 short entry_count = 0;
00768 pp->lsn.file = stream.GetUint(sizeof(pp->lsn.file) * 8);
00769 pp->lsn.offset = stream.GetUint(sizeof(pp->lsn.offset) * 8);
00770 PGNO(pp) = stream.GetUint(sizeof(PGNO(pp)) * 8);
00771 entry_count = stream.GetUint(sizeof(NUM_ENT(pp)) * 8);
00772 LEVEL(pp) = stream.GetUint(sizeof(LEVEL(pp)) * 8);
00773 NEXT_PGNO(pp) = PREV_PGNO(pp) = 0;
00774
00775 if(entry_count > 0) {
00776 int i;
00777 String packed;
00778 BINTERNAL previous_bi;
00779 BINTERNAL bi;
00780 DBT hdr;
00781 hdr.data = &bi;
00782 hdr.size = SSZA(BINTERNAL, data);
00783 DBT data;
00784 WordKey key(context);
00785 WordKey previous_key(context);
00786 String word;
00787
00788 encoded->Get(stream);
00789
00790 for(i = 0; i < entry_count; i++) {
00791 unsigned int changes = encoded->ShiftValue(WORD_CMPR_VAL_FLAGS);
00792
00793 memset((char*)&bi, '\0', sizeof(BINTERNAL));
00794
00795 if(changes & WORD_CMPR_VAL_FLAGS_EMPTY) {
00796 packed.trunc();
00797
00798
00799
00800 bi.pgno = encoded->ShiftValue(WORD_CMPR_VAL_PGNO);
00801 bi.nrecs = encoded->ShiftValue(WORD_CMPR_VAL_NRECS);
00802 } else {
00803 key.Clear();
00804
00805 if(!previous_key.Empty()) {
00806 int is_first_change = 1;
00807 #if 0
00808
00809
00810
00811 if(changes & WORD_CMPR_VAL_FLAGS_STRING) {
00812 unsigned int suffix_length = encoded->ShiftValue(WORD_CMPR_VAL_SUFFIX);
00813 unsigned int prefix_length = encoded->ShiftValue(WORD_CMPR_VAL_PREFIX);
00814 unsigned char* suffix = encoded->ShiftString(suffix_length);
00815 word.set(previous_key.GetWord().get(), prefix_length);
00816 word.append((const char*)suffix, suffix_length);
00817 key.SetWord(word);
00818 is_first_change = 0;
00819 } else {
00820 key.SetWord(previous_key.GetWord());
00821 }
00822 #endif
00823
00824
00825
00826 int nfields = key.NFields();
00827 for(int j = 0; j < nfields; j++) {
00828 if(changes & WORD_CMPR_VAL_FLAGS_FIELD(j)) {
00829 unsigned int value = encoded->ShiftValue(j + WORD_CMPR_VAL_FIELDS);
00830 if(is_first_change) {
00831 value += previous_key.Get(j);
00832 is_first_change = 0;
00833 }
00834 key.Set(j, value);
00835 } else {
00836 key.Set(j, previous_key.Get(j));
00837 }
00838 }
00839
00840
00841
00842 if(changes & WORD_CMPR_VAL_FLAGS_PGNO) {
00843 bi.pgno = encoded->ShiftValue(WORD_CMPR_VAL_PGNO);
00844 } else {
00845 bi.pgno = previous_bi.pgno;
00846 }
00847 if(changes & WORD_CMPR_VAL_FLAGS_NRECS) {
00848 bi.nrecs = encoded->ShiftValue(WORD_CMPR_VAL_NRECS);
00849 } else {
00850 bi.nrecs = previous_bi.nrecs;
00851 }
00852 } else {
00853 #if 0
00854
00855
00856
00857 unsigned int string_length = encoded->ShiftValue(WORD_CMPR_VAL_SUFFIX);
00858 unsigned char* string = encoded->ShiftString(string_length);
00859 key.SetWord((const char*)string, string_length);
00860 #endif
00861
00862
00863
00864 int nfields = key.NFields();
00865 for(int i = 0; i < nfields; i++) {
00866 key.Set(i, encoded->ShiftValue(i + WORD_CMPR_VAL_FIELDS));
00867 }
00868
00869
00870
00871 bi.pgno = encoded->ShiftValue(WORD_CMPR_VAL_PGNO);
00872 bi.nrecs = encoded->ShiftValue(WORD_CMPR_VAL_NRECS);
00873 }
00874
00875 key.Pack(packed);
00876 }
00877 data.data = packed.get();
00878 data.size = packed.length();
00879 bi.len = packed.length();
00880 bi.type = B_KEYDATA;
00881
00882
00883
00884
00885 cdb___db_pitem(pp, i, BINTERNAL_SIZE(bi.len), &hdr, &data);
00886
00887
00888
00889
00890 previous_bi = bi;
00891 previous_key = key;
00892 }
00893 }
00894
00895 if(debug) {
00896 if(entry_count != NUM_ENT(pp))
00897 fprintf(stderr, "WordDBCompress::UncompressIBtree: pgno %d: NUM_ENT(pp) = %d is different than entry_count = %d\n", PGNO(pp), NUM_ENT(pp), entry_count);
00898 }
00899 return 0;
00900 }
00901
00902 int
00903 WordDBCompress::UncompressLBtree(const unsigned char *inbuff, int inbuff_length, unsigned char *outbuff, int outbuff_length)
00904 {
00905 memset((char*)outbuff, '\0', outbuff_length);
00906
00907 WordBitCompress stream(inbuff_length);
00908 stream.BuffSet(inbuff + WORD_CMPR_OVERHEAD, inbuff_length - WORD_CMPR_OVERHEAD);
00909
00910 PAGE *pp = (PAGE *)outbuff;
00911
00912 TYPE_TAGS(pp) = *inbuff;
00913 HOFFSET(pp) = outbuff_length;
00914 short entry_count = 0;
00915 pp->lsn.file = stream.GetUint(sizeof(pp->lsn.file) * 8);
00916 pp->lsn.offset = stream.GetUint(sizeof(pp->lsn.offset) * 8);
00917 PGNO(pp) = stream.GetUint(sizeof(PGNO(pp)) * 8);
00918 PREV_PGNO(pp) = stream.GetUint(sizeof(PREV_PGNO(pp)) * 8);
00919 NEXT_PGNO(pp) = stream.GetUint(sizeof(NEXT_PGNO(pp)) * 8);
00920 entry_count = stream.GetUint(sizeof(NUM_ENT(pp)) * 8);
00921 LEVEL(pp) = stream.GetUint(sizeof(LEVEL(pp)) * 8);
00922
00923 if(entry_count > 0) {
00924 int i;
00925 String packed;
00926 BKEYDATA bk;
00927 DBT hdr;
00928 hdr.data = &bk;
00929 hdr.size = SSZA(BKEYDATA, data);
00930 DBT data;
00931 WordKey key(context);
00932 WordKey previous_key(context);
00933 WordRecord record(context);
00934 WordRecord previous_record(context);
00935 int previous_record_p = 0;
00936 String word;
00937
00938 encoded->Get(stream);
00939
00940 for(i = 0; i < entry_count; i += P_INDX) {
00941 unsigned int changes = encoded->ShiftValue(WORD_CMPR_VAL_FLAGS);
00942
00943
00944
00945
00946 {
00947 memset((char*)&bk, '\0', sizeof(BKEYDATA));
00948
00949 key.Clear();
00950
00951 if(!previous_key.Empty()) {
00952 int is_first_change = 1;
00953 #if 0
00954
00955
00956
00957 if(changes & WORD_CMPR_VAL_FLAGS_STRING) {
00958 unsigned int suffix_length = encoded->ShiftValue(WORD_CMPR_VAL_SUFFIX);
00959 unsigned int prefix_length = encoded->ShiftValue(WORD_CMPR_VAL_PREFIX);
00960 unsigned char* suffix = encoded->ShiftString(suffix_length);
00961 word.set(previous_key.GetWord().get(), prefix_length);
00962 word.append((const char*)suffix, suffix_length);
00963 key.SetWord(word);
00964 is_first_change = 0;
00965 } else {
00966 key.SetWord(previous_key.GetWord());
00967 }
00968 #endif
00969
00970
00971
00972 int nfields = key.NFields();
00973 for(int j = 0; j < nfields; j++) {
00974 if(changes & WORD_CMPR_VAL_FLAGS_FIELD(j)) {
00975 unsigned int value = encoded->ShiftValue(j + WORD_CMPR_VAL_FIELDS);
00976 if(is_first_change) {
00977 value += previous_key.Get(j);
00978 is_first_change = 0;
00979 }
00980 key.Set(j, value);
00981 } else {
00982 key.Set(j, previous_key.Get(j));
00983 }
00984 }
00985 } else {
00986 #if 0
00987
00988
00989
00990 unsigned int string_length = encoded->ShiftValue(WORD_CMPR_VAL_SUFFIX);
00991 unsigned char* string = encoded->ShiftString(string_length);
00992 key.SetWord((const char*)string, string_length);
00993 #endif
00994
00995
00996
00997 int nfields = key.NFields();
00998 for(int i = 0; i < nfields; i++) {
00999 key.Set(i, encoded->ShiftValue(i + WORD_CMPR_VAL_FIELDS));
01000 }
01001 }
01002
01003 key.Pack(packed);
01004
01005 data.data = packed.get();
01006 data.size = packed.length();
01007 bk.len = packed.length();
01008 bk.type = B_KEYDATA;
01009
01010
01011
01012
01013 cdb___db_pitem(pp, i, BKEYDATA_SIZE(bk.len), &hdr, &data);
01014
01015 previous_key = key;
01016 }
01017
01018
01019
01020 {
01021 memset((char*)&bk, '\0', sizeof(BKEYDATA));
01022
01023 if(changes & WORD_CMPR_VAL_FLAGS_RECORD_NO) {
01024 packed.trunc();
01025 } else {
01026 if(changes & WORD_CMPR_VAL_FLAGS_RECORD_STR) {
01027 record.type = WORD_RECORD_STR;
01028 if(previous_record_p &&
01029 (changes & WORD_CMPR_VAL_FLAGS_RECORD_EQ)) {
01030 record.info.str = previous_record.info.str;
01031 } else {
01032 unsigned int string_length = encoded->ShiftValue(WORD_CMPR_VAL_RLENGTH);
01033 unsigned char* string = encoded->ShiftString(string_length);
01034 record.info.str.set((const char*)string, string_length);
01035 }
01036 } else {
01037 record.type = WORD_RECORD_DATA;
01038
01039 if(previous_record_p &&
01040 (changes & WORD_CMPR_VAL_FLAGS_RECORD_EQ)) {
01041 record.info.data = previous_record.info.data;
01042 } else {
01043 record.info.data = encoded->ShiftValue(WORD_CMPR_VAL_RVALUE);
01044 }
01045 }
01046
01047 record.Pack(packed);
01048
01049 previous_record = record;
01050 previous_record_p = 1;
01051 }
01052 data.data = packed.get();
01053 data.size = packed.length();
01054 bk.len = packed.length();
01055 bk.type = B_KEYDATA;
01056
01057
01058
01059
01060 cdb___db_pitem(pp, i + 1, BKEYDATA_SIZE(bk.len), &hdr, &data);
01061 }
01062 }
01063 }
01064
01065 if(debug) {
01066 if(entry_count != NUM_ENT(pp))
01067 fprintf(stderr, "WordDBCompress::UncompressLBtree: pgno %d: NUM_ENT(pp) = %d is different than entry_count = %d\n", PGNO(pp), NUM_ENT(pp), entry_count);
01068 }
01069
01070 return 0;
01071 }
01072
01073 DB_CMPR_INFO* WordDBCompress::CmprInfo()
01074 {
01075 DB_CMPR_INFO *cmpr_info = new DB_CMPR_INFO;
01076
01077 cmpr_info->user_data = (void *)this;
01078 cmpr_info->compress = WordDBCompress_compress_c;
01079 cmpr_info->uncompress = WordDBCompress_uncompress_c;
01080 cmpr_info->coefficient = 3;
01081 cmpr_info->max_npages = 9;
01082 cmprInfo = cmpr_info;
01083
01084 return cmpr_info;
01085 }
01086
01087 void
01088 WordDBCompress::DumpPage(const unsigned char* page) const
01089 {
01090 PAGE* pp = (PAGE*)page;
01091
01092 fprintf(stderr, "--------------------------------------------------\n");
01093 fprintf(stderr, "pgno = %d ", PGNO(pp));
01094 fprintf(stderr, "lsn.file = %d ", pp->lsn.file);
01095 fprintf(stderr, "lsn.offset = %d ", pp->lsn.offset);
01096 fprintf(stderr, "prev_pgno = %d ", PREV_PGNO(pp));
01097 fprintf(stderr, "next_pgno = %d\n", NEXT_PGNO(pp));
01098 fprintf(stderr, "entries = %d ", NUM_ENT(pp));
01099 fprintf(stderr, "hf_offset = %d ", HOFFSET(pp));
01100 fprintf(stderr, "level = %d ", LEVEL(pp));
01101 fprintf(stderr, "type = %d\n", TYPE(pp));
01102 fprintf(stderr, "tags = 0x%02x\n", TAGS(pp));
01103 fprintf(stderr, "freespace = %d bytes from byte %d to byte %d\n", P_FREESPACE(pp), LOFFSET(pp), HOFFSET(pp));
01104
01105 int i;
01106 for(i = 0; i < NUM_ENT(pp); i++) {
01107 fprintf(stderr, "index = %d, ", pp->inp[i]);
01108 unsigned char* data = 0;
01109 int data_length = 0;
01110 switch(TYPE(pp)) {
01111 case P_IBTREE:
01112 {
01113 BINTERNAL* e = GET_BINTERNAL(pp, i);
01114 fprintf(stderr, "len = %d, type = %d, pgno = %d, nrecs = %d\n", e->len, e->type, e->pgno, e->nrecs);
01115 data = (unsigned char*)e->data;
01116 data_length = e->len;
01117 }
01118 break;
01119 case P_LBTREE:
01120 {
01121 BKEYDATA* e = GET_BKEYDATA(pp, i);
01122 fprintf(stderr, "len = %d, type = %d\n", e->len, e->type);
01123 data = (unsigned char*)e->data;
01124 data_length = e->len;
01125 }
01126 break;
01127 }
01128 if(data_length > 0) {
01129 int j;
01130 for(j = 0; j < data_length; j++) {
01131 fprintf(stderr, "(%d) ", data[j]);
01132 }
01133 fprintf(stderr, "\n");
01134 }
01135 }
01136 }
01137
01138 int
01139 WordDBCompress::DiffPage(const unsigned char* first, const unsigned char* second) const
01140 {
01141 PAGE* p1 = (PAGE*)first;
01142 PAGE* p2 = (PAGE*)second;
01143
01144 if(TAGS(p1) != TAGS(p2)) return 1;
01145 if(TYPE(p1) != TYPE(p2)) return 1;
01146 if(PGNO(p1) != PGNO(p2)) return 1;
01147 if(p1->lsn.file != p2->lsn.file) return 1;
01148 if(p1->lsn.offset != p2->lsn.offset) return 1;
01149 if(TYPE(p1) == P_LBTREE) {
01150 if(PREV_PGNO(p1) != PREV_PGNO(p2)) return 1;
01151 if(NEXT_PGNO(p1) != NEXT_PGNO(p2)) return 1;
01152 }
01153 if(NUM_ENT(p1) != NUM_ENT(p2)) return 1;
01154 if(HOFFSET(p1) != HOFFSET(p2)) return 1;
01155 if(LEVEL(p1) != LEVEL(p2)) return 1;
01156
01157 int i;
01158 for(i = 0; i < NUM_ENT(p1); i++) {
01159 unsigned char* data1 = 0;
01160 int data1_length = 0;
01161 unsigned char* data2 = 0;
01162 int data2_length = 0;
01163 switch(TYPE(p1)) {
01164 case P_IBTREE:
01165 {
01166 BINTERNAL* e1 = GET_BINTERNAL(p1, i);
01167 BINTERNAL* e2 = GET_BINTERNAL(p2, i);
01168 if(e1->len != e2->len) return 1;
01169 if(e1->type != e2->type) return 1;
01170 if(e1->pgno != e2->pgno) return 1;
01171 if(e1->nrecs != e2->nrecs) return 1;
01172 data1 = (unsigned char*)e1->data;
01173 data1_length = e1->len;
01174 data2 = (unsigned char*)e2->data;
01175 data2_length = e2->len;
01176 }
01177 break;
01178 case P_LBTREE:
01179 {
01180 BKEYDATA* e1 = GET_BKEYDATA(p1, i);
01181 BKEYDATA* e2 = GET_BKEYDATA(p2, i);
01182 if(e1->len != e2->len) return 1;
01183 if(e1->type != e2->type) return 1;
01184 data1 = (unsigned char*)e1->data;
01185 data1_length = e1->len;
01186 data2 = (unsigned char*)e2->data;
01187 data2_length = e2->len;
01188 }
01189 break;
01190 }
01191 if(data1_length > 0) {
01192 int j;
01193 for(j = 0; j < data1_length; j++) {
01194 if(data1[j] != data2[j]) return 1;
01195 }
01196 }
01197 }
01198 return 0;
01199 }