00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080 #ifdef HAVE_CONFIG_H
00081 #include <config.h>
00082 #endif
00083
00084 #ifdef HAVE_UNISTD_H
00085 #include <unistd.h>
00086 #endif
00087
00088 #if TIME_WITH_SYS_TIME
00089 # include <sys/time.h>
00090 # include <time.h>
00091 #else
00092 # if HAVE_SYS_TIME_H
00093 # include <sys/time.h>
00094 # else
00095 # include <time.h>
00096 # endif
00097 #endif
00098
00099
00100 #ifdef HAVE_GETOPT_H
00101 #include <getopt.h>
00102 #endif
00103 #ifdef HAVE_MALLOC_H
00104 #include <malloc.h>
00105 #endif
00106 #include <stdlib.h>
00107 #include <ctype.h>
00108
00109 #include <htString.h>
00110 #include <WordList.h>
00111 #include <WordContext.h>
00112 #include <WordCursorOne.h>
00113 #include <HtMaxMin.h>
00114 #include <WordListOne.h>
00115 #include <WordDict.h>
00116
00117 #include <mifluzsearch.h>
00118 #include <WordTree.h>
00119 #include <WordSearch.h>
00120
00121
00122
00123
00124 static int verbose = 0;
00125
00126
00127
00128
00129
00130
00131 class params_t
00132 {
00133 public:
00134 char* dbfile;
00135 char* find;
00136 unsigned int count;
00137 int uniq_server;
00138 int proximity;
00139 unsigned int base;
00140 int nop;
00141 int exclude;
00142 char* low;
00143 char* high;
00144 char* occurrences;
00145 int bounded;
00146 int xml;
00147 int http;
00148 int or_method;
00149 char* mifluz_config;
00150 unsigned int restrict;
00151 };
00152
00153 static int dosearch(params_t* params);
00154
00155
00156
00157
00158 static void usage();
00159
00160 static int urldecode(char *s)
00161 {
00162 char *p = s;
00163
00164 while (*s != '\0')
00165 {
00166 if (*s == '%')
00167 {
00168 s++;
00169 if (!isxdigit(*s))
00170 return 0;
00171 *p = (isalpha(*s) ? (*s & 0xdf) - 'A' + 10 : *s - '0') << 4;
00172 s++;
00173 if (!isxdigit(*s))
00174 return 0;
00175 *p += isalpha(*s) ? (*s & 0xdf) - 'A' + 10 : *s - '0';
00176 }
00177 else if (*s == '+')
00178 *p = ' ';
00179 else
00180 *p = *s;
00181 s++;
00182 p++;
00183 }
00184 *p = '\0';
00185 return 1;
00186 }
00187
00188 static void handle_param(params_t* params, char c, const char* optarg)
00189 {
00190 switch (c)
00191 {
00192 case 'v':
00193 verbose++;
00194 break;
00195 case 'B':
00196 free(params->dbfile);
00197 params->dbfile = strdup(optarg);
00198 break;
00199 case 'f':
00200 params->find = strdup(optarg);
00201 break;
00202 case 'c':
00203 params->count = (unsigned int)atoi(optarg);
00204 break;
00205 case 'd':
00206 params->base = (unsigned int)atoi(optarg);
00207 break;
00208 case 'P':
00209 params->proximity = atoi(optarg);
00210 break;
00211 case 'S':
00212 params->uniq_server = 1;
00213 break;
00214 case 'n':
00215 params->nop = 1;
00216 break;
00217 case 'l':
00218 free(params->low);
00219 params->low = strdup(optarg);
00220 break;
00221 case 'h':
00222 free(params->high);
00223 params->high = strdup(optarg);
00224 break;
00225 case 'o':
00226 free(params->occurrences);
00227 params->occurrences = strdup(optarg);
00228 break;
00229 case 'O':
00230 params->or_method = WORD_SEARCH_OR;
00231 break;
00232 case 'x':
00233 params->xml = 1;
00234 break;
00235 case 'H':
00236 params->http = 1;
00237 break;
00238 case 'M':
00239 {
00240 free(params->mifluz_config);
00241 params->mifluz_config = (char*)malloc(strlen(optarg) + 32);
00242 sprintf(params->mifluz_config, "MIFLUZ_CONFIG=%s", optarg);
00243 if(putenv(params->mifluz_config) < 0) {
00244 perror("putenv");
00245 exit(1);
00246 }
00247 }
00248 break;
00249 case 'R':
00250 params->restrict = (unsigned int)strtoul(optarg, 0, 10);
00251 break;
00252 case '?':
00253 usage();
00254 break;
00255 }
00256 }
00257
00258 int main(int ac, char **av)
00259 {
00260 params_t params;
00261
00262 params.dbfile = strdup("");
00263 params.find = 0;
00264 params.count = 10;
00265 params.base = 0;
00266 params.uniq_server = 0;
00267 params.proximity = WORD_SEARCH_DEFAULT_PROXIMITY;
00268 params.nop = 0;
00269 params.low = strdup("");
00270 params.high = strdup("");
00271 params.occurrences = strdup("");
00272 params.bounded = 0;
00273 params.xml = 0;
00274 params.http = 0;
00275 params.or_method = WORD_SEARCH_OPTIONAL;
00276 params.mifluz_config = strdup("");
00277 params.restrict = 0;
00278
00279 if(getenv("QUERY_STRING")) {
00280 StringList fields(getenv("QUERY_STRING"), "&");
00281 for(int i = 0; i < fields.Count(); i++) {
00282 const char* field = fields[i];
00283 StringList pair(field, "=");
00284 if(pair.Count() != 2 && pair.Count() != 1) {
00285 fprintf(stderr, "%s should match .+=.*\n", field);
00286 exit(1);
00287 }
00288
00289
00290
00291 if(pair.Count() == 2) {
00292 const char* name = pair[0];
00293 char* value = pair[1];
00294 if(!urldecode(value)) {
00295 fprintf(stderr, "mifluzsearch: unable to decode %s from QUERY_STRING %s\n", value, getenv("QUERY_STRING"));
00296 exit(1);
00297 }
00298 handle_param(¶ms, name[0], value);
00299 }
00300 }
00301 params.http = params.xml = 1;
00302 } else {
00303 int c;
00304 extern char *optarg;
00305
00306 while ((c = getopt(ac, av, "vB:f:c:SP:nR:l:h:o:xM:HOd:")) != -1)
00307 {
00308 handle_param(¶ms, c, optarg);
00309 }
00310 }
00311
00312 if(dosearch(¶ms) < 0)
00313 exit(1);
00314
00315 if(params.find) free(params.find);
00316 free(params.dbfile);
00317 free(params.low);
00318 free(params.high);
00319 free(params.occurrences);
00320 free(params.mifluz_config);
00321 }
00322
00323 static int dosearch(params_t* params)
00324 {
00325 clock_t start_time;
00326 clock_t end_time;
00327
00328 if((start_time = clock()) == (time_t)-1) {
00329 perror("start clock");
00330 return -1;
00331 }
00332
00333 WordContext* context = new WordContext();
00334 Configuration& config = context->GetConfiguration();
00335
00336 if(!context) {
00337 fprintf(stderr, "search: cannot create context\n");
00338 return -1;
00339 }
00340
00341
00342
00343
00344 if(verbose > 1) {
00345 String tmp;
00346 tmp << (verbose - 1);
00347 config.Add("wordlist_verbose", tmp);
00348 context->ReInitialize();
00349 }
00350
00351
00352
00353
00354 if(params->dbfile[0] == '\0') {
00355 printf("missing -B option\n");
00356 usage();
00357 }
00358
00359 WordList *words = context->List();
00360 words->Open(params->dbfile, O_RDONLY);
00361
00362
00363
00364
00365
00366 if(params->occurrences[0]) {
00367 unsigned int occurrences = 0;
00368 words->Noccurrence(params->occurrences, occurrences);
00369 printf("%s occurs %d times\n", params->occurrences, occurrences);
00370 exit(0);
00371 }
00372
00373 if(!params->find) {
00374 printf("missing -f option\n");
00375 usage();
00376 }
00377
00378 WordTree* expr;
00379 MifluzSearchInput input;
00380 {
00381 input.BufferSet(params->find, strlen(params->find));
00382 input.Verbose(verbose);
00383 input.or_method = params->or_method;
00384 input.maximum_word_length = config.Value("wordlist_maximum_word_length", 25);
00385 input.words = words;
00386 search_parse(&input);
00387 expr = input.query;
00388 expr->Verbose(verbose);
00389 }
00390
00391
00392
00393
00394
00395 if(params->nop) {
00396 printf("%s\n", (char*)expr->Get().get());
00397 exit(0);
00398 }
00399
00400 int* document = 0;
00401 int document_length = 0;
00402
00403
00404
00405
00406 {
00407 int location = -1;
00408 int nfields = words->GetContext()->GetKeyInfo().nfields;
00409
00410 StringList fields(config.Find("wordlist_wordkey_document"), "\t ");
00411 document_length = fields.Count();
00412 if(document_length > 0) {
00413 if(document_length > nfields - 1) {
00414 fprintf(stderr, "wordlist_wordkey_document has more fields than the authorized maximum (%d)\n", nfields - 1);
00415 return -1;
00416 }
00417
00418 document = new int[document_length];
00419 for(int i = 0; i < document_length; i++) {
00420 if(!fields[i]) {
00421 fprintf(stderr, "wordlist_wordkey_document unexpected null field returned at position %d \n", i);
00422 return -1;
00423 }
00424 document[i] = atoi(fields[i]);
00425 }
00426
00427 if((location = config.Value("wordlist_wordkey_location", -1)) == -1) {
00428 fprintf(stderr, "wordlist_wordkey_location must be set in configuration to define the structure of the key\n");
00429 exit(0);
00430 }
00431
00432 int uniq = config.Value("wordlist_wordkey_uniq", 0);
00433 if(params->uniq_server && !uniq) {
00434 fprintf(stderr, "wordlist_wordkey_uniq not set or 0, -S is useless\n");
00435 }
00436
00437 WordTreeArg arg(words, uniq, params->uniq_server, params->proximity, document, document_length, location);
00438 if(expr->Prepare(arg) != OK)
00439 return -1;
00440 } else {
00441 fprintf(stderr, "wordlist_wordkey_document and wordlist_wordkey_location must be set in configuration to define the structure of the key\n");
00442 return -1;
00443 }
00444 }
00445
00446
00447
00448
00449 if(params->restrict) {
00450 if(params->low[0] || params->high[0]) {
00451 fprintf(stderr, "-R number will override -l and -h\n");
00452 }
00453
00454 WordKey restrict(context);
00455 int uniq = config.Value("wordlist_wordkey_uniq", 0);
00456
00457 restrict.Set(uniq, params->restrict);
00458 free(params->low);
00459 params->low = strdup((char*)restrict.Get());
00460
00461 if(restrict.Overflow(uniq, 1)) {
00462 if(verbose) fprintf(stderr, "-R %d overflow\n", params->restrict);
00463 } else {
00464 restrict.Get(uniq)++;
00465 free(params->high);
00466 params->high = strdup((char*)restrict.Get());
00467 }
00468
00469 if(verbose > 1) {
00470 fprintf(stderr, "-R %d => -l '%s' -h '%s'\n", params->restrict, params->low, params->high);
00471 }
00472 }
00473
00474
00475
00476
00477 String low_string;
00478 String high_string;
00479 {
00480 WordKey low(context);
00481 WordKey high(context);
00482 if(params->low[0]) {
00483 low.Set(params->low);
00484 low_string << "L" << params->low;
00485 }
00486 if(params->high[0]) {
00487 high.Set(params->high);
00488 high_string << "H" << params->high;
00489 }
00490 if(params->low[0] || params->high[0])
00491 if(expr->Bounds(low, high) != OK)
00492 return -1;
00493 }
00494
00495 WordSearch* search = new WordSearch(words);
00496 search->Verbose(verbose);
00497
00498
00499
00500
00501 search->limit_count = params->count;
00502 search->limit_base = params->base;
00503 if(params->base % params->count) {
00504 fprintf(stderr, "mifluzsearch: -d number must be a multiple of -c number\n");
00505 return -1;
00506 }
00507
00508
00509
00510
00511 {
00512 WordResults* results = new WordResults(context);
00513
00514 results->Verbose(verbose);
00515
00516 results->KeySemantic(expr->key_semantic);
00517
00518 if(search->SetResults(results) != OK)
00519 return -1;
00520
00521 String nameFile;
00522 nameFile=nameFile+String("Cmifluz_")+ expr->search +
00523 String(params->uniq_server ? "U" : "") +
00524 low_string +
00525 high_string;
00526 nameFile.replace(' ', '1');
00527 nameFile.replace('(', '1');
00528 nameFile.replace(')', '1');
00529 nameFile.replace('"', '1');
00530 nameFile.replace('<', '1');
00531 nameFile.replace('>', '1');
00532 nameFile.replace('\t', '1');
00533 if(results->Open(nameFile) != OK)
00534 return -1;
00535 }
00536
00537
00538
00539 search->expr = expr;
00540 WordMatches* matches = search->Search();
00541
00542
00543
00544
00545 search->GetResults()->Close();
00546
00547 if((end_time = clock()) == (time_t)-1) {
00548 perror("end clock");
00549 return -1;
00550 }
00551
00552
00553
00554 clock_t elapsed_time = (end_time - start_time) / 1000;
00555
00556
00557
00558
00559 if(params->xml) {
00560 if(params->http) {
00561 printf("Content-Type: text/html\n\n");
00562 }
00563 printf("<?xml version=\"1.0\" encoding=\"ISO-8859-1\" ?>\n");
00564 printf("<searchresults>\n");
00565 if(matches) {
00566 unsigned int i;
00567 for(i = 0; i < matches->length; i++) {
00568 const WordMatch& match = *matches->matches[i];
00569 printf("<match>");
00570 printf("<document>");
00571 for(int j = 0; j < document_length; j++) {
00572 printf("%u ", match.match[document[j]]);
00573 }
00574 printf("</document>");
00575 if(!match.info.empty()) {
00576 printf("<info>");
00577 printf("%s", (const char*)match.info);
00578 printf("</info>");
00579 }
00580 printf("</match>\n");
00581 }
00582 printf("<count>%d</count>\n", search->matches_total);
00583 printf("<time>%lu</time>\n", elapsed_time);
00584 {
00585 ListCursor cursor;
00586 String* word;
00587 printf("<words>\n");
00588 printf("\t<verbatim>");
00589 for(input.words_verbatim.Start_Get(cursor); (word = (String*)input.words_verbatim.Get_Next(cursor)); ) {
00590 printf("%s ", (char*)word->get());
00591 }
00592 printf("</verbatim>\n");
00593 printf("\t<unaccent>");
00594 for(input.words_unaccent.Start_Get(cursor); (word = (String*)input.words_unaccent.Get_Next(cursor)); ) {
00595 printf("%s ", (char*)word->get());
00596 }
00597 printf("</unaccent>\n");
00598 printf("</words>\n");
00599 }
00600 printf("<base>%d</base>\n", search->limit_base);
00601 delete matches;
00602 }
00603 printf("</searchresults>\n");
00604 } else {
00605 if(matches) {
00606 unsigned int i;
00607 for(i = 0; i < matches->length; i++) {
00608 const WordMatch& match = *matches->matches[i];
00609 printf("match: %s\n", (char*)match.Get());
00610 }
00611 printf("count: %d\n", search->matches_total);
00612 printf("time: %lu\n", elapsed_time);
00613 printf("base: %d\n", search->limit_base);
00614 {
00615 ListCursor cursor;
00616 String* word;
00617 printf("words:");
00618 for(input.words_verbatim.Start_Get(cursor); (word = (String*)input.words_verbatim.Get_Next(cursor)); ) {
00619 printf(" %s", (char*)word->get());
00620 }
00621 printf("\n");
00622 }
00623 delete matches;
00624 } else {
00625 printf("match: none\n");
00626 }
00627 }
00628
00629
00630
00631
00632 delete search;
00633 delete words;
00634 delete context;
00635 delete [] document;
00636
00637 return 0;
00638 }
00639
00640
00641
00642
00643
00644 static void usage()
00645 {
00646 printf("usage:\tmifluzsearch -f words [options]\n");
00647 printf("\tmifluzsearch -o word [options]\n");
00648 printf("\tQUERY_STRING='...' mifluzsearch\n");
00649 printf("Options:\n");
00650 printf("\t-v\t\tIncreases the verbosity.\n");
00651 printf("\t-M config_file\tUse <config_file> instead of MIFLUZ_CONFIG env.\n");
00652 printf("\t-B dbfile\tUse <dbfile> as a db file name (default test).\n");
00653 printf("\t-f expr\t\tAltavista search expression.\n");
00654 printf("\t-x\t\tXML output.\n");
00655 printf("\t-H\t\tHTML headers.\n");
00656 printf("\t-O\t\tUse WordTreeOr instead of WordTreeOptional.\n");
00657 printf("\t-o <word>\treturn the number of occurrences of <word>\n");
00658 printf("\t\t\tSee WordParser comments in source for more information.\n");
00659 printf("\t-c number\tRetrieve at most this number documents.\n");
00660 printf("\t-d number\tIndex of the first document.\n");
00661 printf("\t-n\t\tOnly parse the search expression and print it.\n");
00662 printf("\t-P proximity\tUse with near/optional, proximity tolerance is <proximity>\n");
00663 printf("\t\t\tif negative order of terms is not meaningful\n");
00664 printf("\t\t\t(default 1).\n");
00665 printf("\t-S\t\tReturn at most one match per server.\n");
00666 printf("\t-R <value>\tcompute -l and -h according to wordlist_wordkey_uniq.\n");
00667 printf("\t-l <key>\tlow bound.\n");
00668 printf("\t-h <key>\thigh bound.\n");
00669 exit(1);
00670 }