mifluzsearch.cc

Go to the documentation of this file.
00001 //
00002 // NAME
00003 // search the content of an inverted index.
00004 //
00005 // SYNOPSIS
00006 //
00007 // mifluzsearch -f words [options]
00008 //
00009 // DESCRIPTION
00010 //
00011 // mifluzsearch searches a mifluz index for documents matching a 
00012 // Alt*Vista expression (simple syntax). 
00013 //
00014 // Debugging information interpretation. A cursor is open in the index
00015 // for every word and they are stored in a list. The list of cursors
00016 // is always processed in the same order, as a single link list. With
00017 // -v, each block is an individual action on behalf of the word shown
00018 // on the first line. The last line of the block is the conclusion of
00019 // the action described in the block. REDO means the same cursor must
00020 // be examined again because the conditions have changed. RESTART means
00021 // we go back to the first cursor in the list because it may not 
00022 // match the new conditions anymore. NEXT means the cursor and all
00023 // the cursors before it match the conditions and we may proceed to
00024 // the next cursor. ATEND means the cursor cannot match the conditions
00025 // because it is at the end of the index.
00026 //
00027 // ENVIRONMENT
00028 //
00029 // <b>MIFLUZ_CONFIG</b>
00030 // file name of configuration file read by WordContext(3). Defaults to
00031 // <b>~/.mifluz.</b> 
00032 // 
00033 // 
00034 // END
00035 //
00036 // mifluzsearch: Sample implementation of search algorithms using
00037 //         a mifluz inverted index. 
00038 //
00039 //         Each class is documented in the class definition. Before
00040 //         each method declaration a comment explains the semantic of
00041 //         the method. In the method definition comments in the code
00042 //         may contain additional information.
00043 //
00044 //         Each virtual function is documented in the base class, not
00045 //         in the derived classes except for semantic differences.
00046 //         
00047 //         The class tree is:
00048 //         
00049 //         WordKeySemantic
00050 //
00051 //         WordExclude
00052 //           WordExcludeMask
00053 //             WordPermute
00054 //
00055 //         WordSearch
00056 //
00057 //         WordMatch
00058 //
00059 //         WordTree
00060 //           WordTreeOperand
00061 //             WordTreeOptional
00062 //              WordTreeOr
00063 //              WordTreeAnd
00064 //              WordTreeNear
00065 //             WordTreeMandatory
00066 //             WordTreeNot
00067 //           WordTreeLiteral
00068 //
00069 //         WordParser
00070 //
00071 // Part of the ht://Dig package   <http://www.htdig.org/>
00072 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group
00073 // For copyright details, see the file COPYING in your distribution
00074 // or the GNU General Public License version 2 or later
00075 // <http://www.gnu.org/copyleft/gpl.html>
00076 //
00077 // $Id: mifluzsearch_8cc-source.html,v 1.1 2008/06/08 10:20:34 sebdiaz Exp $
00078 //
00079 
00080 #ifdef HAVE_CONFIG_H
00081 #include <config.h>
00082 #endif /* HAVE_CONFIG_H */
00083 
00084 #ifdef HAVE_UNISTD_H
00085 #include <unistd.h>
00086 #endif /* HAVE_UNISTD_H */
00087 
00088 #if TIME_WITH_SYS_TIME
00089 # include <sys/time.h>
00090 # include <time.h>
00091 #else
00092 # if HAVE_SYS_TIME_H
00093 #  include <sys/time.h>
00094 # else
00095 #  include <time.h>
00096 # endif
00097 #endif
00098 
00099 // If we have this, we probably want it.
00100 #ifdef HAVE_GETOPT_H
00101 #include <getopt.h>
00102 #endif /* HAVE_GETOPT_H */
00103 #ifdef HAVE_MALLOC_H
00104 #include <malloc.h>
00105 #endif /* HAVE_MALLOC_H */
00106 #include <stdlib.h>
00107 #include <ctype.h>
00108 
00109 #include <htString.h>
00110 #include <WordList.h>
00111 #include <WordContext.h>
00112 #include <WordCursorOne.h>
00113 #include <HtMaxMin.h>
00114 #include <WordListOne.h>
00115 #include <WordDict.h>
00116 
00117 #include <mifluzsearch.h>
00118 #include <WordTree.h>
00119 #include <WordSearch.h>
00120 
00121 //
00122 // Verbosity level set with -v (++)
00123 // 
00124 static int verbose = 0;
00125 
00126 // ************************* main loop implementation ********************
00127 
00128 //
00129 // Store all options from the command line
00130 //
00131 class params_t
00132 {
00133 public:
00134   char* dbfile;
00135   char* find;
00136   unsigned int count;
00137   int uniq_server;
00138   int proximity;
00139   unsigned int base;
00140   int nop;
00141   int exclude;
00142   char* low;
00143   char* high;
00144   char* occurrences;
00145   int bounded;
00146   int xml;
00147   int http;
00148   int or_method;
00149   char* mifluz_config;
00150   unsigned int restrict;
00151 };
00152 
00153 static int dosearch(params_t* params);
00154 
00155 //
00156 // Explain options
00157 //
00158 static void usage();
00159 
00160 static int urldecode(char *s)
00161 {
00162   char *p = s;
00163 
00164   while (*s != '\0')
00165   {
00166     if (*s == '%')
00167     {
00168       s++;
00169       if (!isxdigit(*s))
00170         return 0;
00171       *p = (isalpha(*s) ? (*s & 0xdf) - 'A' + 10 : *s - '0') << 4;
00172       s++;
00173       if (!isxdigit(*s))
00174         return 0;
00175       *p += isalpha(*s) ? (*s & 0xdf) - 'A' + 10 : *s - '0';
00176     }
00177     else if (*s == '+')
00178       *p = ' ';
00179     else
00180       *p = *s;
00181     s++;
00182     p++;
00183   }
00184   *p = '\0';
00185   return 1;
00186 }
00187 
00188 static void handle_param(params_t* params, char c, const char* optarg) 
00189 {
00190   switch (c)
00191     {
00192     case 'v':
00193       verbose++;
00194       break;
00195     case 'B':
00196       free(params->dbfile);
00197       params->dbfile = strdup(optarg);
00198       break;
00199     case 'f':
00200       params->find = strdup(optarg);
00201       break;
00202     case 'c':
00203       params->count = (unsigned int)atoi(optarg);
00204       break;
00205     case 'd':
00206       params->base = (unsigned int)atoi(optarg);
00207       break;
00208     case 'P':
00209       params->proximity = atoi(optarg);
00210       break;
00211     case 'S':
00212       params->uniq_server = 1;
00213       break;
00214     case 'n':
00215       params->nop = 1;
00216       break;
00217     case 'l':
00218       free(params->low);
00219       params->low = strdup(optarg);
00220       break;
00221     case 'h':
00222       free(params->high);
00223       params->high = strdup(optarg);
00224       break;
00225     case 'o':
00226       free(params->occurrences);
00227       params->occurrences = strdup(optarg);
00228       break;
00229     case 'O':
00230       params->or_method = WORD_SEARCH_OR;
00231       break;
00232     case 'x':
00233       params->xml = 1;
00234       break;
00235     case 'H':
00236       params->http = 1;
00237       break;
00238     case 'M':
00239       {
00240         free(params->mifluz_config);
00241         params->mifluz_config = (char*)malloc(strlen(optarg) + 32);
00242         sprintf(params->mifluz_config, "MIFLUZ_CONFIG=%s", optarg);
00243         if(putenv(params->mifluz_config) < 0) {
00244           perror("putenv");
00245           exit(1);
00246         }
00247       }
00248       break;
00249     case 'R':
00250       params->restrict = (unsigned int)strtoul(optarg, 0, 10);
00251       break;
00252     case '?':
00253       usage();
00254       break;
00255     }
00256 }
00257 
00258 int main(int ac, char **av)
00259 {
00260   params_t              params;
00261 
00262   params.dbfile = strdup("");
00263   params.find = 0;
00264   params.count = 10;
00265   params.base = 0;
00266   params.uniq_server = 0;
00267   params.proximity = WORD_SEARCH_DEFAULT_PROXIMITY;
00268   params.nop = 0;
00269   params.low = strdup("");
00270   params.high = strdup("");
00271   params.occurrences = strdup("");
00272   params.bounded = 0;
00273   params.xml = 0;
00274   params.http = 0;
00275   params.or_method = WORD_SEARCH_OPTIONAL;
00276   params.mifluz_config = strdup("");
00277   params.restrict = 0;
00278 
00279   if(getenv("QUERY_STRING")) {
00280     StringList fields(getenv("QUERY_STRING"), "&");
00281     for(int i = 0; i < fields.Count(); i++) {
00282       const char* field = fields[i];
00283       StringList pair(field, "=");
00284       if(pair.Count() != 2 && pair.Count() != 1) {
00285         fprintf(stderr, "%s should match .+=.*\n", field);
00286         exit(1);
00287       }
00288       //
00289       // name= with no value is silently ignored
00290       //
00291       if(pair.Count() == 2) {
00292         const char* name = pair[0];
00293         char* value = pair[1];
00294         if(!urldecode(value)) {
00295           fprintf(stderr, "mifluzsearch: unable to decode %s from QUERY_STRING %s\n", value, getenv("QUERY_STRING"));
00296           exit(1);
00297         }
00298         handle_param(&params, name[0], value);
00299       }
00300     }
00301     params.http = params.xml = 1;
00302   } else {
00303     int                 c;
00304     extern char         *optarg;
00305 
00306     while ((c = getopt(ac, av, "vB:f:c:SP:nR:l:h:o:xM:HOd:")) != -1)
00307       {
00308         handle_param(&params, c, optarg);
00309       }
00310   }
00311   
00312   if(dosearch(&params) < 0)
00313     exit(1);
00314 
00315   if(params.find) free(params.find);
00316   free(params.dbfile);
00317   free(params.low);
00318   free(params.high);
00319   free(params.occurrences);
00320   free(params.mifluz_config);
00321 }
00322 
00323 static int dosearch(params_t* params)
00324 {
00325   clock_t start_time;
00326   clock_t end_time;
00327 
00328   if((start_time = clock()) == (time_t)-1) {
00329     perror("start clock");
00330     return -1;
00331   }
00332 
00333   WordContext* context = new WordContext();
00334   Configuration& config = context->GetConfiguration();
00335 
00336   if(!context) {
00337     fprintf(stderr, "search: cannot create context\n");
00338     return -1;
00339   }
00340 
00341   //
00342   // Forward command line verbosity to htword library.
00343   //
00344   if(verbose > 1) {
00345     String tmp;
00346     tmp << (verbose - 1);
00347     config.Add("wordlist_verbose", tmp);
00348     context->ReInitialize();
00349   }
00350 
00351   //
00352   // Prepare the index (-B).
00353   //
00354   if(params->dbfile[0] == '\0') {
00355     printf("missing -B option\n");
00356     usage();
00357   }
00358 
00359   WordList *words = context->List();
00360   words->Open(params->dbfile, O_RDONLY);
00361 
00362 
00363   //
00364   // Return the number of occurrences of a given word
00365   //
00366   if(params->occurrences[0]) {
00367     unsigned int occurrences = 0;
00368     words->Noccurrence(params->occurrences, occurrences);
00369     printf("%s occurs %d times\n", params->occurrences, occurrences);
00370     exit(0);
00371   }
00372 
00373   if(!params->find) {
00374     printf("missing -f option\n");
00375     usage();
00376   }
00377 
00378   WordTree* expr;
00379   MifluzSearchInput input;
00380   {
00381     input.BufferSet(params->find, strlen(params->find));
00382     input.Verbose(verbose);
00383     input.or_method = params->or_method;
00384     input.maximum_word_length = config.Value("wordlist_maximum_word_length", 25);
00385     input.words = words;
00386     search_parse(&input);
00387     expr = input.query;
00388     expr->Verbose(verbose);
00389   }
00390 
00391 
00392   //
00393   // Try the query parser alone
00394   //
00395   if(params->nop) {
00396     printf("%s\n", (char*)expr->Get().get());
00397     exit(0);
00398   }
00399 
00400   int* document = 0;
00401   int document_length = 0;
00402 
00403   //
00404   // Define the semantic of the key
00405   //
00406   {
00407     int location = -1;
00408     int nfields = words->GetContext()->GetKeyInfo().nfields;
00409 
00410     StringList fields(config.Find("wordlist_wordkey_document"), "\t ");
00411     document_length = fields.Count();
00412     if(document_length > 0) {
00413       if(document_length > nfields - 1) {
00414         fprintf(stderr, "wordlist_wordkey_document has more fields than the authorized maximum (%d)\n", nfields - 1);
00415         return -1;
00416       }
00417 
00418       document = new int[document_length];
00419       for(int i = 0; i < document_length; i++) {
00420         if(!fields[i]) {
00421           fprintf(stderr, "wordlist_wordkey_document unexpected null field returned at position %d \n", i);
00422           return -1;
00423         }
00424         document[i] = atoi(fields[i]);
00425       }
00426 
00427       if((location = config.Value("wordlist_wordkey_location", -1)) == -1) {
00428         fprintf(stderr, "wordlist_wordkey_location must be set in configuration to define the structure of the key\n");
00429         exit(0);
00430       }
00431       
00432       int uniq = config.Value("wordlist_wordkey_uniq", 0);
00433       if(params->uniq_server && !uniq) {
00434         fprintf(stderr, "wordlist_wordkey_uniq not set or 0, -S is useless\n");
00435       }
00436 
00437       WordTreeArg arg(words, uniq, params->uniq_server, params->proximity, document, document_length, location);
00438       if(expr->Prepare(arg) != OK)
00439         return -1;
00440     } else {
00441       fprintf(stderr, "wordlist_wordkey_document and wordlist_wordkey_location must be set in configuration to define the structure of the key\n");
00442       return -1;
00443     }
00444   }
00445 
00446   //
00447   // Calculate low and high bounds based on -R and wordlist_wordkey_uniq
00448   //
00449   if(params->restrict) {
00450     if(params->low[0] || params->high[0]) {
00451       fprintf(stderr, "-R number will override -l and -h\n");
00452     }
00453 
00454     WordKey restrict(context);
00455     int uniq = config.Value("wordlist_wordkey_uniq", 0);
00456 
00457     restrict.Set(uniq, params->restrict);
00458     free(params->low);
00459     params->low = strdup((char*)restrict.Get());
00460 
00461     if(restrict.Overflow(uniq, 1)) {
00462       if(verbose) fprintf(stderr, "-R %d overflow\n", params->restrict); 
00463     } else {
00464       restrict.Get(uniq)++;
00465       free(params->high);
00466       params->high = strdup((char*)restrict.Get());
00467     }
00468 
00469     if(verbose > 1) {
00470       fprintf(stderr, "-R %d => -l '%s' -h '%s'\n", params->restrict, params->low, params->high);
00471     }
00472   }
00473   
00474   //
00475   // Set lower and higher bounds if appropriate
00476   //
00477   String low_string;
00478   String high_string;
00479   {
00480     WordKey low(context);
00481     WordKey high(context);
00482     if(params->low[0]) {
00483       low.Set(params->low);
00484       low_string << "L" << params->low;
00485     }
00486     if(params->high[0]) {
00487       high.Set(params->high);
00488       high_string << "H" << params->high;
00489     }
00490     if(params->low[0] || params->high[0])
00491       if(expr->Bounds(low, high) != OK)
00492         return -1;
00493   }
00494 
00495   WordSearch* search = new WordSearch(words);
00496   search->Verbose(verbose);
00497 
00498   //
00499   // Forward query options to WordSearch object
00500   //
00501   search->limit_count = params->count;                  // -c
00502   search->limit_base = params->base;                    // -d
00503   if(params->base % params->count) {
00504     fprintf(stderr, "mifluzsearch: -d number must be a multiple of -c number\n");
00505     return -1;
00506   }
00507 
00508   //
00509   // Create or re-use the list of results
00510   //
00511   {
00512     WordResults* results = new WordResults(context);
00513 
00514     results->Verbose(verbose);
00515 
00516     results->KeySemantic(expr->key_semantic);
00517 
00518     if(search->SetResults(results) != OK)
00519       return -1;
00520 
00521         String nameFile;
00522     nameFile=nameFile+String("Cmifluz_")+       expr->search +
00523                      String(params->uniq_server ? "U" : "") +
00524                      low_string +
00525                      high_string;
00526         nameFile.replace(' ', '1');
00527         nameFile.replace('(', '1');
00528         nameFile.replace(')', '1');
00529         nameFile.replace('"', '1');
00530         nameFile.replace('<', '1');
00531         nameFile.replace('>', '1');
00532         nameFile.replace('\t', '1');
00533     if(results->Open(nameFile) != OK)
00534       return -1;
00535   }
00536   //
00537   // Perform the search (-f)
00538   //
00539   search->expr = expr;
00540   WordMatches* matches = search->Search();
00541 
00542   //
00543   // Close the list of results
00544   //
00545   search->GetResults()->Close();
00546 
00547   if((end_time = clock()) == (time_t)-1) {
00548     perror("end clock");
00549     return -1;
00550   }
00551   //
00552   // Elapsed time in milli seconds
00553   //
00554   clock_t elapsed_time = (end_time - start_time) / 1000;
00555   
00556   //
00557   // Display results, if any.
00558   //
00559   if(params->xml) {
00560     if(params->http) {
00561       printf("Content-Type: text/html\n\n");
00562     }
00563     printf("<?xml version=\"1.0\" encoding=\"ISO-8859-1\" ?>\n");
00564     printf("<searchresults>\n");
00565     if(matches) {
00566       unsigned int i;
00567       for(i = 0; i < matches->length; i++) {
00568         const WordMatch& match = *matches->matches[i];
00569         printf("<match>");
00570         printf("<document>");
00571         for(int j = 0; j < document_length; j++) {
00572           printf("%u ", match.match[document[j]]);
00573         }
00574         printf("</document>");
00575         if(!match.info.empty()) {
00576           printf("<info>");
00577           printf("%s", (const char*)match.info);
00578           printf("</info>");
00579         }
00580         printf("</match>\n");
00581       }
00582       printf("<count>%d</count>\n", search->matches_total);
00583       printf("<time>%lu</time>\n", elapsed_time);
00584       {
00585         ListCursor cursor;
00586         String* word;
00587         printf("<words>\n");
00588         printf("\t<verbatim>");
00589         for(input.words_verbatim.Start_Get(cursor); (word = (String*)input.words_verbatim.Get_Next(cursor)); ) {
00590           printf("%s ", (char*)word->get());
00591         }
00592         printf("</verbatim>\n");
00593         printf("\t<unaccent>");
00594         for(input.words_unaccent.Start_Get(cursor); (word = (String*)input.words_unaccent.Get_Next(cursor)); ) {
00595           printf("%s ", (char*)word->get());
00596         }
00597         printf("</unaccent>\n");
00598         printf("</words>\n");
00599       }
00600       printf("<base>%d</base>\n", search->limit_base);
00601       delete matches;
00602     }
00603     printf("</searchresults>\n");
00604   } else {
00605     if(matches) {
00606       unsigned int i;
00607       for(i = 0; i < matches->length; i++) {
00608         const WordMatch& match = *matches->matches[i];
00609         printf("match: %s\n", (char*)match.Get());
00610       }
00611       printf("count: %d\n", search->matches_total);
00612       printf("time: %lu\n", elapsed_time);
00613       printf("base: %d\n", search->limit_base);
00614       {
00615         ListCursor cursor;
00616         String* word;
00617         printf("words:");
00618         for(input.words_verbatim.Start_Get(cursor); (word = (String*)input.words_verbatim.Get_Next(cursor)); ) {
00619           printf(" %s", (char*)word->get());
00620         }
00621         printf("\n");
00622       }
00623       delete matches;
00624     } else {
00625       printf("match: none\n");
00626     }
00627   }
00628 
00629   //
00630   // Cleanup
00631   //
00632   delete search;
00633   delete words;
00634   delete context;
00635   delete [] document;
00636 
00637   return 0;
00638 }
00639 
00640 // *****************************************************************************
00641 // void usage()
00642 //   Display program usage information
00643 //
00644 static void usage()
00645 {
00646     printf("usage:\tmifluzsearch -f words [options]\n");
00647     printf("\tmifluzsearch -o word [options]\n");
00648     printf("\tQUERY_STRING='...' mifluzsearch\n");
00649     printf("Options:\n");
00650     printf("\t-v\t\tIncreases the verbosity.\n");
00651     printf("\t-M config_file\tUse <config_file> instead of MIFLUZ_CONFIG env.\n");
00652     printf("\t-B dbfile\tUse <dbfile> as a db file name (default test).\n");
00653     printf("\t-f expr\t\tAltavista search expression.\n");
00654     printf("\t-x\t\tXML output.\n");
00655     printf("\t-H\t\tHTML headers.\n");
00656     printf("\t-O\t\tUse WordTreeOr instead of WordTreeOptional.\n");
00657     printf("\t-o <word>\treturn the number of occurrences of <word>\n");
00658     printf("\t\t\tSee WordParser comments in source for more information.\n");
00659     printf("\t-c number\tRetrieve at most this number documents.\n");
00660     printf("\t-d number\tIndex of the first document.\n");
00661     printf("\t-n\t\tOnly parse the search expression and print it.\n");
00662     printf("\t-P proximity\tUse with near/optional, proximity tolerance is <proximity>\n");
00663     printf("\t\t\tif negative order of terms is not meaningful\n");
00664     printf("\t\t\t(default 1).\n");
00665     printf("\t-S\t\tReturn at most one match per server.\n");
00666     printf("\t-R <value>\tcompute -l and -h according to wordlist_wordkey_uniq.\n");
00667     printf("\t-l <key>\tlow bound.\n");
00668     printf("\t-h <key>\thigh bound.\n");
00669     exit(1);
00670 }

Generated on Sun Jun 8 10:56:40 2008 for GNUmifluz by  doxygen 1.5.5