00001 // 00002 // WordType.h 00003 // 00004 // NAME 00005 // defines a word in term of allowed characters, length etc. 00006 // 00007 // SYNOPSIS 00008 // 00009 // Only called thru WordContext::Initialize() 00010 // 00011 // DESCRIPTION 00012 // 00013 // WordType defines an indexed word and operations to validate 00014 // a word to be indexed. All words inserted into the <i>mifluz</i> index 00015 // are <b>Normalize</b>d before insertion. The configuration options 00016 // give some control over the definition of a word. 00017 // 00018 // CONFIGURATION 00019 // 00020 // wordlist_locale <locale> (default C) 00021 // Set the locale of the program to <b>locale</b>. See setlocale(3) 00022 // for more information. 00023 // 00024 // wordlist_allow_numbers {true|false} (default false) 00025 // If <b>true</b> a word may contain digits. If <b>false</b> digits 00026 // are not considered to be part of a word and an attempt to insert 00027 // a word containing digits will result in an error. 00028 // See the <b>Normalize</b> method for more information. 00029 // 00030 // wordlist_mimimun_word_length <number> (default 3) 00031 // The minimum length of a word. 00032 // See the <b>Normalize</b> method for more information. 00033 // 00034 // wordlist_maximum_word_length <number> (default 25) 00035 // The maximum length of a word. 00036 // See the <b>Normalize</b> method for more information. 00037 // 00038 // wordlist_allow_numbers {true|false} <number> (default false) 00039 // A digit is considered a valid character within a word if 00040 // this configuration parameter is set to <i>true</i> otherwise 00041 // it is an error to insert a word containing digits. 00042 // See the <b>Normalize</b> method for more information. 00043 // 00044 // wordlist_truncate {true|false} <number> (default true) 00045 // If a word is too long according to 00046 // the <i>wordlist_maximum_word_length</i> it is truncated 00047 // if this configuration parameter is <i>true</i> otherwise it 00048 // is considered an invalid word. 00049 // 00050 // wordlist_lowercase {true|false} <number> (default true) 00051 // If a word contains upper case letters it is converted to lowercase 00052 // if this configuration parameter is true, otherwise it is left 00053 // untouched. 00054 // 00055 // wordlist_valid_punctuation [characters] (default none) 00056 // A list of punctuation characters that may appear in a word. 00057 // These characters will be removed from the word before insertion 00058 // in the index. 00059 // 00060 // wordlist_extra_word_characters [characters] (default none) 00061 // A list of characters that may appear in a word. These characters 00062 // are left untouched. 00063 // 00064 // END 00065 // 00066 // Part of the ht://Dig package <http://www.htdig.org/> 00067 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group 00068 // For copyright details, see the file COPYING in your distribution 00069 // or the GNU General Public License version 2 or later 00070 // <http://www.gnu.org/copyleft/gpl.html> 00071 // 00072 // $Id: WordType_8h-source.html,v 1.1 2008/06/08 10:13:25 sebdiaz Exp $ 00073 // 00074 00075 #ifndef _WordType_h 00076 #define _WordType_h 00077 00078 #include "htString.h" 00079 #include "Configuration.h" 00080 00081 // 00082 // Return values of Normalize, to get them in string form use NormalizeStatus 00083 // 00084 #define WORD_NORMALIZE_TOOLONG 0x0001 00085 #define WORD_NORMALIZE_TOOSHORT 0x0002 00086 #define WORD_NORMALIZE_CAPITAL 0x0004 00087 #define WORD_NORMALIZE_NUMBER 0x0008 00088 #define WORD_NORMALIZE_CONTROL 0x0010 00089 #define WORD_NORMALIZE_BAD 0x0020 00090 #define WORD_NORMALIZE_NULL 0x0040 00091 #define WORD_NORMALIZE_PUNCTUATION 0x0080 00092 #define WORD_NORMALIZE_NOALPHA 0x0100 00093 #define WORD_NORMALIZE_OK 0x4000 00094 #define WORD_NORMALIZE_NOTOK 0x8000 00095 00096 class WordType 00097 { 00098 public: 00099 // 00100 // Constructors 00101 // 00102 WordType(const Configuration& config); 00103 00104 // 00105 // Predicates 00106 // 00107 int IsChar(int c) const; 00108 int IsStrictChar(int c) const; 00109 int IsDigit(int c) const; 00110 int IsControl(int c) const; 00111 00112 // 00113 // Transformations 00114 // 00115 int StripPunctuation(String &s) const; 00116 00117 //- 00118 // Normalize a word according to configuration specifications and 00119 // builtin transformations. <b>Every</b> word inserted in the inverted 00120 // index goes thru this function. If 00121 // a word is rejected (return value has WORD_NORMALIZE_NOTOK bit set) it will not 00122 // be inserted in the index. If a word is accepted (return value has 00123 // WORD_NORMALIZE_OK bit set) it will be inserted in the index. In 00124 // addition to these two bits, informational values are stored that 00125 // give information on the processing done on the word. 00126 // The bit field values and their meanings are 00127 // as follows: 00128 // 00129 // <dl> 00130 // <dt>WORD_NORMALIZE_TOOLONG 00131 // <dd>the word length exceeds the value of 00132 // the <i>wordlist_maximum_word_length</i> configuration parameter. 00133 // <dt>WORD_NORMALIZE_TOOSHORT 00134 // <dd>the word length is smaller than the value of 00135 // the <i>wordlist_minimum_word_length</i> configuration parameter. 00136 // <dt>WORD_NORMALIZE_CAPITAL 00137 // <dd>the word contained capital letters and has been converted 00138 // to lowercase. This bit is only set 00139 // if the <i>wordlist_lowercase</i> configuration parameter 00140 // is true. 00141 // <dt>WORD_NORMALIZE_NUMBER 00142 // <dd>the word contains digits and the configuration 00143 // parameter <i>wordlist_allow_numbers</i> is set to false. 00144 // <dt>WORD_NORMALIZE_CONTROL 00145 // <dd>the word contains control characters. 00146 // <dt>WORD_NORMALIZE_BAD 00147 // <dd>the word is listed in the file pointed by 00148 // the <i>wordlist_bad_word_list</i> configuration parameter. 00149 // <dt>WORD_NORMALIZE_NULL 00150 // <dd>the word is a zero length string. 00151 // <dt>WORD_NORMALIZE_PUNCTUATION 00152 // <dd>at least one character listed in 00153 // the <i>wordlist_valid_punctuation</i> attribute was removed 00154 // from the word. 00155 // <dt>WORD_NORMALIZE_NOALPHA 00156 // <dd>the word does not contain any alphanumerical character. 00157 // </dl> 00158 // 00159 int Normalize(String &s) const; 00160 00161 // 00162 // Error handling 00163 // 00164 //- 00165 // Returns a string explaining the return flags of the Normalize 00166 // method. 00167 // 00168 static String NormalizeStatus(int flags); 00169 00170 private: 00171 00172 String valid_punctuation; // The same as the attribute. 00173 String extra_word_characters; // The same as the attribute. 00174 char chrtypes[256]; // quick lookup table for types 00175 int minimum_length; // Minimum word length 00176 int maximum_length; // Maximum word length 00177 int allow_numbers; // True if a word may contain numbers 00178 int lowercase; // True words converted to lowercase 00179 int truncate; // True if word too long are truncated 00180 Dictionary badwords; // List of excluded words 00181 }; 00182 00183 // Bits to set in chrtypes[]: 00184 #define WORD_TYPE_ALPHA 0x01 00185 #define WORD_TYPE_DIGIT 0x02 00186 #define WORD_TYPE_EXTRA 0x04 00187 #define WORD_TYPE_VALIDPUNCT 0x08 00188 #define WORD_TYPE_CONTROL 0x10 00189 00190 // One for characters that when put together are a word 00191 // (including punctuation). 00192 inline int 00193 WordType::IsChar(int c) const 00194 { 00195 return (chrtypes[(unsigned char)c] & (WORD_TYPE_ALPHA|WORD_TYPE_DIGIT|WORD_TYPE_EXTRA|WORD_TYPE_VALIDPUNCT)) != 0; 00196 } 00197 00198 // Similar, but no punctuation characters. 00199 inline int 00200 WordType::IsStrictChar(int c) const 00201 { 00202 return (chrtypes[(unsigned char)c] & (WORD_TYPE_ALPHA|WORD_TYPE_EXTRA)) != 0; 00203 } 00204 00205 // Reimplementation of isdigit() using the lookup table chrtypes[] 00206 inline int 00207 WordType::IsDigit(int c) const 00208 { 00209 return (chrtypes[(unsigned char)c] & WORD_TYPE_DIGIT) != 0; 00210 } 00211 00212 // Similar to IsDigit, but for iscntrl() 00213 inline int 00214 WordType::IsControl(int c) const 00215 { 00216 return (chrtypes[(unsigned char)c] & WORD_TYPE_CONTROL) != 0; 00217 } 00218 00219 // Let caller get rid of getting and holding a configuration parameter. 00220 inline int 00221 WordType::StripPunctuation(String &s) const 00222 { 00223 return s.remove(valid_punctuation); 00224 } 00225 00226 00227 #endif /* __WordType_h */