00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #ifdef HAVE_CONFIG_H
00021 #include "config.h"
00022 #endif
00023
00024 #include <ctype.h>
00025 #include <stdio.h>
00026 #include <locale.h>
00027
00028 #include "WordType.h"
00029
00030 WordType::WordType(const Configuration &config)
00031 {
00032 const String valid_punct = config["wordlist_valid_punctuation"];
00033 const String extra_word_chars = config["wordlist_extra_word_characters"];
00034
00035 String locale = config.Find("wordlist_locale");
00036 if(locale.empty())
00037 locale = "C";
00038 if(setlocale(LC_ALL, (char*)locale) == 0) {
00039 fprintf(stderr, "WordType::WordType: cannot set locale: ");
00040 perror("");
00041 }
00042
00043 minimum_length = config.Value("wordlist_minimum_word_length", 3);
00044 maximum_length = config.Value("wordlist_maximum_word_length", 25);
00045 allow_numbers = config.Boolean("wordlist_allow_numbers", 0);
00046 lowercase = config.Boolean("wordlist_lowercase", 1);
00047 truncate = config.Boolean("wordlist_truncate", 1);
00048
00049 extra_word_characters = extra_word_chars;
00050 valid_punctuation = valid_punct;
00051
00052 chrtypes[0] = 0;
00053 for (int i = 1; i < 256; i++)
00054 {
00055 chrtypes[i] = 0;
00056 if (isalpha(i))
00057 chrtypes[i] |= WORD_TYPE_ALPHA;
00058 if (isdigit(i))
00059 chrtypes[i] |= WORD_TYPE_DIGIT;
00060 if (iscntrl(i))
00061 chrtypes[i] |= WORD_TYPE_CONTROL;
00062 if (strchr(extra_word_chars, i))
00063 chrtypes[i] |= WORD_TYPE_EXTRA;
00064 if (strchr(valid_punct, i))
00065 chrtypes[i] |= WORD_TYPE_VALIDPUNCT;
00066 }
00067
00068 {
00069 const String filename = config["wordlist_bad_word_list"];
00070 FILE *fl = fopen(filename, "r");
00071 char buffer[1000];
00072 char *word;
00073 String new_word;
00074
00075
00076 while (fl && fgets(buffer, sizeof(buffer), fl))
00077 {
00078 word = strtok(buffer, "\r\n \t");
00079 if (word && *word)
00080 {
00081 int flags;
00082 new_word = word;
00083 if((flags = Normalize(new_word)) & WORD_NORMALIZE_NOTOK) {
00084 fprintf(stderr, "WordType::WordType: reading bad words from %s found %s, ignored because %s\n", (const char*)filename, word, (char*)NormalizeStatus(flags & WORD_NORMALIZE_NOTOK));
00085 } else {
00086 badwords.Add(new_word, 0);
00087 }
00088 }
00089 }
00090
00091 if (fl)
00092 fclose(fl);
00093 }
00094 }
00095
00096 int
00097 WordType::Normalize(String& word) const
00098 {
00099 int status = 0;
00100
00101
00102
00103
00104 if(word.empty())
00105 return status | WORD_NORMALIZE_NULL | WORD_NORMALIZE_NOTOK;
00106
00107
00108
00109
00110 if(lowercase && word.lowercase())
00111 status |= WORD_NORMALIZE_CAPITAL;
00112
00113
00114
00115
00116 if(StripPunctuation(word))
00117 status |= WORD_NORMALIZE_PUNCTUATION;
00118
00119
00120
00121
00122 if(word.length() > maximum_length) {
00123 status |= WORD_NORMALIZE_TOOLONG;
00124 if(truncate)
00125 word.chop(word.length() - maximum_length);
00126 else
00127 return status | WORD_NORMALIZE_NOTOK;
00128 }
00129
00130
00131
00132
00133 if(word.length() < minimum_length)
00134 return status | WORD_NORMALIZE_TOOSHORT | WORD_NORMALIZE_NOTOK;
00135
00136
00137
00138
00139 int alpha = 0;
00140 for(const unsigned char *p = (const unsigned char*)(const char*)(char *)word; *p; p++) {
00141 if(IsStrictChar(*p) || (allow_numbers && IsDigit(*p))) {
00142 alpha = 1;
00143 } else if(IsControl(*p)) {
00144 return status | WORD_NORMALIZE_CONTROL | WORD_NORMALIZE_NOTOK;
00145 } else if(IsDigit(*p)) {
00146 return status | WORD_NORMALIZE_NUMBER | WORD_NORMALIZE_NOTOK;
00147 }
00148 }
00149
00150
00151
00152
00153 if(!alpha) return status | WORD_NORMALIZE_NOALPHA | WORD_NORMALIZE_NOTOK;
00154
00155
00156
00157
00158 if(badwords.Exists(word))
00159 return status | WORD_NORMALIZE_BAD | WORD_NORMALIZE_NOTOK;
00160
00161
00162
00163
00164 return status | WORD_NORMALIZE_OK;
00165 }
00166
00167
00168
00169
00170 String
00171 WordType::NormalizeStatus(int flags)
00172 {
00173 String tmp;
00174
00175 if(flags & WORD_NORMALIZE_TOOLONG) tmp << "TOOLONG ";
00176 if(flags & WORD_NORMALIZE_TOOSHORT) tmp << "TOOSHORT ";
00177 if(flags & WORD_NORMALIZE_CAPITAL) tmp << "CAPITAL ";
00178 if(flags & WORD_NORMALIZE_NUMBER) tmp << "NUMBER ";
00179 if(flags & WORD_NORMALIZE_CONTROL) tmp << "CONTROL ";
00180 if(flags & WORD_NORMALIZE_BAD) tmp << "BAD ";
00181 if(flags & WORD_NORMALIZE_NULL) tmp << "NULL ";
00182 if(flags & WORD_NORMALIZE_PUNCTUATION) tmp << "PUNCTUATION ";
00183 if(flags & WORD_NORMALIZE_NOALPHA) tmp << "NOALPHA ";
00184
00185 if(tmp.empty()) tmp << "GOOD";
00186
00187 return tmp;
00188 }