00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #ifndef MF_DICTIONARY_H
00024 #define MF_DICTIONARY_H
00025
00026 #include "mfstream.h"
00027 #include "htable.h"
00028 #include <cstring>
00029 #include <iostream>
00030
00031
00032 #define MAX_WORD 1000
00033 #define DICTIONARY_LOAD_FACTOR 2.0
00034
00035
00036 #ifndef GROWTH_STEP
00037 #define GROWTH_STEP 1.5
00038 #endif
00039
00040 #ifndef DICT_INITSIZE
00041 #define DICT_INITSIZE 100000
00042 #endif
00043
00044
00045 #ifndef BOS_
00046 #define BOS_ "<s>"
00047 #endif
00048
00049
00050
00051 #ifndef EOS_
00052 #define EOS_ "</s>"
00053 #endif
00054
00055
00056 #ifndef BOD_
00057 #define BOD_ "<d>"
00058 #endif
00059
00060
00061 #ifndef EOD_
00062 #define EOD_ "</d>"
00063 #endif
00064
00065
00066
00067 #ifndef OOV_
00068 #define OOV_ "<unk>"
00069 #endif
00070
00071 typedef struct {
00072 const char *word;
00073 int code;
00074 long long freq;
00075 } dict_entry;
00076
00077 typedef htable<char*> HASHTABLE_t;
00078
00079 class strstack;
00080
00081 class dictionary
00082 {
00083 strstack *st;
00084 dict_entry *tb;
00085 HASHTABLE_t *htb;
00086 int n;
00087 long long N;
00088 int lim;
00089 int oov_code;
00090 char ifl;
00091 int dubv;
00092 float load_factor;
00093 char* oov_str;
00094
00095 public:
00096
00097 friend class dictionary_iter;
00098
00099 dictionary* oovlex;
00100
00101 inline int dub() {
00102 return dubv;
00103 }
00104
00105 inline int dub(int value) {
00106 return dubv=value;
00107 }
00108
00109 inline const char *OOV() {
00110 return (char*) OOV_;
00111 }
00112
00113 inline const char *BoS() {
00114 return (char*) BOS_;
00115 }
00116
00117 inline const char *EoS() {
00118 return (char*) EOS_;
00119 }
00120
00121 inline const char *BoD() {
00122 return (char*) BOD_;
00123 }
00124
00125 inline const char *EoD() {
00126 return (char*) EOD_;
00127 }
00128
00129 inline int oovcode(int v=-1) {
00130 return oov_code=(v>=0?v:oov_code);
00131 }
00132
00133 inline int incflag() {
00134 return ifl;
00135 }
00136 inline int incflag(int v) {
00137 return ifl=v;
00138 }
00139
00140 int getword(fstream& inp , char* buffer);
00141 int isprintable(char* w) {
00142 char buffer[MAX_WORD];
00143 sprintf(buffer,"%s",w);
00144 return strcmp(w,buffer)==0;
00145 }
00146
00147 inline void genoovcode() {
00148 int c=encode(OOV());
00149 std::cerr << "OOV code is "<< c << std::endl;
00150 oovcode(c);
00151 }
00152
00153 inline void genBoScode() {
00154 int c=encode(BoS());
00155 std::cerr << "BoS code is "<< c << std::endl;
00156 }
00157
00158 inline void genEoScode() {
00159 int c=encode(EoS());
00160 std::cerr << "EoS code is "<< c << std::endl;
00161 }
00162
00163 inline int setoovrate(double oovrate) {
00164 encode(OOV());
00165 int oovfreq=(int)(oovrate * totfreq());
00166 std::cerr << "setting OOV rate to: " << oovrate << " -- freq= " << oovfreq << std::endl;
00167 return freq(oovcode(),oovfreq);
00168 }
00169
00170
00171 inline long long incfreq(int code,long long value) {
00172 N+=value;
00173 return tb[code].freq+=value;
00174 }
00175
00176 inline long long multfreq(int code,double value) {
00177 N+=(long long)(value * tb[code].freq)-tb[code].freq;
00178 return tb[code].freq=(long long)(value * tb[code].freq);
00179 }
00180
00181 inline long freq(int code,long long value=-1) {
00182 if (value>=0) {
00183 N+=value-tb[code].freq;
00184 tb[code].freq=value;
00185 }
00186 return tb[code].freq;
00187 }
00188
00189 inline long long totfreq() {
00190 return N;
00191 }
00192 inline float set_load_factor(float value) {
00193 return load_factor=value;
00194 }
00195
00196 void grow();
00197 void sort();
00198
00199 dictionary(char *filename,int size=DICT_INITSIZE,float lf=DICTIONARY_LOAD_FACTOR);
00200 dictionary(dictionary* d, bool prune=false,int prunethresh=0);
00201
00202 ~dictionary();
00203 void generate(char *filename);
00204 void load(char *filename);
00205 void save(char *filename, int freqflag=0);
00206 void load(std::istream& fd);
00207 void save(std::ostream& fd);
00208
00209 void augment(dictionary *d);
00210
00211 int size() {
00212 return n;
00213 }
00214 int getcode(const char *w);
00215 int encode(const char *w);
00216 const char *decode(int c);
00217 void stat();
00218
00219 void print_curve(int curvesize, float* testOOV=NULL);
00220 float* test(int curvesize, const char *filename, int listflag=0);
00221
00222 void cleanfreq() {
00223 for (int i=0; i<n; tb[i++].freq=0) {};
00224 N=0;
00225 }
00226
00227 inline dict_entry* scan(HT_ACTION action) {
00228 return (dict_entry*) htb->scan(action);
00229 }
00230 };
00231
00232 class dictionary_iter
00233 {
00234 public:
00235 dictionary_iter(dictionary *dict);
00236 dict_entry* next();
00237 private:
00238 dictionary* m_dict;
00239 };
00240
00241 #endif
00242