00001
00002
00003 using namespace std;
00004
00005 #include <iostream>
00006 #include "cmd.h"
00007 #include "mfstream.h"
00008 #include "mempool.h"
00009 #include "dictionary.h"
00010
00011 void print_help(int TypeFlag=0){
00012 std::cerr << std::endl << "dict - extracts a dictionary" << std::endl;
00013 std::cerr << std::endl << "USAGE:" << std::endl;
00014 std::cerr << " dict -i=<inputfile> [options]" << std::endl;
00015 std::cerr << std::endl << "DESCRIPTION:" << std::endl;
00016 std::cerr << " dict extracts a dictionary from a corpus or a dictionary." << std::endl;
00017 std::cerr << std::endl << "OPTIONS:" << std::endl;
00018 FullPrintParams(TypeFlag, 0, 1, stderr);
00019 }
00020
00021 void usage(const char *msg = 0)
00022 {
00023 if (msg){
00024 std::cerr << msg << std::endl;
00025 }
00026 else{
00027 print_help();
00028 }
00029 exit(1);
00030 }
00031
00032 int main(int argc, char **argv)
00033 {
00034 char *inp=NULL;
00035 char *out=NULL;
00036 char *testfile=NULL;
00037 char *intsymb=NULL;
00038 int freqflag=0;
00039 int sortflag=0;
00040 int curveflag=0;
00041 int curvesize=10;
00042 int listflag=0;
00043 int size=1000000;
00044 float load_factor=0;
00045
00046 int prunefreq=0;
00047 int prunerank=0;
00048
00049 bool help=false;
00050
00051 DeclareParams((char*)
00052 "InputFile", CMDSTRINGTYPE|CMDMSG, &inp, "input file (Mandatory)",
00053 "i", CMDSTRINGTYPE|CMDMSG, &inp, "input file (Mandatory)",
00054 "OutputFile", CMDSTRINGTYPE|CMDMSG, &out, "output file",
00055 "o", CMDSTRINGTYPE|CMDMSG, &out, "output file",
00056 "f", CMDBOOLTYPE|CMDMSG, &freqflag,"output word frequencies; default is false",
00057 "Freq", CMDBOOLTYPE|CMDMSG, &freqflag,"output word frequencies; default is false",
00058 "sort", CMDBOOLTYPE|CMDMSG, &sortflag,"sort dictionary by frequency; default is false",
00059 "Size", CMDINTTYPE|CMDMSG, &size, "Initial dictionary size; default is 1000000",
00060 "s", CMDINTTYPE|CMDMSG, &size, "Initial dictionary size; default is 1000000",
00061 "LoadFactor", CMDFLOATTYPE|CMDMSG, &load_factor, "set the load factor for cache; it should be a positive real value; default is 0",
00062 "lf", CMDFLOATTYPE|CMDMSG, &load_factor, "set the load factor for cache; it should be a positive real value; default is 0",
00063 "IntSymb", CMDSTRINGTYPE|CMDMSG, &intsymb, "interruption symbol",
00064 "is", CMDSTRINGTYPE|CMDMSG, &intsymb, "interruption symbol",
00065
00066 "PruneFreq", CMDINTTYPE|CMDMSG, &prunefreq, "prune words with frequency below the specified value",
00067 "pf", CMDINTTYPE|CMDMSG, &prunefreq, "prune words with frequency below the specified value",
00068 "PruneRank", CMDINTTYPE|CMDMSG, &prunerank, "prune words with frequency rank above the specified value",
00069 "pr", CMDINTTYPE|CMDMSG, &prunerank, "prune words with frequency rank above the specified value",
00070
00071 "Curve", CMDBOOLTYPE|CMDMSG, &curveflag,"show dictionary growth curve; default is false",
00072 "c", CMDBOOLTYPE|CMDMSG, &curveflag,"show dictionary growth curve; default is false",
00073 "CurveSize", CMDINTTYPE|CMDMSG, &curvesize, "default 10",
00074 "cs", CMDINTTYPE|CMDMSG, &curvesize, "default 10",
00075
00076 "TestFile", CMDSTRINGTYPE|CMDMSG, &testfile, "compute OOV rates on the specified test corpus",
00077 "t", CMDSTRINGTYPE|CMDMSG, &testfile, "compute OOV rates on the specified test corpus",
00078 "ListOOV", CMDBOOLTYPE|CMDMSG, &listflag, "print OOV words to stderr; default is false",
00079 "oov", CMDBOOLTYPE|CMDMSG, &listflag, "print OOV words to stderr; default is false",
00080
00081 "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help",
00082 "h", CMDBOOLTYPE|CMDMSG, &help, "print this help",
00083
00084 (char*)NULL
00085 );
00086
00087 if (argc == 1){
00088 usage();
00089 }
00090
00091 GetParams(&argc, &argv, (char*) NULL);
00092
00093 if (help){
00094 usage();
00095 }
00096
00097 if (inp==NULL) {
00098 usage("Warning: no input file specified");
00099 };
00100
00101
00102 if (curveflag && !freqflag)
00103 freqflag=1;
00104 if (testfile!=NULL && !freqflag) {
00105 freqflag=1;
00106 mfstream test(testfile,ios::in);
00107 if (!test) {
00108 usage(strcat((char*) "Warning: cannot open testfile: ", testfile));
00109 exit(1);
00110 }
00111 test.close();
00112
00113 }
00114
00115
00116 dictionary *d = new dictionary(inp,size,load_factor);
00117
00118
00119 if (prunefreq>0 || prunerank>0 || sortflag) {
00120 dictionary *sortd=new dictionary(d,false);
00121 sortd->sort();
00122 delete d;
00123 d=sortd;
00124 }
00125
00126
00127
00128 if (testfile != NULL)
00129 d->print_curve(curvesize, d->test(curvesize, testfile, listflag));
00130 else if (curveflag)
00131 d->print_curve(curvesize);
00132
00133
00134
00135 if (prunefreq>0 || prunerank>0) {
00136 cerr << "pruning dictionary prunefreq:" << prunefreq << " prunerank: " << prunerank <<" \n";
00137 int count=0;
00138 int bos=d->encode(d->BoS());
00139 int eos=d->encode(d->EoS());
00140
00141 for (int i=0; i< d->size() ; i++) {
00142 if (prunefreq && d->freq(i) <= prunefreq && i!=bos && i!=eos) {
00143 d->freq(i,0);
00144 continue;
00145 }
00146 if (prunerank>0 && count>=prunerank && i!=bos && i!=eos) {
00147 d->freq(i,0);
00148 continue;
00149 }
00150 count++;
00151 }
00152 }
00153
00154 if(out!=NULL) d->save(out,freqflag);
00155
00156 }
00157