00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 using namespace std;
00022
00023 #include <iostream>
00024 #include "cmd.h"
00025 #include "mfstream.h"
00026 #include "mempool.h"
00027 #include "htable.h"
00028 #include "dictionary.h"
00029 #include "n_gram.h"
00030 #include "ngramtable.h"
00031 #include "doc.h"
00032 #include "cplsa.h"
00033
00034 void print_help(int TypeFlag=0){
00035 std::cerr << std::endl << "plsa - performs probabilistic latent semantic analysis LM inference" << std::endl;
00036 std::cerr << std::endl << "USAGE:" << std::endl;
00037 std::cerr << " plsa -c=<text_collection> -d=<dictionary> -m=<model> -t=<topics> -it=<iter> [options]" << std::endl;
00038 std::cerr << " plsa -c=<text_collection> -d=<dictionary> -b=<binary_collection> [options]" << std::endl;
00039 std::cerr << " plsa -d=<dictionary> -m=<model> -t=<topics> -inf=<text> -f=<features> -it=<iterations> [options]" << std::endl;
00040 std::cerr << std::endl << "DESCRIPTION:" << std::endl;
00041 std::cerr << " plsa is a tool for probabilistic latent semantic analysis" << std::endl;
00042 std::cerr << " LM inference. It can be used to train a PLSA model, to binarize" << std::endl;
00043 std::cerr << " a textual document collection to speed-up training or to" << std::endl;
00044 std::cerr << " infer a full n-gram distribution from a model and a small text." << std::endl;
00045 std::cerr << std::endl << "OPTIONS:" << std::endl;
00046
00047
00048 FullPrintParams(TypeFlag, 0, 1, stderr);
00049
00050 std::cerr << std::endl << "EXAMPLES:" << std::endl;
00051 std::cerr <<" (1) plsa -c=<text_collection> -d=<dictionary> -m=<model> -t=<topics> -it=<iter>" << std::endl;
00052 std::cerr <<" Train a PLSA model, <model>, from the text collection" << std::endl;
00053 std::cerr <<" <text_collection> using the dictionary <dictionary>. The" << std::endl;
00054 std::cerr <<" number of EM iterations is specified by <iter> and the" << std::endl;
00055 std::cerr <<" number of topics is specified by <topics>." << std::endl;
00056 std::cerr <<" The <text_collection> content must begin with the number of" << std::endl;
00057 std::cerr <<" documents and documents should be separated with the </d> tag." << std::endl;
00058 std::cerr <<" The begin document tag <d> is not considered." << std::endl;
00059 std::cerr <<" Example of <text_collection> content:" << std::endl;
00060 std::cerr <<" 3" << std::endl;
00061 std::cerr <<" <d> hello world ! </d>" << std::endl;
00062 std::cerr <<" <d> good morning good afternoon </d>" << std::endl;
00063 std::cerr <<" <d> welcome aboard </d>" << std::endl;
00064 std::cerr <<" (2) plsa -c=<text_collection> -d=<dictionary> -b=<binary collection>" << std::endl;
00065 std::cerr <<" Binarize a textual document collection to speed-up training (1)" << std::endl;
00066 std::cerr <<" (3) plsa -d=<dictionary> -m=<model> -t=<topics> -inf=<text> -f=<features> -it=<iterations>" << std::endl;
00067 std::cerr <<" Infer a full 1-gram distribution from a model and a small" << std::endl;
00068 std::cerr <<" text. The 1-gram is saved in the feature file. The 1-gram" << std::endl;
00069 std::cerr << std::endl;
00070 }
00071
00072 void usage(const char *msg = 0)
00073 {
00074 if (msg){
00075 std::cerr << msg << std::endl;
00076 }
00077 else{
00078 print_help();
00079 }
00080 exit(1);
00081 }
00082
00083 int main(int argc, char **argv)
00084 {
00085 char *dictfile=NULL;
00086 char *trainfile=NULL;
00087 char *adafile=NULL;
00088 char *featurefile=NULL;
00089 char *basefile=NULL;
00090 char *hfile=NULL;
00091 char *tmphfile=NULL;
00092 char *tfile=NULL;
00093 char *wfile=NULL;
00094 char *ctfile=NULL;
00095 char *txtfile=NULL;
00096 char *binfile=NULL;
00097
00098 int binsize=0;
00099 int topics=0;
00100 int st=0;
00101 int it=0;
00102 bool help=false;
00103
00104 DeclareParams((char*)
00105
00106 "Dictionary", CMDSTRINGTYPE|CMDMSG, &dictfile, "dictionary file",
00107 "d", CMDSTRINGTYPE|CMDMSG, &dictfile, "dictionary file",
00108
00109 "Binary", CMDSTRINGTYPE|CMDMSG, &binfile, "binary file",
00110 "b", CMDSTRINGTYPE|CMDMSG, &binfile, "binary file",
00111
00112 "SplitData", CMDINTTYPE|CMDMSG, &binsize, "size of binary file; default is unlimited",
00113 "sd", CMDINTTYPE|CMDMSG, &binsize, "size of binary file; default is unlimited",
00114
00115 "Collection", CMDSTRINGTYPE|CMDMSG, &trainfile, "text collection file",
00116 "c", CMDSTRINGTYPE|CMDMSG, &trainfile, "text collection file",
00117
00118 "Model", CMDSTRINGTYPE|CMDMSG, &basefile, "model file",
00119 "m", CMDSTRINGTYPE|CMDMSG, &basefile, "model file",
00120
00121 "HFile", CMDSTRINGTYPE, &tmphfile,
00122 "hf", CMDSTRINGTYPE, &tmphfile,
00123
00124 "WFile", CMDSTRINGTYPE, &wfile,
00125 "wf", CMDSTRINGTYPE, &wfile,
00126
00127 "TFile", CMDSTRINGTYPE, &tfile,
00128 "tf", CMDSTRINGTYPE, &tfile,
00129
00130 "CombineTFile", CMDSTRINGTYPE, &ctfile,
00131 "ct", CMDSTRINGTYPE, &ctfile,
00132
00133 "TxtFile", CMDSTRINGTYPE, &txtfile,
00134 "txt", CMDSTRINGTYPE, &txtfile,
00135
00136 "Inference", CMDSTRINGTYPE, &adafile,
00137 "inf", CMDSTRINGTYPE, &adafile,
00138
00139 "Features", CMDSTRINGTYPE, &featurefile,
00140 "f", CMDSTRINGTYPE, &featurefile,
00141
00142 "Topics", CMDINTTYPE|CMDMSG, &topics, "number of topics; default is 0",
00143 "t", CMDINTTYPE|CMDMSG, &topics,"number of topics; default is 0",
00144
00145 "SpecialTopic", CMDINTTYPE|CMDMSG, &st, "special topic: first dictionary words; default is 0",
00146 "st", CMDINTTYPE|CMDMSG, &st, "special topic: first dictionary words; default is 0",
00147
00148 "Iterations", CMDINTTYPE|CMDMSG, &it, "number of EM iterations; default is 0",
00149 "it", CMDINTTYPE|CMDMSG, &it, "number of EM iterations; default is 0",
00150
00151 "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help",
00152 "h", CMDBOOLTYPE|CMDMSG, &help, "print this help",
00153
00154 (char *)NULL
00155 );
00156
00157 if (argc == 1){
00158 usage();
00159 }
00160
00161 GetParams(&argc, &argv, (char*) NULL);
00162
00163 if (help){
00164 usage();
00165 }
00166
00167 if (!dictfile) {
00168 usage("Missing parameters dictionary");
00169 };
00170
00171 if (!adafile & (!trainfile || !binfile) && (!trainfile || !it || !topics || !basefile)) {
00172 usage("Missing parameters for training");
00173 }
00174
00175 if ((!trainfile && basefile) && (!featurefile || !adafile || !it || !topics)) {
00176 usage("Missing parameters for adapting");
00177 }
00178
00179 if ((adafile) && (!featurefile)) {
00180 usage("Missing parameters for adapting 2");
00181 }
00182
00183 if (!tmphfile) {
00184
00185 hfile=new char[4+1];
00186 strcpy(hfile,"hfff");
00187 } else {
00188
00189 hfile=new char[strlen(tmphfile)+1];
00190 strcpy(hfile,tmphfile);
00191 }
00192
00193 dictionary dict(dictfile);
00194
00195 cout << dict.size() << "\n";
00196 dict.incflag(1);
00197 dict.encode(dict.BoD());
00198 dict.encode(dict.EoD());
00199 dict.incflag(0);
00200 if (dict.oovcode()==-1) {
00201 dict.oovcode(dict.encode(dict.OOV()));
00202 }
00203
00204 cout << dict.size() << "\n";
00205
00206 if (binfile) {
00207 cout << "opening collection\n";
00208 doc col(&dict,trainfile);
00209 col.open();
00210 if (binsize)
00211 col.save(binfile,binsize);
00212 else
00213 col.save(binfile);
00214 exit(1);
00215 }
00216
00217 system("rm -f hfff");
00218
00219 plsa tc(&dict,topics,basefile,featurefile,hfile,wfile,tfile);
00220
00221 if (ctfile) {
00222 tc.combineT(ctfile);
00223 tc.saveW(basefile);
00224 exit(1);
00225 }
00226
00227 if (trainfile) {
00228 tc.train(trainfile,it,.5,1,0.5,st);
00229 if (txtfile) tc.saveWtxt(txtfile);
00230 }
00231
00232 if (adafile) {
00233 tc.loadW(basefile);
00234 tc.train(adafile,it,.0);
00235 }
00236 if (strcmp(hfile,"hfff")==0) system("rm -f hfff");
00237 delete hfile;
00238
00239 exit(1);
00240 }
00241
00242
00243