Moses: /disk4/html/www/moses/doxygen/irstlm/trunk/src/plsa.cpp Source File

00001 /******************************************************************************
00002  IrstLM: IRST Language Model Toolkit, compile LM
00003  Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
00004  
00005  This library is free software; you can redistribute it and/or
00006  modify it under the terms of the GNU Lesser General Public
00007  License as published by the Free Software Foundation; either
00008  version 2.1 of the License, or (at your option) any later version.
00009  
00010  This library is distributed in the hope that it will be useful,
00011  but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013  Lesser General Public License for more details.
00014  
00015  You should have received a copy of the GNU Lesser General Public
00016  License along with this library; if not, write to the Free Software
00017  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
00018  
00019  ******************************************************************************/
00020 
00021 using namespace std;
00022 
00023 #include <iostream>
00024 #include "cmd.h"
00025 #include "mfstream.h"
00026 #include "mempool.h"
00027 #include "htable.h"
00028 #include "dictionary.h"
00029 #include "n_gram.h"
00030 #include "ngramtable.h"
00031 #include "doc.h"
00032 #include "cplsa.h"
00033 
00034 void print_help(int TypeFlag=0){
00035   std::cerr << std::endl << "plsa - performs probabilistic latent semantic analysis LM inference" << std::endl;
00036   std::cerr << std::endl << "USAGE:"  << std::endl;
00037         std::cerr << "       plsa -c=<text_collection> -d=<dictionary> -m=<model> -t=<topics> -it=<iter> [options]" << std::endl;
00038         std::cerr << "       plsa -c=<text_collection> -d=<dictionary> -b=<binary_collection> [options]" << std::endl;
00039         std::cerr << "       plsa -d=<dictionary> -m=<model> -t=<topics> -inf=<text> -f=<features> -it=<iterations> [options]" << std::endl;
00040   std::cerr << std::endl << "DESCRIPTION:" << std::endl;
00041   std::cerr << "       plsa is a tool for probabilistic latent semantic analysis" << std::endl;
00042   std::cerr << "       LM inference. It can be used to train a PLSA model, to binarize" << std::endl;
00043   std::cerr << "       a textual document collection to speed-up training or to" << std::endl;
00044   std::cerr << "       infer a full n-gram distribution from a model and a small text." << std::endl;
00045   std::cerr << std::endl << "OPTIONS:" << std::endl;
00046         
00047         
00048         FullPrintParams(TypeFlag, 0, 1, stderr);
00049         
00050   std::cerr << std::endl << "EXAMPLES:" << std::endl;
00051   std::cerr <<"       (1) plsa -c=<text_collection> -d=<dictionary> -m=<model> -t=<topics> -it=<iter>" << std::endl;
00052   std::cerr <<"           Train a PLSA model, <model>, from the text collection" << std::endl;
00053   std::cerr <<"           <text_collection> using the dictionary <dictionary>. The" << std::endl;
00054   std::cerr <<"           number of EM iterations is specified by <iter> and the" << std::endl;
00055   std::cerr <<"           number of topics is specified by <topics>." << std::endl;
00056   std::cerr <<"           The <text_collection> content must begin with the number of" << std::endl;
00057   std::cerr <<"           documents and documents should be separated with the </d> tag." << std::endl;
00058   std::cerr <<"           The begin document tag <d> is not considered." << std::endl;
00059   std::cerr <<"           Example of <text_collection> content:" << std::endl;
00060   std::cerr <<"           3" << std::endl;
00061   std::cerr <<"           <d> hello world ! </d>" << std::endl;
00062   std::cerr <<"           <d> good morning good afternoon </d>" << std::endl;
00063   std::cerr <<"           <d> welcome aboard </d>" << std::endl;
00064   std::cerr <<"       (2) plsa -c=<text_collection> -d=<dictionary> -b=<binary collection>" << std::endl;
00065   std::cerr <<"           Binarize a textual document collection to speed-up training (1)" << std::endl;
00066   std::cerr <<"       (3) plsa -d=<dictionary> -m=<model> -t=<topics> -inf=<text> -f=<features> -it=<iterations>" << std::endl;
00067   std::cerr <<"           Infer a full 1-gram distribution from a model and a small" << std::endl;
00068   std::cerr <<"           text. The 1-gram is saved in the feature file. The 1-gram" << std::endl;
00069   std::cerr << std::endl;
00070 }
00071 
00072 void usage(const char *msg = 0)
00073 {
00074   if (msg){
00075     std::cerr << msg << std::endl;
00076         }
00077   else{
00078                 print_help();
00079         }
00080         exit(1);
00081 }
00082 
00083 int main(int argc, char **argv)
00084 {
00085   char *dictfile=NULL;
00086   char *trainfile=NULL;
00087   char *adafile=NULL;
00088   char *featurefile=NULL;
00089   char *basefile=NULL;
00090   char *hfile=NULL;
00091   char *tmphfile=NULL;
00092   char *tfile=NULL;
00093   char *wfile=NULL;
00094   char *ctfile=NULL;
00095   char *txtfile=NULL;
00096   char *binfile=NULL;
00097         
00098   int binsize=0;
00099   int topics=0;  //number of topics
00100   int st=0;      //special topic: first st dict words
00101   int it=0;
00102   bool help=false;
00103         
00104   DeclareParams((char*)
00105                                                                 
00106                                                                 "Dictionary", CMDSTRINGTYPE|CMDMSG, &dictfile, "dictionary file",
00107                                                                 "d", CMDSTRINGTYPE|CMDMSG, &dictfile, "dictionary file",
00108                                                                 
00109                                                                 "Binary", CMDSTRINGTYPE|CMDMSG, &binfile, "binary file",
00110                                                                 "b", CMDSTRINGTYPE|CMDMSG, &binfile, "binary file",
00111                                                                 
00112                                                                 "SplitData", CMDINTTYPE|CMDMSG, &binsize, "size of binary file; default is unlimited",
00113                                                                 "sd", CMDINTTYPE|CMDMSG, &binsize, "size of binary file; default is unlimited",
00114                                                                 
00115                                                                 "Collection", CMDSTRINGTYPE|CMDMSG, &trainfile, "text collection file",
00116                                                                 "c", CMDSTRINGTYPE|CMDMSG, &trainfile, "text collection file",
00117                                                                 
00118                                                                 "Model", CMDSTRINGTYPE|CMDMSG, &basefile, "model file",
00119                                                                 "m", CMDSTRINGTYPE|CMDMSG, &basefile, "model file",
00120                                                                 
00121                                                                 "HFile", CMDSTRINGTYPE, &tmphfile,
00122                                                                 "hf", CMDSTRINGTYPE, &tmphfile,
00123                                                                 
00124                                                                 "WFile", CMDSTRINGTYPE, &wfile,
00125                                                                 "wf", CMDSTRINGTYPE, &wfile,
00126                                                                 
00127                                                                 "TFile", CMDSTRINGTYPE, &tfile,
00128                                                                 "tf", CMDSTRINGTYPE, &tfile,
00129                                                                 
00130                                                                 "CombineTFile", CMDSTRINGTYPE, &ctfile,
00131                                                                 "ct", CMDSTRINGTYPE, &ctfile,
00132                                                                 
00133                                                                 "TxtFile", CMDSTRINGTYPE, &txtfile,
00134                                                                 "txt", CMDSTRINGTYPE, &txtfile,
00135                                                                 
00136                                                                 "Inference", CMDSTRINGTYPE, &adafile,
00137                                                                 "inf", CMDSTRINGTYPE, &adafile,
00138                                                                 
00139                                                                 "Features", CMDSTRINGTYPE, &featurefile,
00140                                                                 "f", CMDSTRINGTYPE, &featurefile,
00141                                                                 
00142                                                                 "Topics", CMDINTTYPE|CMDMSG, &topics, "number of topics; default is 0",
00143                                                                 "t", CMDINTTYPE|CMDMSG, &topics,"number of topics; default is 0",
00144                                                                 
00145                                                                 "SpecialTopic", CMDINTTYPE|CMDMSG, &st, "special topic: first dictionary words; default is 0",
00146                                                                 "st", CMDINTTYPE|CMDMSG, &st, "special topic: first dictionary words; default is 0",
00147                                                                 
00148                                                                 "Iterations", CMDINTTYPE|CMDMSG, &it, "number of EM iterations; default is 0",
00149                                                                 "it", CMDINTTYPE|CMDMSG, &it, "number of EM iterations; default is 0",
00150                                                                 
00151                                                                 "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help",
00152                                                                 "h", CMDBOOLTYPE|CMDMSG, &help, "print this help",
00153                                                                 
00154                                                                 (char *)NULL
00155                                                                 );
00156         
00157         if (argc == 1){
00158                 usage();
00159         }
00160         
00161   GetParams(&argc, &argv, (char*) NULL);
00162         
00163         if (help){
00164                 usage();
00165         }
00166         
00167   if (!dictfile) {
00168     usage("Missing parameters dictionary");
00169   };
00170         
00171   if (!adafile & (!trainfile || !binfile) && (!trainfile || !it || !topics || !basefile)) {
00172     usage("Missing parameters for training");
00173   }
00174         
00175   if ((!trainfile && basefile) && (!featurefile || !adafile || !it || !topics)) {
00176     usage("Missing parameters for adapting");
00177   }
00178         
00179   if ((adafile) && (!featurefile)) {
00180     usage("Missing parameters for adapting 2");
00181   }
00182         
00183   if (!tmphfile) {
00184     //set default value
00185     hfile=new char[4+1];
00186     strcpy(hfile,"hfff");
00187   } else {
00188     //set the value of the parameter
00189     hfile=new char[strlen(tmphfile)+1];
00190     strcpy(hfile,tmphfile);
00191   }
00192         
00193   dictionary dict(dictfile);
00194         
00195   cout << dict.size() << "\n";
00196   dict.incflag(1);
00197   dict.encode(dict.BoD());
00198   dict.encode(dict.EoD());
00199   dict.incflag(0);
00200   if (dict.oovcode()==-1) {
00201     dict.oovcode(dict.encode(dict.OOV()));
00202   }
00203         
00204   cout << dict.size() << "\n";
00205         
00206   if (binfile) {
00207     cout << "opening collection\n";
00208     doc col(&dict,trainfile);
00209     col.open();
00210     if (binsize)
00211       col.save(binfile,binsize);
00212     else
00213       col.save(binfile);
00214     exit(1);
00215   }
00216         
00217   system("rm -f hfff");
00218         
00219   plsa tc(&dict,topics,basefile,featurefile,hfile,wfile,tfile);
00220         
00221   if (ctfile) { //combine t
00222     tc.combineT(ctfile);
00223     tc.saveW(basefile);
00224     exit(1);
00225   }
00226         
00227   if (trainfile) {
00228     tc.train(trainfile,it,.5,1,0.5,st);
00229     if (txtfile) tc.saveWtxt(txtfile);
00230   }
00231         
00232   if (adafile) {
00233     tc.loadW(basefile);
00234     tc.train(adafile,it,.0);
00235   }
00236   if (strcmp(hfile,"hfff")==0)  system("rm -f hfff");
00237   delete hfile;
00238         
00239   exit(1);
00240 }
00241 
00242 
00243