Moses: /disk4/html/www/moses/doxygen/mosesdecoder/moses/TranslationModel/UG/mm/mtt-build.cc Source File

00001 // -*- c++ -*-
00002 // Converts a corpus in text format (plain text, one centence per line) or
00003 // conll format or treetagger output format (which one is automatically
00004 // recognized based on the number of fields per line) into memory-mapped
00005 // format. (c) 2007-2013 Ulrich Germann
00006 
00007 #include <boost/algorithm/string/predicate.hpp>
00008 #include <boost/program_options.hpp>
00009 #include <boost/program_options/options_description.hpp>
00010 #include <boost/program_options/parsers.hpp>
00011 #include <boost/program_options/variables_map.hpp>
00012 #include <boost/iostreams/device/mapped_file.hpp>
00013 
00014 #include <iostream>
00015 #include <fstream>
00016 #include <sstream>
00017 #include <iomanip>
00018 #include <vector>
00019 #include <string>
00020 
00021 #include <sys/types.h>
00022 #include <sys/wait.h>
00023 
00024 #include "ug_conll_record.h"
00025 #include "tpt_tokenindex.h"
00026 #include "ug_mm_ttrack.h"
00027 #include "tpt_pickler.h"
00028 #include "ug_deptree.h"
00029 #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
00030 #include "moses/TranslationModel/UG/mm/ug_im_tsa.h"
00031 
00032 using namespace std;
00033 using namespace sapt;
00034 using namespace Moses;
00035 using namespace boost;
00036 using namespace boost::algorithm;
00037 namespace po=boost::program_options;
00038 
00039 int with_pfas;
00040 int with_dcas;
00041 int with_sfas;
00042 
00043 bool incremental = false; // build / grow vocabs automatically
00044 bool is_conll    = false; // text or conll format?
00045 bool quiet       = false; // no progress reporting
00046 
00047 string vocabBase; // base name for existing vocabs that should be used
00048 string baseName;  // base name for all files
00049 string tmpFile, mttFile;   /* name of temporary / actual track file
00050                             * (.mtt for Conll format, .mct for plain text)
00051                             */
00052 string UNK;
00053 
00054 TokenIndex SF; // surface form
00055 TokenIndex LM; // lemma
00056 TokenIndex PS; // part of speech
00057 TokenIndex DT; // dependency type
00058 
00059 void interpret_args(int ac, char* av[]);
00060 
00061 inline uchar rangeCheck(int p, int limit) { return p < limit ? p : 1; }
00062 
00063 id_type
00064 get_id(TokenIndex const& T, string const& w)
00065 {
00066   id_type ret = T[w];
00067   if (ret == 1 && w != UNK)
00068     {
00069       cerr << "Warning! Unkown vocabulary item '" << w << "', but "
00070            << "incremental mode (-i) is not set." << endl;
00071       assert(0);
00072     }
00073   return ret;
00074 }
00075 
00076 void
00077 open_vocab(TokenIndex& T, string fname)
00078 {
00079   if (!access(fname.c_str(), F_OK))
00080     {
00081       T.open(fname,UNK);
00082       assert(T[UNK] == 1);
00083     }
00084   else T.setUnkLabel(UNK);
00085   if (incremental) T.setDynamic(true);
00086   assert(T["NULL"] == 0);
00087   assert(T[UNK]  == 1);
00088 }
00089 
00090 void
00091 ini_cnt_vec(TokenIndex const& T, vector<pair<string,size_t> > & v)
00092 {
00093   v.resize(T.totalVocabSize());
00094   for (size_t i = 0; i < T.totalVocabSize(); ++i)
00095     {
00096       v[i].first = T[i];
00097       v[i].second = 0;
00098     }
00099 }
00100 
00101 void
00102 write_tokenindex(string fname, TokenIndex& T, vector<id_type> const& n2o)
00103 {
00104   if (!quiet) cerr << "Writing " << fname << endl;
00105   vector<id_type> o2n(n2o.size());
00106   for (id_type i = 0; i < n2o.size(); ++i) o2n[n2o[i]] = i;
00107   vector<pair<string,uint32_t> > v(n2o.size());
00108   for (id_type i = 0; i < n2o.size(); ++i)
00109     {
00110       v[i].first  = T[n2o[i]];
00111       v[i].second = i;
00112     }
00113   T.close();
00114   sort(v.begin(),v.end());
00115   write_tokenindex_to_disk(v, fname, UNK);
00116 }
00117 
00118 void init(int argc, char* argv[])
00119 {
00120   interpret_args(argc,argv);
00121   if (is_conll)
00122     {
00123       open_vocab(SF, vocabBase+".tdx.sfo"); // surface form
00124       open_vocab(LM, vocabBase+".tdx.lem"); // lemma
00125       open_vocab(PS, vocabBase+".tdx.pos"); // part-of-speech
00126       open_vocab(DT, vocabBase+".tdx.drl"); // dependency type
00127     }
00128   else open_vocab(SF, vocabBase+".tdx"); // surface form
00129 }
00130 
00131 void fill_rec(Conll_Record& rec, vector<string> const& w)
00132 {
00133   if (w.size() == 3) // treetagger output
00134     {
00135       rec.sform  =  get_id(SF, w[0]);
00136       rec.lemma  =  get_id(LM, w[2] == "<UNKNOWN>" ? w[0] : w[2]);
00137       rec.majpos =  rangeCheck(get_id(PS, w[1]), 256);
00138       rec.minpos =  rangeCheck(get_id(PS, w[1]), 256);
00139       rec.dtype  =  0;
00140       rec.parent = -1;
00141     }
00142   else if (w.size() >= 8) // CONLL format
00143     {
00144       int id  = atoi(w[0].c_str());
00145       int gov = atoi(w[6].c_str());
00146       rec.sform  = get_id(SF, w[1]);
00147       rec.lemma  = get_id(LM, w[2]);
00148       rec.majpos = rangeCheck(get_id(PS, w[3]), 256);
00149       rec.minpos = rangeCheck(get_id(PS, w[4]), 256);
00150       rec.dtype  = get_id(DT, w[7]);
00151       rec.parent = gov ? gov - id : 0;
00152     }
00153 }
00154 
00155 void log_progress(size_t ctr)
00156 {
00157   if (ctr % 100000 == 0)
00158     {
00159       if (ctr) cerr << endl;
00160       cerr << setw(12) << ctr / 1000 << "K sentences processed ";
00161     }
00162   else if (ctr % 10000 == 0)
00163     {
00164       cerr << ".";
00165     }
00166 }
00167 
00168 
00169 size_t
00170 process_plain_input(ostream& out, vector<id_type> & s_index)
00171 {
00172   id_type totalWords = 0;
00173   string line,w;
00174   while (getline(cin,line))
00175     {
00176       istringstream buf(line);
00177       if (!quiet) log_progress(s_index.size());
00178       s_index.push_back(totalWords);
00179       while (buf>>w)
00180         {
00181           tpt::numwrite(out,get_id(SF,w));
00182           ++totalWords;
00183         }
00184     }
00185   s_index.push_back(totalWords);
00186   return totalWords;
00187 }
00188 
00189 size_t
00190 process_tagged_input(ostream& out,
00191                      vector<id_type> & s_index,
00192                      vector<id_type> & p_index)
00193 {
00194   string line;
00195   Conll_Record rec;
00196   bool new_sent  = true;
00197   bool new_par   = true;
00198   id_type totalWords = 0;
00199 
00200   while (getline(cin,line))
00201     {
00202       vector<string> w; string f; istringstream buf(line);
00203       while (buf>>f) w.push_back(f);
00204 
00205       if (w.size() == 0 || starts_with(w[0], "SID="))
00206         new_sent = true;
00207 
00208       else if (w.size() == 1 && w[0] == "<P>")
00209         new_par = new_sent = true;
00210 
00211       if (w.size() < 3) continue;
00212       if (!quiet && new_sent) log_progress(s_index.size());
00213       if (new_sent) { s_index.push_back(totalWords); new_sent = false; }
00214       if (new_par)  { p_index.push_back(totalWords); new_par  = false; }
00215       fill_rec(rec,w);
00216       out.write(reinterpret_cast<char const*>(&rec),sizeof(rec));
00217       ++totalWords;
00218     }
00219   s_index.push_back(totalWords);
00220   return totalWords;
00221 }
00222 
00223 size_t
00224 numberize()
00225 {
00226   ofstream out(tmpFile.c_str());
00227   filepos_type startIdx=0;
00228   id_type idxSize=0,totalWords=0;
00229   tpt::numwrite(out,startIdx);   // place holder, to be filled at the end
00230   tpt::numwrite(out,idxSize);    // place holder, to be filled at the end
00231   tpt::numwrite(out,totalWords); // place holder, to be filled at the end
00232 
00233   vector<id_type> s_index, p_index;
00234 
00235   if(is_conll)
00236     totalWords = process_tagged_input(out,s_index,p_index);
00237   else
00238     totalWords = process_plain_input(out,s_index);
00239 
00240   vector<id_type> const* index = &s_index;
00241   if (p_index.size() && p_index.back())
00242     {
00243       p_index.push_back(totalWords);
00244       index = &p_index;
00245     }
00246 
00247   if (!quiet)
00248     cerr << endl << "Writing index ... (" << index->size() << " chunks) ";
00249 
00250   startIdx = out.tellp();
00251   for (size_t i = 0; i < index->size(); i++) 
00252     tpt::numwrite(out,(*index)[i]);
00253   out.seekp(0);
00254   idxSize = index->size();
00255   tpt::numwrite(out, startIdx);
00256   tpt::numwrite(out, idxSize - 1);
00257   tpt::numwrite(out, totalWords);
00258   out.close();
00259   if (!quiet) cerr << "done" << endl;
00260   return totalWords;
00261 }
00262 
00263 vector<id_type> smap,lmap,pmap,dmap;
00264 
00265 void
00266 invert(vector<id_type> const& from, vector<id_type> & to)
00267 {
00268   to.resize(from.size());
00269   for (size_t i = 0 ; i < to.size(); ++i)
00270     to[from[i]] = i;
00271 }
00272 
00273 // sorts new items based on occurrence counts but won't reassign
00274 // existing token ids
00275 void
00276 conservative_sort(TokenIndex     const & V,
00277                   vector<size_t> const & cnt,
00278                   vector<id_type>      & xmap)
00279 {
00280   xmap.resize(V.totalVocabSize());
00281   for (size_t i = 0; i < xmap.size(); ++i) xmap[i] = i;
00282   VectorIndexSorter<size_t,greater<size_t>, id_type> sorter(cnt);
00283   sort(xmap.begin()+max(id_type(2),V.knownVocabSize()), xmap.end(), sorter);
00284 }
00285 
00286 // reassign token ids in the corpus track based on the id map created by
00287 // conservative_sort
00288 void remap()
00289 {
00290   if (!quiet) cerr << "Remapping ids ... ";
00291   filepos_type idxOffset;
00292   id_type totalWords, idxSize;
00293   boost::iostreams::mapped_file mtt(tmpFile);
00294   char const* p = mtt.data();
00295   p = tpt::numread(p,idxOffset);
00296   p = tpt::numread(p,idxSize);
00297   p = tpt::numread(p,totalWords);
00298   if (is_conll)
00299     {
00300       vector<size_t> sf(SF.totalVocabSize(), 0);
00301       vector<size_t> lm(LM.totalVocabSize(), 0);
00302       vector<size_t> ps(PS.totalVocabSize(), 0);
00303       vector<size_t> dt(DT.totalVocabSize(), 0);
00304       Conll_Record* w  = reinterpret_cast<Conll_Record*>(const_cast<char*>(p));
00305       for (size_t i = 0; i < totalWords; ++i)
00306         {
00307           ++sf.at(w[i].sform);
00308           ++lm.at(w[i].lemma);
00309           ++ps.at(w[i].majpos);
00310           ++ps.at(w[i].minpos);
00311           ++dt.at(w[i].dtype);
00312         }
00313       conservative_sort(SF,sf,smap);
00314       conservative_sort(LM,lm,lmap);
00315       conservative_sort(PS,ps,pmap);
00316       conservative_sort(DT,dt,dmap);
00317       vector<id_type> smap_i(smap.size()); invert(smap,smap_i);
00318       vector<id_type> lmap_i(lmap.size()); invert(lmap,lmap_i);
00319       vector<id_type> pmap_i(pmap.size()); invert(pmap,pmap_i);
00320       vector<id_type> dmap_i(dmap.size()); invert(dmap,dmap_i);
00321       for (size_t i = 0; i < totalWords; ++i)
00322         {
00323           w[i].sform  = smap_i[w[i].sform];
00324           w[i].lemma  = lmap_i[w[i].lemma];
00325           w[i].majpos = pmap_i[w[i].majpos];
00326           w[i].minpos = pmap_i[w[i].minpos];
00327           w[i].dtype  = dmap_i[w[i].dtype];
00328         }
00329     }
00330   else
00331     {
00332       vector<size_t> sf(SF.totalVocabSize(), 0);
00333       id_type* w = reinterpret_cast<id_type*>(const_cast<char*>(p));
00334       for (size_t i = 0; i < totalWords; ++i) ++sf.at(w[i]);
00335       conservative_sort(SF,sf,smap);
00336       vector<id_type> smap_i(smap.size()); invert(smap,smap_i);
00337       for (size_t i = 0; i < totalWords; ++i) w[i] = smap_i[w[i]];
00338     }
00339   mtt.close();
00340   if (!quiet) cerr << "done." << endl;
00341 }
00342 
00343 void save_vocabs()
00344 {
00345   string vbase = baseName;
00346   if (is_conll)
00347     {
00348       if (SF.totalVocabSize() > SF.knownVocabSize())
00349         write_tokenindex(vbase+".tdx.sfo",SF,smap);
00350       if (LM.totalVocabSize() > LM.knownVocabSize())
00351         write_tokenindex(vbase+".tdx.lem",LM,lmap);
00352       if (PS.totalVocabSize() > PS.knownVocabSize())
00353         write_tokenindex(vbase+".tdx.pos",PS,pmap);
00354       if (DT.totalVocabSize() > DT.knownVocabSize())
00355         write_tokenindex(vbase+".tdx.drl",DT,dmap);
00356     }
00357   else if (SF.totalVocabSize() > SF.knownVocabSize())
00358     write_tokenindex(vbase+".tdx",SF,smap);
00359 }
00360 
00361 template<typename Token>
00362 void
00363 build_mmTSA(string infile, string outfile)
00364 {
00365   // size_t mypid = fork();
00366   // if(mypid) return mypid;
00367   boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>(infile));
00368   bdBitset filter;
00369   filter.resize(T->size(),true);
00370   imTSA<Token> S(T,&filter,(quiet?NULL:&cerr));
00371   S.save_as_mm_tsa(outfile);
00372   // exit(0);
00373 }
00374 
00375 bool
00376 build_plaintext_tsas()
00377 {
00378   typedef L2R_Token<SimpleWordId> L2R;
00379   typedef R2L_Token<SimpleWordId> R2L;
00380   // size_t c = with_sfas + with_pfas;
00381   if (with_sfas) build_mmTSA<L2R>(tmpFile, baseName + ".sfa");
00382   if (with_pfas) build_mmTSA<R2L>(tmpFile, baseName + ".pfa");
00383   // while (c--) wait(NULL);
00384   return true;
00385 }
00386 
00387 void build_conll_tsas()
00388 {
00389   string bn  = baseName;
00390   string mtt = tmpFile;
00391   size_t c = 3 * (with_sfas + with_pfas + with_dcas);
00392   if (with_sfas)
00393     {
00394       build_mmTSA<L2R_Token<Conll_Sform> >(mtt,bn+".sfa-sform");
00395       build_mmTSA<L2R_Token<Conll_Lemma> >(mtt,bn+".sfa-lemma");
00396       build_mmTSA<L2R_Token<Conll_MinPos> >(mtt,bn+".sfa-minpos");
00397     }
00398 
00399   if (with_pfas)
00400     {
00401       build_mmTSA<R2L_Token<Conll_Sform> >(mtt,bn+".pfa-sform");
00402       build_mmTSA<R2L_Token<Conll_Lemma> >(mtt,bn+".pfa-lemma");
00403       build_mmTSA<R2L_Token<Conll_MinPos> >(mtt,bn+".pfa-minpos");
00404     }
00405 
00406   if (with_dcas)
00407     {
00408       build_mmTSA<ConllBottomUpToken<Conll_Sform> >(mtt,bn+".dca-sform");
00409       build_mmTSA<ConllBottomUpToken<Conll_Lemma> >(mtt,bn+".dca-lemma");
00410       build_mmTSA<ConllBottomUpToken<Conll_MinPos> >(mtt,bn+".dca-minpos");
00411     }
00412   // while (c--) wait(NULL);
00413 }
00414 
00415 
00416 int main(int argc, char* argv[])
00417 {
00418   init(argc,argv);
00419   numberize();
00420   if (SF.totalVocabSize() > SF.knownVocabSize() ||
00421       LM.totalVocabSize() > LM.knownVocabSize() ||
00422       PS.totalVocabSize() > PS.knownVocabSize() ||
00423       DT.totalVocabSize() > DT.knownVocabSize())
00424     {
00425       remap();
00426       save_vocabs();
00427     }
00428   if (is_conll) build_conll_tsas();
00429   else          build_plaintext_tsas();
00430   if (!quiet) cerr << endl;
00431   rename(tmpFile.c_str(),mttFile.c_str());
00432 }
00433 
00434 void
00435 interpret_args(int ac, char* av[])
00436 {
00437   po::variables_map vm;
00438   po::options_description o("Options");
00439   o.add_options()
00440 
00441     ("help,h",  "print this message")
00442 
00443     ("quiet,q", po::bool_switch(&quiet),
00444      "don't print progress information")
00445 
00446     ("incremental,i", po::bool_switch(&incremental),
00447      "incremental mode; rewrites vocab files!")
00448 
00449     ("vocab-base,v", po::value<string>(&vocabBase),
00450      "base name of various vocabularies")
00451 
00452     ("output,o", po::value<string>(&baseName),
00453      "base file name of the resulting file(s)")
00454 
00455     ("sfa,s", po::value<int>(&with_sfas)->default_value(1),
00456      "also build suffix arrays")
00457 
00458     ("pfa,p", po::value<int>(&with_pfas)
00459      ->default_value(0)->implicit_value(1),
00460      "also build prefix arrays")
00461 
00462     ("dca,d", po::value<int>(&with_dcas)
00463      ->default_value(0)->implicit_value(1),
00464      "also build dependency chain arrays")
00465 
00466     ("conll,c", po::bool_switch(&is_conll),
00467      "corpus is in CoNLL format (default: plain text)")
00468 
00469     ("unk,u", po::value<string>(&UNK)->default_value("UNK"),
00470      "label for unknown tokens")
00471 
00472     // ("map,m", po::value<string>(&vmap),
00473     // "map words to word classes for indexing")
00474 
00475     ;
00476 
00477   po::options_description h("Hidden Options");
00478   h.add_options()
00479     ;
00480   h.add(o);
00481   po::positional_options_description a;
00482   a.add("output",1);
00483 
00484   po::store(po::command_line_parser(ac,av)
00485             .options(h)
00486             .positional(a)
00487             .run(),vm);
00488   po::notify(vm);
00489   if (vm.count("help") || !vm.count("output"))
00490     {
00491       cout << "\nusage:\n\t cat <corpus> | " << av[0]
00492            << " [options] <output .mtt file>" << endl;
00493       cout << o << endl;
00494       exit(0);
00495     }
00496   mttFile = baseName + (is_conll ? ".mtt" : ".mct");
00497   tmpFile = mttFile + "_";
00498 }