00001
00002
00003
00004
00005
00006
00007 #include <boost/algorithm/string/predicate.hpp>
00008 #include <boost/program_options.hpp>
00009 #include <boost/program_options/options_description.hpp>
00010 #include <boost/program_options/parsers.hpp>
00011 #include <boost/program_options/variables_map.hpp>
00012 #include <boost/iostreams/device/mapped_file.hpp>
00013
00014 #include <iostream>
00015 #include <fstream>
00016 #include <sstream>
00017 #include <iomanip>
00018 #include <vector>
00019 #include <string>
00020
00021 #include <sys/types.h>
00022 #include <sys/wait.h>
00023
00024 #include "ug_conll_record.h"
00025 #include "tpt_tokenindex.h"
00026 #include "ug_mm_ttrack.h"
00027 #include "tpt_pickler.h"
00028 #include "ug_deptree.h"
00029 #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
00030 #include "moses/TranslationModel/UG/mm/ug_im_tsa.h"
00031
00032 using namespace std;
00033 using namespace sapt;
00034 using namespace Moses;
00035 using namespace boost;
00036 using namespace boost::algorithm;
00037 namespace po=boost::program_options;
00038
00039 int with_pfas;
00040 int with_dcas;
00041 int with_sfas;
00042
00043 bool incremental = false;
00044 bool is_conll = false;
00045 bool quiet = false;
00046
00047 string vocabBase;
00048 string baseName;
00049 string tmpFile, mttFile;
00050
00051
00052 string UNK;
00053
00054 TokenIndex SF;
00055 TokenIndex LM;
00056 TokenIndex PS;
00057 TokenIndex DT;
00058
00059 void interpret_args(int ac, char* av[]);
00060
00061 inline uchar rangeCheck(int p, int limit) { return p < limit ? p : 1; }
00062
00063 id_type
00064 get_id(TokenIndex const& T, string const& w)
00065 {
00066 id_type ret = T[w];
00067 if (ret == 1 && w != UNK)
00068 {
00069 cerr << "Warning! Unkown vocabulary item '" << w << "', but "
00070 << "incremental mode (-i) is not set." << endl;
00071 assert(0);
00072 }
00073 return ret;
00074 }
00075
00076 void
00077 open_vocab(TokenIndex& T, string fname)
00078 {
00079 if (!access(fname.c_str(), F_OK))
00080 {
00081 T.open(fname,UNK);
00082 assert(T[UNK] == 1);
00083 }
00084 else T.setUnkLabel(UNK);
00085 if (incremental) T.setDynamic(true);
00086 assert(T["NULL"] == 0);
00087 assert(T[UNK] == 1);
00088 }
00089
00090 void
00091 ini_cnt_vec(TokenIndex const& T, vector<pair<string,size_t> > & v)
00092 {
00093 v.resize(T.totalVocabSize());
00094 for (size_t i = 0; i < T.totalVocabSize(); ++i)
00095 {
00096 v[i].first = T[i];
00097 v[i].second = 0;
00098 }
00099 }
00100
00101 void
00102 write_tokenindex(string fname, TokenIndex& T, vector<id_type> const& n2o)
00103 {
00104 if (!quiet) cerr << "Writing " << fname << endl;
00105 vector<id_type> o2n(n2o.size());
00106 for (id_type i = 0; i < n2o.size(); ++i) o2n[n2o[i]] = i;
00107 vector<pair<string,uint32_t> > v(n2o.size());
00108 for (id_type i = 0; i < n2o.size(); ++i)
00109 {
00110 v[i].first = T[n2o[i]];
00111 v[i].second = i;
00112 }
00113 T.close();
00114 sort(v.begin(),v.end());
00115 write_tokenindex_to_disk(v, fname, UNK);
00116 }
00117
00118 void init(int argc, char* argv[])
00119 {
00120 interpret_args(argc,argv);
00121 if (is_conll)
00122 {
00123 open_vocab(SF, vocabBase+".tdx.sfo");
00124 open_vocab(LM, vocabBase+".tdx.lem");
00125 open_vocab(PS, vocabBase+".tdx.pos");
00126 open_vocab(DT, vocabBase+".tdx.drl");
00127 }
00128 else open_vocab(SF, vocabBase+".tdx");
00129 }
00130
00131 void fill_rec(Conll_Record& rec, vector<string> const& w)
00132 {
00133 if (w.size() == 3)
00134 {
00135 rec.sform = get_id(SF, w[0]);
00136 rec.lemma = get_id(LM, w[2] == "<UNKNOWN>" ? w[0] : w[2]);
00137 rec.majpos = rangeCheck(get_id(PS, w[1]), 256);
00138 rec.minpos = rangeCheck(get_id(PS, w[1]), 256);
00139 rec.dtype = 0;
00140 rec.parent = -1;
00141 }
00142 else if (w.size() >= 8)
00143 {
00144 int id = atoi(w[0].c_str());
00145 int gov = atoi(w[6].c_str());
00146 rec.sform = get_id(SF, w[1]);
00147 rec.lemma = get_id(LM, w[2]);
00148 rec.majpos = rangeCheck(get_id(PS, w[3]), 256);
00149 rec.minpos = rangeCheck(get_id(PS, w[4]), 256);
00150 rec.dtype = get_id(DT, w[7]);
00151 rec.parent = gov ? gov - id : 0;
00152 }
00153 }
00154
00155 void log_progress(size_t ctr)
00156 {
00157 if (ctr % 100000 == 0)
00158 {
00159 if (ctr) cerr << endl;
00160 cerr << setw(12) << ctr / 1000 << "K sentences processed ";
00161 }
00162 else if (ctr % 10000 == 0)
00163 {
00164 cerr << ".";
00165 }
00166 }
00167
00168
00169 size_t
00170 process_plain_input(ostream& out, vector<id_type> & s_index)
00171 {
00172 id_type totalWords = 0;
00173 string line,w;
00174 while (getline(cin,line))
00175 {
00176 istringstream buf(line);
00177 if (!quiet) log_progress(s_index.size());
00178 s_index.push_back(totalWords);
00179 while (buf>>w)
00180 {
00181 tpt::numwrite(out,get_id(SF,w));
00182 ++totalWords;
00183 }
00184 }
00185 s_index.push_back(totalWords);
00186 return totalWords;
00187 }
00188
00189 size_t
00190 process_tagged_input(ostream& out,
00191 vector<id_type> & s_index,
00192 vector<id_type> & p_index)
00193 {
00194 string line;
00195 Conll_Record rec;
00196 bool new_sent = true;
00197 bool new_par = true;
00198 id_type totalWords = 0;
00199
00200 while (getline(cin,line))
00201 {
00202 vector<string> w; string f; istringstream buf(line);
00203 while (buf>>f) w.push_back(f);
00204
00205 if (w.size() == 0 || starts_with(w[0], "SID="))
00206 new_sent = true;
00207
00208 else if (w.size() == 1 && w[0] == "<P>")
00209 new_par = new_sent = true;
00210
00211 if (w.size() < 3) continue;
00212 if (!quiet && new_sent) log_progress(s_index.size());
00213 if (new_sent) { s_index.push_back(totalWords); new_sent = false; }
00214 if (new_par) { p_index.push_back(totalWords); new_par = false; }
00215 fill_rec(rec,w);
00216 out.write(reinterpret_cast<char const*>(&rec),sizeof(rec));
00217 ++totalWords;
00218 }
00219 s_index.push_back(totalWords);
00220 return totalWords;
00221 }
00222
00223 size_t
00224 numberize()
00225 {
00226 ofstream out(tmpFile.c_str());
00227 filepos_type startIdx=0;
00228 id_type idxSize=0,totalWords=0;
00229 tpt::numwrite(out,startIdx);
00230 tpt::numwrite(out,idxSize);
00231 tpt::numwrite(out,totalWords);
00232
00233 vector<id_type> s_index, p_index;
00234
00235 if(is_conll)
00236 totalWords = process_tagged_input(out,s_index,p_index);
00237 else
00238 totalWords = process_plain_input(out,s_index);
00239
00240 vector<id_type> const* index = &s_index;
00241 if (p_index.size() && p_index.back())
00242 {
00243 p_index.push_back(totalWords);
00244 index = &p_index;
00245 }
00246
00247 if (!quiet)
00248 cerr << endl << "Writing index ... (" << index->size() << " chunks) ";
00249
00250 startIdx = out.tellp();
00251 for (size_t i = 0; i < index->size(); i++)
00252 tpt::numwrite(out,(*index)[i]);
00253 out.seekp(0);
00254 idxSize = index->size();
00255 tpt::numwrite(out, startIdx);
00256 tpt::numwrite(out, idxSize - 1);
00257 tpt::numwrite(out, totalWords);
00258 out.close();
00259 if (!quiet) cerr << "done" << endl;
00260 return totalWords;
00261 }
00262
00263 vector<id_type> smap,lmap,pmap,dmap;
00264
00265 void
00266 invert(vector<id_type> const& from, vector<id_type> & to)
00267 {
00268 to.resize(from.size());
00269 for (size_t i = 0 ; i < to.size(); ++i)
00270 to[from[i]] = i;
00271 }
00272
00273
00274
00275 void
00276 conservative_sort(TokenIndex const & V,
00277 vector<size_t> const & cnt,
00278 vector<id_type> & xmap)
00279 {
00280 xmap.resize(V.totalVocabSize());
00281 for (size_t i = 0; i < xmap.size(); ++i) xmap[i] = i;
00282 VectorIndexSorter<size_t,greater<size_t>, id_type> sorter(cnt);
00283 sort(xmap.begin()+max(id_type(2),V.knownVocabSize()), xmap.end(), sorter);
00284 }
00285
00286
00287
00288 void remap()
00289 {
00290 if (!quiet) cerr << "Remapping ids ... ";
00291 filepos_type idxOffset;
00292 id_type totalWords, idxSize;
00293 boost::iostreams::mapped_file mtt(tmpFile);
00294 char const* p = mtt.data();
00295 p = tpt::numread(p,idxOffset);
00296 p = tpt::numread(p,idxSize);
00297 p = tpt::numread(p,totalWords);
00298 if (is_conll)
00299 {
00300 vector<size_t> sf(SF.totalVocabSize(), 0);
00301 vector<size_t> lm(LM.totalVocabSize(), 0);
00302 vector<size_t> ps(PS.totalVocabSize(), 0);
00303 vector<size_t> dt(DT.totalVocabSize(), 0);
00304 Conll_Record* w = reinterpret_cast<Conll_Record*>(const_cast<char*>(p));
00305 for (size_t i = 0; i < totalWords; ++i)
00306 {
00307 ++sf.at(w[i].sform);
00308 ++lm.at(w[i].lemma);
00309 ++ps.at(w[i].majpos);
00310 ++ps.at(w[i].minpos);
00311 ++dt.at(w[i].dtype);
00312 }
00313 conservative_sort(SF,sf,smap);
00314 conservative_sort(LM,lm,lmap);
00315 conservative_sort(PS,ps,pmap);
00316 conservative_sort(DT,dt,dmap);
00317 vector<id_type> smap_i(smap.size()); invert(smap,smap_i);
00318 vector<id_type> lmap_i(lmap.size()); invert(lmap,lmap_i);
00319 vector<id_type> pmap_i(pmap.size()); invert(pmap,pmap_i);
00320 vector<id_type> dmap_i(dmap.size()); invert(dmap,dmap_i);
00321 for (size_t i = 0; i < totalWords; ++i)
00322 {
00323 w[i].sform = smap_i[w[i].sform];
00324 w[i].lemma = lmap_i[w[i].lemma];
00325 w[i].majpos = pmap_i[w[i].majpos];
00326 w[i].minpos = pmap_i[w[i].minpos];
00327 w[i].dtype = dmap_i[w[i].dtype];
00328 }
00329 }
00330 else
00331 {
00332 vector<size_t> sf(SF.totalVocabSize(), 0);
00333 id_type* w = reinterpret_cast<id_type*>(const_cast<char*>(p));
00334 for (size_t i = 0; i < totalWords; ++i) ++sf.at(w[i]);
00335 conservative_sort(SF,sf,smap);
00336 vector<id_type> smap_i(smap.size()); invert(smap,smap_i);
00337 for (size_t i = 0; i < totalWords; ++i) w[i] = smap_i[w[i]];
00338 }
00339 mtt.close();
00340 if (!quiet) cerr << "done." << endl;
00341 }
00342
00343 void save_vocabs()
00344 {
00345 string vbase = baseName;
00346 if (is_conll)
00347 {
00348 if (SF.totalVocabSize() > SF.knownVocabSize())
00349 write_tokenindex(vbase+".tdx.sfo",SF,smap);
00350 if (LM.totalVocabSize() > LM.knownVocabSize())
00351 write_tokenindex(vbase+".tdx.lem",LM,lmap);
00352 if (PS.totalVocabSize() > PS.knownVocabSize())
00353 write_tokenindex(vbase+".tdx.pos",PS,pmap);
00354 if (DT.totalVocabSize() > DT.knownVocabSize())
00355 write_tokenindex(vbase+".tdx.drl",DT,dmap);
00356 }
00357 else if (SF.totalVocabSize() > SF.knownVocabSize())
00358 write_tokenindex(vbase+".tdx",SF,smap);
00359 }
00360
00361 template<typename Token>
00362 void
00363 build_mmTSA(string infile, string outfile)
00364 {
00365
00366
00367 boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>(infile));
00368 bdBitset filter;
00369 filter.resize(T->size(),true);
00370 imTSA<Token> S(T,&filter,(quiet?NULL:&cerr));
00371 S.save_as_mm_tsa(outfile);
00372
00373 }
00374
00375 bool
00376 build_plaintext_tsas()
00377 {
00378 typedef L2R_Token<SimpleWordId> L2R;
00379 typedef R2L_Token<SimpleWordId> R2L;
00380
00381 if (with_sfas) build_mmTSA<L2R>(tmpFile, baseName + ".sfa");
00382 if (with_pfas) build_mmTSA<R2L>(tmpFile, baseName + ".pfa");
00383
00384 return true;
00385 }
00386
00387 void build_conll_tsas()
00388 {
00389 string bn = baseName;
00390 string mtt = tmpFile;
00391 size_t c = 3 * (with_sfas + with_pfas + with_dcas);
00392 if (with_sfas)
00393 {
00394 build_mmTSA<L2R_Token<Conll_Sform> >(mtt,bn+".sfa-sform");
00395 build_mmTSA<L2R_Token<Conll_Lemma> >(mtt,bn+".sfa-lemma");
00396 build_mmTSA<L2R_Token<Conll_MinPos> >(mtt,bn+".sfa-minpos");
00397 }
00398
00399 if (with_pfas)
00400 {
00401 build_mmTSA<R2L_Token<Conll_Sform> >(mtt,bn+".pfa-sform");
00402 build_mmTSA<R2L_Token<Conll_Lemma> >(mtt,bn+".pfa-lemma");
00403 build_mmTSA<R2L_Token<Conll_MinPos> >(mtt,bn+".pfa-minpos");
00404 }
00405
00406 if (with_dcas)
00407 {
00408 build_mmTSA<ConllBottomUpToken<Conll_Sform> >(mtt,bn+".dca-sform");
00409 build_mmTSA<ConllBottomUpToken<Conll_Lemma> >(mtt,bn+".dca-lemma");
00410 build_mmTSA<ConllBottomUpToken<Conll_MinPos> >(mtt,bn+".dca-minpos");
00411 }
00412
00413 }
00414
00415
00416 int main(int argc, char* argv[])
00417 {
00418 init(argc,argv);
00419 numberize();
00420 if (SF.totalVocabSize() > SF.knownVocabSize() ||
00421 LM.totalVocabSize() > LM.knownVocabSize() ||
00422 PS.totalVocabSize() > PS.knownVocabSize() ||
00423 DT.totalVocabSize() > DT.knownVocabSize())
00424 {
00425 remap();
00426 save_vocabs();
00427 }
00428 if (is_conll) build_conll_tsas();
00429 else build_plaintext_tsas();
00430 if (!quiet) cerr << endl;
00431 rename(tmpFile.c_str(),mttFile.c_str());
00432 }
00433
00434 void
00435 interpret_args(int ac, char* av[])
00436 {
00437 po::variables_map vm;
00438 po::options_description o("Options");
00439 o.add_options()
00440
00441 ("help,h", "print this message")
00442
00443 ("quiet,q", po::bool_switch(&quiet),
00444 "don't print progress information")
00445
00446 ("incremental,i", po::bool_switch(&incremental),
00447 "incremental mode; rewrites vocab files!")
00448
00449 ("vocab-base,v", po::value<string>(&vocabBase),
00450 "base name of various vocabularies")
00451
00452 ("output,o", po::value<string>(&baseName),
00453 "base file name of the resulting file(s)")
00454
00455 ("sfa,s", po::value<int>(&with_sfas)->default_value(1),
00456 "also build suffix arrays")
00457
00458 ("pfa,p", po::value<int>(&with_pfas)
00459 ->default_value(0)->implicit_value(1),
00460 "also build prefix arrays")
00461
00462 ("dca,d", po::value<int>(&with_dcas)
00463 ->default_value(0)->implicit_value(1),
00464 "also build dependency chain arrays")
00465
00466 ("conll,c", po::bool_switch(&is_conll),
00467 "corpus is in CoNLL format (default: plain text)")
00468
00469 ("unk,u", po::value<string>(&UNK)->default_value("UNK"),
00470 "label for unknown tokens")
00471
00472
00473
00474
00475 ;
00476
00477 po::options_description h("Hidden Options");
00478 h.add_options()
00479 ;
00480 h.add(o);
00481 po::positional_options_description a;
00482 a.add("output",1);
00483
00484 po::store(po::command_line_parser(ac,av)
00485 .options(h)
00486 .positional(a)
00487 .run(),vm);
00488 po::notify(vm);
00489 if (vm.count("help") || !vm.count("output"))
00490 {
00491 cout << "\nusage:\n\t cat <corpus> | " << av[0]
00492 << " [options] <output .mtt file>" << endl;
00493 cout << o << endl;
00494 exit(0);
00495 }
00496 mttFile = baseName + (is_conll ? ".mtt" : ".mct");
00497 tmpFile = mttFile + "_";
00498 }