00001
00002 #include <boost/program_options.hpp>
00003 #include <boost/algorithm/string/predicate.hpp>
00004 #include <boost/format.hpp>
00005 #include "mm/ug_bitext.h"
00006 #include "mm/tpt_typedefs.h"
00007 #include "mm/ug_prime_sampling1.h"
00008 #include "mm/ug_bitext_sampler.h"
00009 #include "mm/ug_phrasepair.h"
00010 #include "mm/ug_lru_cache.h"
00011
00012 #include "generic/sorting/VectorIndexSorter.h"
00013 #include "generic/sorting/NBestList.h"
00014 #include <string>
00015 #include <boost/unordered_map.hpp>
00016 #include "moses/thread_safe_container.h"
00017 #include "mm/ug_prep_phrases.h"
00018
00019 using namespace std;
00020 using namespace Moses;
00021 using namespace Moses::bitext;
00022 namespace po=boost::program_options;
00023 using namespace boost::algorithm;
00024 typedef L2R_Token<SimpleWordId> Token;
00025 typedef mmBitext<Token> mmbitext;
00026 typedef Bitext<Token>::tsa tsa;
00027
00028 typedef Bitext<Token>::iter iter;
00029 typedef imTtrack<Token> imttrack;
00030 typedef imTSA<Token> imtsa;
00031
00032 string bname, bname1, bname2, ifile, L1, L2, Q1, Q2;
00033 size_t maxhits;
00034 size_t cache_size;
00035 void interpret_args(int ac, char* av[]);
00036
00037 sptr<imttrack>
00038 read_input(TokenIndex& V)
00039 {
00040 sptr<vector<vector<Token> > > crp(new vector<vector<Token> >);
00041 crp->reserve(1000);
00042 string line;
00043 istream* in = &cin;
00044 ifstream inputfile;
00045 if (ifile.size())
00046 {
00047 inputfile.open(ifile.c_str());
00048 in = & inputfile;
00049 }
00050 while (getline(*in,line))
00051 {
00052 crp->push_back(vector<Token>());
00053 fill_token_seq(V, line, crp->back());
00054 }
00055 sptr<imttrack> ret(new imttrack (crp));
00056 return ret;
00057 }
00058
00059 typedef ThreadSafeContainer<uint64_t, sptr<pstats> > permacache_t;
00060
00061 void dump(iter& m, TokenIndex& V)
00062 {
00063 if (m.down())
00064 {
00065 do
00066 {
00067
00068 dump(m,V);
00069 }
00070 while (m.over());
00071 m.up();
00072 }
00073 }
00074
00075 int main(int argc, char* argv[])
00076 {
00077 typedef vector<PhrasePair<Token> > pplist_t;
00078 interpret_args(argc, argv);
00079 boost_iptr<mmbitext> Bptr(new mmbitext);
00080 mmbitext& B = *Bptr;
00081 B.open(bname, L1, L2);
00082 B.V1->setDynamic(true);
00083 sptr<imttrack> icrp = read_input(*B.V1);
00084 imtsa newIdx(icrp,NULL);
00085 sptr<SentenceBias> bias = prime_sampling1(*B.I1, newIdx, 5000, B.sid2did());
00086 cerr << "primed " << endl;
00087 ug::ThreadPool T(1);
00088 TSA<Token>::tree_iterator m(&newIdx);
00089
00090
00091 TSA<Token>::tree_iterator r(B.I1.get());
00092 StatsCollector<Token> collect(Bptr, bias);
00093
00094 collect.process(m, r);
00095
00096 typedef PhrasePair<Token>::SortDescendingByJointCount sorter_t;
00097 sorter_t sorter;
00098 for (size_t s = 0; s < icrp->size(); ++s)
00099 {
00100 size_t stop = icrp->sntLen(s);
00101 Token const* t = icrp->sntStart(s);
00102 cout << string(80,'-') << "\n" << toString(*B.V1, t, stop) << endl;
00103 for (size_t i = 0; i < stop; ++i)
00104 {
00105 iter r(B.I1.get());
00106 for (size_t k = i; k < stop && r.extend(t[k].id()); ++k)
00107 {
00108 sptr<pstats> stats = (*collect.lcache)[r.getPid()];
00109 stats->wait();
00110 pplist_t pplist;
00111 expand(r, B, *stats, pplist, NULL);
00112 if (pplist.empty()) continue;
00113 cout << "\n" << r.str(B.V1.get()) << " [" << r.ca() << "]" << endl;
00114 VectorIndexSorter<PhrasePair<Token>, sorter_t> viso(pplist, sorter);
00115 sptr<vector<size_t> > ranked = viso.GetOrder();
00116 size_t ctr=0;
00117 BOOST_FOREACH(size_t const i, *ranked)
00118 {
00119 PhrasePair<Token> const& pp = pplist[i];
00120
00121 cout << boost::format(" %6d %.5f | ") % pp.joint % pp.cum_bias
00122 << toString(*B.V2, pp.start2, pp.len2)
00123 << " [";
00124 for (size_t d = 0; d < pp.indoc.size(); ++d)
00125 {
00126 if (d) cout << ":";
00127 cout << pp.indoc[d];
00128 }
00129 cout << "]" << endl;
00130 if (++ctr == 5) break;
00131 }
00132 }
00133 }
00134 }
00135 }
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224 void
00225 interpret_args(int ac, char* av[])
00226 {
00227 po::variables_map vm;
00228 po::options_description o("Options");
00229 o.add_options()
00230
00231 ("help,h", "print this message")
00232 ("cache,C", po::value<size_t>(&cache_size)->default_value(0),
00233 "cache size")
00234
00235
00236
00237
00238 ;
00239
00240 po::options_description h("Hidden Options");
00241 h.add_options()
00242 ("bname", po::value<string>(&bname), "base name of corpus")
00243 ("L1", po::value<string>(&L1), "L1 tag")
00244 ("L2", po::value<string>(&L2), "L2 tag")
00245 ("ifile,i", po::value<string>(&ifile), "input file")
00246 ;
00247
00248 h.add(o);
00249 po::positional_options_description a;
00250 a.add("bname",1);
00251 a.add("L1",1);
00252 a.add("L2",1);
00253 a.add("ifile",1);
00254
00255 po::store(po::command_line_parser(ac,av)
00256 .options(h)
00257 .positional(a)
00258 .run(),vm);
00259 po::notify(vm);
00260 if (vm.count("help"))
00261 {
00262 cout << "\nusage:\n\t" << av[0]
00263 << " [options] [--q1=<L1string>] [--q2=<L2string>]" << endl;
00264 cout << o << endl;
00265 exit(0);
00266 }
00267 }