00001
00002 #include <boost/program_options.hpp>
00003 #include <boost/algorithm/string/predicate.hpp>
00004 #include <boost/format.hpp>
00005 #include "mm/ug_bitext.h"
00006 #include "mm/tpt_typedefs.h"
00007 #include "mm/ug_prime_sampling1.h"
00008 #include "generic/sorting/VectorIndexSorter.h"
00009 #include "generic/sorting/NBestList.h"
00010 #include <string>
00011
00012 using namespace std;
00013 using namespace Moses;
00014 using namespace Moses::bitext;
00015 namespace po=boost::program_options;
00016 using namespace boost::algorithm;
00017 typedef L2R_Token<SimpleWordId> Token;
00018 typedef mmBitext<Token> mmbitext;
00019 typedef Bitext<Token>::tsa tsa;
00020 typedef imTtrack<Token> imttrack;
00021 typedef imTSA<Token> imtsa;
00022
00023 string bname, bname1, bname2, L1, L2, Q1, Q2;
00024 size_t maxhits;
00025 void interpret_args(int ac, char* av[]);
00026
00027 TokenIndex V1;
00028 TokenIndex V2;
00029 sptr<mmTtrack<Token> > C1;
00030 sptr<mmTtrack<Token> > C2;
00031 mmTSA<Token> I1;
00032
00033 void
00034 open_bitext()
00035 {
00036 C1.reset(new mmTtrack<Token>);
00037 if (L2.size())
00038 {
00039 bname1 = bname + L1 + ".";
00040 bname2 = bname + L2 + ".";
00041 }
00042 else if (L1.size())
00043 {
00044 bname1 = bname;
00045 bname2 = L1;
00046 }
00047 else bname1 = bname;
00048
00049 if (bname2.size()) C2.reset(new mmTtrack<Token>);
00050
00051 C1->open(bname1+"mct");
00052 I1.open(bname1+"sfa", C1);
00053 V1.open(bname1+"tdx");
00054 V1.setDynamic(true);
00055
00056 if (bname2.size())
00057 {
00058 C2->open(bname2+"mct");
00059 V2.open(bname2+"tdx");
00060 }
00061
00062 }
00063
00064 sptr<imttrack>
00065 read_input()
00066 {
00067 sptr<vector<vector<Token> > > crp(new vector<vector<Token> >);
00068 crp->reserve(1000);
00069 string line;
00070 while (getline(cin,line))
00071 {
00072 crp->push_back(vector<Token>());
00073 fill_token_seq(V1, line, crp->back());
00074 }
00075 sptr<imttrack> ret(new imttrack (crp));
00076 return ret;
00077 }
00078
00079 sptr<NBestList<uint32_t, VectorIndexSorter<float> > >
00080 nbest(TSA<Token>::tree_iterator const& r, vector<float> const& hits,
00081 vector<float>& score, VectorIndexSorter<float>& sorter,
00082 size_t const nbest_size)
00083 {
00084 typedef NBestList<uint32_t, VectorIndexSorter<float> > nbest_list_t;
00085 sptr<nbest_list_t> ret(new nbest_list_t(nbest_size, sorter));
00086 bitvector mycheck(hits.size());
00087 tsa::ArrayEntry I(r.lower_bound(-1));
00088 char const* stop = r.upper_bound(-1);
00089 while (I.next < stop)
00090 {
00091 r.root->readEntry(I.next,I);
00092 if (mycheck[I.sid]) continue;
00093 score[I.sid] = hits[I.sid] / r.root->getCorpus()->sntLen(I.sid);
00094 ret->add(I.sid);
00095 mycheck.set(I.sid);
00096 }
00097 return ret;
00098 }
00099
00100 int main(int argc, char* argv[])
00101 {
00102 interpret_args(argc, argv);
00103 open_bitext();
00104 sptr<imttrack> icrp = read_input();
00105 imtsa newIdx(icrp,NULL);
00106 sptr<SentenceBias> hits = prime_sampling1(I1, newIdx, 1000);
00107 vector<float> score(hits->size());
00108 VectorIndexSorter<float> sorter(score);
00109 for (size_t s = 0; s < icrp->size(); ++s)
00110 {
00111 size_t stop = icrp->sntLen(s);
00112 Token const* t = icrp->sntStart(s);
00113 cout << string(80,'-') << "\n" << toString(V1, t, stop) << endl;
00114 for (size_t i = 0; i < stop; ++i)
00115 {
00116 TSA<Token>::tree_iterator r(&I1);
00117 for (size_t k = i; k < stop && r.extend(t[k].id()); ++k)
00118 {
00119 if (r.ca() < 3) continue;
00120 cout << "\n" << r.str(&V1) << " " << int(r.ca()) << endl;
00121 if (r.ca() > 10000) continue;
00122 sptr<NBestList<uint32_t, VectorIndexSorter<float> > > top;
00123 top = nbest(r, *hits, score, sorter, 5);
00124 for (size_t n = 0; n < top->size(); ++n)
00125 {
00126 cout << "[" << n << ": " << score[(*top)[n]]
00127 << " (" << (*hits)[(*top)[n]] << "/" << C1->sntLen((*top)[n]) << ")]\n"
00128 << toString(V1, C1->sntStart((*top)[n]), C1->sntLen((*top)[n])) << "\n";
00129 if (C2) cout << toString(V2, C2->sntStart((*top)[n]), C2->sntLen((*top)[n])) << "\n";
00130 cout << endl;
00131 }
00132 }
00133 }
00134
00135 }
00136 }
00137
00138 void
00139 interpret_args(int ac, char* av[])
00140 {
00141 po::variables_map vm;
00142 po::options_description o("Options");
00143 o.add_options()
00144
00145 ("help,h", "print this message")
00146 ("maxhits,n", po::value<size_t>(&maxhits)->default_value(25),
00147 "max. number of hits")
00148 ("q1", po::value<string>(&Q1), "query in L1")
00149 ("q2", po::value<string>(&Q2), "query in L2")
00150 ;
00151
00152 po::options_description h("Hidden Options");
00153 h.add_options()
00154 ("bname", po::value<string>(&bname), "base name of corpus")
00155 ("L1", po::value<string>(&L1), "L1 tag")
00156 ("L2", po::value<string>(&L2), "L2 tag")
00157 ;
00158
00159 h.add(o);
00160 po::positional_options_description a;
00161 a.add("bname",1);
00162 a.add("L1",1);
00163 a.add("L2",1);
00164
00165 po::store(po::command_line_parser(ac,av)
00166 .options(h)
00167 .positional(a)
00168 .run(),vm);
00169 po::notify(vm);
00170 if (vm.count("help"))
00171 {
00172 cout << "\nusage:\n\t" << av[0]
00173 << " [options] [--q1=<L1string>] [--q2=<L2string>]" << endl;
00174 cout << o << endl;
00175 exit(0);
00176 }
00177 }