00001 #include <boost/program_options.hpp>
00002 #include "mm/ug_bitext.h"
00003 #include <string>
00004
00005 using namespace std;
00006 using namespace Moses;
00007 using namespace sapt;
00008
00009 namespace po=boost::program_options;
00010 typedef L2R_Token<SimpleWordId> Token;
00011 typedef mmBitext<Token> mmbitext;
00012 typedef Bitext<Token>::tsa tsa;
00013
00014 string bname, L1, L2, Q1, Q2;
00015 size_t maxhits;
00016 void interpret_args(int ac, char* av[]);
00017
00018
00019 void
00020 write_sentence
00021 (Ttrack<Token> const& T, uint32_t const sid, TokenIndex const& V, ostream& out)
00022 {
00023 Token const* t = T.sntStart(sid);
00024 Token const* e = T.sntEnd(sid);
00025
00026 while (t < e)
00027 {
00028
00029 out << V[t->id()];
00030 if (++t < e) out << " ";
00031 }
00032 }
00033
00034 bool
00035 fill(string const& query, TSA<Token> const& tsa,
00036 TokenIndex const& V, bitvector& v)
00037 {
00038 v.resize(tsa.getCorpus()->size());
00039 Bitext<Token>::iter m(&tsa);
00040 istringstream buf(query); string w;
00041 while (buf >> w)
00042 if (!m.extend(V[w]))
00043 return false;
00044 m.markSentences(v);
00045 return true;
00046 }
00047
00048
00049
00050
00051 int main(int argc, char* argv[])
00052 {
00053 interpret_args(argc, argv);
00054 if (Q1.empty() && Q2.empty()) exit(0);
00055
00056 boost::shared_ptr<mmbitext> B(new mmbitext); string w;
00057 B->open(bname, L1, L2);
00058
00059 Bitext<Token>::iter m1(B->I1.get(), *B->V1, Q1);
00060 if (Q1.size() && m1.size() == 0) exit(0);
00061
00062 Bitext<Token>::iter m2(B->I2.get(), *B->V2, Q2);
00063 if (Q2.size() && m2.size() == 0) exit(0);
00064
00065 bitvector check(B->T1->size());
00066 if (Q1.size() == 0 || Q2.size() == 0) check.set();
00067 else (m2.markSentences(check));
00068
00069 Bitext<Token>::iter& m = m1.size() ? m1 : m2;
00070 char const* x = m.lower_bound(-1);
00071 char const* stop = m.upper_bound(-1);
00072 uint64_t sid;
00073 ushort off;
00074 boost::taus88 rnd;
00075 size_t N = m.approxOccurrenceCount();
00076 maxhits = min(N, maxhits);
00077 size_t k = 0;
00078 for (size_t i = 0; x < stop; ++i)
00079 {
00080 x = m.root->readSid(x,stop,sid);
00081 x = m.root->readOffset(x,stop,off);
00082
00083 if (!check[sid]) continue;
00084 size_t r = (N - i) * rnd()/(rnd.max()+1.) + k;
00085 if (maxhits != N && r >= maxhits) continue;
00086 ++k;
00087
00088 size_t s1,s2,e1,e2; int po_fwd=-1,po_bwd=-1;
00089 std::vector<unsigned char> caln;
00090
00091 if (!B->find_trg_phr_bounds(sid, off, off+m.size(),
00092 s1,s2,e1,e2,po_fwd,po_bwd,
00093 &caln, NULL, &m == &m2))
00094 {
00095
00096 }
00097
00098 std::cout << sid << " " << B->sid2docname(sid)
00099 << " dfwd=" << po_fwd << " dbwd=" << po_bwd
00100 << "\n";
00101
00102 write_sentence(*B->T1, sid, *B->V1, std::cout); std::cout << "\n";
00103 write_sentence(*B->T2, sid, *B->V2, std::cout); std::cout << "\n";
00104 B->write_yawat_alignment(sid,
00105 m1.size() ? &m1 : NULL,
00106 m2.size() ? &m2 : NULL, std::cout);
00107 std::cout << std::endl;
00108
00109 }
00110 }
00111
00112 void
00113 interpret_args(int ac, char* av[])
00114 {
00115 po::variables_map vm;
00116 po::options_description o("Options");
00117 o.add_options()
00118
00119 ("help,h", "print this message")
00120 ("maxhits,n", po::value<size_t>(&maxhits)->default_value(25),
00121 "max. number of hits")
00122 ("q1", po::value<string>(&Q1), "query in L1")
00123 ("q2", po::value<string>(&Q2), "query in L2")
00124 ;
00125
00126 po::options_description h("Hidden Options");
00127 h.add_options()
00128 ("bname", po::value<string>(&bname), "base name of corpus")
00129 ("L1", po::value<string>(&L1), "L1 tag")
00130 ("L2", po::value<string>(&L2), "L2 tag")
00131 ;
00132
00133 h.add(o);
00134 po::positional_options_description a;
00135 a.add("bname",1);
00136 a.add("L1",1);
00137 a.add("L2",1);
00138
00139 po::store(po::command_line_parser(ac,av)
00140 .options(h)
00141 .positional(a)
00142 .run(),vm);
00143 po::notify(vm);
00144 if (vm.count("help"))
00145 {
00146 std::cout << "\nusage:\n\t" << av[0]
00147 << " [options] [--q1=<L1string>] [--q2=<L2string>]" << std::endl;
00148 std::cout << o << std::endl;
00149 exit(0);
00150 }
00151 }