00001
00002 #include <boost/foreach.hpp>
00003 #include <boost/format.hpp>
00004 #include <boost/tokenizer.hpp>
00005 #include <boost/shared_ptr.hpp>
00006 #include <algorithm>
00007 #include <iostream>
00008 #include "mm/ug_bitext.h"
00009 #include "generic/file_io/ug_stream.h"
00010 #include <string>
00011 #include <sstream>
00012
00013 using namespace Moses;
00014 using namespace sapt;
00015 using namespace std;
00016 using namespace boost;
00017
00018 typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
00019 typedef mmBitext<Token> bitext_t;
00020
00021 struct mycmp
00022 {
00023 bool operator() (pair<string,uint32_t> const& a,
00024 pair<string,uint32_t> const& b) const
00025 {
00026 return a.second > b.second;
00027 }
00028 };
00029
00030 string
00031 basename(string const path, string const suffix)
00032 {
00033 size_t p = path.find_last_of("/");
00034 size_t k = path.size() - suffix.size();
00035 cout << path << " " << suffix << endl;
00036 cout << path.substr(0,p) << " " << path.substr(k) << endl;
00037 return path.substr(p, suffix == &path[k] ? k-p : path.size() - p);
00038 }
00039
00040 int main(int argc, char* argv[])
00041 {
00042 bitext_t B;
00043 B.open(argv[1],argv[2],argv[3]);
00044 B.V1->setDynamic(true);
00045 string line;
00046 string ifile = argv[4];
00047 string docname = basename(ifile, string(".") + argv[2] + ".gz");
00048 boost::iostreams::filtering_istream in;
00049 ugdiss::open_input_stream(ifile,in);
00050 while(getline(in,line))
00051 {
00052 cout << line << " [" << docname << "]" << endl;
00053 vector<id_type> snt;
00054 B.V1->fillIdSeq(line,snt);
00055 vector<size_t> match(snt.size(),0);
00056 for (size_t i = 0; i < snt.size(); ++i)
00057 {
00058 bitext_t::iter m(B.I1.get());
00059 for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k);
00060 for (size_t j = 0; j < m.size(); ++j)
00061
00062 match[i+j] = max(match[i+j], m.size());
00063 }
00064 for (size_t i = 0; i < snt.size(); ++i)
00065 cout << setw(3) << match[i] << " " << (*B.V1)[snt[i]] << endl;
00066 }
00067 }