Moses: /disk4/html/www/moses/doxygen/mosesdecoder/moses/TranslationModel/UG/check-coverage2.cc Source File

00001 // for each word in the input, keep track of the longest matching ngram covering it
00002 #include <boost/foreach.hpp>
00003 #include <boost/format.hpp>
00004 #include <boost/tokenizer.hpp>
00005 #include <boost/shared_ptr.hpp>
00006 #include <algorithm>
00007 #include <iostream>
00008 #include "mm/ug_bitext.h"
00009 #include "generic/file_io/ug_stream.h"
00010 #include <string>
00011 #include <sstream>
00012 
00013 using namespace Moses;
00014 using namespace sapt;
00015 using namespace std;
00016 using namespace boost;
00017 
00018 typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
00019 typedef mmBitext<Token> bitext_t;
00020 
00021 struct mycmp 
00022 {
00023   bool operator() (pair<string,uint32_t> const& a, 
00024                    pair<string,uint32_t> const& b) const
00025   {
00026     return a.second > b.second;
00027   }
00028 };
00029 
00030 string 
00031 basename(string const path, string const suffix)
00032 {
00033   size_t p = path.find_last_of("/");
00034   size_t k = path.size() - suffix.size();
00035   cout << path << " " << suffix << endl;
00036   cout << path.substr(0,p) << " " << path.substr(k) << endl;
00037   return path.substr(p, suffix == &path[k] ? k-p : path.size() - p);
00038 }
00039 
00040 int main(int argc, char* argv[])
00041 {
00042   bitext_t B;
00043   B.open(argv[1],argv[2],argv[3]);
00044   B.V1->setDynamic(true);
00045   string line;
00046   string ifile = argv[4];
00047   string docname = basename(ifile, string(".") + argv[2] + ".gz");
00048   boost::iostreams::filtering_istream in;
00049   ugdiss::open_input_stream(ifile,in);
00050   while(getline(in,line))
00051     {
00052       cout << line << " [" << docname << "]" << endl;
00053       vector<id_type> snt;
00054       B.V1->fillIdSeq(line,snt);
00055       vector<size_t> match(snt.size(),0);
00056       for (size_t i = 0; i < snt.size(); ++i)
00057         {
00058           bitext_t::iter m(B.I1.get());
00059           for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k);
00060           for (size_t j = 0; j < m.size(); ++j) 
00061 
00062             match[i+j] = max(match[i+j], m.size());
00063         }
00064       for (size_t i = 0; i < snt.size(); ++i)
00065         cout << setw(3) << match[i] << " " << (*B.V1)[snt[i]] << endl;
00066     }
00067 }