Moses: /disk4/html/www/moses/doxygen/mosesdecoder/moses/TranslationModel/UG/check-coverage3.cc Source File

00001 // #include "mmsapt.h"
00002 // #include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
00003 // #include "moses/TranslationTask.h"
00004 #include <boost/foreach.hpp>
00005 #include <boost/format.hpp>
00006 #include <boost/tokenizer.hpp>
00007 #include <boost/shared_ptr.hpp>
00008 #include <algorithm>
00009 #include <iostream>
00010 #include "mm/ug_bitext.h"
00011 #include "generic/file_io/ug_stream.h"
00012 #include <string>
00013 #include <sstream>
00014 #include "mm/ug_bitext_sampler.h"
00015 
00016 #include <boost/program_options.hpp>
00017 namespace po=boost::program_options;
00018 using namespace Moses;
00019 using namespace sapt;
00020 using namespace std;
00021 using namespace boost;
00022 
00023 typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
00024 typedef mmBitext<Token> bitext_t;
00025 
00026 size_t topN;
00027 string docname;
00028 string reference_file;
00029 string domain_name;
00030 string bname, L1, L2;
00031 string ifile;
00032 
00033 struct mycmp 
00034 {
00035   bool operator() (pair<string,uint32_t> const& a, 
00036                    pair<string,uint32_t> const& b) const
00037   {
00038     return a.second > b.second;
00039   }
00040 };
00041 
00042 
00043 
00044 void interpret_args(int ac, char* av[]);
00045 
00046 string 
00047 basename(string const path)
00048 {
00049   size_t p = path.find_last_of("/");
00050   string dot = ".";
00051   size_t k = path.find((dot + L1),p+1);
00052   if (k == string::npos) k = path.find(dot + L1 + ".gz");
00053   if (k == string::npos) return path.substr(p+1);
00054   return path.substr(p+1, k-p-1);
00055 }
00056 
00057 void 
00058 print_evidence_list(bitext_t const& B, std::map<uint32_t, uint32_t> const& indoc)
00059 {
00060   typedef std::map<uint32_t, uint32_t>::const_iterator iter;
00061   typedef pair<size_t,string> item;
00062   vector<item> where; 
00063   where.reserve(indoc.size());
00064   
00065   for (iter d = indoc.begin(); d != indoc.end(); ++d)
00066     where.push_back(item(d->second, B.docid2name(d->first)));
00067   sort(where.begin(),where.end(),greater<item>());
00068   BOOST_FOREACH(item const& doc, where)
00069     if (domain_name == doc.second)
00070       cout << (boost::format("\t\t%4d ! %s") % doc.first % doc.second) << endl;
00071     else
00072       cout << (boost::format("\t\t%4d   %s") % doc.first % doc.second) << endl;
00073 }
00074 
00075 int main(int argc, char* argv[])
00076 {
00077   boost::shared_ptr<bitext_t> B(new bitext_t);
00078   interpret_args(argc,argv);
00079 
00080   B->open(bname, L1, L2);
00081   string line, refline;
00082   if (domain_name == "" && ifile != "-")
00083     domain_name = basename(ifile);
00084   
00085   id_type docid = B->docname2docid(domain_name);
00086   boost::iostreams::filtering_istream in, ref;
00087   ugdiss::open_input_stream(ifile,in);
00088   if (reference_file.size()) 
00089     ugdiss::open_input_stream(reference_file,ref);
00090 
00091   while(getline(in,line))
00092     {
00093       if (reference_file.size()) getline(ref, refline);
00094       cout << string(80,'-') << endl;
00095       cout << " [" << domain_name << "]" << endl;
00096       cout << line << endl;
00097       if (refline.size()) cout << refline << endl;
00098       cout << string(80,'-') << endl;
00099       vector<id_type> snt;
00100       B->V1->fillIdSeq(line,snt);
00101       for (size_t i = 0; i < snt.size(); ++i)
00102         {
00103           bitext_t::iter m(B->I1.get());
00104           for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k);
00105           for (size_t num_occurrences = 0; m.size(); m.up())
00106             {
00107               if (size_t(m.ca()) == num_occurrences) continue;
00108               num_occurrences = m.ca();
00109               SPTR<SamplingBias const> zilch;
00110               BitextSampler<Token> s(B, m, zilch, 1000, 1000, 
00111                                      sapt::random_sampling);
00112               s();
00113               if (s.stats()->trg.size() == 0) continue;
00114               sapt::pstats::indoc_map_t::const_iterator d
00115                 = s.stats()->indoc.find(docid);
00116               size_t indoccnt = d != s.stats()->indoc.end() ? d->second : 0;
00117               cout << m.str(B->V1.get()) << " (" 
00118                    << s.stats()->trg.size() << " entries; " 
00119                    << indoccnt << "/" << s.stats()->good 
00120                    << " samples in domain; " << num_occurrences
00121                    << " occ.)" << endl;
00122               vector<PhrasePair<Token> > ppairs;
00123               PhrasePair<Token>::SortDescendingByJointCount sorter;
00124               expand(m,*B,*s.stats(),ppairs,NULL);
00125               sort(ppairs.begin(),ppairs.end(),sorter);
00126               boost::format fmt("%4d/%d/%d |%s| (%4.2f : %4.2f)"); 
00127               size_t ctr = 0;
00128               bool skipped_some = false;
00129               BOOST_FOREACH(PhrasePair<Token>& ppair, ppairs)
00130                 {
00131                   if (++ctr > topN && ppair.indoc.find(docid) == ppair.indoc.end())
00132                     {
00133                       skipped_some = true;
00134                       continue;
00135                     }
00136                   if (skipped_some) 
00137                     {
00138                       cout << string(17,' ') << "..." << endl;
00139                       skipped_some = false;
00140                     }
00141                   // if (ppair.joint * 100 < ppair.good1) break;
00142                   ppair.good2 = ppair.raw2 * float(ppair.good1)/ppair.raw1;
00143                   ppair.good2 = max(ppair.good2, ppair.joint);
00144 
00145 #if 1
00146                   cout << "\t" 
00147                        << (fmt % ppair.joint % ppair.good1 % ppair.good2
00148                            % B->T2->pid2str(B->V2.get(),ppair.p2)
00149                            % (float(ppair.joint)/ppair.good1)
00150                            % (float(ppair.joint)/ppair.good2)
00151                            ) << "\n";
00152                   print_evidence_list(*B, ppair.indoc);
00153                   cout << endl;
00154 #else
00155                   cout << "\t" 
00156                        << (fmt % ppair.joint % ppair.good1 % ppair.good2
00157                            % B->T2->pid2str(B->V2.get(),ppair.p2)
00158                            % (float(ppair.joint)/ppair.good1)
00159                            % (float(ppair.joint)/ppair.good2)
00160                            ) << " [";
00161                   typedef std::map<uint32_t, uint32_t>::const_iterator iter;
00162                   for (iter d = ppair.indoc.begin(); d != ppair.indoc.end(); ++d)
00163                     {
00164                       if (d != ppair.indoc.begin()) cout << "; ";
00165                       cout << (boost::format("%s: %d") % B->docid2name(d->first)
00166                                % d->second) ;
00167                     }
00168                   cout << "]" << endl;
00169 
00170 #endif
00171 
00172                 }
00173             }
00174         }
00175     }
00176 }
00177 
00178 void
00179 interpret_args(int ac, char* av[])
00180 {
00181   po::variables_map vm;
00182   po::options_description o("Options");
00183   o.add_options()
00184 
00185     ("help,h",  "print this message")
00186     ("top,n", po::value<size_t>(&topN)->default_value(5),
00187      "max. number of entries to show")
00188     ("domain,D", po::value<string>(&domain_name),
00189      "domain name (when reading from stdin)")
00190     ("reference,r", po::value<string>(&reference_file),
00191      "reference file")
00192     ;
00193 
00194   po::options_description h("Hidden Options");
00195   h.add_options()
00196     ("bname", po::value<string>(&bname), "base name of corpus")
00197     ("L1", po::value<string>(&L1), "L1 tag")
00198     ("L2", po::value<string>(&L2), "L2 tag")
00199     ("input", po::value<string>(&ifile), "input file")
00200     ;
00201 
00202   h.add(o);
00203   po::positional_options_description a;
00204   a.add("bname",1);
00205   a.add("L1",1);
00206   a.add("L2",1);
00207   a.add("input",1);
00208 
00209   po::store(po::command_line_parser(ac,av)
00210             .options(h)
00211             .positional(a)
00212             .run(),vm);
00213   po::notify(vm);
00214   if (vm.count("help"))
00215     {
00216       std::cout << "\nusage:\n\t" << av[0]
00217            << " [options] <model file stem> <L1> <L2> <input file>" << std::endl;
00218       std::cout << o << std::endl;
00219       exit(0);
00220     }
00221 }