00001
00002
00003
00004 #include <boost/foreach.hpp>
00005 #include <boost/format.hpp>
00006 #include <boost/tokenizer.hpp>
00007 #include <boost/shared_ptr.hpp>
00008 #include <algorithm>
00009 #include <iostream>
00010 #include "mm/ug_bitext.h"
00011 #include "generic/file_io/ug_stream.h"
00012 #include <string>
00013 #include <sstream>
00014
00015 using namespace Moses;
00016 using namespace sapt;
00017 using namespace std;
00018 using namespace boost;
00019
00020 typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
00021 typedef mmBitext<Token> bitext_t;
00022
00023 struct mycmp
00024 {
00025 bool operator() (pair<string,uint32_t> const& a,
00026 pair<string,uint32_t> const& b) const
00027 {
00028 return a.second > b.second;
00029 }
00030 };
00031
00032 string
00033 basename(string const path, string const suffix)
00034 {
00035 size_t p = path.find_last_of("/");
00036 size_t k = path.size() - suffix.size();
00037 cout << path << " " << suffix << endl;
00038 cout << path.substr(0,p) << " " << path.substr(k) << endl;
00039 return path.substr(p, suffix == &path[k] ? k-p : path.size() - p);
00040 }
00041
00042 int main(int argc, char* argv[])
00043 {
00044 boost::shared_ptr<bitext_t> B(new bitext_t);
00045 B->open(argv[1],argv[2],argv[3]);
00046 string line;
00047 string ifile = argv[4];
00048 string docname = basename(ifile, string(".") + argv[2] + ".gz");
00049 boost::iostreams::filtering_istream in;
00050 ugdiss::open_input_stream(ifile,in);
00051 while(getline(in,line))
00052 {
00053 cout << line << " [" << docname << "]" << endl;
00054 vector<id_type> snt;
00055 B->V1->fillIdSeq(line,snt);
00056 for (size_t i = 0; i < snt.size(); ++i)
00057 {
00058 bitext_t::iter m(B->I1.get());
00059 for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
00060 {
00061 if (m.ca() > 500) continue;
00062 sapt::tsa::ArrayEntry I(m.lower_bound(-1));
00063 char const* stop = m.upper_bound(-1);
00064 map<string,uint32_t> cnt;
00065 while (I.next != stop)
00066 {
00067 m.root->readEntry(I.next,I);
00068 ++cnt[B->sid2docname(I.sid)];
00069 }
00070 cout << setw(8) << int(m.ca()) << " "
00071 << B->V1->toString(&snt[i],&snt[k+1]) << endl;
00072 typedef pair<string,uint32_t> entry;
00073 vector<entry> ranked; ranked.reserve(cnt.size());
00074 BOOST_FOREACH(entry const& e, cnt) ranked.push_back(e);
00075 sort(ranked.begin(),ranked.end(),mycmp());
00076 BOOST_FOREACH(entry const& e, ranked)
00077 cout << setw(12) << " " << e.second << " " << e.first << endl;
00078 cout << endl;
00079 }
00080 }
00081 }
00082 }