00001
00002
00003
00004 namespace sapt
00005 {
00006 template<typename TKN>
00007 class mmBitext : public Bitext<TKN>
00008 {
00009 void load_document_map(std::string const& fname);
00010 public:
00011 void open(std::string const base, std::string const L1, std::string L2);
00012 mmBitext();
00013 };
00014
00015 template<typename TKN>
00016 mmBitext<TKN>::
00017 mmBitext()
00018 : Bitext<TKN>(new mmTtrack<TKN>(), new mmTtrack<TKN>(), new mmTtrack<char>(),
00019 new TokenIndex(), new TokenIndex(),
00020 new mmTSA<TKN>(), new mmTSA<TKN>())
00021 {};
00022
00023 template<typename TKN>
00024 void
00025 mmBitext<TKN>::
00026 load_document_map(std::string const& fname)
00027 {
00028 std::ifstream docmap(fname.c_str());
00029
00030
00031
00032
00033
00034
00035 std::string buffer,docname; size_t a=0,b;
00036 this->m_sid2docid.reset(new std::vector<id_type>(this->T1->size()));
00037 while(getline(docmap,buffer))
00038 {
00039 std::istringstream line(buffer);
00040 if (!(line>>docname)) continue;
00041 if (docname.size() && docname[0] == '#') continue;
00042 size_t docid = this->m_docname2docid.size();
00043 this->m_docname2docid[docname] = docid;
00044 this->m_docname.push_back(docname);
00045 line >> b;
00046 #ifndef NO_MOSES
00047 VERBOSE(3, "DOCUMENT MAP " << docname << " " << a << "-" << b+a << std::endl);
00048 #endif
00049 for (b += a; a < b; ++a)
00050 (*this->m_sid2docid)[a] = docid;
00051 }
00052 UTIL_THROW_IF2(b != this->T1->size(),
00053 "Document map doesn't match corpus!");
00054 }
00055
00056 template<typename TKN>
00057 void
00058 mmBitext<TKN>::
00059 open(std::string const base, std::string const L1, std::string L2)
00060 {
00061 mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
00062 mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
00063 mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
00064 t1.open(base+L1+".mct");
00065 t2.open(base+L2+".mct");
00066 tx.open(base+L1+"-"+L2+".mam");
00067 this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
00068 this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
00069 mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
00070 mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
00071 i1.open(base+L1+".sfa", this->T1);
00072 i2.open(base+L2+".sfa", this->T2);
00073 assert(this->T1->size() == this->T2->size());
00074
00075 std::string docmapfile = base+"dmp";
00076 if (!access(docmapfile.c_str(),F_OK))
00077 load_document_map(docmapfile);
00078 }
00079
00080 }
00081