00001 
00002 
00003 
00004 namespace sapt
00005 {
00006   template<typename TKN>
00007   class mmBitext : public Bitext<TKN>
00008   {
00009     void load_document_map(std::string const& fname);
00010   public:
00011     void open(std::string const base, std::string const L1, std::string L2);
00012     mmBitext();
00013   };
00014 
00015   template<typename TKN>
00016   mmBitext<TKN>::
00017   mmBitext()
00018     : Bitext<TKN>(new mmTtrack<TKN>(), new mmTtrack<TKN>(), new mmTtrack<char>(),
00019                   new TokenIndex(), new TokenIndex(),
00020                   new mmTSA<TKN>(), new mmTSA<TKN>())
00021   {};
00022 
00023   template<typename TKN>
00024   void
00025   mmBitext<TKN>::
00026   load_document_map(std::string const& fname)
00027   {
00028     std::ifstream docmap(fname.c_str());
00029     
00030     
00031     
00032     
00033     
00034     
00035     std::string buffer,docname; size_t a=0,b;
00036     this->m_sid2docid.reset(new std::vector<id_type>(this->T1->size()));
00037     while(getline(docmap,buffer))
00038       {
00039         std::istringstream line(buffer);
00040         if (!(line>>docname)) continue; 
00041         if (docname.size() && docname[0] == '#') continue; 
00042         size_t docid = this->m_docname2docid.size();
00043         this->m_docname2docid[docname] = docid;
00044         this->m_docname.push_back(docname);
00045         line >> b;
00046 #ifndef NO_MOSES
00047         VERBOSE(3, "DOCUMENT MAP " << docname << " " << a << "-" << b+a << std::endl);
00048 #endif
00049         for (b += a; a < b; ++a)
00050           (*this->m_sid2docid)[a] = docid;
00051       }
00052     UTIL_THROW_IF2(b != this->T1->size(),
00053                    "Document map doesn't match corpus!");
00054   }
00055 
00056   template<typename TKN>
00057   void
00058   mmBitext<TKN>::
00059   open(std::string const base, std::string const L1, std::string L2)
00060   {
00061     mmTtrack<TKN>&  t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
00062     mmTtrack<TKN>&  t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
00063     mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
00064     t1.open(base+L1+".mct");
00065     t2.open(base+L2+".mct");
00066     tx.open(base+L1+"-"+L2+".mam");
00067     this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
00068     this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
00069     mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
00070     mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
00071     i1.open(base+L1+".sfa", this->T1);
00072     i2.open(base+L2+".sfa", this->T2);
00073     assert(this->T1->size() == this->T2->size());
00074 
00075     std::string docmapfile = base+"dmp";
00076     if (!access(docmapfile.c_str(),F_OK))
00077       load_document_map(docmapfile);
00078   }
00079 
00080 }
00081