00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 #include <cstdio>
00023 
00024 #include "PhraseTableCreator.h"
00025 #include "ConsistentPhrases.h"
00026 #include "ThrowingFwrite.h"
00027 #include "util/file.hh"
00028 #include "util/exception.hh"
00029 
00030 namespace Moses
00031 {
00032 
00033 bool operator<(const PackedItem &pi1, const PackedItem &pi2)
00034 {
00035   if(pi1.GetLine() < pi2.GetLine())
00036     return false;
00037   return true;
00038 }
00039 
00040 std::string PhraseTableCreator::m_phraseStopSymbol = "__SPECIAL_STOP_SYMBOL__";
00041 std::string PhraseTableCreator::m_separator = "|||";
00042 
00043 PhraseTableCreator::PhraseTableCreator(std::string inPath,
00044                                        std::string outPath,
00045                                        std::string tempfilePath,
00046                                        size_t numScoreComponent,
00047                                        size_t sortScoreIndex,
00048                                        Coding coding,
00049                                        size_t orderBits,
00050                                        size_t fingerPrintBits,
00051                                        bool useAlignmentInfo,
00052                                        bool multipleScoreTrees,
00053                                        size_t quantize,
00054                                        size_t maxRank,
00055                                        bool warnMe
00056 #ifdef WITH_THREADS
00057                                        , size_t threads
00058 #endif
00059                                       )
00060   : m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath),
00061     m_outFile(std::fopen(m_outPath.c_str(), "w")), m_numScoreComponent(numScoreComponent),
00062     m_sortScoreIndex(sortScoreIndex), m_warnMe(warnMe),
00063     m_coding(coding), m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
00064     m_useAlignmentInfo(useAlignmentInfo),
00065     m_multipleScoreTrees(multipleScoreTrees),
00066     m_quantize(quantize), m_maxRank(maxRank),
00067 #ifdef WITH_THREADS
00068     m_threads(threads),
00069     m_srcHash(m_orderBits, m_fingerPrintBits, 1),
00070     m_rnkHash(10, 24, m_threads),
00071 #else
00072     m_srcHash(m_orderBits, m_fingerPrintBits),
00073     m_rnkHash(m_orderBits, m_fingerPrintBits),
00074 #endif
00075     m_maxPhraseLength(0),
00076     m_lastFlushedLine(-1), m_lastFlushedSourceNum(0),
00077     m_lastFlushedSourcePhrase("")
00078 {
00079   PrintInfo();
00080 
00081   AddTargetSymbolId(m_phraseStopSymbol);
00082 
00083   size_t cur_pass = 1;
00084   size_t all_passes = 2;
00085   if(m_coding == PREnc)
00086     all_passes = 3;
00087 
00088   m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
00089   for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
00090       it != m_scoreCounters.end(); it++)
00091     *it = new ScoreCounter();
00092   m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
00093 
00094   
00095   if(m_coding == REnc) {
00096     size_t found = inPath.find_last_of("/\\");
00097     std::string path;
00098     if(found != std::string::npos)
00099       path = inPath.substr(0, found);
00100     else
00101       path = ".";
00102     LoadLexicalTable(path + "/lex.f2e");
00103   } else if(m_coding == PREnc) {
00104     std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating hash function for rank assignment" << std::endl;
00105     cur_pass++;
00106     CreateRankHash();
00107   }
00108 
00109   
00110   std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating source phrase index + Encoding target phrases" << std::endl;
00111   m_srcHash.BeginSave(m_outFile);
00112 
00113   if(tempfilePath.size()) {
00114     MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath));
00115     m_encodedTargetPhrases = new StringVectorTemp<unsigned char, unsigned long, MmapAllocator>(allocEncoded);
00116   } else {
00117     m_encodedTargetPhrases = new StringVectorTemp<unsigned char, unsigned long, MmapAllocator>();
00118   }
00119   EncodeTargetPhrases();
00120 
00121   cur_pass++;
00122 
00123   std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl;
00124   CalcHuffmanCodes();
00125 
00126   
00127   std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Compressing target phrases" << std::endl;
00128 
00129   if(tempfilePath.size()) {
00130     MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
00131     m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
00132   } else {
00133     m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(true);
00134   }
00135   CompressTargetPhrases();
00136 
00137   std::cerr << "Saving to " << m_outPath << std::endl;
00138   Save();
00139   std::cerr << "Done" << std::endl;
00140   std::fclose(m_outFile);
00141 }
00142 
00143 PhraseTableCreator::~PhraseTableCreator()
00144 {
00145   delete m_symbolTree;
00146   if(m_useAlignmentInfo)
00147     delete m_alignTree;
00148   for(size_t i = 0; i < m_scoreTrees.size(); i++) {
00149     delete m_scoreTrees[i];
00150     delete m_scoreCounters[i];
00151   }
00152 
00153   delete m_encodedTargetPhrases;
00154   delete m_compressedTargetPhrases;
00155 }
00156 
00157 void PhraseTableCreator::PrintInfo()
00158 {
00159   std::string encodings[3] = {"Huffman", "Huffman + REnc", "Huffman + PREnc"};
00160 
00161   std::cerr << "Used options:" << std::endl;
00162   std::cerr << "\tText phrase table will be read from: " << m_inPath << std::endl;
00163   std::cerr << "\tOutput phrase table will be written to: " << m_outPath << std::endl;
00164   std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl;
00165   std::cerr << "\tSource phrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl;
00166   std::cerr << "\tSelected target phrase encoding: " << encodings[m_coding] << std::endl;
00167   if(m_coding == PREnc) {
00168     std::cerr << "\tMaxiumum allowed rank for PREnc: ";
00169     if(!m_maxRank)
00170       std::cerr << "unlimited" << std::endl;
00171     else
00172       std::cerr << m_maxRank << std::endl;
00173   }
00174   std::cerr << "\tNumber of score components in phrase table: " << m_numScoreComponent << std::endl;
00175   std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl;
00176   std::cerr << "\tUsing score quantization: ";
00177   if(m_quantize)
00178     std::cerr << m_quantize << " best" << std::endl;
00179   else
00180     std::cerr << "no" << std::endl;
00181   std::cerr << "\tExplicitly included alignment information: " << (m_useAlignmentInfo ? "yes" : "no") << std::endl;
00182 
00183 #ifdef WITH_THREADS
00184   std::cerr << "\tRunning with " << m_threads << " threads" << std::endl;
00185 #endif
00186   std::cerr << std::endl;
00187 }
00188 
00189 void PhraseTableCreator::Save()
00190 {
00191   
00192   ThrowingFwrite(&m_coding, sizeof(m_coding), 1, m_outFile);
00193   ThrowingFwrite(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, m_outFile);
00194   ThrowingFwrite(&m_useAlignmentInfo, sizeof(m_useAlignmentInfo), 1, m_outFile);
00195   ThrowingFwrite(&m_maxRank, sizeof(m_maxRank), 1, m_outFile);
00196   ThrowingFwrite(&m_maxPhraseLength, sizeof(m_maxPhraseLength), 1, m_outFile);
00197 
00198   if(m_coding == REnc) {
00199     
00200     std::vector<std::string> temp1;
00201     temp1.resize(m_sourceSymbolsMap.size());
00202     for(boost::unordered_map<std::string, unsigned>::iterator it
00203         = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++)
00204       temp1[it->second] = it->first;
00205     std::sort(temp1.begin(), temp1.end());
00206     StringVector<unsigned char, unsigned, std::allocator> sourceSymbols(true);
00207     for(std::vector<std::string>::iterator it = temp1.begin();
00208         it != temp1.end(); it++)
00209       sourceSymbols.push_back(*it);
00210     sourceSymbols.save(m_outFile);
00211 
00212     
00213     size_t size = m_lexicalTableIndex.size();
00214     ThrowingFwrite(&size, sizeof(size_t), 1, m_outFile);
00215     ThrowingFwrite(&m_lexicalTableIndex[0], sizeof(size_t), size, m_outFile);
00216     size = m_lexicalTable.size();
00217     ThrowingFwrite(&size, sizeof(size_t), 1, m_outFile);
00218     ThrowingFwrite(&m_lexicalTable[0], sizeof(SrcTrg), size, m_outFile);
00219   }
00220 
00221   
00222   std::vector<std::string> temp2;
00223   temp2.resize(m_targetSymbolsMap.size());
00224   for(boost::unordered_map<std::string, unsigned>::iterator it
00225       = m_targetSymbolsMap.begin(); it != m_targetSymbolsMap.end(); it++)
00226     temp2[it->second] = it->first;
00227   StringVector<unsigned char, unsigned, std::allocator> targetSymbols(true);
00228   for(std::vector<std::string>::iterator it = temp2.begin();
00229       it != temp2.end(); it++)
00230     targetSymbols.push_back(*it);
00231   targetSymbols.save(m_outFile);
00232 
00233   
00234   m_symbolTree->Save(m_outFile);
00235 
00236   
00237   
00238   ThrowingFwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile);
00239   size_t numScoreTrees = m_scoreTrees.size();
00240   for(size_t i = 0; i < numScoreTrees; i++)
00241     m_scoreTrees[i]->Save(m_outFile);
00242 
00243   
00244   if(m_useAlignmentInfo)
00245     m_alignTree->Save(m_outFile);
00246 
00247   
00248   m_compressedTargetPhrases->save(m_outFile);
00249 }
00250 
00251 void PhraseTableCreator::LoadLexicalTable(std::string filePath)
00252 {
00253   std::vector<SrcTrgProb> t_lexTable;
00254 
00255   std::cerr << "Reading in lexical table for Rank Encoding" << std::endl;
00256   std::ifstream lexIn(filePath.c_str(), std::ifstream::in);
00257   std::string src, trg;
00258   float prob;
00259 
00260   
00261 
00262   std::cerr << "\tLoading from " << filePath << std::endl;
00263   while(lexIn >> trg >> src >> prob) {
00264     t_lexTable.push_back(SrcTrgProb(SrcTrgString(src, trg), prob));
00265     AddSourceSymbolId(src);
00266     AddTargetSymbolId(trg);
00267   }
00268 
00269   
00270   
00271 
00272   std::cerr << "\tSorting according to translation rank" << std::endl;
00273   std::sort(t_lexTable.begin(), t_lexTable.end(), SrcTrgProbSorter());
00274 
00275   
00276 
00277   std::vector<std::string> temp1;
00278   temp1.resize(m_sourceSymbolsMap.size());
00279   for(boost::unordered_map<std::string, unsigned>::iterator it
00280       = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++)
00281     temp1[it->second] = it->first;
00282 
00283   std::sort(temp1.begin(), temp1.end());
00284 
00285   for(size_t i = 0; i < temp1.size(); i++)
00286     m_sourceSymbolsMap[temp1[i]] = i;
00287 
00288   
00289 
00290   std::string srcWord = "";
00291   size_t srcIdx = 0;
00292   for(std::vector<SrcTrgProb>::iterator it = t_lexTable.begin();
00293       it != t_lexTable.end(); it++) {
00294     
00295     if(it->first.first != srcWord) {
00296       srcIdx = GetSourceSymbolId(it->first.first);
00297 
00298       
00299       if(srcIdx >= m_lexicalTableIndex.size())
00300         m_lexicalTableIndex.resize(srcIdx + 1);
00301       m_lexicalTableIndex[srcIdx] = m_lexicalTable.size();
00302     }
00303 
00304     
00305     size_t trgIdx = GetTargetSymbolId(it->first.second);
00306     m_lexicalTable.push_back(SrcTrg(srcIdx, trgIdx));
00307 
00308     srcWord = it->first.first;
00309   }
00310   std::cerr << "\tLoaded " << m_lexicalTable.size() << " lexical pairs" << std::endl;
00311   std::cerr << std::endl;
00312 }
00313 
00314 void PhraseTableCreator::CreateRankHash()
00315 {
00316   InputFileStream inFile(m_inPath);
00317 
00318 #ifdef WITH_THREADS
00319   boost::thread_group threads;
00320   for (size_t i = 0; i < m_threads; ++i) {
00321     RankingTask* rt = new RankingTask(inFile, *this);
00322     threads.create_thread(*rt);
00323   }
00324   threads.join_all();
00325 #else
00326   RankingTask* rt = new RankingTask(inFile, *this);
00327   (*rt)();
00328   delete rt;
00329 #endif
00330   FlushRankedQueue(true);
00331 }
00332 
00333 inline std::string PhraseTableCreator::MakeSourceKey(std::string &source)
00334 {
00335   return source + " " + m_separator + " ";
00336 }
00337 
00338 inline std::string PhraseTableCreator::MakeSourceTargetKey(std::string &source, std::string &target)
00339 {
00340   return source + " " + m_separator + " " + target + " " + m_separator + " ";
00341 }
00342 
00343 void PhraseTableCreator::EncodeTargetPhrases()
00344 {
00345   InputFileStream inFile(m_inPath);
00346 
00347 #ifdef WITH_THREADS
00348   boost::thread_group threads;
00349   for (size_t i = 0; i < m_threads; ++i) {
00350     EncodingTask* et = new EncodingTask(inFile, *this);
00351     threads.create_thread(*et);
00352   }
00353   threads.join_all();
00354 #else
00355   EncodingTask* et = new EncodingTask(inFile, *this);
00356   (*et)();
00357   delete et;
00358 #endif
00359   FlushEncodedQueue(true);
00360 }
00361 
00362 
00363 void PhraseTableCreator::CompressTargetPhrases()
00364 {
00365 #ifdef WITH_THREADS
00366   boost::thread_group threads;
00367   for (size_t i = 0; i < m_threads; ++i) {
00368     CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this);
00369     threads.create_thread(*ct);
00370   }
00371   threads.join_all();
00372 #else
00373   CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this);
00374   (*ct)();
00375   delete ct;
00376 #endif
00377   FlushCompressedQueue(true);
00378 }
00379 
00380 void PhraseTableCreator::CalcHuffmanCodes()
00381 {
00382   std::cerr << "\tCreating Huffman codes for " << m_symbolCounter.Size()
00383             << " target phrase symbols" << std::endl;
00384 
00385   m_symbolTree = new SymbolTree(m_symbolCounter.Begin(),
00386                                 m_symbolCounter.End());
00387 
00388   std::vector<ScoreTree*>::iterator treeIt = m_scoreTrees.begin();
00389   for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
00390       it != m_scoreCounters.end(); it++) {
00391     if(m_quantize)
00392       (*it)->Quantize(m_quantize);
00393 
00394     std::cerr << "\tCreating Huffman codes for " << (*it)->Size()
00395               << " scores" << std::endl;
00396 
00397     *treeIt = new ScoreTree((*it)->Begin(), (*it)->End());
00398     treeIt++;
00399   }
00400 
00401   if(m_useAlignmentInfo) {
00402     std::cerr << "\tCreating Huffman codes for " << m_alignCounter.Size()
00403               << " alignment points" << std::endl;
00404     m_alignTree = new AlignTree(m_alignCounter.Begin(), m_alignCounter.End());
00405   }
00406   std::cerr << std::endl;
00407 }
00408 
00409 
00410 void PhraseTableCreator::AddSourceSymbolId(std::string& symbol)
00411 {
00412 #ifdef WITH_THREADS
00413   boost::mutex::scoped_lock lock(m_mutex);
00414 #endif
00415 
00416   if(m_sourceSymbolsMap.count(symbol) == 0) {
00417     unsigned value = m_sourceSymbolsMap.size();
00418     m_sourceSymbolsMap[symbol] = value;
00419   }
00420 }
00421 
00422 void PhraseTableCreator::AddTargetSymbolId(std::string& symbol)
00423 {
00424 #ifdef WITH_THREADS
00425   boost::mutex::scoped_lock lock(m_mutex);
00426 #endif
00427   if(m_targetSymbolsMap.count(symbol) == 0) {
00428     unsigned value = m_targetSymbolsMap.size();
00429     m_targetSymbolsMap[symbol] = value;
00430   }
00431 }
00432 
00433 unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol)
00434 {
00435 #ifdef WITH_THREADS
00436   boost::mutex::scoped_lock lock(m_mutex);
00437 #endif
00438   boost::unordered_map<std::string, unsigned>::iterator it
00439   = m_sourceSymbolsMap.find(symbol);
00440 
00441   if(it != m_sourceSymbolsMap.end())
00442     return it->second;
00443   else
00444     return m_sourceSymbolsMap.size();
00445 }
00446 
00447 unsigned PhraseTableCreator::GetTargetSymbolId(std::string& symbol)
00448 {
00449 #ifdef WITH_THREADS
00450   boost::mutex::scoped_lock lock(m_mutex);
00451 #endif
00452   boost::unordered_map<std::string, unsigned>::iterator it
00453   = m_targetSymbolsMap.find(symbol);
00454 
00455   UTIL_THROW_IF2(it == m_targetSymbolsMap.end(), "No id found for target symbol: " << symbol);
00456   return it->second;
00457 }
00458 
00459 unsigned PhraseTableCreator::GetOrAddTargetSymbolId(std::string& symbol)
00460 {
00461 #ifdef WITH_THREADS
00462   boost::mutex::scoped_lock lock(m_mutex);
00463 #endif
00464   boost::unordered_map<std::string, unsigned>::iterator it
00465   = m_targetSymbolsMap.find(symbol);
00466 
00467   if(it != m_targetSymbolsMap.end())
00468     return it->second;
00469   else {
00470     unsigned value = m_targetSymbolsMap.size();
00471     m_targetSymbolsMap[symbol] = value;
00472     return value;
00473   }
00474 }
00475 
00476 unsigned PhraseTableCreator::GetRank(unsigned srcIdx, unsigned trgIdx)
00477 {
00478   size_t srcTrgIdx = m_lexicalTableIndex[srcIdx];
00479   while(srcTrgIdx < m_lexicalTable.size()
00480         && srcIdx == m_lexicalTable[srcTrgIdx].first
00481         && m_lexicalTable[srcTrgIdx].second != trgIdx)
00482     srcTrgIdx++;
00483 
00484   if(srcTrgIdx < m_lexicalTable.size()
00485       && m_lexicalTable[srcTrgIdx].second == trgIdx)
00486     return srcTrgIdx - m_lexicalTableIndex[srcIdx];
00487   else
00488     return m_lexicalTable.size();
00489 }
00490 
00491 unsigned PhraseTableCreator::EncodeREncSymbol1(unsigned trgIdx)
00492 {
00493   assert((~(1 << 31)) > trgIdx);
00494   return trgIdx;
00495 }
00496 
00497 unsigned PhraseTableCreator::EncodeREncSymbol2(unsigned pos, unsigned rank)
00498 {
00499   unsigned symbol = rank;
00500   symbol |= 1 << 30;
00501   symbol |= pos << 24;
00502   return symbol;
00503 }
00504 
00505 unsigned PhraseTableCreator::EncodeREncSymbol3(unsigned rank)
00506 {
00507   unsigned symbol = rank;
00508   symbol |= 2 << 30;
00509   return symbol;
00510 }
00511 
00512 unsigned PhraseTableCreator::EncodePREncSymbol1(unsigned trgIdx)
00513 {
00514   assert((~(1 << 31)) > trgIdx);
00515   return trgIdx;
00516 }
00517 
00518 unsigned PhraseTableCreator::EncodePREncSymbol2(int left, int right, unsigned rank)
00519 {
00520   
00521   
00522   left  = left  + 32;
00523   right = right + 32;
00524 
00525   assert(64 > left);
00526   assert(64 > right);
00527   assert(524288 > rank);
00528 
00529   unsigned symbol = 0;
00530   symbol |=    1  << 31;
00531   symbol |= left  << 25;
00532   symbol |= right << 19;
00533   symbol |= rank;
00534   return symbol;
00535 }
00536 
00537 void PhraseTableCreator::EncodeTargetPhraseNone(std::vector<std::string>& t,
00538     std::ostream& os)
00539 {
00540   std::stringstream encodedTargetPhrase;
00541   size_t j = 0;
00542   while(j < t.size()) {
00543     unsigned targetSymbolId = GetOrAddTargetSymbolId(t[j]);
00544 
00545     m_symbolCounter.Increase(targetSymbolId);
00546     os.write((char*)&targetSymbolId, sizeof(targetSymbolId));
00547     j++;
00548   }
00549 
00550   unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
00551   os.write((char*)&stopSymbolId, sizeof(stopSymbolId));
00552   m_symbolCounter.Increase(stopSymbolId);
00553 }
00554 
00555 void PhraseTableCreator::EncodeTargetPhraseREnc(std::vector<std::string>& s,
00556     std::vector<std::string>& t,
00557     std::set<AlignPoint>& a,
00558     std::ostream& os)
00559 {
00560   std::stringstream encodedTargetPhrase;
00561 
00562   std::vector<std::vector<size_t> > a2(t.size());
00563   for(std::set<AlignPoint>::iterator it = a.begin(); it != a.end(); it++)
00564     a2[it->second].push_back(it->first);
00565 
00566   for(size_t i = 0; i < t.size(); i++) {
00567     unsigned idxTarget = GetOrAddTargetSymbolId(t[i]);
00568     unsigned encodedSymbol = -1;
00569 
00570     unsigned bestSrcPos = s.size();
00571     unsigned bestDiff = s.size();
00572     unsigned bestRank = m_lexicalTable.size();
00573     unsigned badRank = m_lexicalTable.size();
00574 
00575     for(std::vector<size_t>::iterator it = a2[i].begin(); it != a2[i].end(); it++) {
00576       unsigned idxSource = GetSourceSymbolId(s[*it]);
00577       size_t r = GetRank(idxSource, idxTarget);
00578       if(r != badRank) {
00579         if(r < bestRank) {
00580           bestRank = r;
00581           bestSrcPos = *it;
00582           bestDiff = abs(*it-i);
00583         } else if(r == bestRank && unsigned(abs(*it-i)) < bestDiff) {
00584           bestSrcPos = *it;
00585           bestDiff = abs(*it-i);
00586         }
00587       }
00588     }
00589 
00590     if(bestRank != badRank && bestSrcPos < s.size()) {
00591       if(bestSrcPos == i)
00592         encodedSymbol = EncodeREncSymbol3(bestRank);
00593       else
00594         encodedSymbol = EncodeREncSymbol2(bestSrcPos, bestRank);
00595       a.erase(AlignPoint(bestSrcPos, i));
00596     } else {
00597       encodedSymbol = EncodeREncSymbol1(idxTarget);
00598     }
00599 
00600     os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
00601     m_symbolCounter.Increase(encodedSymbol);
00602   }
00603 
00604   unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
00605   unsigned encodedSymbol = EncodeREncSymbol1(stopSymbolId);
00606   os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
00607   m_symbolCounter.Increase(encodedSymbol);
00608 }
00609 
00610 void PhraseTableCreator::EncodeTargetPhrasePREnc(std::vector<std::string>& s,
00611     std::vector<std::string>& t,
00612     std::set<AlignPoint>& a,
00613     size_t ownRank,
00614     std::ostream& os)
00615 {
00616   std::vector<unsigned> encodedSymbols(t.size());
00617   std::vector<unsigned> encodedSymbolsLengths(t.size(), 0);
00618 
00619   ConsistentPhrases cp(s.size(), t.size(), a);
00620   while(!cp.Empty()) {
00621     ConsistentPhrases::Phrase p = cp.Pop();
00622 
00623     std::stringstream key1;
00624     key1 << s[p.i];
00625     for(int i = p.i+1; i < p.i+p.m; i++)
00626       key1 << " " << s[i];
00627 
00628     std::stringstream key2;
00629     key2 << t[p.j];
00630     for(int i = p.j+1; i < p.j+p.n; i++)
00631       key2 << " " << t[i];
00632 
00633     int rank = -1;
00634     std::string key1Str = key1.str(), key2Str = key2.str();
00635     size_t idx = m_rnkHash[MakeSourceTargetKey(key1Str, key2Str)];
00636     if(idx != m_rnkHash.GetSize())
00637       rank = m_ranks[idx];
00638 
00639     if(rank >= 0 && (m_maxRank == 0 || unsigned(rank) < m_maxRank)) {
00640       if(unsigned(p.m) != s.size() || unsigned(rank) < ownRank) {
00641         std::stringstream encodedSymbol;
00642         encodedSymbols[p.j] = EncodePREncSymbol2(p.i-p.j, s.size()-(p.i+p.m), rank);
00643         encodedSymbolsLengths[p.j] = p.n;
00644 
00645         std::set<AlignPoint> tAlignment;
00646         for(std::set<AlignPoint>::iterator it = a.begin();
00647             it != a.end(); it++)
00648           if(it->first < p.i || it->first >= p.i + p.m
00649               || it->second < p.j || it->second >= p.j + p.n)
00650             tAlignment.insert(*it);
00651         a = tAlignment;
00652         cp.RemoveOverlap(p);
00653       }
00654     }
00655   }
00656 
00657   std::stringstream encodedTargetPhrase;
00658 
00659   size_t j = 0;
00660   while(j < t.size()) {
00661     if(encodedSymbolsLengths[j] > 0) {
00662       unsigned encodedSymbol = encodedSymbols[j];
00663       m_symbolCounter.Increase(encodedSymbol);
00664       os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
00665       j += encodedSymbolsLengths[j];
00666     } else {
00667       unsigned targetSymbolId = GetOrAddTargetSymbolId(t[j]);
00668       unsigned encodedSymbol = EncodePREncSymbol1(targetSymbolId);
00669       m_symbolCounter.Increase(encodedSymbol);
00670       os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
00671       j++;
00672     }
00673   }
00674 
00675   unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
00676   unsigned encodedSymbol = EncodePREncSymbol1(stopSymbolId);
00677   os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
00678   m_symbolCounter.Increase(encodedSymbol);
00679 }
00680 
00681 void PhraseTableCreator::EncodeScores(std::vector<float>& scores, std::ostream& os)
00682 {
00683   size_t c = 0;
00684   float score;
00685 
00686   while(c < scores.size()) {
00687     score = scores[c];
00688     score = FloorScore(TransformScore(score));
00689     os.write((char*)&score, sizeof(score));
00690     m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score);
00691     c++;
00692   }
00693 }
00694 
00695 void PhraseTableCreator::EncodeAlignment(std::set<AlignPoint>& alignment,
00696     std::ostream& os)
00697 {
00698   for(std::set<AlignPoint>::iterator it = alignment.begin();
00699       it != alignment.end(); it++) {
00700     os.write((char*)&(*it), sizeof(AlignPoint));
00701     m_alignCounter.Increase(*it);
00702   }
00703   AlignPoint stop(-1, -1);
00704   os.write((char*) &stop, sizeof(AlignPoint));
00705   m_alignCounter.Increase(stop);
00706 }
00707 
00708 std::string PhraseTableCreator::EncodeLine(std::vector<std::string>& tokens, size_t ownRank)
00709 {
00710   std::string sourcePhraseStr = tokens[0];
00711   std::string targetPhraseStr = tokens[1];
00712   std::string scoresStr = tokens[2];
00713 
00714   std::string alignmentStr = "";
00715   if(tokens.size() > 3)
00716     alignmentStr = tokens[3];
00717 
00718   std::vector<std::string> s = Tokenize(sourcePhraseStr);
00719 
00720   size_t phraseLength = s.size();
00721   if(m_maxPhraseLength < phraseLength)
00722     m_maxPhraseLength = phraseLength;
00723 
00724   std::vector<std::string> t = Tokenize(targetPhraseStr);
00725   std::vector<float> scores = Tokenize<float>(scoresStr);
00726 
00727   if(scores.size() != m_numScoreComponent) {
00728     std::stringstream strme;
00729     strme << "Error: Wrong number of scores detected ("
00730           << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
00731     strme << "Line: " << tokens[0] << " ||| " << tokens[1] << " ||| " << tokens[2] << " ..." << std::endl;
00732     UTIL_THROW2(strme.str());
00733   }
00734 
00735   std::set<AlignPoint> a;
00736   if(m_coding != None || m_useAlignmentInfo) {
00737     std::vector<size_t> positions = Tokenize<size_t>(alignmentStr, " \t-");
00738     for(size_t i = 0; i < positions.size(); i += 2) {
00739       a.insert(AlignPoint(positions[i], positions[i+1]));
00740     }
00741   }
00742 
00743   std::stringstream encodedTargetPhrase;
00744 
00745   if(m_coding == PREnc) {
00746     EncodeTargetPhrasePREnc(s, t, a, ownRank, encodedTargetPhrase);
00747   } else if(m_coding == REnc) {
00748     EncodeTargetPhraseREnc(s, t, a, encodedTargetPhrase);
00749   } else {
00750     EncodeTargetPhraseNone(t, encodedTargetPhrase);
00751   }
00752 
00753   EncodeScores(scores, encodedTargetPhrase);
00754 
00755   if(m_useAlignmentInfo)
00756     EncodeAlignment(a, encodedTargetPhrase);
00757 
00758   return encodedTargetPhrase.str();
00759 }
00760 
00761 std::string PhraseTableCreator::CompressEncodedCollection(std::string encodedCollection)
00762 {
00763   enum EncodeState {
00764     ReadSymbol, ReadScore, ReadAlignment,
00765     EncodeSymbol, EncodeScore, EncodeAlignment
00766   };
00767   EncodeState state = ReadSymbol;
00768 
00769   unsigned phraseStopSymbolId;
00770   if(m_coding == REnc)
00771     phraseStopSymbolId = EncodeREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol));
00772   else if(m_coding == PREnc)
00773     phraseStopSymbolId = EncodePREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol));
00774   else
00775     phraseStopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
00776   AlignPoint alignStopSymbol(-1, -1);
00777 
00778   std::stringstream encodedStream(encodedCollection);
00779   encodedStream.unsetf(std::ios::skipws);
00780 
00781   std::string compressedEncodedCollection;
00782   BitWrapper<> bitStream(compressedEncodedCollection);
00783 
00784   unsigned symbol;
00785   float score;
00786   size_t currScore = 0;
00787   AlignPoint alignPoint;
00788 
00789   while(encodedStream) {
00790     switch(state) {
00791     case ReadSymbol:
00792       encodedStream.read((char*) &symbol, sizeof(unsigned));
00793       state = EncodeSymbol;
00794       break;
00795     case ReadScore:
00796       if(currScore == m_numScoreComponent) {
00797         currScore = 0;
00798         if(m_useAlignmentInfo)
00799           state = ReadAlignment;
00800         else
00801           state = ReadSymbol;
00802       } else {
00803         encodedStream.read((char*) &score, sizeof(float));
00804         currScore++;
00805         state = EncodeScore;
00806       }
00807       break;
00808     case ReadAlignment:
00809       encodedStream.read((char*) &alignPoint, sizeof(AlignPoint));
00810       state = EncodeAlignment;
00811       break;
00812 
00813     case EncodeSymbol:
00814       state = (symbol == phraseStopSymbolId) ? ReadScore : ReadSymbol;
00815       m_symbolTree->Put(bitStream, symbol);
00816       break;
00817     case EncodeScore: {
00818       state = ReadScore;
00819       size_t idx = m_multipleScoreTrees ? currScore-1 : 0;
00820       if(m_quantize)
00821         score = m_scoreCounters[idx]->LowerBound(score);
00822       m_scoreTrees[idx]->Put(bitStream, score);
00823     }
00824     break;
00825     case EncodeAlignment:
00826       state = (alignPoint == alignStopSymbol) ? ReadSymbol : ReadAlignment;
00827       m_alignTree->Put(bitStream, alignPoint);
00828       break;
00829     }
00830   }
00831 
00832   return compressedEncodedCollection;
00833 }
00834 
00835 void PhraseTableCreator::AddRankedLine(PackedItem& pi)
00836 {
00837   m_queue.push(pi);
00838 }
00839 
00840 void PhraseTableCreator::FlushRankedQueue(bool force)
00841 {
00842   size_t step = 1ul << 10;
00843 
00844   while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
00845     m_lastFlushedLine++;
00846 
00847     PackedItem pi = m_queue.top();
00848     m_queue.pop();
00849 
00850     if(m_lastSourceRange.size() == step) {
00851       m_rnkHash.AddRange(m_lastSourceRange);
00852       m_lastSourceRange.clear();
00853     }
00854 
00855     if(m_lastFlushedSourcePhrase != pi.GetSrc()) {
00856       if(m_rankQueue.size()) {
00857         m_lastFlushedSourceNum++;
00858         if(m_lastFlushedSourceNum % 100000 == 0) {
00859           std::cerr << ".";
00860         }
00861         if(m_lastFlushedSourceNum % 5000000 == 0) {
00862           std::cerr << "[" << m_lastFlushedSourceNum << "]" << std::endl;
00863         }
00864 
00865         m_ranks.resize(m_lastFlushedLine + 1);
00866         int r = 0;
00867         while(!m_rankQueue.empty()) {
00868           m_ranks[m_rankQueue.top().second] = r++;
00869           m_rankQueue.pop();
00870         }
00871       }
00872     }
00873 
00874     m_lastSourceRange.push_back(pi.GetTrg());
00875 
00876     m_rankQueue.push(std::make_pair(pi.GetScore(), pi.GetLine()));
00877     m_lastFlushedSourcePhrase = pi.GetSrc();
00878   }
00879 
00880   if(force) {
00881     if(!m_lastSourceRange.empty()) {
00882       m_rnkHash.AddRange(m_lastSourceRange);
00883       m_lastSourceRange.clear();
00884     }
00885 
00886 #ifdef WITH_THREADS
00887     m_rnkHash.WaitAll();
00888 #endif
00889 
00890     m_ranks.resize(m_lastFlushedLine + 1);
00891     int r = 0;
00892     while(!m_rankQueue.empty()) {
00893       m_ranks[m_rankQueue.top().second] = r++;
00894       m_rankQueue.pop();
00895     }
00896 
00897     m_lastFlushedLine = -1;
00898     m_lastFlushedSourceNum = 0;
00899 
00900     std::cerr << std::endl << std::endl;
00901   }
00902 }
00903 
00904 
00905 void PhraseTableCreator::AddEncodedLine(PackedItem& pi)
00906 {
00907   m_queue.push(pi);
00908 }
00909 
00910 void PhraseTableCreator::FlushEncodedQueue(bool force)
00911 {
00912   while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
00913     PackedItem pi = m_queue.top();
00914     m_queue.pop();
00915     m_lastFlushedLine++;
00916 
00917     if(m_lastFlushedSourcePhrase != pi.GetSrc()) {
00918       if(m_lastCollection.size()) {
00919         std::stringstream targetPhraseCollection;
00920         for(std::vector<std::string>::iterator it =
00921               m_lastCollection.begin(); it != m_lastCollection.end(); it++)
00922           targetPhraseCollection << *it;
00923 
00924         m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase));
00925         m_encodedTargetPhrases->push_back(targetPhraseCollection.str());
00926 
00927         m_lastFlushedSourceNum++;
00928         if(m_lastFlushedSourceNum % 100000 == 0)
00929           std::cerr << ".";
00930         if(m_lastFlushedSourceNum % 5000000 == 0)
00931           std::cerr << "[" << m_lastFlushedSourceNum << "]" << std::endl;
00932 
00933         m_lastCollection.clear();
00934       }
00935     }
00936 
00937     if(m_lastSourceRange.size() == (1ul << m_orderBits)) {
00938       m_srcHash.AddRange(m_lastSourceRange);
00939       m_srcHash.SaveLastRange();
00940       m_srcHash.DropLastRange();
00941       m_lastSourceRange.clear();
00942     }
00943 
00944     m_lastFlushedSourcePhrase = pi.GetSrc();
00945     if(m_coding == PREnc) {
00946       if(m_lastCollection.size() <= pi.GetRank())
00947         m_lastCollection.resize(pi.GetRank() + 1);
00948       m_lastCollection[pi.GetRank()] = pi.GetTrg();
00949     } else {
00950       m_lastCollection.push_back(pi.GetTrg());
00951     }
00952   }
00953 
00954   if(force) {
00955     if(!m_lastSourceRange.size() || m_lastSourceRange.back() != m_lastFlushedSourcePhrase)
00956       m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase));
00957 
00958     if(m_lastCollection.size()) {
00959       std::stringstream targetPhraseCollection;
00960       for(std::vector<std::string>::iterator it =
00961             m_lastCollection.begin(); it != m_lastCollection.end(); it++)
00962         targetPhraseCollection << *it;
00963 
00964       m_encodedTargetPhrases->push_back(targetPhraseCollection.str());
00965       m_lastCollection.clear();
00966     }
00967 
00968     if(!m_lastSourceRange.empty()) {
00969       m_srcHash.AddRange(m_lastSourceRange);
00970       m_lastSourceRange.clear();
00971     }
00972 
00973 #ifdef WITH_THREADS
00974     m_srcHash.WaitAll();
00975 #endif
00976 
00977     m_srcHash.SaveLastRange();
00978     m_srcHash.DropLastRange();
00979     m_srcHash.FinalizeSave();
00980 
00981     m_lastFlushedLine = -1;
00982     m_lastFlushedSourceNum = 0;
00983 
00984     std::cerr << std::endl << std::endl;
00985   }
00986 }
00987 
00988 void PhraseTableCreator::AddCompressedCollection(PackedItem& pi)
00989 {
00990   m_queue.push(pi);
00991 }
00992 
00993 void PhraseTableCreator::FlushCompressedQueue(bool force)
00994 {
00995   if(force || m_queue.size() > 10000) {
00996     while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
00997       PackedItem pi = m_queue.top();
00998       m_queue.pop();
00999       m_lastFlushedLine++;
01000 
01001       m_compressedTargetPhrases->push_back(pi.GetTrg());
01002 
01003       if((pi.GetLine()+1) % 100000 == 0)
01004         std::cerr << ".";
01005       if((pi.GetLine()+1) % 5000000 == 0)
01006         std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
01007     }
01008   }
01009 
01010   if(force) {
01011     m_lastFlushedLine = -1;
01012     std::cerr << std::endl << std::endl;
01013   }
01014 }
01015 
01016 
01017 
01018 size_t RankingTask::m_lineNum = 0;
01019 #ifdef WITH_THREADS
01020 boost::mutex RankingTask::m_mutex;
01021 boost::mutex RankingTask::m_fileMutex;
01022 #endif
01023 
01024 RankingTask::RankingTask(InputFileStream& inFile, PhraseTableCreator& creator)
01025   : m_inFile(inFile), m_creator(creator) {}
01026 
01027 void RankingTask::operator()()
01028 {
01029   size_t lineNum = 0;
01030 
01031   std::vector<std::string> lines;
01032   size_t max_lines = 1000;
01033   lines.reserve(max_lines);
01034 
01035   {
01036 #ifdef WITH_THREADS
01037     boost::mutex::scoped_lock lock(m_fileMutex);
01038 #endif
01039     std::string line;
01040     while(lines.size() < max_lines && std::getline(m_inFile, line))
01041       lines.push_back(line);
01042     lineNum = m_lineNum;
01043     m_lineNum += lines.size();
01044   }
01045 
01046   std::vector<PackedItem> result;
01047   result.reserve(max_lines);
01048 
01049   while(lines.size()) {
01050     for(size_t i = 0; i < lines.size(); i++) {
01051       std::vector<std::string> tokens;
01052       Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
01053 
01054       for(std::vector<std::string>::iterator it = tokens.begin(); it != tokens.end(); it++)
01055         *it = Moses::Trim(*it);
01056 
01057       if(tokens.size() < 4) {
01058         std::stringstream strme;
01059         strme << "Error: It seems the following line has a wrong format:" << std::endl;
01060         strme << "Line " << i << ": " << lines[i] << std::endl;
01061         UTIL_THROW2(strme.str());
01062       }
01063 
01064       if(tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) {
01065         std::stringstream strme;
01066         strme << "Error: It seems the following line contains no alignment information, " << std::endl;
01067         strme << "but you are using ";
01068         strme << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc");
01069         strme << " encoding which makes use of alignment data. " << std::endl;
01070         strme << "Use -encoding None" << std::endl;
01071         strme << "Line " << i << ": " << lines[i] << std::endl;
01072         UTIL_THROW2(strme.str());
01073       }
01074 
01075       std::vector<float> scores = Tokenize<float>(tokens[2]);
01076       if(scores.size() != m_creator.m_numScoreComponent) {
01077         std::stringstream strme;
01078         strme << "Error: It seems the following line has a wrong number of scores ("
01079               << scores.size() << " != " << m_creator.m_numScoreComponent << ") :" << std::endl;
01080         strme << "Line " << i << ": " << lines[i] << std::endl;
01081         UTIL_THROW2(strme.str());
01082       }
01083 
01084       float sortScore = scores[m_creator.m_sortScoreIndex];
01085 
01086       std::string key1 = m_creator.MakeSourceKey(tokens[0]);
01087       std::string key2 = m_creator.MakeSourceTargetKey(tokens[0], tokens[1]);
01088 
01089       PackedItem packedItem(lineNum + i, key1, key2, 0, sortScore);
01090       result.push_back(packedItem);
01091     }
01092     lines.clear();
01093 
01094     {
01095 #ifdef WITH_THREADS
01096       boost::mutex::scoped_lock lock(m_mutex);
01097 #endif
01098       for(size_t i = 0; i < result.size(); i++)
01099         m_creator.AddRankedLine(result[i]);
01100       m_creator.FlushRankedQueue();
01101     }
01102 
01103     result.clear();
01104     lines.reserve(max_lines);
01105     result.reserve(max_lines);
01106 
01107 #ifdef WITH_THREADS
01108     boost::mutex::scoped_lock lock(m_fileMutex);
01109 #endif
01110     std::string line;
01111     while(lines.size() < max_lines && std::getline(m_inFile, line))
01112       lines.push_back(line);
01113     lineNum = m_lineNum;
01114     m_lineNum += lines.size();
01115   }
01116 }
01117 
01118 size_t EncodingTask::m_lineNum = 0;
01119 #ifdef WITH_THREADS
01120 boost::mutex EncodingTask::m_mutex;
01121 boost::mutex EncodingTask::m_fileMutex;
01122 #endif
01123 
01124 EncodingTask::EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator)
01125   : m_inFile(inFile), m_creator(creator) {}
01126 
01127 void EncodingTask::operator()()
01128 {
01129   size_t lineNum = 0;
01130 
01131   std::vector<std::string> lines;
01132   size_t max_lines = 1000;
01133   lines.reserve(max_lines);
01134 
01135   {
01136 #ifdef WITH_THREADS
01137     boost::mutex::scoped_lock lock(m_fileMutex);
01138 #endif
01139     std::string line;
01140     while(lines.size() < max_lines && std::getline(m_inFile, line))
01141       lines.push_back(line);
01142     lineNum = m_lineNum;
01143     m_lineNum += lines.size();
01144   }
01145 
01146   std::vector<PackedItem> result;
01147   result.reserve(max_lines);
01148 
01149   while(lines.size()) {
01150     for(size_t i = 0; i < lines.size(); i++) {
01151       std::vector<std::string> tokens;
01152       Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
01153 
01154       for(std::vector<std::string>::iterator it = tokens.begin(); it != tokens.end(); it++)
01155         *it = Moses::Trim(*it);
01156 
01157       if(tokens.size() < 3) {
01158         std::stringstream strme;
01159         strme << "Error: It seems the following line has a wrong format:" << std::endl;
01160         strme << "Line " << i << ": " << lines[i] << std::endl;
01161         UTIL_THROW2(strme.str());
01162       }
01163 
01164       if(tokens.size() > 3 && tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) {
01165         std::stringstream strme;
01166         strme << "Error: It seems the following line contains no alignment information, " << std::endl;
01167         strme << "but you are using ";
01168         strme << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc");
01169         strme << " encoding which makes use of alignment data. " << std::endl;
01170         strme << "Use -encoding None" << std::endl;
01171         strme << "Line " << i << ": " << lines[i] << std::endl;
01172         UTIL_THROW2(strme.str());
01173       }
01174 
01175       size_t ownRank = 0;
01176       if(m_creator.m_coding == PhraseTableCreator::PREnc)
01177         ownRank = m_creator.m_ranks[lineNum + i];
01178 
01179       std::string encodedLine = m_creator.EncodeLine(tokens, ownRank);
01180 
01181       PackedItem packedItem(lineNum + i, tokens[0], encodedLine, ownRank);
01182       result.push_back(packedItem);
01183     }
01184     lines.clear();
01185 
01186     {
01187 #ifdef WITH_THREADS
01188       boost::mutex::scoped_lock lock(m_mutex);
01189 #endif
01190       for(size_t i = 0; i < result.size(); i++)
01191         m_creator.AddEncodedLine(result[i]);
01192       m_creator.FlushEncodedQueue();
01193     }
01194 
01195     result.clear();
01196     lines.reserve(max_lines);
01197     result.reserve(max_lines);
01198 
01199 #ifdef WITH_THREADS
01200     boost::mutex::scoped_lock lock(m_fileMutex);
01201 #endif
01202     std::string line;
01203     while(lines.size() < max_lines && std::getline(m_inFile, line))
01204       lines.push_back(line);
01205     lineNum = m_lineNum;
01206     m_lineNum += lines.size();
01207   }
01208 }
01209 
01210 
01211 
01212 size_t CompressionTask::m_collectionNum = 0;
01213 #ifdef WITH_THREADS
01214 boost::mutex CompressionTask::m_mutex;
01215 #endif
01216 
01217 CompressionTask::CompressionTask(StringVectorTemp<unsigned char, unsigned long,
01218                                  MmapAllocator>& encodedCollections,
01219                                  PhraseTableCreator& creator)
01220   : m_encodedCollections(encodedCollections), m_creator(creator) {}
01221 
01222 void CompressionTask::operator()()
01223 {
01224   size_t collectionNum;
01225   {
01226 #ifdef WITH_THREADS
01227     boost::mutex::scoped_lock lock(m_mutex);
01228 #endif
01229     collectionNum = m_collectionNum;
01230     m_collectionNum++;
01231   }
01232 
01233   while(collectionNum < m_encodedCollections.size()) {
01234     std::string collection = m_encodedCollections[collectionNum];
01235     std::string compressedCollection
01236     = m_creator.CompressEncodedCollection(collection);
01237 
01238     std::string dummy;
01239     PackedItem packedItem(collectionNum, dummy, compressedCollection, 0);
01240 
01241 #ifdef WITH_THREADS
01242     boost::mutex::scoped_lock lock(m_mutex);
01243 #endif
01244     m_creator.AddCompressedCollection(packedItem);
01245     m_creator.FlushCompressedQueue();
01246 
01247     collectionNum = m_collectionNum;
01248     m_collectionNum++;
01249   }
01250 }
01251 
01252 
01253 
01254 PackedItem::PackedItem(long line, std::string sourcePhrase,
01255                        std::string packedTargetPhrase, size_t rank,
01256                        float score)
01257   : m_line(line), m_sourcePhrase(sourcePhrase),
01258     m_packedTargetPhrase(packedTargetPhrase), m_rank(rank),
01259     m_score(score) {}
01260 
01261 long PackedItem::GetLine() const
01262 {
01263   return m_line;
01264 }
01265 
01266 const std::string& PackedItem::GetSrc() const
01267 {
01268   return m_sourcePhrase;
01269 }
01270 
01271 const std::string& PackedItem::GetTrg() const
01272 {
01273   return m_packedTargetPhrase;
01274 }
01275 
01276 size_t PackedItem::GetRank() const
01277 {
01278   return m_rank;
01279 }
01280 
01281 float PackedItem::GetScore() const
01282 {
01283   return m_score;
01284 }
01285 
01286 }