00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <cstdio>
00023
00024 #include "PhraseTableCreator.h"
00025 #include "ConsistentPhrases.h"
00026 #include "ThrowingFwrite.h"
00027 #include "util/file.hh"
00028 #include "util/exception.hh"
00029
00030 namespace Moses
00031 {
00032
00033 bool operator<(const PackedItem &pi1, const PackedItem &pi2)
00034 {
00035 if(pi1.GetLine() < pi2.GetLine())
00036 return false;
00037 return true;
00038 }
00039
00040 std::string PhraseTableCreator::m_phraseStopSymbol = "__SPECIAL_STOP_SYMBOL__";
00041 std::string PhraseTableCreator::m_separator = "|||";
00042
00043 PhraseTableCreator::PhraseTableCreator(std::string inPath,
00044 std::string outPath,
00045 std::string tempfilePath,
00046 size_t numScoreComponent,
00047 size_t sortScoreIndex,
00048 Coding coding,
00049 size_t orderBits,
00050 size_t fingerPrintBits,
00051 bool useAlignmentInfo,
00052 bool multipleScoreTrees,
00053 size_t quantize,
00054 size_t maxRank,
00055 bool warnMe
00056 #ifdef WITH_THREADS
00057 , size_t threads
00058 #endif
00059 )
00060 : m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath),
00061 m_outFile(std::fopen(m_outPath.c_str(), "w")), m_numScoreComponent(numScoreComponent),
00062 m_sortScoreIndex(sortScoreIndex), m_warnMe(warnMe),
00063 m_coding(coding), m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
00064 m_useAlignmentInfo(useAlignmentInfo),
00065 m_multipleScoreTrees(multipleScoreTrees),
00066 m_quantize(quantize), m_maxRank(maxRank),
00067 #ifdef WITH_THREADS
00068 m_threads(threads),
00069 m_srcHash(m_orderBits, m_fingerPrintBits, 1),
00070 m_rnkHash(10, 24, m_threads),
00071 #else
00072 m_srcHash(m_orderBits, m_fingerPrintBits),
00073 m_rnkHash(m_orderBits, m_fingerPrintBits),
00074 #endif
00075 m_maxPhraseLength(0),
00076 m_lastFlushedLine(-1), m_lastFlushedSourceNum(0),
00077 m_lastFlushedSourcePhrase("")
00078 {
00079 PrintInfo();
00080
00081 AddTargetSymbolId(m_phraseStopSymbol);
00082
00083 size_t cur_pass = 1;
00084 size_t all_passes = 2;
00085 if(m_coding == PREnc)
00086 all_passes = 3;
00087
00088 m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
00089 for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
00090 it != m_scoreCounters.end(); it++)
00091 *it = new ScoreCounter();
00092 m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
00093
00094
00095 if(m_coding == REnc) {
00096 size_t found = inPath.find_last_of("/\\");
00097 std::string path;
00098 if(found != std::string::npos)
00099 path = inPath.substr(0, found);
00100 else
00101 path = ".";
00102 LoadLexicalTable(path + "/lex.f2e");
00103 } else if(m_coding == PREnc) {
00104 std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating hash function for rank assignment" << std::endl;
00105 cur_pass++;
00106 CreateRankHash();
00107 }
00108
00109
00110 std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating source phrase index + Encoding target phrases" << std::endl;
00111 m_srcHash.BeginSave(m_outFile);
00112
00113 if(tempfilePath.size()) {
00114 MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath));
00115 m_encodedTargetPhrases = new StringVectorTemp<unsigned char, unsigned long, MmapAllocator>(allocEncoded);
00116 } else {
00117 m_encodedTargetPhrases = new StringVectorTemp<unsigned char, unsigned long, MmapAllocator>();
00118 }
00119 EncodeTargetPhrases();
00120
00121 cur_pass++;
00122
00123 std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl;
00124 CalcHuffmanCodes();
00125
00126
00127 std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Compressing target phrases" << std::endl;
00128
00129 if(tempfilePath.size()) {
00130 MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
00131 m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
00132 } else {
00133 m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(true);
00134 }
00135 CompressTargetPhrases();
00136
00137 std::cerr << "Saving to " << m_outPath << std::endl;
00138 Save();
00139 std::cerr << "Done" << std::endl;
00140 std::fclose(m_outFile);
00141 }
00142
00143 PhraseTableCreator::~PhraseTableCreator()
00144 {
00145 delete m_symbolTree;
00146 if(m_useAlignmentInfo)
00147 delete m_alignTree;
00148 for(size_t i = 0; i < m_scoreTrees.size(); i++) {
00149 delete m_scoreTrees[i];
00150 delete m_scoreCounters[i];
00151 }
00152
00153 delete m_encodedTargetPhrases;
00154 delete m_compressedTargetPhrases;
00155 }
00156
00157 void PhraseTableCreator::PrintInfo()
00158 {
00159 std::string encodings[3] = {"Huffman", "Huffman + REnc", "Huffman + PREnc"};
00160
00161 std::cerr << "Used options:" << std::endl;
00162 std::cerr << "\tText phrase table will be read from: " << m_inPath << std::endl;
00163 std::cerr << "\tOutput phrase table will be written to: " << m_outPath << std::endl;
00164 std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl;
00165 std::cerr << "\tSource phrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl;
00166 std::cerr << "\tSelected target phrase encoding: " << encodings[m_coding] << std::endl;
00167 if(m_coding == PREnc) {
00168 std::cerr << "\tMaxiumum allowed rank for PREnc: ";
00169 if(!m_maxRank)
00170 std::cerr << "unlimited" << std::endl;
00171 else
00172 std::cerr << m_maxRank << std::endl;
00173 }
00174 std::cerr << "\tNumber of score components in phrase table: " << m_numScoreComponent << std::endl;
00175 std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl;
00176 std::cerr << "\tUsing score quantization: ";
00177 if(m_quantize)
00178 std::cerr << m_quantize << " best" << std::endl;
00179 else
00180 std::cerr << "no" << std::endl;
00181 std::cerr << "\tExplicitly included alignment information: " << (m_useAlignmentInfo ? "yes" : "no") << std::endl;
00182
00183 #ifdef WITH_THREADS
00184 std::cerr << "\tRunning with " << m_threads << " threads" << std::endl;
00185 #endif
00186 std::cerr << std::endl;
00187 }
00188
00189 void PhraseTableCreator::Save()
00190 {
00191
00192 ThrowingFwrite(&m_coding, sizeof(m_coding), 1, m_outFile);
00193 ThrowingFwrite(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, m_outFile);
00194 ThrowingFwrite(&m_useAlignmentInfo, sizeof(m_useAlignmentInfo), 1, m_outFile);
00195 ThrowingFwrite(&m_maxRank, sizeof(m_maxRank), 1, m_outFile);
00196 ThrowingFwrite(&m_maxPhraseLength, sizeof(m_maxPhraseLength), 1, m_outFile);
00197
00198 if(m_coding == REnc) {
00199
00200 std::vector<std::string> temp1;
00201 temp1.resize(m_sourceSymbolsMap.size());
00202 for(boost::unordered_map<std::string, unsigned>::iterator it
00203 = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++)
00204 temp1[it->second] = it->first;
00205 std::sort(temp1.begin(), temp1.end());
00206 StringVector<unsigned char, unsigned, std::allocator> sourceSymbols(true);
00207 for(std::vector<std::string>::iterator it = temp1.begin();
00208 it != temp1.end(); it++)
00209 sourceSymbols.push_back(*it);
00210 sourceSymbols.save(m_outFile);
00211
00212
00213 size_t size = m_lexicalTableIndex.size();
00214 ThrowingFwrite(&size, sizeof(size_t), 1, m_outFile);
00215 ThrowingFwrite(&m_lexicalTableIndex[0], sizeof(size_t), size, m_outFile);
00216 size = m_lexicalTable.size();
00217 ThrowingFwrite(&size, sizeof(size_t), 1, m_outFile);
00218 ThrowingFwrite(&m_lexicalTable[0], sizeof(SrcTrg), size, m_outFile);
00219 }
00220
00221
00222 std::vector<std::string> temp2;
00223 temp2.resize(m_targetSymbolsMap.size());
00224 for(boost::unordered_map<std::string, unsigned>::iterator it
00225 = m_targetSymbolsMap.begin(); it != m_targetSymbolsMap.end(); it++)
00226 temp2[it->second] = it->first;
00227 StringVector<unsigned char, unsigned, std::allocator> targetSymbols(true);
00228 for(std::vector<std::string>::iterator it = temp2.begin();
00229 it != temp2.end(); it++)
00230 targetSymbols.push_back(*it);
00231 targetSymbols.save(m_outFile);
00232
00233
00234 m_symbolTree->Save(m_outFile);
00235
00236
00237
00238 ThrowingFwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile);
00239 size_t numScoreTrees = m_scoreTrees.size();
00240 for(size_t i = 0; i < numScoreTrees; i++)
00241 m_scoreTrees[i]->Save(m_outFile);
00242
00243
00244 if(m_useAlignmentInfo)
00245 m_alignTree->Save(m_outFile);
00246
00247
00248 m_compressedTargetPhrases->save(m_outFile);
00249 }
00250
00251 void PhraseTableCreator::LoadLexicalTable(std::string filePath)
00252 {
00253 std::vector<SrcTrgProb> t_lexTable;
00254
00255 std::cerr << "Reading in lexical table for Rank Encoding" << std::endl;
00256 std::ifstream lexIn(filePath.c_str(), std::ifstream::in);
00257 std::string src, trg;
00258 float prob;
00259
00260
00261
00262 std::cerr << "\tLoading from " << filePath << std::endl;
00263 while(lexIn >> trg >> src >> prob) {
00264 t_lexTable.push_back(SrcTrgProb(SrcTrgString(src, trg), prob));
00265 AddSourceSymbolId(src);
00266 AddTargetSymbolId(trg);
00267 }
00268
00269
00270
00271
00272 std::cerr << "\tSorting according to translation rank" << std::endl;
00273 std::sort(t_lexTable.begin(), t_lexTable.end(), SrcTrgProbSorter());
00274
00275
00276
00277 std::vector<std::string> temp1;
00278 temp1.resize(m_sourceSymbolsMap.size());
00279 for(boost::unordered_map<std::string, unsigned>::iterator it
00280 = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++)
00281 temp1[it->second] = it->first;
00282
00283 std::sort(temp1.begin(), temp1.end());
00284
00285 for(size_t i = 0; i < temp1.size(); i++)
00286 m_sourceSymbolsMap[temp1[i]] = i;
00287
00288
00289
00290 std::string srcWord = "";
00291 size_t srcIdx = 0;
00292 for(std::vector<SrcTrgProb>::iterator it = t_lexTable.begin();
00293 it != t_lexTable.end(); it++) {
00294
00295 if(it->first.first != srcWord) {
00296 srcIdx = GetSourceSymbolId(it->first.first);
00297
00298
00299 if(srcIdx >= m_lexicalTableIndex.size())
00300 m_lexicalTableIndex.resize(srcIdx + 1);
00301 m_lexicalTableIndex[srcIdx] = m_lexicalTable.size();
00302 }
00303
00304
00305 size_t trgIdx = GetTargetSymbolId(it->first.second);
00306 m_lexicalTable.push_back(SrcTrg(srcIdx, trgIdx));
00307
00308 srcWord = it->first.first;
00309 }
00310 std::cerr << "\tLoaded " << m_lexicalTable.size() << " lexical pairs" << std::endl;
00311 std::cerr << std::endl;
00312 }
00313
00314 void PhraseTableCreator::CreateRankHash()
00315 {
00316 InputFileStream inFile(m_inPath);
00317
00318 #ifdef WITH_THREADS
00319 boost::thread_group threads;
00320 for (size_t i = 0; i < m_threads; ++i) {
00321 RankingTask* rt = new RankingTask(inFile, *this);
00322 threads.create_thread(*rt);
00323 }
00324 threads.join_all();
00325 #else
00326 RankingTask* rt = new RankingTask(inFile, *this);
00327 (*rt)();
00328 delete rt;
00329 #endif
00330 FlushRankedQueue(true);
00331 }
00332
00333 inline std::string PhraseTableCreator::MakeSourceKey(std::string &source)
00334 {
00335 return source + " " + m_separator + " ";
00336 }
00337
00338 inline std::string PhraseTableCreator::MakeSourceTargetKey(std::string &source, std::string &target)
00339 {
00340 return source + " " + m_separator + " " + target + " " + m_separator + " ";
00341 }
00342
00343 void PhraseTableCreator::EncodeTargetPhrases()
00344 {
00345 InputFileStream inFile(m_inPath);
00346
00347 #ifdef WITH_THREADS
00348 boost::thread_group threads;
00349 for (size_t i = 0; i < m_threads; ++i) {
00350 EncodingTask* et = new EncodingTask(inFile, *this);
00351 threads.create_thread(*et);
00352 }
00353 threads.join_all();
00354 #else
00355 EncodingTask* et = new EncodingTask(inFile, *this);
00356 (*et)();
00357 delete et;
00358 #endif
00359 FlushEncodedQueue(true);
00360 }
00361
00362
00363 void PhraseTableCreator::CompressTargetPhrases()
00364 {
00365 #ifdef WITH_THREADS
00366 boost::thread_group threads;
00367 for (size_t i = 0; i < m_threads; ++i) {
00368 CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this);
00369 threads.create_thread(*ct);
00370 }
00371 threads.join_all();
00372 #else
00373 CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this);
00374 (*ct)();
00375 delete ct;
00376 #endif
00377 FlushCompressedQueue(true);
00378 }
00379
00380 void PhraseTableCreator::CalcHuffmanCodes()
00381 {
00382 std::cerr << "\tCreating Huffman codes for " << m_symbolCounter.Size()
00383 << " target phrase symbols" << std::endl;
00384
00385 m_symbolTree = new SymbolTree(m_symbolCounter.Begin(),
00386 m_symbolCounter.End());
00387
00388 std::vector<ScoreTree*>::iterator treeIt = m_scoreTrees.begin();
00389 for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
00390 it != m_scoreCounters.end(); it++) {
00391 if(m_quantize)
00392 (*it)->Quantize(m_quantize);
00393
00394 std::cerr << "\tCreating Huffman codes for " << (*it)->Size()
00395 << " scores" << std::endl;
00396
00397 *treeIt = new ScoreTree((*it)->Begin(), (*it)->End());
00398 treeIt++;
00399 }
00400
00401 if(m_useAlignmentInfo) {
00402 std::cerr << "\tCreating Huffman codes for " << m_alignCounter.Size()
00403 << " alignment points" << std::endl;
00404 m_alignTree = new AlignTree(m_alignCounter.Begin(), m_alignCounter.End());
00405 }
00406 std::cerr << std::endl;
00407 }
00408
00409
00410 void PhraseTableCreator::AddSourceSymbolId(std::string& symbol)
00411 {
00412 #ifdef WITH_THREADS
00413 boost::mutex::scoped_lock lock(m_mutex);
00414 #endif
00415
00416 if(m_sourceSymbolsMap.count(symbol) == 0) {
00417 unsigned value = m_sourceSymbolsMap.size();
00418 m_sourceSymbolsMap[symbol] = value;
00419 }
00420 }
00421
00422 void PhraseTableCreator::AddTargetSymbolId(std::string& symbol)
00423 {
00424 #ifdef WITH_THREADS
00425 boost::mutex::scoped_lock lock(m_mutex);
00426 #endif
00427 if(m_targetSymbolsMap.count(symbol) == 0) {
00428 unsigned value = m_targetSymbolsMap.size();
00429 m_targetSymbolsMap[symbol] = value;
00430 }
00431 }
00432
00433 unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol)
00434 {
00435 #ifdef WITH_THREADS
00436 boost::mutex::scoped_lock lock(m_mutex);
00437 #endif
00438 boost::unordered_map<std::string, unsigned>::iterator it
00439 = m_sourceSymbolsMap.find(symbol);
00440
00441 if(it != m_sourceSymbolsMap.end())
00442 return it->second;
00443 else
00444 return m_sourceSymbolsMap.size();
00445 }
00446
00447 unsigned PhraseTableCreator::GetTargetSymbolId(std::string& symbol)
00448 {
00449 #ifdef WITH_THREADS
00450 boost::mutex::scoped_lock lock(m_mutex);
00451 #endif
00452 boost::unordered_map<std::string, unsigned>::iterator it
00453 = m_targetSymbolsMap.find(symbol);
00454
00455 UTIL_THROW_IF2(it == m_targetSymbolsMap.end(), "No id found for target symbol: " << symbol);
00456 return it->second;
00457 }
00458
00459 unsigned PhraseTableCreator::GetOrAddTargetSymbolId(std::string& symbol)
00460 {
00461 #ifdef WITH_THREADS
00462 boost::mutex::scoped_lock lock(m_mutex);
00463 #endif
00464 boost::unordered_map<std::string, unsigned>::iterator it
00465 = m_targetSymbolsMap.find(symbol);
00466
00467 if(it != m_targetSymbolsMap.end())
00468 return it->second;
00469 else {
00470 unsigned value = m_targetSymbolsMap.size();
00471 m_targetSymbolsMap[symbol] = value;
00472 return value;
00473 }
00474 }
00475
00476 unsigned PhraseTableCreator::GetRank(unsigned srcIdx, unsigned trgIdx)
00477 {
00478 size_t srcTrgIdx = m_lexicalTableIndex[srcIdx];
00479 while(srcTrgIdx < m_lexicalTable.size()
00480 && srcIdx == m_lexicalTable[srcTrgIdx].first
00481 && m_lexicalTable[srcTrgIdx].second != trgIdx)
00482 srcTrgIdx++;
00483
00484 if(srcTrgIdx < m_lexicalTable.size()
00485 && m_lexicalTable[srcTrgIdx].second == trgIdx)
00486 return srcTrgIdx - m_lexicalTableIndex[srcIdx];
00487 else
00488 return m_lexicalTable.size();
00489 }
00490
00491 unsigned PhraseTableCreator::EncodeREncSymbol1(unsigned trgIdx)
00492 {
00493 assert((~(1 << 31)) > trgIdx);
00494 return trgIdx;
00495 }
00496
00497 unsigned PhraseTableCreator::EncodeREncSymbol2(unsigned pos, unsigned rank)
00498 {
00499 unsigned symbol = rank;
00500 symbol |= 1 << 30;
00501 symbol |= pos << 24;
00502 return symbol;
00503 }
00504
00505 unsigned PhraseTableCreator::EncodeREncSymbol3(unsigned rank)
00506 {
00507 unsigned symbol = rank;
00508 symbol |= 2 << 30;
00509 return symbol;
00510 }
00511
00512 unsigned PhraseTableCreator::EncodePREncSymbol1(unsigned trgIdx)
00513 {
00514 assert((~(1 << 31)) > trgIdx);
00515 return trgIdx;
00516 }
00517
00518 unsigned PhraseTableCreator::EncodePREncSymbol2(int left, int right, unsigned rank)
00519 {
00520
00521
00522 left = left + 32;
00523 right = right + 32;
00524
00525 assert(64 > left);
00526 assert(64 > right);
00527 assert(524288 > rank);
00528
00529 unsigned symbol = 0;
00530 symbol |= 1 << 31;
00531 symbol |= left << 25;
00532 symbol |= right << 19;
00533 symbol |= rank;
00534 return symbol;
00535 }
00536
00537 void PhraseTableCreator::EncodeTargetPhraseNone(std::vector<std::string>& t,
00538 std::ostream& os)
00539 {
00540 std::stringstream encodedTargetPhrase;
00541 size_t j = 0;
00542 while(j < t.size()) {
00543 unsigned targetSymbolId = GetOrAddTargetSymbolId(t[j]);
00544
00545 m_symbolCounter.Increase(targetSymbolId);
00546 os.write((char*)&targetSymbolId, sizeof(targetSymbolId));
00547 j++;
00548 }
00549
00550 unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
00551 os.write((char*)&stopSymbolId, sizeof(stopSymbolId));
00552 m_symbolCounter.Increase(stopSymbolId);
00553 }
00554
00555 void PhraseTableCreator::EncodeTargetPhraseREnc(std::vector<std::string>& s,
00556 std::vector<std::string>& t,
00557 std::set<AlignPoint>& a,
00558 std::ostream& os)
00559 {
00560 std::stringstream encodedTargetPhrase;
00561
00562 std::vector<std::vector<size_t> > a2(t.size());
00563 for(std::set<AlignPoint>::iterator it = a.begin(); it != a.end(); it++)
00564 a2[it->second].push_back(it->first);
00565
00566 for(size_t i = 0; i < t.size(); i++) {
00567 unsigned idxTarget = GetOrAddTargetSymbolId(t[i]);
00568 unsigned encodedSymbol = -1;
00569
00570 unsigned bestSrcPos = s.size();
00571 unsigned bestDiff = s.size();
00572 unsigned bestRank = m_lexicalTable.size();
00573 unsigned badRank = m_lexicalTable.size();
00574
00575 for(std::vector<size_t>::iterator it = a2[i].begin(); it != a2[i].end(); it++) {
00576 unsigned idxSource = GetSourceSymbolId(s[*it]);
00577 size_t r = GetRank(idxSource, idxTarget);
00578 if(r != badRank) {
00579 if(r < bestRank) {
00580 bestRank = r;
00581 bestSrcPos = *it;
00582 bestDiff = abs(*it-i);
00583 } else if(r == bestRank && unsigned(abs(*it-i)) < bestDiff) {
00584 bestSrcPos = *it;
00585 bestDiff = abs(*it-i);
00586 }
00587 }
00588 }
00589
00590 if(bestRank != badRank && bestSrcPos < s.size()) {
00591 if(bestSrcPos == i)
00592 encodedSymbol = EncodeREncSymbol3(bestRank);
00593 else
00594 encodedSymbol = EncodeREncSymbol2(bestSrcPos, bestRank);
00595 a.erase(AlignPoint(bestSrcPos, i));
00596 } else {
00597 encodedSymbol = EncodeREncSymbol1(idxTarget);
00598 }
00599
00600 os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
00601 m_symbolCounter.Increase(encodedSymbol);
00602 }
00603
00604 unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
00605 unsigned encodedSymbol = EncodeREncSymbol1(stopSymbolId);
00606 os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
00607 m_symbolCounter.Increase(encodedSymbol);
00608 }
00609
00610 void PhraseTableCreator::EncodeTargetPhrasePREnc(std::vector<std::string>& s,
00611 std::vector<std::string>& t,
00612 std::set<AlignPoint>& a,
00613 size_t ownRank,
00614 std::ostream& os)
00615 {
00616 std::vector<unsigned> encodedSymbols(t.size());
00617 std::vector<unsigned> encodedSymbolsLengths(t.size(), 0);
00618
00619 ConsistentPhrases cp(s.size(), t.size(), a);
00620 while(!cp.Empty()) {
00621 ConsistentPhrases::Phrase p = cp.Pop();
00622
00623 std::stringstream key1;
00624 key1 << s[p.i];
00625 for(int i = p.i+1; i < p.i+p.m; i++)
00626 key1 << " " << s[i];
00627
00628 std::stringstream key2;
00629 key2 << t[p.j];
00630 for(int i = p.j+1; i < p.j+p.n; i++)
00631 key2 << " " << t[i];
00632
00633 int rank = -1;
00634 std::string key1Str = key1.str(), key2Str = key2.str();
00635 size_t idx = m_rnkHash[MakeSourceTargetKey(key1Str, key2Str)];
00636 if(idx != m_rnkHash.GetSize())
00637 rank = m_ranks[idx];
00638
00639 if(rank >= 0 && (m_maxRank == 0 || unsigned(rank) < m_maxRank)) {
00640 if(unsigned(p.m) != s.size() || unsigned(rank) < ownRank) {
00641 std::stringstream encodedSymbol;
00642 encodedSymbols[p.j] = EncodePREncSymbol2(p.i-p.j, s.size()-(p.i+p.m), rank);
00643 encodedSymbolsLengths[p.j] = p.n;
00644
00645 std::set<AlignPoint> tAlignment;
00646 for(std::set<AlignPoint>::iterator it = a.begin();
00647 it != a.end(); it++)
00648 if(it->first < p.i || it->first >= p.i + p.m
00649 || it->second < p.j || it->second >= p.j + p.n)
00650 tAlignment.insert(*it);
00651 a = tAlignment;
00652 cp.RemoveOverlap(p);
00653 }
00654 }
00655 }
00656
00657 std::stringstream encodedTargetPhrase;
00658
00659 size_t j = 0;
00660 while(j < t.size()) {
00661 if(encodedSymbolsLengths[j] > 0) {
00662 unsigned encodedSymbol = encodedSymbols[j];
00663 m_symbolCounter.Increase(encodedSymbol);
00664 os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
00665 j += encodedSymbolsLengths[j];
00666 } else {
00667 unsigned targetSymbolId = GetOrAddTargetSymbolId(t[j]);
00668 unsigned encodedSymbol = EncodePREncSymbol1(targetSymbolId);
00669 m_symbolCounter.Increase(encodedSymbol);
00670 os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
00671 j++;
00672 }
00673 }
00674
00675 unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
00676 unsigned encodedSymbol = EncodePREncSymbol1(stopSymbolId);
00677 os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
00678 m_symbolCounter.Increase(encodedSymbol);
00679 }
00680
00681 void PhraseTableCreator::EncodeScores(std::vector<float>& scores, std::ostream& os)
00682 {
00683 size_t c = 0;
00684 float score;
00685
00686 while(c < scores.size()) {
00687 score = scores[c];
00688 score = FloorScore(TransformScore(score));
00689 os.write((char*)&score, sizeof(score));
00690 m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score);
00691 c++;
00692 }
00693 }
00694
00695 void PhraseTableCreator::EncodeAlignment(std::set<AlignPoint>& alignment,
00696 std::ostream& os)
00697 {
00698 for(std::set<AlignPoint>::iterator it = alignment.begin();
00699 it != alignment.end(); it++) {
00700 os.write((char*)&(*it), sizeof(AlignPoint));
00701 m_alignCounter.Increase(*it);
00702 }
00703 AlignPoint stop(-1, -1);
00704 os.write((char*) &stop, sizeof(AlignPoint));
00705 m_alignCounter.Increase(stop);
00706 }
00707
00708 std::string PhraseTableCreator::EncodeLine(std::vector<std::string>& tokens, size_t ownRank)
00709 {
00710 std::string sourcePhraseStr = tokens[0];
00711 std::string targetPhraseStr = tokens[1];
00712 std::string scoresStr = tokens[2];
00713
00714 std::string alignmentStr = "";
00715 if(tokens.size() > 3)
00716 alignmentStr = tokens[3];
00717
00718 std::vector<std::string> s = Tokenize(sourcePhraseStr);
00719
00720 size_t phraseLength = s.size();
00721 if(m_maxPhraseLength < phraseLength)
00722 m_maxPhraseLength = phraseLength;
00723
00724 std::vector<std::string> t = Tokenize(targetPhraseStr);
00725 std::vector<float> scores = Tokenize<float>(scoresStr);
00726
00727 if(scores.size() != m_numScoreComponent) {
00728 std::stringstream strme;
00729 strme << "Error: Wrong number of scores detected ("
00730 << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
00731 strme << "Line: " << tokens[0] << " ||| " << tokens[1] << " ||| " << tokens[2] << " ..." << std::endl;
00732 UTIL_THROW2(strme.str());
00733 }
00734
00735 std::set<AlignPoint> a;
00736 if(m_coding != None || m_useAlignmentInfo) {
00737 std::vector<size_t> positions = Tokenize<size_t>(alignmentStr, " \t-");
00738 for(size_t i = 0; i < positions.size(); i += 2) {
00739 a.insert(AlignPoint(positions[i], positions[i+1]));
00740 }
00741 }
00742
00743 std::stringstream encodedTargetPhrase;
00744
00745 if(m_coding == PREnc) {
00746 EncodeTargetPhrasePREnc(s, t, a, ownRank, encodedTargetPhrase);
00747 } else if(m_coding == REnc) {
00748 EncodeTargetPhraseREnc(s, t, a, encodedTargetPhrase);
00749 } else {
00750 EncodeTargetPhraseNone(t, encodedTargetPhrase);
00751 }
00752
00753 EncodeScores(scores, encodedTargetPhrase);
00754
00755 if(m_useAlignmentInfo)
00756 EncodeAlignment(a, encodedTargetPhrase);
00757
00758 return encodedTargetPhrase.str();
00759 }
00760
00761 std::string PhraseTableCreator::CompressEncodedCollection(std::string encodedCollection)
00762 {
00763 enum EncodeState {
00764 ReadSymbol, ReadScore, ReadAlignment,
00765 EncodeSymbol, EncodeScore, EncodeAlignment
00766 };
00767 EncodeState state = ReadSymbol;
00768
00769 unsigned phraseStopSymbolId;
00770 if(m_coding == REnc)
00771 phraseStopSymbolId = EncodeREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol));
00772 else if(m_coding == PREnc)
00773 phraseStopSymbolId = EncodePREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol));
00774 else
00775 phraseStopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
00776 AlignPoint alignStopSymbol(-1, -1);
00777
00778 std::stringstream encodedStream(encodedCollection);
00779 encodedStream.unsetf(std::ios::skipws);
00780
00781 std::string compressedEncodedCollection;
00782 BitWrapper<> bitStream(compressedEncodedCollection);
00783
00784 unsigned symbol;
00785 float score;
00786 size_t currScore = 0;
00787 AlignPoint alignPoint;
00788
00789 while(encodedStream) {
00790 switch(state) {
00791 case ReadSymbol:
00792 encodedStream.read((char*) &symbol, sizeof(unsigned));
00793 state = EncodeSymbol;
00794 break;
00795 case ReadScore:
00796 if(currScore == m_numScoreComponent) {
00797 currScore = 0;
00798 if(m_useAlignmentInfo)
00799 state = ReadAlignment;
00800 else
00801 state = ReadSymbol;
00802 } else {
00803 encodedStream.read((char*) &score, sizeof(float));
00804 currScore++;
00805 state = EncodeScore;
00806 }
00807 break;
00808 case ReadAlignment:
00809 encodedStream.read((char*) &alignPoint, sizeof(AlignPoint));
00810 state = EncodeAlignment;
00811 break;
00812
00813 case EncodeSymbol:
00814 state = (symbol == phraseStopSymbolId) ? ReadScore : ReadSymbol;
00815 m_symbolTree->Put(bitStream, symbol);
00816 break;
00817 case EncodeScore: {
00818 state = ReadScore;
00819 size_t idx = m_multipleScoreTrees ? currScore-1 : 0;
00820 if(m_quantize)
00821 score = m_scoreCounters[idx]->LowerBound(score);
00822 m_scoreTrees[idx]->Put(bitStream, score);
00823 }
00824 break;
00825 case EncodeAlignment:
00826 state = (alignPoint == alignStopSymbol) ? ReadSymbol : ReadAlignment;
00827 m_alignTree->Put(bitStream, alignPoint);
00828 break;
00829 }
00830 }
00831
00832 return compressedEncodedCollection;
00833 }
00834
00835 void PhraseTableCreator::AddRankedLine(PackedItem& pi)
00836 {
00837 m_queue.push(pi);
00838 }
00839
00840 void PhraseTableCreator::FlushRankedQueue(bool force)
00841 {
00842 size_t step = 1ul << 10;
00843
00844 while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
00845 m_lastFlushedLine++;
00846
00847 PackedItem pi = m_queue.top();
00848 m_queue.pop();
00849
00850 if(m_lastSourceRange.size() == step) {
00851 m_rnkHash.AddRange(m_lastSourceRange);
00852 m_lastSourceRange.clear();
00853 }
00854
00855 if(m_lastFlushedSourcePhrase != pi.GetSrc()) {
00856 if(m_rankQueue.size()) {
00857 m_lastFlushedSourceNum++;
00858 if(m_lastFlushedSourceNum % 100000 == 0) {
00859 std::cerr << ".";
00860 }
00861 if(m_lastFlushedSourceNum % 5000000 == 0) {
00862 std::cerr << "[" << m_lastFlushedSourceNum << "]" << std::endl;
00863 }
00864
00865 m_ranks.resize(m_lastFlushedLine + 1);
00866 int r = 0;
00867 while(!m_rankQueue.empty()) {
00868 m_ranks[m_rankQueue.top().second] = r++;
00869 m_rankQueue.pop();
00870 }
00871 }
00872 }
00873
00874 m_lastSourceRange.push_back(pi.GetTrg());
00875
00876 m_rankQueue.push(std::make_pair(pi.GetScore(), pi.GetLine()));
00877 m_lastFlushedSourcePhrase = pi.GetSrc();
00878 }
00879
00880 if(force) {
00881 if(!m_lastSourceRange.empty()) {
00882 m_rnkHash.AddRange(m_lastSourceRange);
00883 m_lastSourceRange.clear();
00884 }
00885
00886 #ifdef WITH_THREADS
00887 m_rnkHash.WaitAll();
00888 #endif
00889
00890 m_ranks.resize(m_lastFlushedLine + 1);
00891 int r = 0;
00892 while(!m_rankQueue.empty()) {
00893 m_ranks[m_rankQueue.top().second] = r++;
00894 m_rankQueue.pop();
00895 }
00896
00897 m_lastFlushedLine = -1;
00898 m_lastFlushedSourceNum = 0;
00899
00900 std::cerr << std::endl << std::endl;
00901 }
00902 }
00903
00904
00905 void PhraseTableCreator::AddEncodedLine(PackedItem& pi)
00906 {
00907 m_queue.push(pi);
00908 }
00909
00910 void PhraseTableCreator::FlushEncodedQueue(bool force)
00911 {
00912 while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
00913 PackedItem pi = m_queue.top();
00914 m_queue.pop();
00915 m_lastFlushedLine++;
00916
00917 if(m_lastFlushedSourcePhrase != pi.GetSrc()) {
00918 if(m_lastCollection.size()) {
00919 std::stringstream targetPhraseCollection;
00920 for(std::vector<std::string>::iterator it =
00921 m_lastCollection.begin(); it != m_lastCollection.end(); it++)
00922 targetPhraseCollection << *it;
00923
00924 m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase));
00925 m_encodedTargetPhrases->push_back(targetPhraseCollection.str());
00926
00927 m_lastFlushedSourceNum++;
00928 if(m_lastFlushedSourceNum % 100000 == 0)
00929 std::cerr << ".";
00930 if(m_lastFlushedSourceNum % 5000000 == 0)
00931 std::cerr << "[" << m_lastFlushedSourceNum << "]" << std::endl;
00932
00933 m_lastCollection.clear();
00934 }
00935 }
00936
00937 if(m_lastSourceRange.size() == (1ul << m_orderBits)) {
00938 m_srcHash.AddRange(m_lastSourceRange);
00939 m_srcHash.SaveLastRange();
00940 m_srcHash.DropLastRange();
00941 m_lastSourceRange.clear();
00942 }
00943
00944 m_lastFlushedSourcePhrase = pi.GetSrc();
00945 if(m_coding == PREnc) {
00946 if(m_lastCollection.size() <= pi.GetRank())
00947 m_lastCollection.resize(pi.GetRank() + 1);
00948 m_lastCollection[pi.GetRank()] = pi.GetTrg();
00949 } else {
00950 m_lastCollection.push_back(pi.GetTrg());
00951 }
00952 }
00953
00954 if(force) {
00955 if(!m_lastSourceRange.size() || m_lastSourceRange.back() != m_lastFlushedSourcePhrase)
00956 m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase));
00957
00958 if(m_lastCollection.size()) {
00959 std::stringstream targetPhraseCollection;
00960 for(std::vector<std::string>::iterator it =
00961 m_lastCollection.begin(); it != m_lastCollection.end(); it++)
00962 targetPhraseCollection << *it;
00963
00964 m_encodedTargetPhrases->push_back(targetPhraseCollection.str());
00965 m_lastCollection.clear();
00966 }
00967
00968 if(!m_lastSourceRange.empty()) {
00969 m_srcHash.AddRange(m_lastSourceRange);
00970 m_lastSourceRange.clear();
00971 }
00972
00973 #ifdef WITH_THREADS
00974 m_srcHash.WaitAll();
00975 #endif
00976
00977 m_srcHash.SaveLastRange();
00978 m_srcHash.DropLastRange();
00979 m_srcHash.FinalizeSave();
00980
00981 m_lastFlushedLine = -1;
00982 m_lastFlushedSourceNum = 0;
00983
00984 std::cerr << std::endl << std::endl;
00985 }
00986 }
00987
00988 void PhraseTableCreator::AddCompressedCollection(PackedItem& pi)
00989 {
00990 m_queue.push(pi);
00991 }
00992
00993 void PhraseTableCreator::FlushCompressedQueue(bool force)
00994 {
00995 if(force || m_queue.size() > 10000) {
00996 while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
00997 PackedItem pi = m_queue.top();
00998 m_queue.pop();
00999 m_lastFlushedLine++;
01000
01001 m_compressedTargetPhrases->push_back(pi.GetTrg());
01002
01003 if((pi.GetLine()+1) % 100000 == 0)
01004 std::cerr << ".";
01005 if((pi.GetLine()+1) % 5000000 == 0)
01006 std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
01007 }
01008 }
01009
01010 if(force) {
01011 m_lastFlushedLine = -1;
01012 std::cerr << std::endl << std::endl;
01013 }
01014 }
01015
01016
01017
01018 size_t RankingTask::m_lineNum = 0;
01019 #ifdef WITH_THREADS
01020 boost::mutex RankingTask::m_mutex;
01021 boost::mutex RankingTask::m_fileMutex;
01022 #endif
01023
01024 RankingTask::RankingTask(InputFileStream& inFile, PhraseTableCreator& creator)
01025 : m_inFile(inFile), m_creator(creator) {}
01026
01027 void RankingTask::operator()()
01028 {
01029 size_t lineNum = 0;
01030
01031 std::vector<std::string> lines;
01032 size_t max_lines = 1000;
01033 lines.reserve(max_lines);
01034
01035 {
01036 #ifdef WITH_THREADS
01037 boost::mutex::scoped_lock lock(m_fileMutex);
01038 #endif
01039 std::string line;
01040 while(lines.size() < max_lines && std::getline(m_inFile, line))
01041 lines.push_back(line);
01042 lineNum = m_lineNum;
01043 m_lineNum += lines.size();
01044 }
01045
01046 std::vector<PackedItem> result;
01047 result.reserve(max_lines);
01048
01049 while(lines.size()) {
01050 for(size_t i = 0; i < lines.size(); i++) {
01051 std::vector<std::string> tokens;
01052 Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
01053
01054 for(std::vector<std::string>::iterator it = tokens.begin(); it != tokens.end(); it++)
01055 *it = Moses::Trim(*it);
01056
01057 if(tokens.size() < 4) {
01058 std::stringstream strme;
01059 strme << "Error: It seems the following line has a wrong format:" << std::endl;
01060 strme << "Line " << i << ": " << lines[i] << std::endl;
01061 UTIL_THROW2(strme.str());
01062 }
01063
01064 if(tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) {
01065 std::stringstream strme;
01066 strme << "Error: It seems the following line contains no alignment information, " << std::endl;
01067 strme << "but you are using ";
01068 strme << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc");
01069 strme << " encoding which makes use of alignment data. " << std::endl;
01070 strme << "Use -encoding None" << std::endl;
01071 strme << "Line " << i << ": " << lines[i] << std::endl;
01072 UTIL_THROW2(strme.str());
01073 }
01074
01075 std::vector<float> scores = Tokenize<float>(tokens[2]);
01076 if(scores.size() != m_creator.m_numScoreComponent) {
01077 std::stringstream strme;
01078 strme << "Error: It seems the following line has a wrong number of scores ("
01079 << scores.size() << " != " << m_creator.m_numScoreComponent << ") :" << std::endl;
01080 strme << "Line " << i << ": " << lines[i] << std::endl;
01081 UTIL_THROW2(strme.str());
01082 }
01083
01084 float sortScore = scores[m_creator.m_sortScoreIndex];
01085
01086 std::string key1 = m_creator.MakeSourceKey(tokens[0]);
01087 std::string key2 = m_creator.MakeSourceTargetKey(tokens[0], tokens[1]);
01088
01089 PackedItem packedItem(lineNum + i, key1, key2, 0, sortScore);
01090 result.push_back(packedItem);
01091 }
01092 lines.clear();
01093
01094 {
01095 #ifdef WITH_THREADS
01096 boost::mutex::scoped_lock lock(m_mutex);
01097 #endif
01098 for(size_t i = 0; i < result.size(); i++)
01099 m_creator.AddRankedLine(result[i]);
01100 m_creator.FlushRankedQueue();
01101 }
01102
01103 result.clear();
01104 lines.reserve(max_lines);
01105 result.reserve(max_lines);
01106
01107 #ifdef WITH_THREADS
01108 boost::mutex::scoped_lock lock(m_fileMutex);
01109 #endif
01110 std::string line;
01111 while(lines.size() < max_lines && std::getline(m_inFile, line))
01112 lines.push_back(line);
01113 lineNum = m_lineNum;
01114 m_lineNum += lines.size();
01115 }
01116 }
01117
01118 size_t EncodingTask::m_lineNum = 0;
01119 #ifdef WITH_THREADS
01120 boost::mutex EncodingTask::m_mutex;
01121 boost::mutex EncodingTask::m_fileMutex;
01122 #endif
01123
01124 EncodingTask::EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator)
01125 : m_inFile(inFile), m_creator(creator) {}
01126
01127 void EncodingTask::operator()()
01128 {
01129 size_t lineNum = 0;
01130
01131 std::vector<std::string> lines;
01132 size_t max_lines = 1000;
01133 lines.reserve(max_lines);
01134
01135 {
01136 #ifdef WITH_THREADS
01137 boost::mutex::scoped_lock lock(m_fileMutex);
01138 #endif
01139 std::string line;
01140 while(lines.size() < max_lines && std::getline(m_inFile, line))
01141 lines.push_back(line);
01142 lineNum = m_lineNum;
01143 m_lineNum += lines.size();
01144 }
01145
01146 std::vector<PackedItem> result;
01147 result.reserve(max_lines);
01148
01149 while(lines.size()) {
01150 for(size_t i = 0; i < lines.size(); i++) {
01151 std::vector<std::string> tokens;
01152 Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
01153
01154 for(std::vector<std::string>::iterator it = tokens.begin(); it != tokens.end(); it++)
01155 *it = Moses::Trim(*it);
01156
01157 if(tokens.size() < 3) {
01158 std::stringstream strme;
01159 strme << "Error: It seems the following line has a wrong format:" << std::endl;
01160 strme << "Line " << i << ": " << lines[i] << std::endl;
01161 UTIL_THROW2(strme.str());
01162 }
01163
01164 if(tokens.size() > 3 && tokens[3].size() <= 1 && m_creator.m_coding != PhraseTableCreator::None) {
01165 std::stringstream strme;
01166 strme << "Error: It seems the following line contains no alignment information, " << std::endl;
01167 strme << "but you are using ";
01168 strme << (m_creator.m_coding == PhraseTableCreator::PREnc ? "PREnc" : "REnc");
01169 strme << " encoding which makes use of alignment data. " << std::endl;
01170 strme << "Use -encoding None" << std::endl;
01171 strme << "Line " << i << ": " << lines[i] << std::endl;
01172 UTIL_THROW2(strme.str());
01173 }
01174
01175 size_t ownRank = 0;
01176 if(m_creator.m_coding == PhraseTableCreator::PREnc)
01177 ownRank = m_creator.m_ranks[lineNum + i];
01178
01179 std::string encodedLine = m_creator.EncodeLine(tokens, ownRank);
01180
01181 PackedItem packedItem(lineNum + i, tokens[0], encodedLine, ownRank);
01182 result.push_back(packedItem);
01183 }
01184 lines.clear();
01185
01186 {
01187 #ifdef WITH_THREADS
01188 boost::mutex::scoped_lock lock(m_mutex);
01189 #endif
01190 for(size_t i = 0; i < result.size(); i++)
01191 m_creator.AddEncodedLine(result[i]);
01192 m_creator.FlushEncodedQueue();
01193 }
01194
01195 result.clear();
01196 lines.reserve(max_lines);
01197 result.reserve(max_lines);
01198
01199 #ifdef WITH_THREADS
01200 boost::mutex::scoped_lock lock(m_fileMutex);
01201 #endif
01202 std::string line;
01203 while(lines.size() < max_lines && std::getline(m_inFile, line))
01204 lines.push_back(line);
01205 lineNum = m_lineNum;
01206 m_lineNum += lines.size();
01207 }
01208 }
01209
01210
01211
01212 size_t CompressionTask::m_collectionNum = 0;
01213 #ifdef WITH_THREADS
01214 boost::mutex CompressionTask::m_mutex;
01215 #endif
01216
01217 CompressionTask::CompressionTask(StringVectorTemp<unsigned char, unsigned long,
01218 MmapAllocator>& encodedCollections,
01219 PhraseTableCreator& creator)
01220 : m_encodedCollections(encodedCollections), m_creator(creator) {}
01221
01222 void CompressionTask::operator()()
01223 {
01224 size_t collectionNum;
01225 {
01226 #ifdef WITH_THREADS
01227 boost::mutex::scoped_lock lock(m_mutex);
01228 #endif
01229 collectionNum = m_collectionNum;
01230 m_collectionNum++;
01231 }
01232
01233 while(collectionNum < m_encodedCollections.size()) {
01234 std::string collection = m_encodedCollections[collectionNum];
01235 std::string compressedCollection
01236 = m_creator.CompressEncodedCollection(collection);
01237
01238 std::string dummy;
01239 PackedItem packedItem(collectionNum, dummy, compressedCollection, 0);
01240
01241 #ifdef WITH_THREADS
01242 boost::mutex::scoped_lock lock(m_mutex);
01243 #endif
01244 m_creator.AddCompressedCollection(packedItem);
01245 m_creator.FlushCompressedQueue();
01246
01247 collectionNum = m_collectionNum;
01248 m_collectionNum++;
01249 }
01250 }
01251
01252
01253
01254 PackedItem::PackedItem(long line, std::string sourcePhrase,
01255 std::string packedTargetPhrase, size_t rank,
01256 float score)
01257 : m_line(line), m_sourcePhrase(sourcePhrase),
01258 m_packedTargetPhrase(packedTargetPhrase), m_rank(rank),
01259 m_score(score) {}
01260
01261 long PackedItem::GetLine() const
01262 {
01263 return m_line;
01264 }
01265
01266 const std::string& PackedItem::GetSrc() const
01267 {
01268 return m_sourcePhrase;
01269 }
01270
01271 const std::string& PackedItem::GetTrg() const
01272 {
01273 return m_packedTargetPhrase;
01274 }
01275
01276 size_t PackedItem::GetRank() const
01277 {
01278 return m_rank;
01279 }
01280
01281 float PackedItem::GetScore() const
01282 {
01283 return m_score;
01284 }
01285
01286 }