00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <sstream>
00023 #include "LexicalReorderingTableCreator.h"
00024 #include "ThrowingFwrite.h"
00025 #include "moses/Util.h"
00026 #include "util/file.hh"
00027 #include "util/exception.hh"
00028
00029 namespace Moses
00030 {
00031
00032 LexicalReorderingTableCreator::LexicalReorderingTableCreator(
00033 std::string inPath, std::string outPath, std::string tempfilePath,
00034 size_t orderBits, size_t fingerPrintBits, bool multipleScoreTrees,
00035 size_t quantize
00036 #ifdef WITH_THREADS
00037 , size_t threads
00038 #endif
00039 )
00040 : m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath),
00041 m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
00042 m_numScoreComponent(0), m_multipleScoreTrees(multipleScoreTrees),
00043 m_quantize(quantize), m_separator(" ||| "),
00044 m_hash(m_orderBits, m_fingerPrintBits), m_lastFlushedLine(-1)
00045 #ifdef WITH_THREADS
00046 , m_threads(threads)
00047 #endif
00048 {
00049 PrintInfo();
00050
00051 m_outFile = std::fopen(m_outPath.c_str(), "w");
00052
00053 std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl;
00054 m_hash.BeginSave(m_outFile);
00055
00056 if(tempfilePath.size()) {
00057 MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath));
00058 m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded);
00059 } else {
00060 m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(true);
00061 }
00062
00063 EncodeScores();
00064
00065 std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl;
00066 CalcHuffmanCodes();
00067
00068 std::cerr << "Pass 2/2: Compressing scores" << std::endl;
00069
00070
00071 if(tempfilePath.size()) {
00072 MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
00073 m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
00074 } else {
00075 m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(true);
00076 }
00077 CompressScores();
00078
00079 std::cerr << "Saving to " << m_outPath << std::endl;
00080 Save();
00081 std::cerr << "Done" << std::endl;
00082 std::fclose(m_outFile);
00083 }
00084
00085 void LexicalReorderingTableCreator::PrintInfo()
00086 {
00087 std::cerr << "Used options:" << std::endl;
00088 std::cerr << "\tText reordering table will be read from: " << m_inPath << std::endl;
00089 std::cerr << "\tOutput reordering table will be written to: " << m_outPath << std::endl;
00090 std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl;
00091 std::cerr << "\tPhrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl;
00092 std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl;
00093 std::cerr << "\tUsing score quantization: ";
00094 if(m_quantize)
00095 std::cerr << m_quantize << " best" << std::endl;
00096 else
00097 std::cerr << "no" << std::endl;
00098
00099 #ifdef WITH_THREADS
00100 std::cerr << "\tRunning with " << m_threads << " threads" << std::endl;
00101 #endif
00102 std::cerr << std::endl;
00103 }
00104
00105 LexicalReorderingTableCreator::~LexicalReorderingTableCreator()
00106 {
00107 for(size_t i = 0; i < m_scoreTrees.size(); i++) {
00108 delete m_scoreTrees[i];
00109 delete m_scoreCounters[i];
00110 }
00111
00112 delete m_encodedScores;
00113 delete m_compressedScores;
00114 }
00115
00116
00117 void LexicalReorderingTableCreator::EncodeScores()
00118 {
00119 InputFileStream inFile(m_inPath);
00120
00121 #ifdef WITH_THREADS
00122 boost::thread_group threads;
00123 for (size_t i = 0; i < m_threads; ++i) {
00124 EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this);
00125 threads.create_thread(*et);
00126 }
00127 threads.join_all();
00128 #else
00129 EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this);
00130 (*et)();
00131 delete et;
00132 #endif
00133 FlushEncodedQueue(true);
00134 }
00135
00136 void LexicalReorderingTableCreator::CalcHuffmanCodes()
00137 {
00138 std::vector<ScoreTree*>::iterator treeIt = m_scoreTrees.begin();
00139 for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
00140 it != m_scoreCounters.end(); it++) {
00141 if(m_quantize)
00142 (*it)->Quantize(m_quantize);
00143
00144 std::cerr << "\tCreating Huffman codes for " << (*it)->Size()
00145 << " scores" << std::endl;
00146
00147 *treeIt = new ScoreTree((*it)->Begin(), (*it)->End());
00148 treeIt++;
00149 }
00150 std::cerr << std::endl;
00151 }
00152
00153 void LexicalReorderingTableCreator::CompressScores()
00154 {
00155 #ifdef WITH_THREADS
00156 boost::thread_group threads;
00157 for (size_t i = 0; i < m_threads; ++i) {
00158 CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this);
00159 threads.create_thread(*ct);
00160 }
00161 threads.join_all();
00162 #else
00163 CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this);
00164 (*ct)();
00165 delete ct;
00166 #endif
00167 FlushCompressedQueue(true);
00168 }
00169
00170 void LexicalReorderingTableCreator::Save()
00171 {
00172 ThrowingFwrite(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, m_outFile);
00173 ThrowingFwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile);
00174 for(size_t i = 0; i < m_scoreTrees.size(); i++)
00175 m_scoreTrees[i]->Save(m_outFile);
00176
00177 m_compressedScores->save(m_outFile);
00178 }
00179
00180 std::string LexicalReorderingTableCreator::MakeSourceTargetKey(std::string &source, std::string &target)
00181 {
00182 std::string key = source + m_separator;
00183 if(!target.empty())
00184 key += target + m_separator;
00185 return key;
00186 }
00187
00188 std::string LexicalReorderingTableCreator::EncodeLine(std::vector<std::string>& tokens)
00189 {
00190 std::string scoresString = tokens.back();
00191 std::stringstream scoresStream;
00192
00193 std::vector<float> scores;
00194 Tokenize<float>(scores, scoresString);
00195
00196 if(!m_numScoreComponent) {
00197 m_numScoreComponent = scores.size();
00198 m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
00199 for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
00200 it != m_scoreCounters.end(); it++)
00201 *it = new ScoreCounter();
00202 m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
00203 }
00204
00205 if(m_numScoreComponent != scores.size()) {
00206 std::stringstream strme;
00207 strme << "Error: Wrong number of scores detected ("
00208 << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
00209 strme << "Line: " << tokens[0] << " ||| ... ||| " << scoresString << std::endl;
00210 UTIL_THROW2(strme.str());
00211 }
00212
00213 size_t c = 0;
00214 float score;
00215 while(c < m_numScoreComponent) {
00216 score = scores[c];
00217 score = FloorScore(TransformScore(score));
00218 scoresStream.write((char*)&score, sizeof(score));
00219
00220 m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score);
00221 c++;
00222 }
00223
00224 return scoresStream.str();
00225 }
00226
00227 void LexicalReorderingTableCreator::AddEncodedLine(PackedItem& pi)
00228 {
00229 m_queue.push(pi);
00230 }
00231
00232 void LexicalReorderingTableCreator::FlushEncodedQueue(bool force)
00233 {
00234 if(force || m_queue.size() > 10000) {
00235 while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
00236 PackedItem pi = m_queue.top();
00237 m_queue.pop();
00238 m_lastFlushedLine++;
00239
00240 m_lastRange.push_back(pi.GetSrc());
00241 m_encodedScores->push_back(pi.GetTrg());
00242
00243 if((pi.GetLine()+1) % 100000 == 0)
00244 std::cerr << ".";
00245 if((pi.GetLine()+1) % 5000000 == 0)
00246 std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
00247
00248 if(m_lastRange.size() == (1ul << m_orderBits)) {
00249 m_hash.AddRange(m_lastRange);
00250 m_hash.SaveLastRange();
00251 m_hash.DropLastRange();
00252 m_lastRange.clear();
00253 }
00254 }
00255 }
00256
00257 if(force) {
00258 m_lastFlushedLine = -1;
00259
00260 if(!m_lastRange.empty()) {
00261 m_hash.AddRange(m_lastRange);
00262 m_lastRange.clear();
00263 }
00264
00265 #ifdef WITH_THREADS
00266 m_hash.WaitAll();
00267 #endif
00268
00269 m_hash.SaveLastRange();
00270 m_hash.DropLastRange();
00271 m_hash.FinalizeSave();
00272
00273 std::cerr << std::endl << std::endl;
00274 }
00275 }
00276
00277 std::string LexicalReorderingTableCreator::CompressEncodedScores(std::string &encodedScores)
00278 {
00279 std::stringstream encodedScoresStream(encodedScores);
00280 encodedScoresStream.unsetf(std::ios::skipws);
00281
00282 std::string compressedScores;
00283 BitWrapper<> compressedScoresStream(compressedScores);
00284
00285 size_t currScore = 0;
00286 float score;
00287 encodedScoresStream.read((char*) &score, sizeof(score));
00288
00289 while(encodedScoresStream) {
00290 size_t index = currScore % m_scoreTrees.size();
00291
00292 if(m_quantize)
00293 score = m_scoreCounters[index]->LowerBound(score);
00294
00295 m_scoreTrees[index]->Put(compressedScoresStream, score);
00296 encodedScoresStream.read((char*) &score, sizeof(score));
00297 currScore++;
00298 }
00299
00300 return compressedScores;
00301 }
00302
00303 void LexicalReorderingTableCreator::AddCompressedScores(PackedItem& pi)
00304 {
00305 m_queue.push(pi);
00306 }
00307
00308 void LexicalReorderingTableCreator::FlushCompressedQueue(bool force)
00309 {
00310 if(force || m_queue.size() > 10000) {
00311 while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
00312 PackedItem pi = m_queue.top();
00313 m_queue.pop();
00314 m_lastFlushedLine++;
00315
00316 m_compressedScores->push_back(pi.GetTrg());
00317
00318 if((pi.GetLine()+1) % 100000 == 0)
00319 std::cerr << ".";
00320 if((pi.GetLine()+1) % 5000000 == 0)
00321 std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
00322 }
00323 }
00324
00325 if(force) {
00326 m_lastFlushedLine = -1;
00327 std::cerr << std::endl << std::endl;
00328 }
00329 }
00330
00331
00332
00333 size_t EncodingTaskReordering::m_lineNum = 0;
00334 #ifdef WITH_THREADS
00335 boost::mutex EncodingTaskReordering::m_mutex;
00336 boost::mutex EncodingTaskReordering::m_fileMutex;
00337 #endif
00338
00339 EncodingTaskReordering::EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator)
00340 : m_inFile(inFile), m_creator(creator) {}
00341
00342 void EncodingTaskReordering::operator()()
00343 {
00344 size_t lineNum = 0;
00345
00346 std::vector<std::string> lines;
00347 size_t max_lines = 1000;
00348 lines.reserve(max_lines);
00349
00350 {
00351 #ifdef WITH_THREADS
00352 boost::mutex::scoped_lock lock(m_fileMutex);
00353 #endif
00354 std::string line;
00355 while(lines.size() < max_lines && std::getline(m_inFile, line))
00356 lines.push_back(line);
00357 lineNum = m_lineNum;
00358 m_lineNum += lines.size();
00359 }
00360
00361 std::vector<PackedItem> result;
00362 result.reserve(max_lines);
00363
00364 while(lines.size()) {
00365 for(size_t i = 0; i < lines.size(); i++) {
00366 std::vector<std::string> tokens;
00367 Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
00368
00369 std::string encodedLine = m_creator.EncodeLine(tokens);
00370
00371 std::string f = tokens[0];
00372
00373 std::string e;
00374 if(tokens.size() > 2)
00375 e = tokens[1];
00376
00377 PackedItem packedItem(lineNum + i, m_creator.MakeSourceTargetKey(f, e),
00378 encodedLine, i);
00379 result.push_back(packedItem);
00380 }
00381
00382 {
00383 #ifdef WITH_THREADS
00384 boost::mutex::scoped_lock lock(m_mutex);
00385 #endif
00386 for(size_t i = 0; i < result.size(); i++)
00387 m_creator.AddEncodedLine(result[i]);
00388 m_creator.FlushEncodedQueue();
00389 }
00390
00391 lines.clear();
00392 result.clear();
00393 lines.reserve(max_lines);
00394 result.reserve(max_lines);
00395
00396 #ifdef WITH_THREADS
00397 boost::mutex::scoped_lock lock(m_fileMutex);
00398 #endif
00399 std::string line;
00400 while(lines.size() < max_lines && std::getline(m_inFile, line))
00401 lines.push_back(line);
00402 lineNum = m_lineNum;
00403 m_lineNum += lines.size();
00404 }
00405 }
00406
00407
00408
00409 size_t CompressionTaskReordering::m_scoresNum = 0;
00410 #ifdef WITH_THREADS
00411 boost::mutex CompressionTaskReordering::m_mutex;
00412 #endif
00413
00414 CompressionTaskReordering::CompressionTaskReordering(StringVector<unsigned char, unsigned long,
00415 MmapAllocator>& encodedScores,
00416 LexicalReorderingTableCreator& creator)
00417 : m_encodedScores(encodedScores), m_creator(creator)
00418 { }
00419
00420 void CompressionTaskReordering::operator()()
00421 {
00422 size_t scoresNum;
00423 {
00424 #ifdef WITH_THREADS
00425 boost::mutex::scoped_lock lock(m_mutex);
00426 #endif
00427 scoresNum = m_scoresNum;
00428 m_scoresNum++;
00429 }
00430
00431 while(scoresNum < m_encodedScores.size()) {
00432 std::string scores = m_encodedScores[scoresNum];
00433 std::string compressedScores
00434 = m_creator.CompressEncodedScores(scores);
00435
00436 std::string dummy;
00437 PackedItem packedItem(scoresNum, dummy, compressedScores, 0);
00438
00439 #ifdef WITH_THREADS
00440 boost::mutex::scoped_lock lock(m_mutex);
00441 #endif
00442 m_creator.AddCompressedScores(packedItem);
00443 m_creator.FlushCompressedQueue();
00444
00445 scoresNum = m_scoresNum;
00446 m_scoresNum++;
00447 }
00448 }
00449
00450 }