00001
00002
00003
00004
00005
00006
00007
00008
00009 #include <iostream>
00010 #include "FuzzyMatchWrapper.h"
00011 #include "SentenceAlignment.h"
00012 #include "Match.h"
00013 #include "create_xml.h"
00014 #include "moses/Util.h"
00015 #include "moses/StaticData.h"
00016 #include "util/file.hh"
00017
00018 using namespace std;
00019
00020 namespace tmmt
00021 {
00022
00023 FuzzyMatchWrapper::FuzzyMatchWrapper(const std::string &sourcePath, const std::string &targetPath, const std::string &alignmentPath)
00024 :basic_flag(false)
00025 ,lsed_flag(true)
00026 ,refined_flag(true)
00027 ,length_filter_flag(true)
00028 ,parse_flag(true)
00029 ,min_match(70)
00030 ,multiple_flag(true)
00031 ,multiple_slack(0)
00032 ,multiple_max(100)
00033 {
00034 cerr << "creating suffix array" << endl;
00035 suffixArray = new tmmt::SuffixArray( sourcePath );
00036
00037
00038
00039
00040 cerr << "loading target data" << endl;
00041 load_target(targetPath, targetAndAlignment);
00042
00043 cerr << "loading alignment" << endl;
00044 load_alignment(alignmentPath, targetAndAlignment);
00045
00046
00047
00048
00049 cerr << "loading completed" << endl;
00050 }
00051
00052 string FuzzyMatchWrapper::Extract(long translationId, const string &dirNameStr)
00053 {
00054 const Moses::StaticData &staticData = Moses::StaticData::Instance();
00055
00056 WordIndex wordIndex;
00057
00058 string fuzzyMatchFile = ExtractTM(wordIndex, translationId, dirNameStr);
00059
00060
00061 create_xml(fuzzyMatchFile);
00062
00063
00064 string cmd;
00065 cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract | gzip -c > "
00066 + fuzzyMatchFile + ".extract.sorted.gz";
00067 system(cmd.c_str());
00068 cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract.inv | gzip -c > "
00069 + fuzzyMatchFile + ".extract.inv.sorted.gz";
00070 system(cmd.c_str());
00071
00072 #ifdef IS_XCODE
00073 cmd = "/Users/hieuhoang/unison/workspace/github/moses-smt/bin";
00074 #elif IS_ECLIPSE
00075 cmd = "/home/hieu/workspace/github/moses-smt/bin";
00076 #else
00077 cmd = staticData.GetBinDirectory();
00078 #endif
00079
00080 cmd += string("/../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical ")
00081 + " -extract-file " + fuzzyMatchFile + ".extract -lexical-file - -score-options \"--NoLex\" "
00082 + " -phrase-translation-table " + fuzzyMatchFile + ".pt";
00083 system(cmd.c_str());
00084
00085
00086 return fuzzyMatchFile + ".pt.gz";
00087 }
00088
00089 string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, const string &dirNameStr)
00090 {
00091 const std::vector< std::vector< WORD_ID > > &source = suffixArray->GetCorpus();
00092
00093 string inputPath = dirNameStr + "/in";
00094 string fuzzyMatchFile = dirNameStr + "/fuzzyMatchFile";
00095 ofstream fuzzyMatchStream(fuzzyMatchFile.c_str());
00096
00097 vector< vector< WORD_ID > > input;
00098 load_corpus(inputPath, input);
00099
00100 assert(input.size() == 1);
00101 size_t sentenceInd = 0;
00102
00103 clock_t start_clock = clock();
00104
00105
00106
00107
00108
00109 int input_length = input[sentenceInd].size();
00110 int best_cost = input_length * (100-min_match) / 100 + 1;
00111
00112 int match_count = 0;
00113
00114
00115
00116 vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range;
00117 for(int start=0; start<input[sentenceInd].size(); start++) {
00118 SuffixArray::INDEX prior_first_match = 0;
00119 SuffixArray::INDEX prior_last_match = suffixArray->GetSize()-1;
00120 vector< string > substring;
00121 bool stillMatched = true;
00122 vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart;
00123
00124 for(size_t word=start; stillMatched && word<input[sentenceInd].size(); word++) {
00125 substring.push_back( GetVocabulary().GetWord( input[sentenceInd][word] ) );
00126
00127
00128
00129
00130 SuffixArray::INDEX first_match, last_match;
00131 stillMatched = false;
00132 if (suffixArray->FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) ) {
00133 stillMatched = true;
00134 matchedAtThisStart.push_back( make_pair( first_match, last_match ) );
00135
00136
00137 prior_first_match = first_match;
00138 prior_last_match = last_match;
00139 }
00140
00141 }
00142
00143 match_range.push_back( matchedAtThisStart );
00144 }
00145
00146 clock_t clock_range = clock();
00147
00148 map< int, vector< Match > > sentence_match;
00149 map< int, int > sentence_match_word_count;
00150
00151
00152 for(int length = input[sentenceInd].size(); length >= 1; length--) {
00153
00154 if (length <= short_match_max_length( input_length ) ) {
00155 continue;
00156 }
00157
00158 unsigned int count = 0;
00159 for(int start = 0; start <= input[sentenceInd].size() - length; start++) {
00160 if (match_range[start].size() >= length) {
00161 pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1];
00162
00163 count += range.second - range.first + 1;
00164
00165 for(SuffixArray::INDEX i=range.first; i<=range.second; i++) {
00166 size_t position = suffixArray->GetPosition( i );
00167
00168
00169 size_t sentence_id = suffixArray->GetSentence( position );
00170 int sentence_length = suffixArray->GetSentenceLength( sentence_id );
00171 int diff = abs( (int)sentence_length - (int)input_length );
00172
00173
00174
00175
00176
00177 if (diff > best_cost)
00178 continue;
00179
00180
00181 int start_pos = suffixArray->GetWordInSentence( position );
00182 int end_pos = start_pos + length-1;
00183
00184
00185
00186 int min_cost = abs( start - start_pos );
00187
00188
00189 if (start == start_pos && start>0)
00190 min_cost++;
00191
00192
00193 min_cost += abs( ( sentence_length-1 - end_pos ) -
00194 ( input_length-1 - (start+length-1) ) );
00195
00196
00197 if ( sentence_length-1 - end_pos ==
00198 input_length-1 - (start+length-1)
00199 && end_pos != sentence_length-1 )
00200 min_cost++;
00201
00202
00203 if (min_cost > best_cost)
00204 continue;
00205
00206
00207 match_count++;
00208
00209
00210 int max_cost = max( start, start_pos )
00211 + max( sentence_length-1 - end_pos,
00212 input_length-1 - (start+length-1) );
00213
00214
00215 Match m = Match( start, start+length-1,
00216 start_pos, start_pos+length-1,
00217 min_cost, max_cost, 0);
00218 sentence_match[ sentence_id ].push_back( m );
00219 sentence_match_word_count[ sentence_id ] += length;
00220
00221 if (max_cost < best_cost) {
00222 best_cost = max_cost;
00223 if (best_cost == 0) break;
00224 }
00225
00226 }
00227 }
00228
00229 if (best_cost == 0) break;
00230
00231 }
00232
00233
00234 if (best_cost == 0) break;
00235
00236 }
00237 cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl;
00238
00239 clock_t clock_matches = clock();
00240
00241
00242 int old_best_cost = best_cost;
00243 int tm_count_word_match = 0;
00244 int tm_count_word_match2 = 0;
00245 int pruned_match_count = 0;
00246 if (short_match_max_length( input_length )) {
00247 init_short_matches(wordIndex, translationId, input[sentenceInd] );
00248 }
00249 vector< int > best_tm;
00250 typedef map< int, vector< Match > >::iterator I;
00251
00252 clock_t clock_validation_sum = 0;
00253
00254 for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++) {
00255 int tmID = tm->first;
00256 int tm_length = suffixArray->GetSentenceLength(tmID);
00257 vector< Match > &match = tm->second;
00258 add_short_matches(wordIndex, translationId, match, source[tmID], input_length, best_cost );
00259
00260
00261
00262
00263 int words_matched = 0;
00264 for(size_t m=0; m<match.size(); m++) {
00265
00266 if (match[m].min_cost <= best_cost)
00267 words_matched += match[m].input_end - match[m].input_start + 1;
00268 }
00269 if (max(input_length,tm_length) - words_matched > best_cost) {
00270 if (length_filter_flag) continue;
00271 }
00272 tm_count_word_match++;
00273
00274
00275 vector< Match > pruned = prune_matches( match, best_cost );
00276 words_matched = 0;
00277 for(size_t p=0; p<pruned.size(); p++) {
00278 words_matched += pruned[p].input_end - pruned[p].input_start + 1;
00279 }
00280 if (max(input_length,tm_length) - words_matched > best_cost) {
00281 if (length_filter_flag) continue;
00282 }
00283 tm_count_word_match2++;
00284
00285 pruned_match_count += pruned.size();
00286 int prior_best_cost = best_cost;
00287 int cost;
00288
00289 clock_t clock_validation_start = clock();
00290 if (! parse_flag ||
00291 pruned.size()>=10) {
00292 string path;
00293 cost = sed( input[sentenceInd], source[tmID], path, false );
00294 if (cost < best_cost) {
00295 best_cost = cost;
00296 }
00297 }
00298
00299 else {
00300 cost = parse_matches( pruned, input_length, tm_length, best_cost );
00301 if (prior_best_cost != best_cost) {
00302 best_tm.clear();
00303 }
00304 }
00305 clock_validation_sum += clock() - clock_validation_start;
00306 if (cost == best_cost) {
00307 best_tm.push_back( tmID );
00308 }
00309 }
00310 cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl;
00311 cerr << "tm considered: " << sentence_match.size()
00312 << " word-matched: " << tm_count_word_match
00313 << " word-matched2: " << tm_count_word_match2
00314 << " best: " << best_tm.size() << endl;
00315
00316 cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl;
00317
00318
00319 string inputStr, sourceStr;
00320 for (size_t pos = 0; pos < input_length; ++pos) {
00321 inputStr += GetVocabulary().GetWord(input[sentenceInd][pos]) + " ";
00322 }
00323
00324
00325 if (multiple_flag) {
00326 for(size_t si=0; si<best_tm.size(); si++) {
00327 int s = best_tm[si];
00328 string path;
00329 sed( input[sentenceInd], source[s], path, true );
00330 const vector<WORD_ID> &sourceSentence = source[s];
00331 vector<SentenceAlignment> &targets = targetAndAlignment[s];
00332 create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path, fuzzyMatchStream);
00333
00334 }
00335 }
00336 else {
00337
00338
00339 string best_path = "";
00340 int best_match = -1;
00341 unsigned int best_letter_cost;
00342 if (lsed_flag) {
00343 best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1;
00344 for(size_t si=0; si<best_tm.size(); si++) {
00345 int s = best_tm[si];
00346 string path;
00347 unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
00348 if (letter_cost < best_letter_cost) {
00349 best_letter_cost = letter_cost;
00350 best_path = path;
00351 best_match = s;
00352 }
00353 }
00354 }
00355
00356 else {
00357 if (best_tm.size() > 0) {
00358 string path;
00359 sed( input[sentenceInd], source[best_tm[0]], path, false );
00360 best_path = path;
00361 best_match = best_tm[0];
00362 }
00363 }
00364 cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC)
00365 << " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC)
00366 << " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC)
00367 << " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC)
00368 << " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")"
00369 << " )" << endl;
00370 if (lsed_flag) {
00371
00372 }
00373
00374 if (lsed_flag) {
00375
00376 }
00377
00378
00379 if (best_match == -1) {
00380 UTIL_THROW_IF2(source.size() == 0, "Empty source phrase");
00381 best_match = 0;
00382 }
00383
00384
00385 const vector<WORD_ID> &sourceSentence = source[best_match];
00386 vector<SentenceAlignment> &targets = targetAndAlignment[best_match];
00387 create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path, fuzzyMatchStream);
00388
00389 }
00390
00391 fuzzyMatchStream.close();
00392
00393 return fuzzyMatchFile;
00394 }
00395
00396 void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector< WORD_ID > > &corpus )
00397 {
00398
00399 ifstream fileStream;
00400 fileStream.open(fileName.c_str());
00401 if (!fileStream) {
00402 cerr << "file not found: " << fileName << endl;
00403 exit(1);
00404 }
00405 cerr << "loading " << fileName << endl;
00406
00407 istream *fileStreamP = &fileStream;
00408
00409 string line;
00410 while(getline(*fileStreamP, line)) {
00411 corpus.push_back( GetVocabulary().Tokenize( line.c_str() ) );
00412 }
00413 }
00414
00415 void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus)
00416 {
00417 ifstream fileStream;
00418 fileStream.open(fileName.c_str());
00419 if (!fileStream) {
00420 cerr << "file not found: " << fileName << endl;
00421 exit(1);
00422 }
00423 cerr << "loading " << fileName << endl;
00424
00425 istream *fileStreamP = &fileStream;
00426
00427 WORD_ID delimiter = GetVocabulary().StoreIfNew("|||");
00428
00429 int lineNum = 0;
00430 string line;
00431 while(getline(*fileStreamP, line)) {
00432 vector<WORD_ID> toks = GetVocabulary().Tokenize( line.c_str() );
00433
00434 corpus.push_back(vector< SentenceAlignment >());
00435 vector< SentenceAlignment > &vec = corpus.back();
00436
00437 vec.push_back(SentenceAlignment());
00438 SentenceAlignment *sentence = &vec.back();
00439
00440 const WORD &countStr = GetVocabulary().GetWord(toks[0]);
00441 sentence->count = atoi(countStr.c_str());
00442
00443 for (size_t i = 1; i < toks.size(); ++i) {
00444 WORD_ID wordId = toks[i];
00445
00446 if (wordId == delimiter) {
00447
00448 vec.push_back(SentenceAlignment());
00449 sentence = &vec.back();
00450
00451
00452 ++i;
00453
00454 const WORD &countStr = GetVocabulary().GetWord(toks[i]);
00455 sentence->count = atoi(countStr.c_str());
00456 } else {
00457
00458 sentence->target.push_back(wordId);
00459 }
00460 }
00461
00462 ++lineNum;
00463
00464 }
00465
00466 }
00467
00468
00469 void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus )
00470 {
00471 ifstream fileStream;
00472 fileStream.open(fileName.c_str());
00473 if (!fileStream) {
00474 cerr << "file not found: " << fileName << endl;
00475 exit(1);
00476 }
00477 cerr << "loading " << fileName << endl;
00478
00479 istream *fileStreamP = &fileStream;
00480
00481 string delimiter = "|||";
00482
00483 int lineNum = 0;
00484 string line;
00485 while(getline(*fileStreamP, line)) {
00486 vector< SentenceAlignment > &vec = corpus[lineNum];
00487 size_t targetInd = 0;
00488 SentenceAlignment *sentence = &vec[targetInd];
00489
00490 vector<string> toks = Moses::Tokenize(line);
00491
00492 for (size_t i = 0; i < toks.size(); ++i) {
00493 string &tok = toks[i];
00494
00495 if (tok == delimiter) {
00496
00497 ++targetInd;
00498 sentence = &vec[targetInd];
00499
00500 ++i;
00501 } else {
00502
00503 vector<int> alignPoint = Moses::Tokenize<int>(tok, "-");
00504 assert(alignPoint.size() == 2);
00505 sentence->alignment.push_back(pair<int,int>(alignPoint[0], alignPoint[1]));
00506 }
00507 }
00508
00509 ++lineNum;
00510
00511 }
00512 }
00513
00514 bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const
00515 {
00516 #ifdef WITH_THREADS
00517 boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
00518 #endif
00519 map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key );
00520 if (lookup != m_lsed.end()) {
00521 value = lookup->second;
00522 return true;
00523 }
00524
00525 return false;
00526 }
00527
00528 void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value)
00529 {
00530 #ifdef WITH_THREADS
00531 boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
00532 #endif
00533 m_lsed[ key ] = value;
00534 }
00535
00536
00537
00538 unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx )
00539 {
00540
00541 pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx );
00542 unsigned int value;
00543 bool ret = GetLSEDCache(pIdx, value);
00544 if (ret) {
00545 return value;
00546 }
00547
00548
00549 const string &a = GetVocabulary().GetWord( aIdx );
00550 const string &b = GetVocabulary().GetWord( bIdx );
00551
00552
00553 unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
00554 for( unsigned int i=0; i<=a.size(); i++ ) {
00555 cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
00556 cost[i][0] = i;
00557 }
00558 for( unsigned int j=0; j<=b.size(); j++ ) {
00559 cost[0][j] = j;
00560 }
00561
00562
00563 for( unsigned int i=1; i<=a.size(); i++ ) {
00564 for( unsigned int j=1; j<=b.size(); j++ ) {
00565
00566 unsigned int ins = cost[i-1][j] + 1;
00567 unsigned int del = cost[i][j-1] + 1;
00568 bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0);
00569 unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1);
00570
00571 unsigned int min = (ins < del) ? ins : del;
00572 min = (diag < min) ? diag : min;
00573
00574 cost[i][j] = min;
00575 }
00576 }
00577
00578
00579 unsigned int final = cost[a.size()][b.size()];
00580 for( unsigned int i=0; i<=a.size(); i++ ) {
00581 free( cost[i] );
00582 }
00583 free( cost );
00584
00585
00586 SetLSEDCache(pIdx, final);
00587 return final;
00588 }
00589
00590
00591
00592 unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed )
00593 {
00594
00595
00596 unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
00597 char **path = (char**) calloc( sizeof( char* ), a.size()+1 );
00598
00599 for( unsigned int i=0; i<=a.size(); i++ ) {
00600 cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
00601 path[i] = (char*) calloc( sizeof(char), b.size()+1 );
00602 if (i>0) {
00603 cost[i][0] = cost[i-1][0];
00604 if (use_letter_sed) {
00605 cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size();
00606 } else {
00607 cost[i][0]++;
00608 }
00609 } else {
00610 cost[i][0] = 0;
00611 }
00612 path[i][0] = 'I';
00613 }
00614
00615 for( unsigned int j=0; j<=b.size(); j++ ) {
00616 if (j>0) {
00617 cost[0][j] = cost[0][j-1];
00618 if (use_letter_sed) {
00619 cost[0][j] += GetVocabulary().GetWord( b[j-1] ).size();
00620 } else {
00621 cost[0][j]++;
00622 }
00623 } else {
00624 cost[0][j] = 0;
00625 }
00626 path[0][j] = 'D';
00627 }
00628
00629
00630 for( unsigned int i=1; i<=a.size(); i++ ) {
00631 for( unsigned int j=1; j<=b.size(); j++ ) {
00632 unsigned int ins = cost[i-1][j];
00633 unsigned int del = cost[i][j-1];
00634 unsigned int match;
00635 if (use_letter_sed) {
00636 ins += GetVocabulary().GetWord( a[i-1] ).size();
00637 del += GetVocabulary().GetWord( b[j-1] ).size();
00638 match = letter_sed( a[i-1], b[j-1] );
00639 } else {
00640 ins++;
00641 del++;
00642 match = ( a[i-1] == b[j-1] ) ? 0 : 1;
00643 }
00644 unsigned int diag = cost[i-1][j-1] + match;
00645
00646 char action = (ins < del) ? 'I' : 'D';
00647 unsigned int min = (ins < del) ? ins : del;
00648 if (diag < min) {
00649 action = (match>0) ? 'S' : 'M';
00650 min = diag;
00651 }
00652
00653 cost[i][j] = min;
00654 path[i][j] = action;
00655 }
00656 }
00657
00658
00659 unsigned int i = a.size();
00660 unsigned int j = b.size();
00661 best_path = "";
00662 while( i>0 || j>0 ) {
00663 best_path = path[i][j] + best_path;
00664 if (path[i][j] == 'I') {
00665 i--;
00666 } else if (path[i][j] == 'D') {
00667 j--;
00668 } else {
00669 i--;
00670 j--;
00671 }
00672 }
00673
00674
00675
00676 unsigned int final = cost[a.size()][b.size()];
00677
00678 for( unsigned int i=0; i<=a.size(); i++ ) {
00679 free( cost[i] );
00680 free( path[i] );
00681 }
00682 free( cost );
00683 free( path );
00684
00685
00686 return final;
00687 }
00688
00689
00690
00691
00692 unsigned int FuzzyMatchWrapper::compute_length( const vector< WORD_ID > &sentence )
00693 {
00694 unsigned int length = 0;
00695 for( unsigned int i=0; i<sentence.size(); i++ ) {
00696 length += GetVocabulary().GetWord( sentence[i] ).size();
00697 }
00698 return length;
00699 }
00700
00701
00702
00703 void FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source,
00704 vector< vector< WORD_ID > > input )
00705 {
00706
00707 for(unsigned int i=0; i<input.size(); i++) {
00708 bool use_letter_sed = false;
00709
00710
00711 unsigned int input_length;
00712 if (use_letter_sed) {
00713 input_length = compute_length( input[i] );
00714 } else {
00715 input_length = input[i].size();
00716 }
00717 unsigned int best_cost = input_length * (100-min_match) / 100 + 2;
00718 string best_path = "";
00719
00720
00721
00722 for(unsigned int s=0; s<source.size(); s++) {
00723 int source_length;
00724 if (use_letter_sed) {
00725 source_length = compute_length( source[s] );
00726 } else {
00727 source_length = source[s].size();
00728 }
00729 int diff = abs((int)source_length - (int)input_length);
00730 if (length_filter_flag && (diff >= best_cost)) {
00731 continue;
00732 }
00733
00734
00735 string path;
00736 unsigned int cost = sed( input[i], source[s], path, use_letter_sed );
00737
00738
00739 if (cost < best_cost) {
00740 best_cost = cost;
00741 best_path = path;
00742
00743 }
00744 }
00745
00746 }
00747 }
00748
00749
00750
00751
00752
00753
00754 int FuzzyMatchWrapper::short_match_max_length( int input_length )
00755 {
00756 if ( ! refined_flag )
00757 return 0;
00758 if ( input_length >= 5 )
00759 return 1;
00760 return 0;
00761 }
00762
00763
00764
00765
00766
00767
00768
00769
00770 void FuzzyMatchWrapper::init_short_matches(WordIndex &wordIndex, long translationId, const vector< WORD_ID > &input )
00771 {
00772 int max_length = short_match_max_length( input.size() );
00773 if (max_length == 0)
00774 return;
00775
00776 wordIndex.clear();
00777
00778
00779 for(size_t i=0; i<input.size(); i++) {
00780 if (wordIndex.find( input[i] ) == wordIndex.end()) {
00781 vector< int > position_vector;
00782 wordIndex[ input[i] ] = position_vector;
00783 }
00784 wordIndex[ input[i] ].push_back( i );
00785 }
00786 }
00787
00788
00789
00790 void FuzzyMatchWrapper::add_short_matches(WordIndex &wordIndex, long translationId, vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost )
00791 {
00792 int max_length = short_match_max_length( input_length );
00793 if (max_length == 0)
00794 return;
00795
00796 int tm_length = tm.size();
00797 map< WORD_ID,vector< int > >::iterator input_word_hit;
00798 for(int t_pos=0; t_pos<tm.size(); t_pos++) {
00799 input_word_hit = wordIndex.find( tm[t_pos] );
00800 if (input_word_hit != wordIndex.end()) {
00801 vector< int > &position_vector = input_word_hit->second;
00802 for(size_t j=0; j<position_vector.size(); j++) {
00803 int &i_pos = position_vector[j];
00804
00805
00806 int max_cost = max( i_pos , t_pos );
00807 int min_cost = abs( i_pos - t_pos );
00808 if ( i_pos>0 && i_pos == t_pos )
00809 min_cost++;
00810
00811
00812 max_cost += max( (input_length-i_pos) , (tm_length-t_pos));
00813 min_cost += abs( (input_length-i_pos) - (tm_length-t_pos));
00814 if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos))
00815 min_cost++;
00816
00817 if (min_cost <= best_cost) {
00818 Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 );
00819 match.push_back( new_match );
00820 }
00821 }
00822 }
00823 }
00824 }
00825
00826
00827
00828 vector< Match > FuzzyMatchWrapper::prune_matches( const vector< Match > &match, int best_cost )
00829 {
00830
00831 vector< Match > pruned;
00832 for(int i=match.size()-1; i>=0; i--) {
00833
00834
00835
00836
00837
00838
00839
00840 bool subsumed = false;
00841 for(int j=match.size()-1; j>=0; j--) {
00842 if (i!=j
00843 && ( match[i].input_end - match[i].input_start <=
00844 match[j].input_end - match[j].input_start )
00845 && ((match[i].input_start == match[j].input_start &&
00846 match[i].tm_start == match[j].tm_start ) ||
00847 (match[i].input_end == match[j].input_end &&
00848 match[i].tm_end == match[j].tm_end) ) ) {
00849 subsumed = true;
00850 }
00851 }
00852 if (! subsumed && match[i].min_cost <= best_cost) {
00853
00854 pruned.push_back( match[i] );
00855 }
00856 }
00857
00858 return pruned;
00859 }
00860
00861
00862
00863 int FuzzyMatchWrapper::parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost )
00864 {
00865
00866
00867 if (match.size() == 1)
00868 return match[0].max_cost;
00869 if (match.size() == 0)
00870 return input_length+tm_length;
00871
00872 int this_best_cost = input_length + tm_length;
00873 for(size_t i=0; i<match.size(); i++) {
00874 this_best_cost = min( this_best_cost, match[i].max_cost );
00875 }
00876
00877
00878
00879 vector< vector< Match > > multi_match;
00880 multi_match.push_back( match );
00881
00882 int match_level = 1;
00883 while(multi_match[ match_level-1 ].size()>0) {
00884
00885 vector< Match > empty;
00886 multi_match.push_back( empty );
00887
00888 for(int first_level = 0; first_level <= (match_level-1)/2; first_level++) {
00889 int second_level = match_level - first_level -1;
00890
00891
00892 vector< Match > &first_match = multi_match[ first_level ];
00893 vector< Match > &second_match = multi_match[ second_level ];
00894
00895 for(size_t i1 = 0; i1 < first_match.size(); i1++) {
00896 for(size_t i2 = 0; i2 < second_match.size(); i2++) {
00897
00898
00899 if (first_level == second_level && i2 <= i1) {
00900 continue;
00901 }
00902
00903
00904 Match *first, *second;
00905 if (first_match[i1].input_start < second_match[i2].input_start ) {
00906 first = &first_match[i1];
00907 second = &second_match[i2];
00908 } else {
00909 second = &first_match[i1];
00910 first = &second_match[i2];
00911 }
00912
00913
00914
00915
00916
00917
00918
00919
00920
00921
00922 if (first->input_end >= second->input_start) {
00923 continue;
00924 }
00925
00926
00927 if (first->tm_end >= second->tm_start) {
00928 continue;
00929 }
00930
00931
00932 int min_cost = 0;
00933 int max_cost = 0;
00934
00935
00936 min_cost += abs( first->input_start - first->tm_start );
00937 max_cost += max( first->input_start, first->tm_start );
00938
00939
00940 if (first->input_start == first->tm_start && first->input_start > 0) {
00941 min_cost++;
00942 }
00943
00944
00945 int skipped_words = second->input_start - first->input_end -1;
00946 int skipped_words_tm = second->tm_start - first->tm_end -1;
00947 int internal_cost = max( skipped_words, skipped_words_tm );
00948 internal_cost += first->internal_cost + second->internal_cost;
00949 min_cost += internal_cost;
00950 max_cost += internal_cost;
00951
00952
00953 min_cost += abs( (tm_length-1 - second->tm_end) -
00954 (input_length-1 - second->input_end) );
00955 max_cost += max( (tm_length-1 - second->tm_end),
00956 (input_length-1 - second->input_end) );
00957
00958
00959 if ( ( input_length-1 - second->input_end
00960 == tm_length-1 - second->tm_end )
00961 && input_length-1 != second->input_end ) {
00962 min_cost++;
00963 }
00964
00965
00966
00967
00968 if (min_cost > best_cost) {
00969 continue;
00970 }
00971
00972
00973 Match new_match( first->input_start,
00974 second->input_end,
00975 first->tm_start,
00976 second->tm_end,
00977 min_cost,
00978 max_cost,
00979 internal_cost);
00980 multi_match[ match_level ].push_back( new_match );
00981
00982
00983
00984 if (max_cost < this_best_cost) {
00985
00986 this_best_cost = max_cost;
00987
00988
00989 if (max_cost < best_cost) {
00990
00991 best_cost = max_cost;
00992 }
00993 }
00994 }
00995 }
00996 }
00997 match_level++;
00998 }
00999 return this_best_cost;
01000 }
01001
01002
01003 void FuzzyMatchWrapper::create_extract(int sentenceInd, int cost, const vector< WORD_ID > &sourceSentence, const vector<SentenceAlignment> &targets, const string &inputStr, const string &path, ofstream &outputFile)
01004 {
01005 string sourceStr;
01006 for (size_t pos = 0; pos < sourceSentence.size(); ++pos) {
01007 WORD_ID wordId = sourceSentence[pos];
01008 sourceStr += GetVocabulary().GetWord(wordId) + " ";
01009 }
01010
01011 for (size_t targetInd = 0; targetInd < targets.size(); ++targetInd) {
01012 const SentenceAlignment &sentenceAlignment = targets[targetInd];
01013 string targetStr = sentenceAlignment.getTargetString(GetVocabulary());
01014 string alignStr = sentenceAlignment.getAlignmentString();
01015
01016 outputFile
01017 << sentenceInd << endl
01018 << cost << endl
01019 << sourceStr << endl
01020 << inputStr << endl
01021 << targetStr << endl
01022 << alignStr << endl
01023 << path << endl
01024 << sentenceAlignment.count << endl;
01025
01026 }
01027 }
01028
01029 }