Moses: /disk4/html/www/moses/doxygen/mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp Source File

00001 //
00002 //  FuzzyMatchWrapper.cpp
00003 //  moses
00004 //
00005 //  Created by Hieu Hoang on 26/07/2012.
00006 //  Copyright 2012 __MyCompanyName__. All rights reserved.
00007 //
00008 
00009 #include <iostream>
00010 #include "FuzzyMatchWrapper.h"
00011 #include "SentenceAlignment.h"
00012 #include "Match.h"
00013 #include "create_xml.h"
00014 #include "moses/Util.h"
00015 #include "moses/StaticData.h"
00016 #include "util/file.hh"
00017 
00018 using namespace std;
00019 
00020 namespace tmmt
00021 {
00022 
00023 FuzzyMatchWrapper::FuzzyMatchWrapper(const std::string &sourcePath, const std::string &targetPath, const std::string &alignmentPath)
00024   :basic_flag(false)
00025   ,lsed_flag(true)
00026   ,refined_flag(true)
00027   ,length_filter_flag(true)
00028   ,parse_flag(true)
00029   ,min_match(70)
00030   ,multiple_flag(true)
00031   ,multiple_slack(0)
00032   ,multiple_max(100)
00033 {
00034   cerr << "creating suffix array" << endl;
00035   suffixArray = new tmmt::SuffixArray( sourcePath );
00036 
00037   //cerr << "loading source data" << endl;
00038   //load_corpus(sourcePath, source);
00039 
00040   cerr << "loading target data" << endl;
00041   load_target(targetPath, targetAndAlignment);
00042 
00043   cerr << "loading alignment" << endl;
00044   load_alignment(alignmentPath, targetAndAlignment);
00045 
00046   // create suffix array
00047   //load_corpus(m_config[0], input);
00048 
00049   cerr << "loading completed" << endl;
00050 }
00051 
00052 string FuzzyMatchWrapper::Extract(long translationId, const string &dirNameStr)
00053 {
00054   const Moses::StaticData &staticData = Moses::StaticData::Instance();
00055 
00056   WordIndex wordIndex;
00057 
00058   string fuzzyMatchFile = ExtractTM(wordIndex, translationId, dirNameStr);
00059 
00060   // create extrac files
00061   create_xml(fuzzyMatchFile);
00062 
00063   // create phrase table with usual Moses scoring and consolidate programs
00064   string cmd;
00065   cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract | gzip -c > "
00066         + fuzzyMatchFile + ".extract.sorted.gz";
00067   system(cmd.c_str());
00068   cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract.inv | gzip -c > "
00069         + fuzzyMatchFile + ".extract.inv.sorted.gz";
00070   system(cmd.c_str());
00071 
00072 #ifdef IS_XCODE
00073   cmd = "/Users/hieuhoang/unison/workspace/github/moses-smt/bin";
00074 #elif IS_ECLIPSE
00075   cmd = "/home/hieu/workspace/github/moses-smt/bin";
00076 #else
00077   cmd = staticData.GetBinDirectory();
00078 #endif
00079 
00080   cmd += string("/../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical ")
00081          + " -extract-file " + fuzzyMatchFile + ".extract -lexical-file - -score-options \"--NoLex\" "
00082          + " -phrase-translation-table " + fuzzyMatchFile + ".pt";
00083   system(cmd.c_str());
00084 
00085 
00086   return fuzzyMatchFile + ".pt.gz";
00087 }
00088 
00089 string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, const string &dirNameStr)
00090 {
00091   const std::vector< std::vector< WORD_ID > > &source = suffixArray->GetCorpus();
00092 
00093   string inputPath = dirNameStr + "/in";
00094   string fuzzyMatchFile = dirNameStr + "/fuzzyMatchFile";
00095   ofstream fuzzyMatchStream(fuzzyMatchFile.c_str());
00096 
00097   vector< vector< WORD_ID > > input;
00098   load_corpus(inputPath, input);
00099 
00100   assert(input.size() == 1);
00101   size_t sentenceInd = 0;
00102 
00103   clock_t start_clock = clock();
00104   // if (i % 10 == 0) cerr << ".";
00105 
00106   // establish some basic statistics
00107 
00108   // int input_length = compute_length( input[i] );
00109   int input_length = input[sentenceInd].size();
00110   int best_cost = input_length * (100-min_match) / 100 + 1;
00111 
00112   int match_count = 0; // how many substring matches to be considered
00113   //cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl;
00114 
00115   // find match ranges in suffix array
00116   vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range;
00117   for(int start=0; start<input[sentenceInd].size(); start++) {
00118     SuffixArray::INDEX prior_first_match = 0;
00119     SuffixArray::INDEX prior_last_match = suffixArray->GetSize()-1;
00120     vector< string > substring;
00121     bool stillMatched = true;
00122     vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart;
00123     //cerr << "start: " << start;
00124     for(size_t word=start; stillMatched && word<input[sentenceInd].size(); word++) {
00125       substring.push_back( GetVocabulary().GetWord( input[sentenceInd][word] ) );
00126 
00127       // only look up, if needed (i.e. no unnecessary short gram lookups)
00128       //                                if (! word-start+1 <= short_match_max_length( input_length ) )
00129       //                        {
00130       SuffixArray::INDEX first_match, last_match;
00131       stillMatched = false;
00132       if (suffixArray->FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) ) {
00133         stillMatched = true;
00134         matchedAtThisStart.push_back( make_pair( first_match, last_match ) );
00135         //cerr << " (" << first_match << "," << last_match << ")";
00136         //cerr << " " << ( last_match - first_match + 1 );
00137         prior_first_match = first_match;
00138         prior_last_match = last_match;
00139       }
00140       //}
00141     }
00142     //cerr << endl;
00143     match_range.push_back( matchedAtThisStart );
00144   }
00145 
00146   clock_t clock_range = clock();
00147 
00148   map< int, vector< Match > > sentence_match;
00149   map< int, int > sentence_match_word_count;
00150 
00151   // go through all matches, longest first
00152   for(int length = input[sentenceInd].size(); length >= 1; length--) {
00153     // do not create matches, if these are handled by the short match function
00154     if (length <= short_match_max_length( input_length ) ) {
00155       continue;
00156     }
00157 
00158     unsigned int count = 0;
00159     for(int start = 0; start <= input[sentenceInd].size() - length; start++) {
00160       if (match_range[start].size() >= length) {
00161         pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1];
00162         // cerr << " (" << range.first << "," << range.second << ")";
00163         count += range.second - range.first + 1;
00164 
00165         for(SuffixArray::INDEX i=range.first; i<=range.second; i++) {
00166           size_t position = suffixArray->GetPosition( i );
00167 
00168           // sentence length mismatch
00169           size_t sentence_id = suffixArray->GetSentence( position );
00170           int sentence_length = suffixArray->GetSentenceLength( sentence_id );
00171           int diff = abs( (int)sentence_length - (int)input_length );
00172           // cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length;
00173           //if (length <= 2 && input_length>=5 &&
00174           //            sentence_match.find( sentence_id ) == sentence_match.end())
00175           //    continue;
00176 
00177           if (diff > best_cost)
00178             continue;
00179 
00180           // compute minimal cost
00181           int start_pos = suffixArray->GetWordInSentence( position );
00182           int end_pos = start_pos + length-1;
00183           // cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. "
00184           // << start << "-" << (start+length-1) << " (" << input_length << ")";
00185           // different number of prior words -> cost is at least diff
00186           int min_cost = abs( start - start_pos );
00187 
00188           // same number of words, but not sent. start -> cost is at least 1
00189           if (start == start_pos && start>0)
00190             min_cost++;
00191 
00192           // different number of remaining words -> cost is at least diff
00193           min_cost += abs( ( sentence_length-1 - end_pos ) -
00194                            ( input_length-1 - (start+length-1) ) );
00195 
00196           // same number of words, but not sent. end -> cost is at least 1
00197           if ( sentence_length-1 - end_pos ==
00198                input_length-1 - (start+length-1)
00199                && end_pos != sentence_length-1 )
00200             min_cost++;
00201 
00202           // cerr << " -> min_cost " << min_cost;
00203           if (min_cost > best_cost)
00204             continue;
00205 
00206           // valid match
00207           match_count++;
00208 
00209           // compute maximal cost
00210           int max_cost = max( start, start_pos )
00211                          + max( sentence_length-1 - end_pos,
00212                                 input_length-1 - (start+length-1) );
00213           // cerr << ", max_cost " << max_cost;
00214 
00215           Match m = Match( start, start+length-1,
00216                            start_pos, start_pos+length-1,
00217                            min_cost, max_cost, 0);
00218           sentence_match[ sentence_id ].push_back( m );
00219           sentence_match_word_count[ sentence_id ] += length;
00220 
00221           if (max_cost < best_cost) {
00222             best_cost = max_cost;
00223             if (best_cost == 0) break;
00224           }
00225           //if (match_count >= MAX_MATCH_COUNT) break;
00226         }
00227       }
00228       // cerr << endl;
00229       if (best_cost == 0) break;
00230       //if (match_count >= MAX_MATCH_COUNT) break;
00231     }
00232     // cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl;
00233 
00234     if (best_cost == 0) break;
00235     //if (match_count >= MAX_MATCH_COUNT) break;
00236   }
00237   cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl;
00238 
00239   clock_t clock_matches = clock();
00240 
00241   // consider each sentence for which we have matches
00242   int old_best_cost = best_cost;
00243   int tm_count_word_match = 0;
00244   int tm_count_word_match2 = 0;
00245   int pruned_match_count = 0;
00246   if (short_match_max_length( input_length )) {
00247     init_short_matches(wordIndex, translationId, input[sentenceInd] );
00248   }
00249   vector< int > best_tm;
00250   typedef map< int, vector< Match > >::iterator I;
00251 
00252   clock_t clock_validation_sum = 0;
00253 
00254   for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++) {
00255     int tmID = tm->first;
00256     int tm_length = suffixArray->GetSentenceLength(tmID);
00257     vector< Match > &match = tm->second;
00258     add_short_matches(wordIndex, translationId, match, source[tmID], input_length, best_cost );
00259 
00260     //cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl;
00261 
00262     // quick look: how many words are matched
00263     int words_matched = 0;
00264     for(size_t m=0; m<match.size(); m++) {
00265 
00266       if (match[m].min_cost <= best_cost) // makes no difference
00267         words_matched += match[m].input_end - match[m].input_start + 1;
00268     }
00269     if (max(input_length,tm_length) - words_matched > best_cost) {
00270       if (length_filter_flag) continue;
00271     }
00272     tm_count_word_match++;
00273 
00274     // prune, check again how many words are matched
00275     vector< Match > pruned = prune_matches( match, best_cost );
00276     words_matched = 0;
00277     for(size_t p=0; p<pruned.size(); p++) {
00278       words_matched += pruned[p].input_end - pruned[p].input_start + 1;
00279     }
00280     if (max(input_length,tm_length) - words_matched > best_cost) {
00281       if (length_filter_flag) continue;
00282     }
00283     tm_count_word_match2++;
00284 
00285     pruned_match_count += pruned.size();
00286     int prior_best_cost = best_cost;
00287     int cost;
00288 
00289     clock_t clock_validation_start = clock();
00290     if (! parse_flag ||
00291         pruned.size()>=10) { // to prevent worst cases
00292       string path;
00293       cost = sed( input[sentenceInd], source[tmID], path, false );
00294       if (cost <  best_cost) {
00295         best_cost = cost;
00296       }
00297     }
00298 
00299     else {
00300       cost = parse_matches( pruned, input_length, tm_length, best_cost );
00301       if (prior_best_cost != best_cost) {
00302         best_tm.clear();
00303       }
00304     }
00305     clock_validation_sum += clock() - clock_validation_start;
00306     if (cost == best_cost) {
00307       best_tm.push_back( tmID );
00308     }
00309   }
00310   cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl;
00311   cerr << "tm considered: " << sentence_match.size()
00312        << " word-matched: " << tm_count_word_match
00313        << " word-matched2: " << tm_count_word_match2
00314        << " best: " << best_tm.size() << endl;
00315 
00316   cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl;
00317 
00318   // create xml and extract files
00319   string inputStr, sourceStr;
00320   for (size_t pos = 0; pos < input_length; ++pos) {
00321     inputStr += GetVocabulary().GetWord(input[sentenceInd][pos]) + " ";
00322   }
00323 
00324   // do not try to find the best ... report multiple matches
00325   if (multiple_flag) {
00326     for(size_t si=0; si<best_tm.size(); si++) {
00327       int s = best_tm[si];
00328       string path;
00329       sed( input[sentenceInd], source[s], path, true );
00330       const vector<WORD_ID> &sourceSentence = source[s];
00331       vector<SentenceAlignment> &targets = targetAndAlignment[s];
00332       create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path, fuzzyMatchStream);
00333 
00334     }
00335   } // if (multiple_flag)
00336   else {
00337 
00338     // find the best matches according to letter sed
00339     string best_path = "";
00340     int best_match = -1;
00341     unsigned int best_letter_cost;
00342     if (lsed_flag) {
00343       best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1;
00344       for(size_t si=0; si<best_tm.size(); si++) {
00345         int s = best_tm[si];
00346         string path;
00347         unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
00348         if (letter_cost < best_letter_cost) {
00349           best_letter_cost = letter_cost;
00350           best_path = path;
00351           best_match = s;
00352         }
00353       }
00354     }
00355     // if letter sed turned off, just compute path for first match
00356     else {
00357       if (best_tm.size() > 0) {
00358         string path;
00359         sed( input[sentenceInd], source[best_tm[0]], path, false );
00360         best_path = path;
00361         best_match = best_tm[0];
00362       }
00363     }
00364     cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC)
00365          << " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC)
00366          << " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC)
00367          << " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC)
00368          << " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")"
00369          << " )" << endl;
00370     if (lsed_flag) {
00371       //cout << best_letter_cost << "/" << compute_length( input[sentenceInd] ) << " (";
00372     }
00373     //cout << best_cost <<"/" << input_length;
00374     if (lsed_flag) {
00375       //cout << ")";
00376     }
00377     //cout << " ||| " << best_match << " ||| " << best_path << endl;
00378 
00379     if (best_match == -1) {
00380       UTIL_THROW_IF2(source.size() == 0, "Empty source phrase");
00381       best_match = 0;
00382     }
00383 
00384     // creat xml & extracts
00385     const vector<WORD_ID> &sourceSentence = source[best_match];
00386     vector<SentenceAlignment> &targets = targetAndAlignment[best_match];
00387     create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path, fuzzyMatchStream);
00388 
00389   } // else if (multiple_flag)
00390 
00391   fuzzyMatchStream.close();
00392 
00393   return fuzzyMatchFile;
00394 }
00395 
00396 void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector< WORD_ID > > &corpus )
00397 {
00398   // source
00399   ifstream fileStream;
00400   fileStream.open(fileName.c_str());
00401   if (!fileStream) {
00402     cerr << "file not found: " << fileName << endl;
00403     exit(1);
00404   }
00405   cerr << "loading " << fileName << endl;
00406 
00407   istream *fileStreamP = &fileStream;
00408 
00409   string line;
00410   while(getline(*fileStreamP, line)) {
00411     corpus.push_back( GetVocabulary().Tokenize( line.c_str() ) );
00412   }
00413 }
00414 
00415 void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus)
00416 {
00417   ifstream fileStream;
00418   fileStream.open(fileName.c_str());
00419   if (!fileStream) {
00420     cerr << "file not found: " << fileName << endl;
00421     exit(1);
00422   }
00423   cerr << "loading " << fileName << endl;
00424 
00425   istream *fileStreamP = &fileStream;
00426 
00427   WORD_ID delimiter = GetVocabulary().StoreIfNew("|||");
00428 
00429   int lineNum = 0;
00430   string line;
00431   while(getline(*fileStreamP, line)) {
00432     vector<WORD_ID> toks = GetVocabulary().Tokenize( line.c_str() );
00433 
00434     corpus.push_back(vector< SentenceAlignment >());
00435     vector< SentenceAlignment > &vec = corpus.back();
00436 
00437     vec.push_back(SentenceAlignment());
00438     SentenceAlignment *sentence = &vec.back();
00439 
00440     const WORD &countStr = GetVocabulary().GetWord(toks[0]);
00441     sentence->count = atoi(countStr.c_str());
00442 
00443     for (size_t i = 1; i < toks.size(); ++i) {
00444       WORD_ID wordId = toks[i];
00445 
00446       if (wordId == delimiter) {
00447         // target and alignments can have multiple sentences.
00448         vec.push_back(SentenceAlignment());
00449         sentence = &vec.back();
00450 
00451         // count
00452         ++i;
00453 
00454         const WORD &countStr = GetVocabulary().GetWord(toks[i]);
00455         sentence->count = atoi(countStr.c_str());
00456       } else {
00457         // just a normal word, add
00458         sentence->target.push_back(wordId);
00459       }
00460     }
00461 
00462     ++lineNum;
00463 
00464   }
00465 
00466 }
00467 
00468 
00469 void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus )
00470 {
00471   ifstream fileStream;
00472   fileStream.open(fileName.c_str());
00473   if (!fileStream) {
00474     cerr << "file not found: " << fileName << endl;
00475     exit(1);
00476   }
00477   cerr << "loading " << fileName << endl;
00478 
00479   istream *fileStreamP = &fileStream;
00480 
00481   string delimiter = "|||";
00482 
00483   int lineNum = 0;
00484   string line;
00485   while(getline(*fileStreamP, line)) {
00486     vector< SentenceAlignment > &vec = corpus[lineNum];
00487     size_t targetInd = 0;
00488     SentenceAlignment *sentence = &vec[targetInd];
00489 
00490     vector<string> toks = Moses::Tokenize(line);
00491 
00492     for (size_t i = 0; i < toks.size(); ++i) {
00493       string &tok = toks[i];
00494 
00495       if (tok == delimiter) {
00496         // target and alignments can have multiple sentences.
00497         ++targetInd;
00498         sentence = &vec[targetInd];
00499 
00500         ++i;
00501       } else {
00502         // just a normal alignment, add
00503         vector<int> alignPoint = Moses::Tokenize<int>(tok, "-");
00504         assert(alignPoint.size() == 2);
00505         sentence->alignment.push_back(pair<int,int>(alignPoint[0], alignPoint[1]));
00506       }
00507     }
00508 
00509     ++lineNum;
00510 
00511   }
00512 }
00513 
00514 bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const
00515 {
00516 #ifdef WITH_THREADS
00517   boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
00518 #endif
00519   map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key );
00520   if (lookup != m_lsed.end()) {
00521     value = lookup->second;
00522     return true;
00523   }
00524 
00525   return false;
00526 }
00527 
00528 void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value)
00529 {
00530 #ifdef WITH_THREADS
00531   boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
00532 #endif
00533   m_lsed[ key ] = value;
00534 }
00535 
00536 /* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */
00537 
00538 unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx )
00539 {
00540   // check if already computed -> lookup in cache
00541   pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx );
00542   unsigned int value;
00543   bool ret = GetLSEDCache(pIdx, value);
00544   if (ret) {
00545     return value;
00546   }
00547 
00548   // get surface strings for word indices
00549   const string &a = GetVocabulary().GetWord( aIdx );
00550   const string &b = GetVocabulary().GetWord( bIdx );
00551 
00552   // initialize cost matrix
00553   unsigned int **cost  = (unsigned int**) calloc( sizeof( unsigned int*  ), a.size()+1 );
00554   for( unsigned int i=0; i<=a.size(); i++ ) {
00555     cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
00556     cost[i][0] = i;
00557   }
00558   for( unsigned int j=0; j<=b.size(); j++ ) {
00559     cost[0][j] = j;
00560   }
00561 
00562   // core string edit distance loop
00563   for( unsigned int i=1; i<=a.size(); i++ ) {
00564     for( unsigned int j=1; j<=b.size(); j++ ) {
00565 
00566       unsigned int ins = cost[i-1][j] + 1;
00567       unsigned int del = cost[i][j-1] + 1;
00568       bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0);
00569       unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1);
00570 
00571       unsigned int min = (ins < del) ? ins : del;
00572       min = (diag < min) ? diag : min;
00573 
00574       cost[i][j] = min;
00575     }
00576   }
00577 
00578   // clear out memory
00579   unsigned int final = cost[a.size()][b.size()];
00580   for( unsigned int i=0; i<=a.size(); i++ ) {
00581     free( cost[i] );
00582   }
00583   free( cost );
00584 
00585   // cache and return result
00586   SetLSEDCache(pIdx, final);
00587   return final;
00588 }
00589 
00590 /* string edit distance implementation */
00591 
00592 unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed )
00593 {
00594 
00595   // initialize cost and path matrices
00596   unsigned int **cost  = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
00597   char **path = (char**) calloc( sizeof( char* ), a.size()+1 );
00598 
00599   for( unsigned int i=0; i<=a.size(); i++ ) {
00600     cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
00601     path[i] = (char*) calloc( sizeof(char), b.size()+1 );
00602     if (i>0) {
00603       cost[i][0] = cost[i-1][0];
00604       if (use_letter_sed) {
00605         cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size();
00606       } else {
00607         cost[i][0]++;
00608       }
00609     } else {
00610       cost[i][0] = 0;
00611     }
00612     path[i][0] = 'I';
00613   }
00614 
00615   for( unsigned int j=0; j<=b.size(); j++ ) {
00616     if (j>0) {
00617       cost[0][j] = cost[0][j-1];
00618       if (use_letter_sed) {
00619         cost[0][j] +=   GetVocabulary().GetWord( b[j-1] ).size();
00620       } else {
00621         cost[0][j]++;
00622       }
00623     } else {
00624       cost[0][j] = 0;
00625     }
00626     path[0][j] = 'D';
00627   }
00628 
00629   // core string edit distance algorithm
00630   for( unsigned int i=1; i<=a.size(); i++ ) {
00631     for( unsigned int j=1; j<=b.size(); j++ ) {
00632       unsigned int ins = cost[i-1][j];
00633       unsigned int del = cost[i][j-1];
00634       unsigned int match;
00635       if (use_letter_sed) {
00636         ins += GetVocabulary().GetWord( a[i-1] ).size();
00637         del += GetVocabulary().GetWord( b[j-1] ).size();
00638         match = letter_sed( a[i-1], b[j-1] );
00639       } else {
00640         ins++;
00641         del++;
00642         match = ( a[i-1] == b[j-1] ) ? 0 : 1;
00643       }
00644       unsigned int diag = cost[i-1][j-1] + match;
00645 
00646       char action = (ins < del) ? 'I' : 'D';
00647       unsigned int min = (ins < del) ? ins : del;
00648       if (diag < min) {
00649         action = (match>0) ? 'S' : 'M';
00650         min = diag;
00651       }
00652 
00653       cost[i][j] = min;
00654       path[i][j] = action;
00655     }
00656   }
00657 
00658   // construct string for best path
00659   unsigned int i = a.size();
00660   unsigned int j = b.size();
00661   best_path = "";
00662   while( i>0 || j>0 ) {
00663     best_path = path[i][j] + best_path;
00664     if (path[i][j] == 'I') {
00665       i--;
00666     } else if (path[i][j] == 'D') {
00667       j--;
00668     } else {
00669       i--;
00670       j--;
00671     }
00672   }
00673 
00674 
00675   // clear out memory
00676   unsigned int final = cost[a.size()][b.size()];
00677 
00678   for( unsigned int i=0; i<=a.size(); i++ ) {
00679     free( cost[i] );
00680     free( path[i] );
00681   }
00682   free( cost );
00683   free( path );
00684 
00685   // return result
00686   return final;
00687 }
00688 
00689 /* utlility function: compute length of sentence in characters
00690  (spaces do not count) */
00691 
00692 unsigned int FuzzyMatchWrapper::compute_length( const vector< WORD_ID > &sentence )
00693 {
00694   unsigned int length = 0;
00695   for( unsigned int i=0; i<sentence.size(); i++ ) {
00696     length += GetVocabulary().GetWord( sentence[i] ).size();
00697   }
00698   return length;
00699 }
00700 
00701 /* brute force method: compare input to all corpus sentences */
00702 
00703 void FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source,
00704     vector< vector< WORD_ID > > input )
00705 {
00706   // go through input set...
00707   for(unsigned int i=0; i<input.size(); i++) {
00708     bool use_letter_sed = false;
00709 
00710     // compute sentence length and worst allowed cost
00711     unsigned int input_length;
00712     if (use_letter_sed) {
00713       input_length = compute_length( input[i] );
00714     } else {
00715       input_length = input[i].size();
00716     }
00717     unsigned int best_cost = input_length * (100-min_match) / 100 + 2;
00718     string best_path = "";
00719     //int best_match = -1;
00720 
00721     // go through all corpus sentences
00722     for(unsigned int s=0; s<source.size(); s++) {
00723       int source_length;
00724       if (use_letter_sed) {
00725         source_length = compute_length( source[s] );
00726       } else {
00727         source_length = source[s].size();
00728       }
00729       int diff = abs((int)source_length - (int)input_length);
00730       if (length_filter_flag && (diff >= best_cost)) {
00731         continue;
00732       }
00733 
00734       // compute string edit distance
00735       string path;
00736       unsigned int cost = sed( input[i], source[s], path, use_letter_sed );
00737 
00738       // update if new best
00739       if (cost < best_cost) {
00740         best_cost = cost;
00741         best_path = path;
00742         //best_match = s;
00743       }
00744     }
00745     //cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl;
00746   }
00747 }
00748 
00749 /* definition of short matches
00750  very short n-gram matches (1-grams) will not be looked up in
00751  the suffix array, since there are too many matches
00752  and for longer sentences, at least one 2-gram match must occur */
00753 
00754 int FuzzyMatchWrapper::short_match_max_length( int input_length )
00755 {
00756   if ( ! refined_flag )
00757     return 0;
00758   if ( input_length >= 5 )
00759     return 1;
00760   return 0;
00761 }
00762 
00763 
00764 /* if we have non-short matches in a sentence, we need to
00765  take a closer look at it.
00766  this function creates a hash map for all input words and their positions
00767  (to be used by the next function)
00768  (done here, because this has be done only once for an input sentence) */
00769 
00770 void FuzzyMatchWrapper::init_short_matches(WordIndex &wordIndex, long translationId, const vector< WORD_ID > &input )
00771 {
00772   int max_length = short_match_max_length( input.size() );
00773   if (max_length == 0)
00774     return;
00775 
00776   wordIndex.clear();
00777 
00778   // store input words and their positions in hash map
00779   for(size_t i=0; i<input.size(); i++) {
00780     if (wordIndex.find( input[i] ) == wordIndex.end()) {
00781       vector< int > position_vector;
00782       wordIndex[ input[i] ] = position_vector;
00783     }
00784     wordIndex[ input[i] ].push_back( i );
00785   }
00786 }
00787 
00788 /* add all short matches to list of matches for a sentence */
00789 
00790 void FuzzyMatchWrapper::add_short_matches(WordIndex &wordIndex, long translationId, vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost )
00791 {
00792   int max_length = short_match_max_length( input_length );
00793   if (max_length == 0)
00794     return;
00795 
00796   int tm_length = tm.size();
00797   map< WORD_ID,vector< int > >::iterator input_word_hit;
00798   for(int t_pos=0; t_pos<tm.size(); t_pos++) {
00799     input_word_hit = wordIndex.find( tm[t_pos] );
00800     if (input_word_hit != wordIndex.end()) {
00801       vector< int > &position_vector = input_word_hit->second;
00802       for(size_t j=0; j<position_vector.size(); j++) {
00803         int &i_pos = position_vector[j];
00804 
00805         // before match
00806         int max_cost = max( i_pos , t_pos );
00807         int min_cost = abs( i_pos - t_pos );
00808         if ( i_pos>0 && i_pos == t_pos )
00809           min_cost++;
00810 
00811         // after match
00812         max_cost += max( (input_length-i_pos) , (tm_length-t_pos));
00813         min_cost += abs( (input_length-i_pos) - (tm_length-t_pos));
00814         if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos))
00815           min_cost++;
00816 
00817         if (min_cost <= best_cost) {
00818           Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 );
00819           match.push_back( new_match );
00820         }
00821       }
00822     }
00823   }
00824 }
00825 
00826 /* remove matches that are subsumed by a larger match */
00827 
00828 vector< Match > FuzzyMatchWrapper::prune_matches( const vector< Match > &match, int best_cost )
00829 {
00830   //cerr << "\tpruning";
00831   vector< Match > pruned;
00832   for(int i=match.size()-1; i>=0; i--) {
00833     //cerr << " (" << match[i].input_start << "," << match[i].input_end
00834     //           << " ; " << match[i].tm_start << "," << match[i].tm_end
00835     //           << " * " << match[i].min_cost << ")";
00836 
00837     //if (match[i].min_cost > best_cost)
00838     //  continue;
00839 
00840     bool subsumed = false;
00841     for(int j=match.size()-1; j>=0; j--) {
00842       if (i!=j // do not compare match with itself
00843           && ( match[i].input_end - match[i].input_start <=
00844                match[j].input_end - match[j].input_start ) // i shorter than j
00845           && ((match[i].input_start == match[j].input_start &&
00846                match[i].tm_start    == match[j].tm_start        ) ||
00847               (match[i].input_end   == match[j].input_end &&
00848                match[i].tm_end      == match[j].tm_end) ) ) {
00849         subsumed = true;
00850       }
00851     }
00852     if (! subsumed && match[i].min_cost <= best_cost) {
00853       //cerr << "*";
00854       pruned.push_back( match[i] );
00855     }
00856   }
00857   //cerr << endl;
00858   return pruned;
00859 }
00860 
00861 /* A* parsing method to compute string edit distance */
00862 
00863 int FuzzyMatchWrapper::parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost )
00864 {
00865   // cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl;
00866 
00867   if (match.size() == 1)
00868     return match[0].max_cost;
00869   if (match.size() == 0)
00870     return input_length+tm_length;
00871 
00872   int this_best_cost = input_length + tm_length;
00873   for(size_t i=0; i<match.size(); i++) {
00874     this_best_cost = min( this_best_cost, match[i].max_cost );
00875   }
00876   // cerr << "\tthis best cost: " << this_best_cost << endl;
00877 
00878   // bottom up combination of spans
00879   vector< vector< Match > > multi_match;
00880   multi_match.push_back( match );
00881 
00882   int match_level = 1;
00883   while(multi_match[ match_level-1 ].size()>0) {
00884     // init vector
00885     vector< Match > empty;
00886     multi_match.push_back( empty );
00887 
00888     for(int first_level = 0; first_level <= (match_level-1)/2; first_level++) {
00889       int second_level = match_level - first_level -1;
00890       //cerr << "\tcombining level " << first_level << " and " << second_level << endl;
00891 
00892       vector< Match > &first_match  = multi_match[ first_level ];
00893       vector< Match > &second_match = multi_match[ second_level ];
00894 
00895       for(size_t i1 = 0; i1 < first_match.size(); i1++) {
00896         for(size_t i2 = 0; i2 < second_match.size(); i2++) {
00897 
00898           // do not combine the same pair twice
00899           if (first_level == second_level && i2 <= i1) {
00900             continue;
00901           }
00902 
00903           // get sorted matches (first is before second)
00904           Match *first, *second;
00905           if (first_match[i1].input_start < second_match[i2].input_start ) {
00906             first = &first_match[i1];
00907             second = &second_match[i2];
00908           } else {
00909             second = &first_match[i1];
00910             first = &second_match[i2];
00911           }
00912 
00913           //cerr << "\tcombining "
00914           //             << "(" << first->input_start << "," << first->input_end << "), "
00915           //             << first->tm_start << " [" << first->internal_cost << "]"
00916           //             << " with "
00917           //             << "(" << second->input_start << "," << second->input_end << "), "
00918           //             << second->tm_start<< " [" << second->internal_cost << "]"
00919           //             << endl;
00920 
00921           // do not process overlapping matches
00922           if (first->input_end >= second->input_start) {
00923             continue;
00924           }
00925 
00926           // no overlap / mismatch in tm
00927           if (first->tm_end >= second->tm_start) {
00928             continue;
00929           }
00930 
00931           // compute cost
00932           int min_cost = 0;
00933           int max_cost = 0;
00934 
00935           // initial
00936           min_cost += abs( first->input_start - first->tm_start );
00937           max_cost += max( first->input_start, first->tm_start );
00938 
00939           // same number of words, but not sent. start -> cost is at least 1
00940           if (first->input_start == first->tm_start && first->input_start > 0) {
00941             min_cost++;
00942           }
00943 
00944           // in-between
00945           int skipped_words = second->input_start - first->input_end -1;
00946           int skipped_words_tm = second->tm_start - first->tm_end -1;
00947           int internal_cost = max( skipped_words, skipped_words_tm );
00948           internal_cost += first->internal_cost + second->internal_cost;
00949           min_cost += internal_cost;
00950           max_cost += internal_cost;
00951 
00952           // final
00953           min_cost += abs( (tm_length-1 - second->tm_end) -
00954                            (input_length-1 - second->input_end) );
00955           max_cost += max( (tm_length-1 - second->tm_end),
00956                            (input_length-1 - second->input_end) );
00957 
00958           // same number of words, but not sent. end -> cost is at least 1
00959           if ( ( input_length-1 - second->input_end
00960                  == tm_length-1 - second->tm_end )
00961                && input_length-1 != second->input_end ) {
00962             min_cost++;
00963           }
00964 
00965           // cerr << "\tcost: " << min_cost << "-" << max_cost << endl;
00966 
00967           // if worst than best cost, forget it
00968           if (min_cost > best_cost) {
00969             continue;
00970           }
00971 
00972           // add match
00973           Match new_match( first->input_start,
00974                            second->input_end,
00975                            first->tm_start,
00976                            second->tm_end,
00977                            min_cost,
00978                            max_cost,
00979                            internal_cost);
00980           multi_match[ match_level ].push_back( new_match );
00981           // cerr << "\tstored\n";
00982 
00983           // possibly updating this_best_cost
00984           if (max_cost < this_best_cost) {
00985             // cerr << "\tupdating this best cost to " << max_cost << "\n";
00986             this_best_cost = max_cost;
00987 
00988             // possibly updating best_cost
00989             if (max_cost < best_cost) {
00990               // cerr << "\tupdating best cost to " << max_cost << "\n";
00991               best_cost = max_cost;
00992             }
00993           }
00994         }
00995       }
00996     }
00997     match_level++;
00998   }
00999   return this_best_cost;
01000 }
01001 
01002 
01003 void FuzzyMatchWrapper::create_extract(int sentenceInd, int cost, const vector< WORD_ID > &sourceSentence, const vector<SentenceAlignment> &targets, const string &inputStr, const string  &path, ofstream &outputFile)
01004 {
01005   string sourceStr;
01006   for (size_t pos = 0; pos < sourceSentence.size(); ++pos) {
01007     WORD_ID wordId = sourceSentence[pos];
01008     sourceStr += GetVocabulary().GetWord(wordId) + " ";
01009   }
01010 
01011   for (size_t targetInd = 0; targetInd < targets.size(); ++targetInd) {
01012     const SentenceAlignment &sentenceAlignment = targets[targetInd];
01013     string targetStr = sentenceAlignment.getTargetString(GetVocabulary());
01014     string alignStr = sentenceAlignment.getAlignmentString();
01015 
01016     outputFile
01017         << sentenceInd << endl
01018         << cost << endl
01019         << sourceStr << endl
01020         << inputStr << endl
01021         << targetStr << endl
01022         << alignStr << endl
01023         << path << endl
01024         << sentenceAlignment.count << endl;
01025 
01026   }
01027 }
01028 
01029 } // namespace