00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #include "tercalc.h"
00033 using namespace std;
00034 using namespace TERCPPNS_Tools;
00035 namespace TERCPPNS_TERCpp
00036 {
00037
00038 terCalc::terCalc()
00039 {
00040 TAILLE_PERMUT_MAX = 10;
00041 NBR_PERMUT_MAX = 10;
00042 infinite = 99999.0;
00043 shift_cost = 1.0;
00044 insert_cost = 1.0;
00045 delete_cost = 1.0;
00046 substitute_cost = 1.0;
00047 match_cost = 0.0;
00048 NBR_SEGS_EVALUATED = 0;
00049 NBR_PERMUTS_CONSID = 0;
00050 NBR_BS_APPELS = 0;
00051 TAILLE_BEAM = 10;
00052 DIST_MAX_PERMUT = 25;
00053 PRINT_DEBUG = false;
00054 hypSpans.clear();
00055 refSpans.clear();
00056 CALL_TER_ALIGN=0;
00057 CALL_CALC_PERMUT=0;
00058 CALL_FIND_BSHIFT=0;
00059 MAX_LENGTH_SENTENCE=10;
00060 S = new vector < vector < double > >(MAX_LENGTH_SENTENCE, std::vector<double>(MAX_LENGTH_SENTENCE,0.0));
00061 P = new vector < vector < char > >(MAX_LENGTH_SENTENCE, std::vector<char>(MAX_LENGTH_SENTENCE,' '));
00062 }
00063
00064 terCalc::~terCalc()
00065 {
00066 delete(S);
00067 delete(P);
00068 }
00069
00070
00071 terAlignment terCalc::WERCalculation ( vector< string >& hyp , vector< string >& ref )
00072 {
00073
00074 return minimizeDistanceEdition ( hyp, ref, hypSpans );
00075
00076 }
00077
00078 terAlignment terCalc::TER ( vector< int >& hyp, vector< int >& ref )
00079 {
00080 stringstream s;
00081 s.str ( "" );
00082 string stringRef ( "" );
00083 string stringHyp ( "" );
00084 for ( vector<int>::iterator l_it = ref.begin(); l_it != ref.end(); l_it++ ) {
00085 if ( l_it == ref.begin() ) {
00086 s << ( *l_it );
00087 } else {
00088 s << " " << ( *l_it );
00089 }
00090 }
00091 stringRef = s.str();
00092 s.str ( "" );
00093 for ( vector<int>::iterator l_itHyp = hyp.begin(); l_itHyp != hyp.end(); l_itHyp++ ) {
00094 if ( l_itHyp == hyp.begin() ) {
00095 s << ( *l_itHyp );
00096 } else {
00097 s << " " << ( *l_itHyp );
00098 }
00099 }
00100 stringHyp = s.str();
00101 s.str ( "" );
00102 vector<string> l_vref=stringToVector ( stringRef , " " );
00103 vector<string> l_vhyp=stringToVector ( stringHyp , " " );
00104 return TER ( l_vhyp , l_vref);
00105 }
00106
00107
00108 hashMapInfos terCalc::createConcordMots ( vector< string >& hyp, vector< string >& ref )
00109 {
00110 hashMap tempHash;
00111 hashMapInfos retour;
00112 for ( int i = 0; i < ( int ) hyp.size(); i++ ) {
00113 tempHash.addHasher ( hyp.at ( i ), "" );
00114 }
00115 bool cor[ref.size() ];
00116 for ( int i = 0; i < ( int ) ref.size(); i++ ) {
00117 if ( tempHash.trouve ( ( string ) ref.at ( i ) ) ) {
00118 cor[i] = true;
00119 } else {
00120 cor[i] = false;
00121 }
00122 }
00123 for ( int start = 0; start < ( int ) ref.size(); start++ ) {
00124 if ( cor[start] ) {
00125 for ( int end = start; ( ( end < ( int ) ref.size() ) && ( end - start <= TAILLE_PERMUT_MAX ) && ( cor[end] ) ); end++ ) {
00126 vector<string> ajouter = subVector ( ref, start, end + 1 );
00127 string ajouterString = vectorToString ( ajouter );
00128 vector<int> values = retour.getValue ( ajouterString );
00129 values.push_back ( start );
00130 if ( values.size() > 1 ) {
00131 retour.setValue ( ajouterString, values );
00132 } else {
00133 retour.addValue ( ajouterString, values );
00134 }
00135 }
00136 }
00137 }
00138 return retour;
00139 }
00140
00141 bool terCalc::trouverIntersection ( vecInt& refSpan, vecInt& hypSpan )
00142 {
00143 if ( ( refSpan.at ( 1 ) >= hypSpan.at ( 0 ) ) && ( refSpan.at ( 0 ) <= hypSpan.at ( 1 ) ) ) {
00144 return true;
00145 }
00146 return false;
00147 }
00148
00149
00150 terAlignment terCalc::minimizeDistanceEdition ( vector< string >& hyp, vector< string >& ref, vector< vecInt >& curHypSpans )
00151 {
00152 double current_best = infinite;
00153 double last_best = infinite;
00154 int first_good = 0;
00155 int current_first_good = 0;
00156 int last_good = -1;
00157 int cur_last_good = 0;
00158 int last_peak = 0;
00159 int cur_last_peak = 0;
00160 int i=0;
00161 int j=0;
00162 int ref_size=0 ;
00163 ref_size=( int ) ref.size();
00164 int hyp_size=0;
00165 hyp_size=( int ) hyp.size();
00166 double cost, icost, dcost;
00167 double score;
00168 delete(S);
00169 delete(P);
00170 S = new vector < vector < double > >(ref_size+1, std::vector<double>(hyp_size+1,-1.0));
00171 P = new vector < vector < char > >(ref_size+1, std::vector<char>(hyp_size+1,'0'));
00172
00173
00174
00175 NBR_BS_APPELS++;
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186 S->at(0).at(0) = 0.0;
00187 for ( j = 0; j <= hyp_size; j++ ) {
00188 last_best = current_best;
00189 current_best = infinite;
00190 first_good = current_first_good;
00191 current_first_good = -1;
00192 last_good = cur_last_good;
00193 cur_last_good = -1;
00194 last_peak = cur_last_peak;
00195 cur_last_peak = 0;
00196 for ( i = first_good; i <= ref_size; i++ ) {
00197 if ( i > last_good ) {
00198 break;
00199 }
00200 if ( S->at(i).at(j) < 0 ) {
00201 continue;
00202 }
00203 score = S->at(i).at(j);
00204 if ( ( j < hyp_size ) && ( score > last_best + TAILLE_BEAM ) ) {
00205 continue;
00206 }
00207 if ( current_first_good == -1 ) {
00208 current_first_good = i ;
00209 }
00210 if ( ( i < ref_size ) && ( j < hyp_size ) ) {
00211 if ( ( int ) refSpans.size() == 0 || ( int ) hypSpans.size() == 0 || trouverIntersection ( refSpans.at ( i ), curHypSpans.at ( j ) ) ) {
00212 if ( ( int ) ( ref.at ( i ).compare ( hyp.at ( j ) ) ) == 0 ) {
00213 cost = match_cost + score;
00214 if ( ( S->at(i+1).at(j+1) == -1 ) || ( cost < S->at(i+1).at(j+1) ) ) {
00215 S->at(i+1).at(j+1) = cost;
00216 P->at(i+1).at(j+1) = 'A';
00217 }
00218 if ( cost < current_best ) {
00219 current_best = cost;
00220 }
00221 if ( current_best == cost ) {
00222 cur_last_peak = i + 1;
00223 }
00224 } else {
00225 cost = substitute_cost + score;
00226 if ( ( S->at(i+1).at(j+1) < 0 ) || ( cost < S->at(i+1).at(j+1) ) ) {
00227 S->at(i+1).at(j+1) = cost;
00228 P->at(i+1).at(j+1) = 'S';
00229 if ( cost < current_best ) {
00230 current_best = cost;
00231 }
00232 if ( current_best == cost ) {
00233 cur_last_peak = i + 1 ;
00234 }
00235 }
00236 }
00237 }
00238 }
00239 cur_last_good = i + 1;
00240 if ( j < hyp_size ) {
00241 icost = score + insert_cost;
00242 if ( ( S->at(i).at(j+1) < 0 ) || ( S->at(i).at(j+1) > icost ) ) {
00243 S->at(i).at(j+1) = icost;
00244 P->at(i).at(j+1) = 'I';
00245 if ( ( cur_last_peak < i ) && ( current_best == icost ) ) {
00246 cur_last_peak = i;
00247 }
00248 }
00249 }
00250 if ( i < ref_size ) {
00251 dcost = score + delete_cost;
00252 if ( ( S->at(i+1).at(j) < 0.0 ) || ( S->at(i+1).at(j) > dcost ) ) {
00253 S->at(i+1).at(j) = dcost;
00254 P->at(i+1).at(j) = 'D';
00255 if ( i >= last_good ) {
00256 last_good = i + 1 ;
00257 }
00258 }
00259 }
00260 }
00261 }
00262
00263
00264 int tracelength = 0;
00265 i = ref.size();
00266 j = hyp.size();
00267 while ( ( i > 0 ) || ( j > 0 ) ) {
00268 tracelength++;
00269 if ( P->at(i).at(j) == 'A' ) {
00270 i--;
00271 j--;
00272 } else if ( P->at(i).at(j) == 'S' ) {
00273 i--;
00274 j--;
00275 } else if ( P->at(i).at(j) == 'D' ) {
00276 i--;
00277 } else if ( P->at(i).at(j) == 'I' ) {
00278 j--;
00279 } else {
00280 cerr << "ERROR : terCalc::minimizeDistanceEdition : Invalid path : " << P->at(i).at(j) << endl;
00281 exit ( -1 );
00282 }
00283 }
00284 vector<char> path ( tracelength );
00285 i = ref.size();
00286 j = hyp.size();
00287 while ( ( i > 0 ) || ( j > 0 ) ) {
00288 path[--tracelength] = P->at(i).at(j);
00289 if ( P->at(i).at(j) == 'A' ) {
00290 i--;
00291 j--;
00292 } else if ( P->at(i).at(j) == 'S' ) {
00293 i--;
00294 j--;
00295 } else if ( P->at(i).at(j) == 'D' ) {
00296 i--;
00297 } else if ( P->at(i).at(j) == 'I' ) {
00298 j--;
00299 }
00300 }
00301 terAlignment to_return;
00302 to_return.numWords = ref_size;
00303 to_return.alignment = path;
00304 to_return.numEdits = S->at(ref_size).at(hyp_size);
00305 to_return.hyp = hyp;
00306 to_return.ref = ref;
00307 to_return.averageWords = ref_size;
00308 if ( PRINT_DEBUG ) {
00309 cerr << "BEGIN DEBUG : terCalc::minimizeDistanceEdition : to_return :" << endl << to_return.toString() << endl << "END DEBUG" << endl;
00310 }
00311 return to_return;
00312
00313 }
00314 void terCalc::minimizeDistanceEdition ( vector< string >& hyp, vector< string >& ref, vector< vecInt >& curHypSpans, terAlignment* to_return )
00315 {
00316 double current_best = infinite;
00317 double last_best = infinite;
00318 int first_good = 0;
00319 int current_first_good = 0;
00320 int last_good = -1;
00321 int cur_last_good = 0;
00322 int last_peak = 0;
00323 int cur_last_peak = 0;
00324 int i=0;
00325 int j=0;
00326 int ref_size=0 ;
00327 ref_size=( int ) ref.size();
00328 int hyp_size=0;
00329 hyp_size=( int ) hyp.size();
00330 double cost, icost, dcost;
00331 double score;
00332 delete(S);
00333 delete(P);
00334 S = new vector < vector < double > >(ref_size+1, std::vector<double>(hyp_size+1,-1.0));
00335 P = new vector < vector < char > >(ref_size+1, std::vector<char>(hyp_size+1,'0'));
00336
00337 NBR_BS_APPELS++;
00338
00339
00340
00341
00342
00343
00344
00345
00346
00347
00348 S->at(0).at(0) = 0.0;
00349 for ( j = 0; j <= hyp_size; j++ ) {
00350 last_best = current_best;
00351 current_best = infinite;
00352 first_good = current_first_good;
00353 current_first_good = -1;
00354 last_good = cur_last_good;
00355 cur_last_good = -1;
00356 last_peak = cur_last_peak;
00357 cur_last_peak = 0;
00358 for ( i = first_good; i <= ref_size; i++ ) {
00359 if ( i > last_good ) {
00360 break;
00361 }
00362 if (S->at(i).at(j) < 0 ) {
00363 continue;
00364 }
00365 score = S->at(i).at(j);
00366 if ( ( j < hyp_size ) && ( score > last_best + TAILLE_BEAM ) ) {
00367 continue;
00368 }
00369 if ( current_first_good == -1 ) {
00370 current_first_good = i ;
00371 }
00372 if ( ( i < ref_size ) && ( j < hyp_size ) ) {
00373 if ( ( int ) refSpans.size() == 0 || ( int ) hypSpans.size() == 0 || trouverIntersection ( refSpans.at ( i ), curHypSpans.at ( j ) ) ) {
00374 if ( ( int ) ( ref.at ( i ).compare ( hyp.at ( j ) ) ) == 0 ) {
00375 cost = match_cost + score;
00376 if ( ( S->at(i+1).at(j+1) == -1 ) || ( cost < S->at(i+1).at(j+1) ) ) {
00377 S->at(i+1).at(j+1) = cost;
00378 P->at(i+1).at(j+1) = 'A';
00379 }
00380 if ( cost < current_best ) {
00381 current_best = cost;
00382 }
00383 if ( current_best == cost ) {
00384 cur_last_peak = i + 1;
00385 }
00386 } else {
00387 cost = substitute_cost + score;
00388 if ( ( S->at(i+1).at(j+1) < 0 ) || ( cost < S->at(i+1).at(j+1) ) ) {
00389 S->at(i+1).at(j+1) = cost;
00390 P->at(i+1).at(j+1) = 'S';
00391 if ( cost < current_best ) {
00392 current_best = cost;
00393 }
00394 if ( current_best == cost ) {
00395 cur_last_peak = i + 1 ;
00396 }
00397 }
00398 }
00399 }
00400 }
00401 cur_last_good = i + 1;
00402 if ( j < hyp_size ) {
00403 icost = score + insert_cost;
00404 if ( ( S->at(i).at(j+1) < 0 ) || ( S->at(i).at(j+1) > icost ) ) {
00405 S->at(i).at(j+1) = icost;
00406 P->at(i).at(j+1) = 'I';
00407 if ( ( cur_last_peak < i ) && ( current_best == icost ) ) {
00408 cur_last_peak = i;
00409 }
00410 }
00411 }
00412 if ( i < ref_size ) {
00413 dcost = score + delete_cost;
00414 if ( ( S->at(i+1).at(j) < 0.0 ) || ( S->at(i+1).at(j) > dcost ) ) {
00415 S->at(i+1).at(j) = dcost;
00416 P->at(i+1).at(j) = 'D';
00417 if ( i >= last_good ) {
00418 last_good = i + 1 ;
00419 }
00420 }
00421 }
00422 }
00423 }
00424
00425
00426 int tracelength = 0;
00427 i = ref_size;;
00428 j = hyp_size;
00429 while ( ( i > 0 ) || ( j > 0 ) ) {
00430 tracelength++;
00431 if (P->at(i).at(j) == 'A' ) {
00432 i--;
00433 j--;
00434 } else if (P->at(i).at(j) == 'S' ) {
00435 i--;
00436 j--;
00437 } else if (P->at(i).at(j) == 'D' ) {
00438 i--;
00439 } else if (P->at(i).at(j) == 'I' ) {
00440 j--;
00441 } else {
00442 cerr << "ERROR : terCalc::minimizeDistanceEdition : Invalid path : " <<P->at(i).at(j) << endl;
00443 exit ( -1 );
00444 }
00445 }
00446 vector<char> path ( tracelength );
00447 i = ref_size;
00448 j = hyp_size;
00449 while ( ( i > 0 ) || ( j > 0 ) ) {
00450 path[--tracelength] =P->at(i).at(j);
00451 if (P->at(i).at(j) == 'A' ) {
00452 i--;
00453 j--;
00454 } else if (P->at(i).at(j) == 'S' ) {
00455 i--;
00456 j--;
00457 } else if (P->at(i).at(j) == 'D' ) {
00458 i--;
00459 } else if (P->at(i).at(j) == 'I' ) {
00460 j--;
00461 }
00462 }
00463
00464 to_return->numWords = ref_size;
00465 to_return->alignment = path;
00466 to_return->numEdits = S->at(ref_size).at(hyp_size);
00467 to_return->hyp = hyp;
00468 to_return->ref = ref;
00469 to_return->averageWords = ref_size;
00470 if ( PRINT_DEBUG ) {
00471 cerr << "BEGIN DEBUG : terCalc::minimizeDistanceEdition : to_return :" << endl << to_return->toString() << endl << "END DEBUG" << endl;
00472 }
00473
00474
00475 }
00476
00477
00478 terAlignment terCalc::TER ( vector<string>& hyp, vector<string>& ref )
00479 {
00480 hashMapInfos rloc = createConcordMots ( hyp, ref );
00481 terAlignment cur_align = minimizeDistanceEdition ( hyp, ref, hypSpans );
00482 vector<string> cur = hyp;
00483 cur_align.hyp = hyp;
00484 cur_align.ref = ref;
00485 cur_align.aftershift = hyp;
00486 double edits = 0;
00487
00488
00489 vector<terShift> * allshifts=new vector<terShift>(0);
00490 bestShiftStruct * returns=new bestShiftStruct();
00491
00492
00493 if ( PRINT_DEBUG ) {
00494 cerr << "BEGIN DEBUG : terCalc::TER : cur_align :" << endl << cur_align.toString() << endl << "END DEBUG" << endl;
00495 }
00496 while ( true ) {
00497
00498 returns=findBestShift ( cur, hyp, ref, rloc, cur_align );
00499
00500 if ( returns->getEmpty()) {
00501 break;
00502 }
00503 terShift bestShift = (*(returns->m_best_shift));
00504 cur_align = (*(returns->m_best_align));
00505 edits += bestShift.cost;
00506 bestShift.alignment = cur_align.alignment;
00507 bestShift.aftershift = cur_align.aftershift;
00508 allshifts->push_back ( bestShift );
00509 cur = cur_align.aftershift;
00510 delete(returns);
00511 }
00512 if ( PRINT_DEBUG ) {
00513 cerr << "BEGIN DEBUG : terCalc::TER : Final to return :" << endl << cur_align.toString() << endl << "END DEBUG" << endl;
00514 }
00515 terAlignment to_return;
00516 to_return = cur_align;
00517 to_return.allshifts = (*(allshifts));
00518 to_return.numEdits += edits;
00519 NBR_SEGS_EVALUATED++;
00520 return to_return;
00521 }
00522 bestShiftStruct * terCalc::findBestShift ( vector<string>& cur, vector<string>& hyp, vector<string>& ref, hashMapInfos& rloc, terAlignment& med_align )
00523 {
00524 CALL_FIND_BSHIFT++;
00525
00526
00527 bool anygain = false;
00528 vector <bool> * herr = new vector<bool>(( int ) hyp.size() + 1 );
00529 vector <bool> * rerr = new vector<bool>( ( int ) ref.size() + 1 );
00530 vector <int> * ralign = new vector<int>( ( int ) ref.size() + 1 );
00531 int l_i,i,j,s;
00532 for (i = 0 ; i< ( int ) hyp.size() + 1 ; i++) {
00533 herr->at(i)=false;
00534 }
00535 for (i = 0 ; i< ( int ) ref.size() + 1 ; i++) {
00536 rerr->at(i)=false;
00537 ralign->at(i)=-1;
00538 }
00539 calculateTerAlignment ( med_align, herr, rerr, ralign );
00540 vector<vecTerShift> * poss_shifts = new vector< vector<terShift> >(0) ;
00541 terAlignment * cur_best_align = new terAlignment();
00542 terShift * cur_best_shift = new terShift();
00543 double cur_best_shift_cost = 0.0;
00544 vector<string> shiftarr;
00545 vector<vecInt> curHypSpans;
00546 terShift * curshift = new terShift();
00547 alignmentStruct shiftReturns;
00548 terAlignment * curalign = new terAlignment() ;
00549
00550
00551 if ( PRINT_DEBUG ) {
00552 cerr << "BEGIN DEBUG : terCalc::findBestShift (after the calculateTerAlignment call) :" << endl;
00553 cerr << "indices: ";
00554 for (l_i=0; l_i < ( int ) ref.size() ; l_i++) {
00555 cerr << l_i << "\t";
00556 }
00557 cerr << endl;
00558 cerr << "hyp : \t"<<vectorToString(hyp ,"\t") << endl;
00559 cerr << "cur : \t"<<vectorToString(cur ,"\t") << endl;
00560 cerr << "ref : \t"<<vectorToString(ref ,"\t") << endl;
00561 cerr << "herr : "<<vectorToString(herr,"\t",( int ) hyp.size()) << " | " << ( int ) hyp.size() <<endl;
00562 cerr << "rerr : "<<vectorToString(rerr,"\t",( int ) ref.size()) << " | " << ( int ) ref.size() <<endl;
00563 cerr << "ralign : "<< vectorToString(ralign,"\t",( int ) ref.size()) << " | " << ( int ) ref.size() << endl;
00564 cerr << "END DEBUG " << endl;
00565 }
00566 poss_shifts = calculerPermutations ( cur, ref, rloc, med_align, herr, rerr, ralign );
00567 double curerr = med_align.numEdits;
00568 if ( PRINT_DEBUG ) {
00569 cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
00570 cerr << "Possible Shifts:" << endl;
00571 for ( i = ( int ) poss_shifts->size() - 1; i >= 0; i-- ) {
00572 for ( j = 0; j < ( int ) ( poss_shifts->at ( i ) ).size(); j++ ) {
00573 cerr << " [" << i << "] " << ( ( poss_shifts->at ( i ) ).at ( j ) ).toString() << endl;
00574 }
00575 }
00576 cerr << endl;
00577 cerr << "END DEBUG " << endl;
00578 }
00579
00580 cur_best_align->set(med_align);
00581 for ( i = ( int ) poss_shifts->size() - 1; i >= 0; i-- ) {
00582 if ( PRINT_DEBUG ) {
00583 cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
00584 cerr << "Considering shift of length " << i << " (" << ( poss_shifts->at ( i ) ).size() << ")" << endl;
00585 cerr << "END DEBUG " << endl;
00586 }
00587
00588 double curfix = curerr - ( cur_best_shift_cost + cur_best_align->numEdits );
00589 double maxfix = ( 2 * ( 1 + i ) );
00590 if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) {
00591 break;
00592 } else {
00593 for ( s = 0; s < ( int ) ( poss_shifts->at ( i ) ).size(); s++ ) {
00594 curfix = curerr - ( cur_best_shift_cost + cur_best_align->numEdits );
00595 if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) {
00596 break;
00597 } else {
00598 curshift->set(( poss_shifts->at ( i ) ).at ( s ));
00599 if ( PRINT_DEBUG ) {
00600 cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
00601 cerr << "cur : "<< join(" ",cur) << endl;
00602 cerr << "shift size : "<< i << endl;
00603 cerr << "shift number : "<< s << endl;
00604 cerr << "size of shift size : "<< ( int ) ( poss_shifts->at ( i ) ).size() << endl;
00605 cerr << "curshift : "<< curshift->toString() << endl;
00606
00607 }
00608
00609 shiftReturns.set(permuter ( cur, curshift ));
00610 shiftarr = shiftReturns.nwords;
00611 curHypSpans = shiftReturns.aftershift;
00612
00613 if ( PRINT_DEBUG ) {
00614 cerr << "shiftarr : "<< join(" ",shiftarr) << endl;
00615 cerr << "curHypSpans size : "<< (int)curHypSpans.size() << endl;
00616 cerr << "END DEBUG " << endl;
00617 }
00618
00619 minimizeDistanceEdition ( shiftarr, ref, curHypSpans, curalign );
00620
00621
00622 curalign->hyp = hyp;
00623 curalign->ref = ref;
00624 curalign->aftershift = shiftarr;
00625
00626
00627 double gain = ( cur_best_align->numEdits + cur_best_shift_cost ) - ( curalign->numEdits + curshift->cost );
00628
00629 if ( PRINT_DEBUG ) {
00630 cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
00631 cerr << "Gain for " << curshift->toString() << " is " << gain << ". (result: [" << curalign->join ( " ", shiftarr ) << "]" << endl;
00632 cerr << "Details of gains : gain = ( cur_best_align->numEdits + cur_best_shift_cost ) - ( curalign->numEdits + curshift->cost )"<<endl;
00633 cerr << "Details of gains : gain = ("<<cur_best_align->numEdits << "+" << cur_best_shift_cost << ") - (" << curalign->numEdits << "+" << curshift->cost << ")"<<endl;
00634 cerr << "" << curalign->toString() << "\n" << endl;
00635 cerr << "END DEBUG " << endl;
00636 }
00637
00638 if ( ( gain > 0 ) || ( ( cur_best_shift_cost == 0 ) && ( gain == 0 ) ) ) {
00639 anygain = true;
00640 cur_best_shift->set(curshift);
00641 cur_best_shift_cost = curshift->cost;
00642 cur_best_align->set(curalign);
00643 if ( PRINT_DEBUG ) {
00644 cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
00645 cerr << "Tmp Choosing shift: " << cur_best_shift->toString() << " gives:\n" << cur_best_align->toString() << "\n" << endl;
00646 cerr << "END DEBUG " << endl;
00647 }
00648 }
00649 }
00650 }
00651 }
00652 }
00653 bestShiftStruct * to_return=new bestShiftStruct();
00654 if ( anygain ) {
00655 to_return->setEmpty(false);
00656 if ( PRINT_DEBUG ) {
00657 cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl;
00658 cerr << "Final shift chosen : " << cur_best_shift->toString() << " gives:\n" << cur_best_align->toString() << "\n" << endl;
00659 cerr << "END DEBUG " << endl;
00660 }
00661 to_return->m_best_shift->set(cur_best_shift);
00662
00663
00664
00665
00666
00667
00668
00669
00670
00671
00672
00673
00674 to_return->m_best_align->set(cur_best_align);
00675
00676 } else {
00677 to_return->setEmpty(true);
00678 }
00679
00680 delete(poss_shifts);
00681 delete(cur_best_align);
00682 delete(cur_best_shift);
00683 delete(curshift);
00684 delete(curalign) ;
00685 return to_return;
00686 }
00687
00688 void terCalc::calculateTerAlignment ( terAlignment& align, vector<bool>* herr, vector<bool>* rerr, vector<int>* ralign )
00689 {
00690 int hpos = -1;
00691 int rpos = -1;
00692 CALL_TER_ALIGN++;
00693
00694 if ( PRINT_DEBUG ) {
00695
00696 cerr << "BEGIN DEBUG : terCalc::calculateTerAlignment : " << endl << align.toString() << endl;
00697 cerr << "END DEBUG " << endl;
00698 }
00699
00700
00701
00702
00703
00704
00705
00706
00707
00708 for ( int i = 0; i < ( int ) align.alignment.size(); i++ ) {
00709 char sym = align.alignment.at(i);
00710 if ( sym == 'A' ) {
00711 hpos++;
00712 rpos++;
00713 herr->at(hpos) = false;
00714 rerr->at(rpos) = false;
00715 ralign->at(rpos) = hpos;
00716 } else if ( sym == 'S' ) {
00717 hpos++;
00718 rpos++;
00719 herr->at(hpos) = true;
00720 rerr->at(rpos) = true;
00721 ralign->at(rpos) = hpos;
00722 } else if ( sym == 'I' ) {
00723 hpos++;
00724 herr->at(hpos) = true;
00725 } else if ( sym == 'D' ) {
00726 rpos++;
00727 rerr->at(rpos) = true;
00728 ralign->at(rpos) = hpos+1;
00729 } else {
00730 cerr << "ERROR : terCalc::calculateTerAlignment : Invalid mini align sequence " << sym << " at pos " << i << endl;
00731 exit ( -1 );
00732 }
00733 }
00734 }
00735
00736 vector<vecTerShift> * terCalc::calculerPermutations ( vector< string >& hyp, vector< string >& ref, hashMapInfos& rloc, TERCPPNS_TERCpp::terAlignment& align, vector<bool>* herr, vector<bool>* rerr, vector<int>* ralign )
00737 {
00738 vector<vecTerShift> * allshifts = new vector<vecTerShift>(0);
00739
00740 CALL_CALC_PERMUT++;
00741
00742 if ( ( TAILLE_PERMUT_MAX <= 0 ) || ( DIST_MAX_PERMUT <= 0 ) ) {
00743 return allshifts;
00744 }
00745 allshifts = new vector<vecTerShift>( TAILLE_PERMUT_MAX + 1 );
00746 int start=0;
00747 int end=0;
00748 bool ok = false;
00749 vector<int> mtiVec(0);
00750 vector<int>::iterator mti;
00751 int moveto=0;
00752 vector<string> cand(0);
00753 bool any_herr = false;
00754 bool any_rerr = false;
00755 int i=0;
00756 int l_nbr_permuts=0;
00757
00758 vector<int> movetoitVec(0);
00759 string subVectorHypString="";
00760 terShift * topush;
00761 for ( start = 0; start < ( int ) hyp.size(); start++ ) {
00762 subVectorHypString = vectorToString ( subVector ( hyp, start, start + 1 ) );
00763 if ( ! rloc.trouve ( subVectorHypString ) ) {
00764 continue;
00765 }
00766
00767 ok = false;
00768 mtiVec = rloc.getValue ( subVectorHypString );
00769 mti = mtiVec.begin();
00770 while ( mti != mtiVec.end() && ( ! ok ) ) {
00771 moveto = ( *mti );
00772 mti++;
00773 if ( ( start != ralign->at(moveto) ) && ( ( ralign->at(moveto) - start ) <= DIST_MAX_PERMUT ) && ( ( start - ralign->at(moveto) - 1 ) <= DIST_MAX_PERMUT ) ) {
00774 ok = true;
00775 }
00776 }
00777 if ( ! ok ) {
00778 continue;
00779 }
00780 ok = true;
00781 for ( end = start; ( ok && ( end < ( int ) hyp.size() ) && ( end < start + TAILLE_PERMUT_MAX ) ); end++ ) {
00782
00783 cand = subVector ( hyp, start, end + 1 );
00784 ok = false;
00785 if ( ! ( rloc.trouve ( vectorToString ( cand ) ) ) ) {
00786 continue;
00787 }
00788
00789 any_herr = false;
00790
00791 for ( i = 0; ( ( i <= ( end - start ) ) && ( ! any_herr ) ); i++ ) {
00792 if ( herr->at(start+i) ) {
00793 any_herr = true;
00794 }
00795 }
00796 if ( any_herr == false ) {
00797 ok = true;
00798 continue;
00799 }
00800
00801 movetoitVec = rloc.getValue ( ( string ) vectorToString ( cand ) );
00802
00803 vector<int>::iterator movetoit;
00804 movetoit = movetoitVec.begin();
00805 while ( movetoit != movetoitVec.end() ) {
00806 moveto = ( *movetoit );
00807 movetoit++;
00808 if ( ! ( ( ralign->at(moveto) != start ) && ( ( ralign->at(moveto) < start ) || ( ralign->at(moveto) > end ) ) && ( ( ralign->at(moveto) - start ) <= DIST_MAX_PERMUT ) && ( ( start - ralign->at(moveto) ) <= DIST_MAX_PERMUT ) ) ) {
00809 continue;
00810 }
00811 ok = true;
00812
00813
00814
00815
00816
00817 any_rerr = false;
00818 for ( int i = 0; ( i <= end - start ) && ( ! any_rerr ); i++ ) {
00819 if ( rerr->at(moveto+i) ) {
00820 any_rerr = true;
00821 }
00822 }
00823 if ( ! any_rerr ) {
00824 continue;
00825 }
00826 for ( int roff = -1; roff <= ( end - start ); roff++ ) {
00827 topush = new terShift();
00828 bool topushNull = true;
00829 if ( ( roff == -1 ) && ( moveto == 0 ) ) {
00830 if ( PRINT_DEBUG ) {
00831
00832 cerr << "BEGIN DEBUG : terCalc::calculerPermutations 01 : " << endl << "Consider making " << start << "..." << end << " (" << vectorToString(cand," ")<< ") moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: -1" << endl << "END DEBUG" << endl;
00833 }
00834
00835
00836 topush->start=start;
00837 topush->end=end;
00838 topush->moveto=-1;
00839 topush->newloc=-1;
00840 topushNull = false;
00841 } else if ( ( start != ralign->at(moveto+roff) ) && ( ( roff == 0 ) || ( ralign->at(moveto+roff) != ralign->at(moveto) ) ) ) {
00842 int newloc = ralign->at(moveto+roff);
00843 if ( PRINT_DEBUG ) {
00844
00845 cerr << "BEGIN DEBUG : terCalc::calculerPermutations 02 : " << endl << "Consider making " << start << "..." << end << " (" << vectorToString(cand," ")<< ") moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: " << newloc << endl << "END DEBUG" << endl;
00846 }
00847
00848
00849 topush->start=start;
00850 topush->end=end;
00851 topush->moveto=moveto + roff;
00852 topush->newloc=newloc;
00853 topushNull = false;
00854 }
00855 if ( !topushNull ) {
00856 topush->shifted = cand;
00857 topush->cost = shift_cost;
00858 l_nbr_permuts++;
00859 if ( PRINT_DEBUG ) {
00860
00861 cerr << "BEGIN DEBUG : terCalc::calculerPermutations 02 : " << endl;
00862 cerr << "start : " << start << endl;
00863 cerr << "end : " << end << endl;
00864 cerr << "end - start : " << end - start << endl;
00865 cerr << "nbr Permutations added: " << l_nbr_permuts << endl;
00866 cerr << "END DEBUG " << endl;
00867 }
00868 if (l_nbr_permuts < NBR_PERMUT_MAX + 1) {
00869 ( allshifts->at ( end - start ) ).push_back ( (*(topush)) );
00870 }
00871
00872
00873
00874
00875 }
00876 delete(topush);
00877 }
00878 }
00879 }
00880 }
00881
00882
00883
00884
00885
00886 return allshifts;
00887 }
00888
00889
00890 alignmentStruct terCalc::permuter ( vector< string >& words, TERCPPNS_TERCpp::terShift& s )
00891 {
00892 return permuter ( words, s.start, s.end, s.newloc );
00893 }
00894 alignmentStruct terCalc::permuter ( vector< string >& words, TERCPPNS_TERCpp::terShift* s )
00895 {
00896 return permuter ( words, s->start, s->end, s->newloc );
00897 }
00898
00899
00900 alignmentStruct terCalc::permuter ( vector< string >& words, int start, int end, int newloc )
00901 {
00902 int c = 0;
00903 vector<string> nwords ( words );
00904 vector<vecInt> spans ( ( int ) hypSpans.size() );
00905 alignmentStruct to_return;
00906 if ( PRINT_DEBUG ) {
00907
00908 if ( ( int ) hypSpans.size() > 0 ) {
00909 cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << "word length: " << ( int ) words.size() << " span length: " << ( int ) hypSpans.size() << endl ;
00910 } else {
00911 cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << "word length: " << ( int ) words.size() << " span length: null" << endl ;
00912 }
00913 cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << join(" ",words) << " start: " << start << " end: " << end << " newloc "<< newloc << endl << "END DEBUG " << endl;
00914 }
00915 if (newloc >= ( int ) words.size()) {
00916 if ( PRINT_DEBUG ) {
00917 cerr << "WARNING: Relocation over the size of the hypothesis, replacing at the end of it."<<endl;
00918 }
00919 newloc = ( int ) words.size()-1;
00920 }
00921
00922
00923
00924 if ( newloc == -1 ) {
00925 for ( int i = start; i <= end; i++ ) {
00926 nwords.at ( c++ ) = words.at ( i );
00927 if ( ( int ) hypSpans.size() > 0 ) {
00928 spans.at ( c - 1 ) = hypSpans.at ( i );
00929 }
00930 }
00931 for ( int i = 0; i <= start - 1; i++ ) {
00932 nwords.at ( c++ ) = words.at ( i );
00933 if ( ( int ) hypSpans.size() > 0 ) {
00934 spans.at ( c - 1 ) = hypSpans.at ( i );
00935 }
00936 }
00937 for ( int i = end + 1; i < ( int ) words.size(); i++ ) {
00938 nwords.at ( c++ ) = words.at ( i );
00939 if ( ( int ) hypSpans.size() > 0 ) {
00940 spans.at ( c - 1 ) = hypSpans.at ( i );
00941 }
00942 }
00943 } else {
00944 if ( newloc < start ) {
00945
00946 for ( int i = 0; i < newloc; i++ ) {
00947 nwords.at ( c++ ) = words.at ( i );
00948 if ( ( int ) hypSpans.size() > 0 ) {
00949 spans.at ( c - 1 ) = hypSpans.at ( i );
00950 }
00951 }
00952 for ( int i = start; i <= end; i++ ) {
00953 nwords.at ( c++ ) = words.at ( i );
00954 if ( ( int ) hypSpans.size() > 0 ) {
00955 spans.at ( c - 1 ) = hypSpans.at ( i );
00956 }
00957 }
00958 for ( int i = newloc ; i < start ; i++ ) {
00959 nwords.at ( c++ ) = words.at ( i );
00960 if ( ( int ) hypSpans.size() > 0 ) {
00961 spans.at ( c - 1 ) = hypSpans.at ( i );
00962 }
00963 }
00964 for ( int i = end + 1; i < ( int ) words.size(); i++ ) {
00965 nwords.at ( c++ ) = words.at ( i );
00966 if ( ( int ) hypSpans.size() > 0 ) {
00967 spans.at ( c - 1 ) = hypSpans.at ( i );
00968 }
00969 }
00970 } else {
00971 if ( newloc > end ) {
00972 for ( int i = 0; i <= start - 1; i++ ) {
00973 nwords.at ( c++ ) = words.at ( i );
00974 if ( ( int ) hypSpans.size() > 0 ) {
00975 spans.at ( c - 1 ) = hypSpans.at ( i );
00976 }
00977 }
00978 for ( int i = end + 1; i <= newloc; i++ ) {
00979 nwords.at ( c++ ) = words.at ( i );
00980 if ( ( int ) hypSpans.size() > 0 ) {
00981 spans.at ( c - 1 ) = hypSpans.at ( i );
00982 }
00983 }
00984 for ( int i = start; i <= end; i++ ) {
00985 nwords.at ( c++ ) = words.at ( i );
00986 if ( ( int ) hypSpans.size() > 0 ) {
00987 spans.at ( c - 1 ) = hypSpans.at ( i );
00988 }
00989 }
00990 for ( int i = newloc + 1; i < ( int ) words.size(); i++ ) {
00991 nwords.at ( c++ ) = words.at ( i );
00992 if ( ( int ) hypSpans.size() > 0 ) {
00993 spans.at ( c - 1 ) = hypSpans.at ( i );
00994 }
00995 }
00996 } else {
00997
00998 for ( int i = 0; i <= start - 1; i++ ) {
00999 nwords.at ( c++ ) = words.at ( i );
01000 if ( ( int ) hypSpans.size() > 0 ) {
01001 spans.at ( c - 1 ) = hypSpans.at ( i );
01002 }
01003 }
01004 for ( int i = end + 1; ( i < ( int ) words.size() ) && ( i <= ( end + ( newloc - start ) ) ); i++ ) {
01005 nwords.at ( c++ ) = words.at ( i );
01006 if ( ( int ) hypSpans.size() > 0 ) {
01007 spans.at ( c - 1 ) = hypSpans.at ( i );
01008 }
01009 }
01010 for ( int i = start; i <= end; i++ ) {
01011 nwords.at ( c++ ) = words.at ( i );
01012 if ( ( int ) hypSpans.size() > 0 ) {
01013 spans.at ( c - 1 ) = hypSpans.at ( i );
01014 }
01015 }
01016 for ( int i = ( end + ( newloc - start ) + 1 ); i < ( int ) words.size(); i++ ) {
01017 nwords.at ( c++ ) = words.at ( i );
01018 if ( ( int ) hypSpans.size() > 0 ) {
01019 spans.at ( c - 1 ) = hypSpans.at ( i );
01020 }
01021 }
01022 }
01023 }
01024 }
01025 NBR_PERMUTS_CONSID++;
01026
01027 if ( PRINT_DEBUG ) {
01028 cerr << "nwords" << join(" ",nwords) << endl;
01029
01030 }
01031
01032 to_return.nwords = nwords;
01033 to_return.aftershift = spans;
01034 return to_return;
01035 }
01036 void terCalc::setDebugMode ( bool b )
01037 {
01038 PRINT_DEBUG = b;
01039 }
01040
01041 }