00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "SentenceAlignment.h"
00021
00022 #include <map>
00023 #include <set>
00024 #include <string>
00025
00026 #include "tables-core.h"
00027 #include "util/tokenize.hh"
00028
00029 using namespace std;
00030
00031 namespace MosesTraining
00032 {
00033
00034 SentenceAlignment::~SentenceAlignment() {}
00035
00036 void addBoundaryWords(vector<string> &phrase)
00037 {
00038 phrase.insert(phrase.begin(), "<s>");
00039 phrase.push_back("</s>");
00040 }
00041
00042 bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
00043 {
00044 target = util::tokenize(targetString);
00045 if (boundaryRules)
00046 addBoundaryWords(target);
00047 return true;
00048 }
00049
00050 bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
00051 {
00052 source = util::tokenize(sourceString);
00053 if (boundaryRules)
00054 addBoundaryWords(source);
00055 return true;
00056 }
00057
00058 bool SentenceAlignment::create(const char targetString[],
00059 const char sourceString[],
00060 const char alignmentString[],
00061 const char weightString[],
00062 int sentenceID, bool boundaryRules)
00063 {
00064 using namespace std;
00065 this->sentenceID = sentenceID;
00066 this->weightString = std::string(weightString);
00067
00068
00069 if (!processTargetSentence(targetString, sentenceID, boundaryRules)) {
00070 return false;
00071 }
00072 if (!processSourceSentence(sourceString, sentenceID, boundaryRules)) {
00073 return false;
00074 }
00075
00076
00077 if (target.size() == 0 || source.size() == 0) {
00078 cerr << "no target (" << target.size() << ") or source (" << source.size() << ") words << end insentence " << sentenceID << endl;
00079 cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
00080 return false;
00081 }
00082
00083
00084 for(size_t i=0; i<source.size(); i++) {
00085 alignedCountS.push_back( 0 );
00086 }
00087 for(size_t i=0; i<target.size(); i++) {
00088 vector< int > dummy;
00089 alignedToT.push_back( dummy );
00090 }
00091
00092
00093 vector<string> alignmentSequence = util::tokenize( alignmentString );
00094 for(size_t i=0; i<alignmentSequence.size(); i++) {
00095 int s,t;
00096
00097 if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
00098 cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceID << endl;
00099 cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
00100 return false;
00101 }
00102
00103 if (boundaryRules) {
00104 ++s;
00105 ++t;
00106 }
00107
00108
00109 if ((size_t)t >= target.size() || (size_t)s >= source.size()) {
00110 cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
00111 cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
00112 return false;
00113 }
00114 alignedToT[t].push_back( s );
00115 alignedCountS[s]++;
00116 }
00117
00118 if (boundaryRules) {
00119 alignedToT[0].push_back(0);
00120 alignedCountS[0]++;
00121
00122 alignedToT.back().push_back(alignedCountS.size() - 1);
00123 alignedCountS.back()++;
00124
00125 }
00126
00127 return true;
00128 }
00129
00130 void SentenceAlignment::invertAlignment()
00131 {
00132 alignedToS.resize(source.size());
00133 for (size_t targetPos = 0; targetPos < alignedToT.size(); ++targetPos) {
00134 const std::vector<int> &vec = alignedToT[targetPos];
00135 for (size_t i = 0; i < vec.size(); ++i) {
00136 int sourcePos = vec[i];
00137 alignedToS[sourcePos].push_back(targetPos);
00138 }
00139
00140 }
00141 }
00142
00143 }
00144