00001
00002
00003
00004
00005
00006
00007
00008 #include <sstream>
00009 #include "moses/Util.h"
00010 #include "AlignedSentence.h"
00011 #include "Parameter.h"
00012
00013 using namespace std;
00014
00015
00017 AlignedSentence::AlignedSentence(int lineNum,
00018 const std::string &source,
00019 const std::string &target,
00020 const std::string &alignment)
00021 :m_lineNum(lineNum)
00022 {
00023 PopulateWordVec(m_source, source);
00024 PopulateWordVec(m_target, target);
00025 PopulateAlignment(alignment);
00026 }
00027
00028 AlignedSentence::~AlignedSentence()
00029 {
00030 Moses::RemoveAllInColl(m_source);
00031 Moses::RemoveAllInColl(m_target);
00032 }
00033
00034 void AlignedSentence::PopulateWordVec(Phrase &vec, const std::string &line)
00035 {
00036 std::vector<string> toks;
00037 Moses::Tokenize(toks, line);
00038
00039 vec.resize(toks.size());
00040 for (size_t i = 0; i < vec.size(); ++i) {
00041 const string &tok = toks[i];
00042 Word *word = new Word(i, tok);
00043 vec[i] = word;
00044 }
00045 }
00046
00047 void AlignedSentence::PopulateAlignment(const std::string &line)
00048 {
00049 vector<string> alignStr;
00050 Moses::Tokenize(alignStr, line);
00051
00052 for (size_t i = 0; i < alignStr.size(); ++i) {
00053 vector<int> alignPair;
00054 Moses::Tokenize(alignPair, alignStr[i], "-");
00055 assert(alignPair.size() == 2);
00056
00057 int sourcePos = alignPair[0];
00058 int targetPos = alignPair[1];
00059
00060 if (sourcePos >= m_source.size()) {
00061 cerr << "ERROR1:AlignedSentence=" << Debug() << endl;
00062 cerr << "m_source=" << m_source.size() << endl;
00063 abort();
00064 }
00065 assert(sourcePos < m_source.size());
00066 assert(targetPos < m_target.size());
00067 Word *sourceWord = m_source[sourcePos];
00068 Word *targetWord = m_target[targetPos];
00069
00070 sourceWord->AddAlignment(targetWord);
00071 targetWord->AddAlignment(sourceWord);
00072 }
00073 }
00074
00075 std::string AlignedSentence::Debug() const
00076 {
00077 stringstream out;
00078 out << "m_lineNum:";
00079 out << m_lineNum;
00080 out << endl;
00081
00082 out << "m_source:";
00083 out << m_source.Debug();
00084 out << endl;
00085
00086 out << "m_target:";
00087 out << m_target.Debug();
00088 out << endl;
00089
00090 out << "consistent phrases:" << endl;
00091 out << m_consistentPhrases.Debug();
00092 out << endl;
00093
00094 return out.str();
00095 }
00096
00097 std::vector<int> AlignedSentence::GetSourceAlignmentCount() const
00098 {
00099 vector<int> ret(m_source.size());
00100
00101 for (size_t i = 0; i < m_source.size(); ++i) {
00102 const Word &word = *m_source[i];
00103 ret[i] = word.GetAlignmentIndex().size();
00104 }
00105 return ret;
00106 }
00107
00108 void AlignedSentence::Create(const Parameter ¶ms)
00109 {
00110 CreateConsistentPhrases(params);
00111 m_consistentPhrases.AddHieroNonTerms(params);
00112 }
00113
00114 void AlignedSentence::CreateConsistentPhrases(const Parameter ¶ms)
00115 {
00116 int countT = m_target.size();
00117 int countS = m_source.size();
00118
00119 m_consistentPhrases.Initialize(countS);
00120
00121
00122 for(int lengthT=1;
00123 lengthT <= params.maxSpan && lengthT <= countT;
00124 lengthT++) {
00125 for(int startT=0; startT < countT-(lengthT-1); startT++) {
00126
00127
00128 int endT = startT + lengthT - 1;
00129
00130
00131
00132 int minS = 9999;
00133 int maxS = -1;
00134 vector< int > usedS = GetSourceAlignmentCount();
00135 for(int ti=startT; ti<=endT; ti++) {
00136 const Word &word = *m_target[ti];
00137 const std::set<int> &alignment = word.GetAlignmentIndex();
00138
00139 std::set<int>::const_iterator iterAlign;
00140 for(iterAlign = alignment.begin(); iterAlign != alignment.end(); ++iterAlign) {
00141 int si = *iterAlign;
00142 if (si<minS) {
00143 minS = si;
00144 }
00145 if (si>maxS) {
00146 maxS = si;
00147 }
00148 usedS[ si ]--;
00149 }
00150 }
00151
00152
00153 if( maxS == -1 )
00154 continue;
00155
00156
00157 size_t width = maxS - minS + 1;
00158
00159 if( width < params.minSpan )
00160 continue;
00161
00162 if( width > params.maxSpan )
00163 continue;
00164
00165
00166 bool out_of_bounds = false;
00167 for(int si=minS; si<=maxS && !out_of_bounds; si++)
00168 if (usedS[si]>0) {
00169 out_of_bounds = true;
00170 }
00171
00172
00173 if (out_of_bounds)
00174 continue;
00175
00176
00177
00178 for(int startS=minS;
00179 (startS>=0 &&
00180 startS>maxS - params.maxSpan &&
00181 (startS==minS || m_source[startS]->GetAlignment().size()==0));
00182 startS--) {
00183
00184 for(int endS=maxS;
00185 (endS<countS && endS<startS + params.maxSpan &&
00186 (endS==maxS || m_source[endS]->GetAlignment().size()==0));
00187 endS++) {
00188
00189
00190 m_consistentPhrases.Add(startS, endS, startT, endT, params);
00191 }
00192 }
00193 }
00194 }
00195 }