00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "util/check.hh"
00024 #include <algorithm>
00025 #include <sstream>
00026 #include <string>
00027 #include "memory.h"
00028 #include "FactorCollection.h"
00029 #include "Phrase.h"
00030 #include "StaticData.h"
00031
00032 #include "util/string_piece.hh"
00033 #include "util/tokenize_piece.hh"
00034
00035 using namespace std;
00036
00037 namespace Moses
00038 {
00039
00040 Phrase::Phrase() {}
00041
00042 Phrase::Phrase(size_t reserveSize)
00043 {
00044 m_words.reserve(reserveSize);
00045 }
00046
00047 Phrase::Phrase(const vector< const Word* > &mergeWords)
00048 {
00049 m_words.reserve(mergeWords.size());
00050 for (size_t currPos = 0 ; currPos < mergeWords.size() ; currPos++) {
00051 AddWord(*mergeWords[currPos]);
00052 }
00053 }
00054
00055 Phrase::~Phrase()
00056 {
00057 }
00058
00059 void Phrase::MergeFactors(const Phrase ©)
00060 {
00061 CHECK(GetSize() == copy.GetSize());
00062 size_t size = GetSize();
00063 const size_t maxNumFactors = MAX_NUM_FACTORS;
00064 for (size_t currPos = 0 ; currPos < size ; currPos++) {
00065 for (unsigned int currFactor = 0 ; currFactor < maxNumFactors ; currFactor++) {
00066 FactorType factorType = static_cast<FactorType>(currFactor);
00067 const Factor *factor = copy.GetFactor(currPos, factorType);
00068 if (factor != NULL)
00069 SetFactor(currPos, factorType, factor);
00070 }
00071 }
00072 }
00073
00074 void Phrase::MergeFactors(const Phrase ©, FactorType factorType)
00075 {
00076 CHECK(GetSize() == copy.GetSize());
00077 for (size_t currPos = 0 ; currPos < GetSize() ; currPos++)
00078 SetFactor(currPos, factorType, copy.GetFactor(currPos, factorType));
00079 }
00080
00081 void Phrase::MergeFactors(const Phrase ©, const std::vector<FactorType>& factorVec)
00082 {
00083 CHECK(GetSize() == copy.GetSize());
00084 for (size_t currPos = 0 ; currPos < GetSize() ; currPos++)
00085 for (std::vector<FactorType>::const_iterator i = factorVec.begin();
00086 i != factorVec.end(); ++i) {
00087 SetFactor(currPos, *i, copy.GetFactor(currPos, *i));
00088 }
00089 }
00090
00091
00092 Phrase Phrase::GetSubString(const WordsRange &wordsRange) const
00093 {
00094 Phrase retPhrase(wordsRange.GetNumWordsCovered());
00095
00096 for (size_t currPos = wordsRange.GetStartPos() ; currPos <= wordsRange.GetEndPos() ; currPos++) {
00097 Word &word = retPhrase.AddWord();
00098 word = GetWord(currPos);
00099 }
00100
00101 return retPhrase;
00102 }
00103
00104 Phrase Phrase::GetSubString(const WordsRange &wordsRange, FactorType factorType) const
00105 {
00106 Phrase retPhrase(wordsRange.GetNumWordsCovered());
00107
00108 for (size_t currPos = wordsRange.GetStartPos() ; currPos <= wordsRange.GetEndPos() ; currPos++) {
00109 const Factor* f = GetFactor(currPos, factorType);
00110 Word &word = retPhrase.AddWord();
00111 word.SetFactor(factorType, f);
00112 }
00113
00114 return retPhrase;
00115 }
00116
00117 std::string Phrase::GetStringRep(const vector<FactorType> factorsToPrint) const
00118 {
00119 stringstream strme;
00120 for (size_t pos = 0 ; pos < GetSize() ; pos++) {
00121 strme << GetWord(pos).GetString(factorsToPrint, (pos != GetSize()-1));
00122 }
00123
00124 return strme.str();
00125 }
00126
00127 Word &Phrase::AddWord()
00128 {
00129 m_words.push_back(Word());
00130 return m_words.back();
00131 }
00132
00133 void Phrase::Append(const Phrase &endPhrase)
00134 {
00135
00136 for (size_t i = 0; i < endPhrase.GetSize(); i++) {
00137 AddWord(endPhrase.GetWord(i));
00138 }
00139 }
00140
00141 void Phrase::PrependWord(const Word &newWord)
00142 {
00143 AddWord();
00144
00145
00146 for (size_t pos = GetSize() - 1; pos >= 1; --pos) {
00147 const Word &word = m_words[pos - 1];
00148 m_words[pos] = word;
00149 }
00150
00151 m_words[0] = newWord;
00152 }
00153
00154 void Phrase::CreateFromString(FactorDirection direction
00155 ,const std::vector<FactorType> &factorOrder
00156 ,const StringPiece &phraseString
00157 ,const StringPiece &factorDelimiter
00158 ,Word **lhs)
00159 {
00160
00161 vector<StringPiece> annotatedWordVector;
00162 for (util::TokenIter<util::AnyCharacter, true> it(phraseString, "\t "); it; ++it) {
00163 annotatedWordVector.push_back(*it);
00164 }
00165
00166 if (annotatedWordVector.size() == 0) {
00167 if (lhs) {
00168 (*lhs) = NULL;
00169 }
00170 return;
00171 }
00172
00173
00174
00175
00176
00177 size_t numWords;
00178 const StringPiece &annotatedWord = annotatedWordVector.back();
00179 if (annotatedWord.size() >= 2
00180 && *annotatedWord.data() == '['
00181 && annotatedWord.data()[annotatedWord.size() - 1] == ']') {
00182
00183 numWords = annotatedWordVector.size()-1;
00184
00185
00186 assert(lhs);
00187 (*lhs) = new Word(true);
00188 (*lhs)->CreateFromString(direction, factorOrder, annotatedWord.substr(1, annotatedWord.size() - 2), true);
00189 assert((*lhs)->IsNonTerminal());
00190 } else {
00191 numWords = annotatedWordVector.size();
00192
00193 if (lhs) {
00194 (*lhs) = NULL;
00195 }
00196 }
00197
00198
00199 m_words.reserve(numWords);
00200
00201 for (size_t phrasePos = 0 ; phrasePos < numWords; phrasePos++) {
00202 StringPiece &annotatedWord = annotatedWordVector[phrasePos];
00203 bool isNonTerminal;
00204 if (annotatedWord.size() >= 2 && *annotatedWord.data() == '[' && annotatedWord.data()[annotatedWord.size() - 1] == ']') {
00205
00206 isNonTerminal = true;
00207
00208 size_t nextPos = annotatedWord.find('[', 1);
00209 CHECK(nextPos != string::npos);
00210
00211 if (direction == Input)
00212 annotatedWord = annotatedWord.substr(1, nextPos - 2);
00213 else
00214 annotatedWord = annotatedWord.substr(nextPos + 1, annotatedWord.size() - nextPos - 2);
00215 } else {
00216 isNonTerminal = false;
00217 }
00218
00219 Word &word = AddWord();
00220 word.CreateFromString(direction, factorOrder, annotatedWord, isNonTerminal);
00221
00222 }
00223 }
00224
00225 int Phrase::Compare(const Phrase &other) const
00226 {
00227 #ifdef min
00228 #undef min
00229 #endif
00230 size_t thisSize = GetSize()
00231 ,compareSize = other.GetSize();
00232 if (thisSize != compareSize) {
00233 return (thisSize < compareSize) ? -1 : 1;
00234 }
00235
00236 for (size_t pos = 0 ; pos < thisSize ; pos++) {
00237 const Word &thisWord = GetWord(pos)
00238 ,&otherWord = other.GetWord(pos);
00239 int ret = Word::Compare(thisWord, otherWord);
00240
00241 if (ret != 0)
00242 return ret;
00243 }
00244
00245 return 0;
00246 }
00247
00248
00249 bool Phrase::Contains(const vector< vector<string> > &subPhraseVector
00250 , const vector<FactorType> &inputFactor) const
00251 {
00252 const size_t subSize = subPhraseVector.size()
00253 ,thisSize= GetSize();
00254 if (subSize > thisSize)
00255 return false;
00256
00257
00258 for (size_t currStartPos = 0 ; currStartPos < (thisSize - subSize + 1) ; currStartPos++) {
00259 bool match = true;
00260
00261 for (size_t currFactorIndex = 0 ; currFactorIndex < inputFactor.size() ; currFactorIndex++) {
00262 FactorType factorType = inputFactor[currFactorIndex];
00263 for (size_t currSubPos = 0 ; currSubPos < subSize ; currSubPos++) {
00264 size_t currThisPos = currSubPos + currStartPos;
00265 const string &subStr = subPhraseVector[currSubPos][currFactorIndex];
00266 StringPiece thisStr = GetFactor(currThisPos, factorType)->GetString();
00267 if (subStr != thisStr) {
00268 match = false;
00269 break;
00270 }
00271 }
00272 if (!match)
00273 break;
00274 }
00275
00276 if (match)
00277 return true;
00278 }
00279 return false;
00280 }
00281
00282 bool Phrase::IsCompatible(const Phrase &inputPhrase) const
00283 {
00284 if (inputPhrase.GetSize() != GetSize()) {
00285 return false;
00286 }
00287
00288 const size_t size = GetSize();
00289
00290 const size_t maxNumFactors = MAX_NUM_FACTORS;
00291 for (size_t currPos = 0 ; currPos < size ; currPos++) {
00292 for (unsigned int currFactor = 0 ; currFactor < maxNumFactors ; currFactor++) {
00293 FactorType factorType = static_cast<FactorType>(currFactor);
00294 const Factor *thisFactor = GetFactor(currPos, factorType)
00295 ,*inputFactor = inputPhrase.GetFactor(currPos, factorType);
00296 if (thisFactor != NULL && inputFactor != NULL && thisFactor != inputFactor)
00297 return false;
00298 }
00299 }
00300 return true;
00301
00302 }
00303
00304 bool Phrase::IsCompatible(const Phrase &inputPhrase, FactorType factorType) const
00305 {
00306 if (inputPhrase.GetSize() != GetSize()) {
00307 return false;
00308 }
00309 for (size_t currPos = 0 ; currPos < GetSize() ; currPos++) {
00310 if (GetFactor(currPos, factorType) != inputPhrase.GetFactor(currPos, factorType))
00311 return false;
00312 }
00313 return true;
00314 }
00315
00316 bool Phrase::IsCompatible(const Phrase &inputPhrase, const std::vector<FactorType>& factorVec) const
00317 {
00318 if (inputPhrase.GetSize() != GetSize()) {
00319 return false;
00320 }
00321 for (size_t currPos = 0 ; currPos < GetSize() ; currPos++) {
00322 for (std::vector<FactorType>::const_iterator i = factorVec.begin();
00323 i != factorVec.end(); ++i) {
00324 if (GetFactor(currPos, *i) != inputPhrase.GetFactor(currPos, *i))
00325 return false;
00326 }
00327 }
00328 return true;
00329 }
00330
00331 size_t Phrase::GetNumTerminals() const
00332 {
00333 size_t ret = 0;
00334
00335 for (size_t pos = 0; pos < GetSize(); ++pos) {
00336 if (!GetWord(pos).IsNonTerminal())
00337 ret++;
00338 }
00339 return ret;
00340 }
00341
00342 void Phrase::InitializeMemPool()
00343 {
00344 }
00345
00346 void Phrase::FinalizeMemPool()
00347 {
00348 }
00349
00350 void Phrase::OnlyTheseFactors(const FactorMask &factors)
00351 {
00352 for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) {
00353 if (!factors[currFactor]) {
00354 for (size_t pos = 0; pos < GetSize(); ++pos) {
00355 SetFactor(pos, currFactor, NULL);
00356 }
00357 }
00358 }
00359 }
00360
00361 TO_STRING_BODY(Phrase);
00362
00363
00364 ostream& operator<<(ostream& out, const Phrase& phrase)
00365 {
00366
00367 for (size_t pos = 0 ; pos < phrase.GetSize() ; pos++) {
00368 const Word &word = phrase.GetWord(pos);
00369 out << word;
00370 }
00371 return out;
00372 }
00373
00374 }
00375
00376