00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <sstream>
00024 #include "memory.h"
00025 #include "Word.h"
00026 #include "TypeDef.h"
00027 #include "FactorTypeSet.h"
00028 #include "FactorCollection.h"
00029 #include "StaticData.h"
00030 #include "util/exception.hh"
00031 #include "util/string_stream.hh"
00032 #include "util/tokenize_piece.hh"
00033
00034 using namespace std;
00035
00036 namespace Moses
00037 {
00038
00039
00040 size_t
00041 max_fax()
00042 {
00043 if (StaticData::Instance().GetFactorDelimiter().size())
00044 return MAX_NUM_FACTORS;
00045 return 1;
00046 }
00047
00048
00049 int Word::Compare(const Word &targetWord, const Word &sourceWord)
00050 {
00051 if (targetWord.IsNonTerminal() != sourceWord.IsNonTerminal()) {
00052 return targetWord.IsNonTerminal() ? -1 : 1;
00053 }
00054
00055 for (size_t factorType = 0 ; factorType < MAX_NUM_FACTORS ; factorType++) {
00056 const Factor *targetFactor = targetWord[factorType];
00057 const Factor *sourceFactor = sourceWord[factorType];
00058
00059 if (targetFactor == NULL || sourceFactor == NULL)
00060 continue;
00061 if (targetFactor == sourceFactor)
00062 continue;
00063
00064 return (targetFactor<sourceFactor) ? -1 : +1;
00065 }
00066 return 0;
00067 }
00068
00069 bool Word::operator==(const Word &compare) const
00070 {
00071 if (IsNonTerminal() != compare.IsNonTerminal()) {
00072 return false;
00073 }
00074
00075 for (size_t factorType = 0 ; factorType < MAX_NUM_FACTORS ; factorType++) {
00076 const Factor *thisFactor = GetFactor(factorType);
00077 const Factor *otherFactor = compare.GetFactor(factorType);
00078
00079 if (thisFactor != otherFactor) {
00080 return false;
00081 }
00082 }
00083 return true;
00084 }
00085
00086 void Word::Merge(const Word &sourceWord)
00087 {
00088 for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) {
00089 const Factor *sourcefactor = sourceWord.m_factorArray[currFactor]
00090 ,*targetFactor = this ->m_factorArray[currFactor];
00091 if (targetFactor == NULL && sourcefactor != NULL) {
00092 m_factorArray[currFactor] = sourcefactor;
00093 }
00094 }
00095 }
00096
00097 std::string Word::GetString(const vector<FactorType> factorType,bool endWithBlank) const
00098 {
00099 util::StringStream strme;
00100 const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
00101 bool firstPass = true;
00102 unsigned int stop = min(max_fax(),factorType.size());
00103 for (unsigned int i = 0 ; i < stop ; i++) {
00104 UTIL_THROW_IF2(factorType[i] >= MAX_NUM_FACTORS,
00105 "Trying to reference factor " << factorType[i]
00106 << ". Max factor is " << MAX_NUM_FACTORS);
00107
00108 const Factor *factor = m_factorArray[factorType[i]];
00109 if (factor != NULL) {
00110 if (firstPass) {
00111 firstPass = false;
00112 } else {
00113 strme << factorDelimiter;
00114 }
00115 strme << factor->GetString();
00116 }
00117 }
00118 if(endWithBlank) strme << " ";
00119 return strme.str();
00120 }
00121
00122 StringPiece Word::GetString(FactorType factorType) const
00123 {
00124 return m_factorArray[factorType]->GetString();
00125 }
00126
00127 class StrayFactorException : public util::Exception {};
00128
00129 void
00130 Word::
00131 CreateFromString(FactorDirection direction
00132 , const std::vector<FactorType> &factorOrder
00133 , const StringPiece &str
00134 , bool isNonTerminal
00135 , bool strict)
00136 {
00137 FactorCollection &factorCollection = FactorCollection::Instance();
00138 vector<StringPiece> bits(MAX_NUM_FACTORS);
00139 string factorDelimiter = StaticData::Instance().GetFactorDelimiter();
00140 if (factorDelimiter.size()) {
00141 util::TokenIter<util::MultiCharacter> fit(str, factorDelimiter);
00142 size_t i = 0;
00143 for (; i < MAX_NUM_FACTORS && fit; ++i,++fit)
00144 bits[i] = *fit;
00145 if (i == MAX_NUM_FACTORS)
00146 UTIL_THROW_IF(fit, StrayFactorException,
00147 "The hard limit for factors is " << MAX_NUM_FACTORS
00148 << ". The word " << str << " contains factor delimiter "
00149 << StaticData::Instance().GetFactorDelimiter()
00150 << " too many times.");
00151 if (strict)
00152 UTIL_THROW_IF(fit, StrayFactorException,
00153 "You have configured " << factorOrder.size()
00154 << " factors but the word " << str
00155 << " contains factor delimiter "
00156 << StaticData::Instance().GetFactorDelimiter()
00157 << " too many times.");
00158 UTIL_THROW_IF(!isNonTerminal && i < factorOrder.size(),util::Exception,
00159 "Too few factors in string '" << str << "'.");
00160 } else {
00161 bits[0] = str;
00162 }
00163 for (size_t k = 0; k < factorOrder.size(); ++k) {
00164 UTIL_THROW_IF(factorOrder[k] >= MAX_NUM_FACTORS, util::Exception,
00165 "Factor order out of bounds.");
00166 m_factorArray[factorOrder[k]] = factorCollection.AddFactor(bits[k], isNonTerminal);
00167 }
00168
00169 m_isNonTerminal = isNonTerminal;
00170 }
00171
00172 void Word::CreateUnknownWord(const Word &sourceWord)
00173 {
00174 FactorCollection &factorCollection = FactorCollection::Instance();
00175
00176 m_isNonTerminal = sourceWord.IsNonTerminal();
00177
00178
00179 unsigned int stop = max_fax();
00180 for (unsigned int currFactor = 0 ; currFactor < stop; currFactor++) {
00181 FactorType factorType = static_cast<FactorType>(currFactor);
00182
00183 const Factor *sourceFactor = sourceWord[currFactor];
00184 if (sourceFactor == NULL)
00185 SetFactor(factorType, factorCollection.AddFactor(Output, factorType, UNKNOWN_FACTOR, m_isNonTerminal));
00186 else
00187 SetFactor(factorType, factorCollection.AddFactor(Output, factorType, sourceFactor->GetString(), m_isNonTerminal));
00188 }
00189
00190 m_isOOV = true;
00191 }
00192
00193 void Word::OnlyTheseFactors(const FactorMask &factors)
00194 {
00195 for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) {
00196 if (!factors[currFactor]) {
00197 SetFactor(currFactor, NULL);
00198 }
00199 }
00200 }
00201
00202 bool Word::IsEpsilon() const
00203 {
00204 const Factor *factor = m_factorArray[0];
00205 int compare = factor->GetString().compare(EPSILON);
00206
00207 return compare == 0;
00208 }
00209
00210 TO_STRING_BODY(Word);
00211
00212
00213 ostream& operator<<(ostream& out, const Word& word)
00214 {
00215 util::StringStream strme;
00216 const std::string& factorDelimiter
00217 = StaticData::Instance().options()->output.factor_delimiter;
00218 bool firstPass = true;
00219 unsigned int stop = max_fax();
00220 for (unsigned int currFactor = 0 ; currFactor < stop; currFactor++) {
00221 FactorType factorType = static_cast<FactorType>(currFactor);
00222 const Factor *factor = word.GetFactor(factorType);
00223 if (factor != NULL) {
00224 if (firstPass) {
00225 firstPass = false;
00226 } else {
00227 strme << factorDelimiter;
00228 }
00229 strme << factor->GetString();
00230 }
00231 }
00232 out << strme.str() << " ";
00233 return out;
00234 }
00235
00236 }
00237