00001
00002
00003
00004 #include "ConfusionNet.h"
00005 #include <sstream>
00006
00007 #include "FactorCollection.h"
00008 #include "Util.h"
00009 #include "TranslationOptionCollectionConfusionNet.h"
00010 #include "StaticData.h"
00011 #include "Sentence.h"
00012 #include "moses/FF/InputFeature.h"
00013 #include "util/exception.hh"
00014 #include "moses/TranslationTask.h"
00015 namespace Moses
00016 {
00017 struct CNStats {
00018 size_t created,destr,read,colls,words;
00019
00020 CNStats() : created(0),destr(0),read(0),colls(0),words(0) {}
00021 ~CNStats() {
00022 print(std::cerr);
00023 }
00024
00025 void createOne() {
00026 ++created;
00027 }
00028 void destroyOne() {
00029 ++destr;
00030 }
00031
00032 void collect(const ConfusionNet& cn) {
00033 ++read;
00034 colls+=cn.GetSize();
00035 for(size_t i=0; i<cn.GetSize(); ++i)
00036 words+=cn[i].size();
00037 }
00038 void print(std::ostream& out) const {
00039 if(created>0) {
00040 out<<"confusion net statistics:\n"
00041 " created:\t"<<created<<"\n"
00042 " destroyed:\t"<<destr<<"\n"
00043 " succ. read:\t"<<read<<"\n"
00044 " columns:\t"<<colls<<"\n"
00045 " words:\t"<<words<<"\n"
00046 " avg. word/column:\t"<<words/(1.0*colls)<<"\n"
00047 " avg. cols/sent:\t"<<colls/(1.0*read)<<"\n"
00048 "\n\n";
00049 }
00050 }
00051 };
00052
00053 CNStats stats;
00054
00055 size_t
00056 ConfusionNet::
00057 GetColumnIncrement(size_t i, size_t j) const
00058 {
00059 (void) i;
00060 (void) j;
00061 return 1;
00062 }
00063
00064 ConfusionNet::
00065 ConfusionNet(AllOptions::ptr const& opts) : InputType(opts)
00066 {
00067 stats.createOne();
00068
00069 if (is_syntax(opts->search.algo)) {
00070 m_defaultLabelSet.insert(opts->syntax.input_default_non_terminal);
00071 }
00072 UTIL_THROW_IF2(InputFeature::InstancePtr() == NULL, "Input feature must be specified");
00073 }
00074
00075 ConfusionNet::
00076 ~ConfusionNet()
00077 {
00078 stats.destroyOne();
00079 }
00080
00081 ConfusionNet::
00082 ConfusionNet(Sentence const& s) : InputType(s.options())
00083 {
00084 data.resize(s.GetSize());
00085 for(size_t i=0; i<s.GetSize(); ++i) {
00086 ScorePair scorePair;
00087 std::pair<Word, ScorePair > temp = std::make_pair(s.GetWord(i), scorePair);
00088 data[i].push_back(temp);
00089 }
00090 }
00091
00092 bool
00093 ConfusionNet::
00094 ReadF(std::istream& in, int format)
00095 {
00096 VERBOSE(2, "read confusion net with format "<<format<<"\n");
00097 switch(format) {
00098 case 0:
00099 return ReadFormat0(in);
00100 case 1:
00101 return ReadFormat1(in);
00102 default:
00103 std::cerr << "ERROR: unknown format '"<<format
00104 <<"' in ConfusionNet::Read";
00105 }
00106 return false;
00107 }
00108
00109 int
00110 ConfusionNet::
00111 Read(std::istream& in)
00112 {
00113 int rv=ReadF(in,0);
00114 if(rv) stats.collect(*this);
00115 return rv;
00116 }
00117
00118 bool
00119 ConfusionNet::
00120 ReadFormat0(std::istream& in)
00121 {
00122 Clear();
00123 const std::vector<FactorType>& factorOrder = m_options->input.factor_order;
00124
00125 const InputFeature *inputFeature = InputFeature::InstancePtr();
00126 size_t numInputScores = inputFeature->GetNumInputScores();
00127 size_t numRealWordCount = inputFeature->GetNumRealWordsInInput();
00128
00129 size_t totalCount = numInputScores + numRealWordCount;
00130 bool addRealWordCount = (numRealWordCount > 0);
00131
00132 std::string line;
00133 while(getline(in,line)) {
00134 std::istringstream is(line);
00135 std::string word;
00136
00137 Column col;
00138 while(is>>word) {
00139 Word w;
00140 w.CreateFromString(Input,factorOrder,StringPiece(word),false,false);
00141 std::vector<float> probs(totalCount, 0.0);
00142 for(size_t i=0; i < numInputScores; i++) {
00143 double prob;
00144 if (!(is>>prob)) {
00145 TRACE_ERR("ERROR: unable to parse CN input - bad link probability, "
00146 << "or wrong number of scores\n");
00147 return false;
00148 }
00149 if(prob<0.0) {
00150 VERBOSE(1, "WARN: negative prob: "<<prob<<" ->set to 0.0\n");
00151 prob=0.0;
00152 } else if (prob>1.0) {
00153 VERBOSE(1, "WARN: prob > 1.0 : "<<prob<<" -> set to 1.0\n");
00154 prob=1.0;
00155 }
00156 probs[i] = (std::max(static_cast<float>(log(prob)),LOWEST_SCORE));
00157
00158 }
00159
00160
00161 if (addRealWordCount && word!=EPSILON && word!="")
00162 probs.back() = -1.0;
00163
00164 ScorePair scorePair(probs);
00165
00166 col.push_back(std::make_pair(w,scorePair));
00167 }
00168 if(col.size()) {
00169 data.push_back(col);
00170 ShrinkToFit(data.back());
00171 } else break;
00172 }
00173 return !data.empty();
00174 }
00175
00176 bool
00177 ConfusionNet::
00178 ReadFormat1(std::istream& in)
00179 {
00180 Clear();
00181 const std::vector<FactorType>& factorOrder = m_options->input.factor_order;
00182 std::string line;
00183 if(!getline(in,line)) return 0;
00184 size_t s;
00185 if(getline(in,line)) s=atoi(line.c_str());
00186 else return 0;
00187 data.resize(s);
00188 for(size_t i=0; i<data.size(); ++i) {
00189 if(!getline(in,line)) return 0;
00190 std::istringstream is(line);
00191 if(!(is>>s)) return 0;
00192 std::string word;
00193 double prob;
00194 data[i].resize(s);
00195 for(size_t j=0; j<s; ++j)
00196 if(is>>word>>prob) {
00197
00198 data[i][j].second.denseScores = std::vector<float> (1);
00199 data[i][j].second.denseScores.push_back((float) log(prob));
00200 if(data[i][j].second.denseScores[0]<0) {
00201 VERBOSE(1, "WARN: neg costs: "<<data[i][j].second.denseScores[0]<<" -> set to 0\n");
00202 data[i][j].second.denseScores[0]=0.0;
00203 }
00204
00205 Word& w = data[i][j].first;
00206 w.CreateFromString(Input,factorOrder,StringPiece(word),false,false);
00207 } else return 0;
00208 }
00209 return !data.empty();
00210 }
00211
00212 void ConfusionNet::Print(std::ostream& out) const
00213 {
00214 out<<"conf net: "<<data.size()<<"\n";
00215 for(size_t i=0; i<data.size(); ++i) {
00216 out<<i<<" -- ";
00217 for(size_t j=0; j<data[i].size(); ++j) {
00218 out<<"("<<data[i][j].first.ToString()<<", ";
00219
00220
00221 std::vector<float>::const_iterator iterDense;
00222 for(iterDense = data[i][j].second.denseScores.begin();
00223 iterDense < data[i][j].second.denseScores.end();
00224 ++iterDense) {
00225 out<<", "<<*iterDense;
00226 }
00227
00228
00229 std::map<StringPiece, float>::const_iterator iterSparse;
00230 for(iterSparse = data[i][j].second.sparseScores.begin();
00231 iterSparse != data[i][j].second.sparseScores.end();
00232 ++iterSparse) {
00233 out << ", " << iterSparse->first << "=" << iterSparse->second;
00234 }
00235
00236 out<<") ";
00237 }
00238 out<<"\n";
00239 }
00240 out<<"\n\n";
00241 }
00242
00243 #ifdef _WIN32
00244 #pragma warning(disable:4716)
00245 #endif
00246 Phrase
00247 ConfusionNet::
00248 GetSubString(const Range&) const
00249 {
00250 UTIL_THROW2("ERROR: call to ConfusionNet::GetSubString\n");
00251
00252 }
00253
00254 std::string
00255 ConfusionNet::
00256 GetStringRep(const std::vector<FactorType> ) const
00257 {
00258 TRACE_ERR("ERROR: call to ConfusionNet::GeStringRep\n");
00259 return "";
00260 }
00261 #ifdef _WIN32
00262 #pragma warning(disable:4716)
00263 #endif
00264 const Word& ConfusionNet::GetWord(size_t) const
00265 {
00266 UTIL_THROW2("ERROR: call to ConfusionNet::GetFactorArray\n");
00267 }
00268 #ifdef _WIN32
00269 #pragma warning(default:4716)
00270 #endif
00271 std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn)
00272 {
00273 cn.Print(out);
00274 return out;
00275 }
00276
00277 TranslationOptionCollection*
00278 ConfusionNet::
00279 CreateTranslationOptionCollection(ttasksptr const& ttask) const
00280 {
00281
00282
00283
00284
00285 TranslationOptionCollection *rv
00286 = new TranslationOptionCollectionConfusionNet(ttask, *this);
00287
00288 assert(rv);
00289 return rv;
00290 }
00291
00292 }
00293
00294