00001
00003
00004
00005
00007
00008
00009
00010 #include <numeric>
00011 #include "moses/Word.h"
00012 #include "moses/Phrase.h"
00013 #include "moses/ConfusionNet.h"
00014 #include "moses/WordsRange.h"
00015 #include "moses/TranslationModel/PhraseDictionaryTree.h"
00016
00017 using namespace Moses;
00018
00019 #if 0
00020
00021
00022
00023
00024
00025 size_t GenerateTuples(unsigned num_idx,unsigned* ranges,unsigned *&tuples)
00026 {
00027 unsigned* single_tuple= new unsigned[num_idx+1];
00028 unsigned num_tuples=1;
00029
00030 for (unsigned k=0; k<num_idx; ++k) {
00031 num_tuples *= ranges[k];
00032 single_tuple[k]=0;
00033 }
00034
00035 tuples=new unsigned[num_idx * num_tuples];
00036
00037
00038 single_tuple[num_idx]=0;
00039 unsigned j=0;
00040 for (unsigned n=0; n<num_tuples; ++n) {
00041 memcpy((void *)((tuples + n * num_idx)),(void *)single_tuple,num_idx * sizeof(unsigned));
00042 j=0;
00043 while (single_tuple[j]==ranges[j]-1) {
00044 single_tuple[j]=0;
00045 ++j;
00046 }
00047 ++single_tuple[j];
00048 }
00049 delete [] single_tuple;
00050 return num_tuples;
00051 }
00052
00053
00054 typedef PhraseDictionaryTree::PrefixPtr PPtr;
00055 typedef std::vector<PPtr> vPPtr;
00056 typedef std::vector<std::vector<Factor const*> > mPhrase;
00057
00058 std::ostream& operator<<(std::ostream& out,const mPhrase& p)
00059 {
00060 for(size_t i=0; i<p.size(); ++i) {
00061 out<<i<<" - ";
00062 for(size_t j=0; j<p[i].size(); ++j)
00063 out<<p[i][j]->ToString()<<" ";
00064 out<<"|";
00065 }
00066
00067 return out;
00068 }
00069
00070 struct State {
00071 vPPtr ptrs;
00072 WordsRange range;
00073 float score;
00074
00075 State() : range(0,0),score(0.0) {}
00076 State(size_t b,size_t e,const vPPtr& v,float sc=0.0) : ptrs(v),range(b,e),score(sc) {}
00077
00078 size_t begin() const {
00079 return range.GetStartPos();
00080 }
00081 size_t end() const {
00082 return range.GetEndPos();
00083 }
00084 float GetScore() const {
00085 return score;
00086 }
00087
00088 };
00089
00090 std::ostream& operator<<(std::ostream& out,const State& s)
00091 {
00092 out<<"["<<s.ptrs.size()<<" ("<<s.begin()<<","<<s.end()<<") "<<s.GetScore()<<"]";
00093
00094 return out;
00095 }
00096
00097 typedef std::map<mPhrase,float> E2Costs;
00098
00099
00100 struct GCData {
00101 const std::vector<PhraseDictionaryTree const*>& pdicts;
00102 const std::vector<std::vector<float> >& weights;
00103 std::vector<FactorType> inF,outF;
00104 size_t distinctOutputFactors;
00105 vPPtr root;
00106 size_t totalTuples,distinctTuples;
00107
00108
00109 GCData(const std::vector<PhraseDictionaryTree const*>& a,
00110 const std::vector<std::vector<float> >& b)
00111 : pdicts(a),weights(b),totalTuples(0),distinctTuples(0) {
00112
00113 CHECK(pdicts.size()==weights.size());
00114 std::set<FactorType> distinctOutFset;
00115 inF.resize(pdicts.size());
00116 outF.resize(pdicts.size());
00117 root.resize(pdicts.size());
00118 for(size_t i=0; i<pdicts.size(); ++i) {
00119 root[i]=pdicts[i]->GetRoot();
00120 inF[i]=pdicts[i]->GetInputFactorType();
00121 outF[i]=pdicts[i]->GetOutputFactorType();
00122 distinctOutFset.insert(pdicts[i]->GetOutputFactorType());
00123 }
00124 distinctOutputFactors=distinctOutFset.size();
00125 }
00126
00127 FactorType OutFT(size_t i) const {
00128 return outF[i];
00129 }
00130 FactorType InFT(size_t i) const {
00131 return inF[i];
00132 }
00133 size_t DistinctOutFactors() const {
00134 return distinctOutputFactors;
00135 }
00136
00137 const vPPtr& GetRoot() const {
00138 return root;
00139 }
00140
00141 };
00142
00143 typedef std::vector<Factor const*> vFactor;
00144 typedef std::vector<std::pair<float,vFactor> > TgtCandList;
00145
00146 typedef std::vector<TgtCandList> OutputFactor2TgtCandList;
00147 typedef std::vector<OutputFactor2TgtCandList*> Len2Cands;
00148
00149 void GeneratePerFactorTgtList(size_t factorType,PPtr pptr,GCData& data,Len2Cands& len2cands)
00150 {
00151 std::vector<FactorTgtCand> cands;
00152 data.pdicts[factorType]->GetTargetCandidates(pptr,cands);
00153
00154 for(std::vector<FactorTgtCand>::const_iterator cand=cands.begin(); cand!=cands.end(); ++cand) {
00155 CHECK(data.weights[factorType].size()==cand->second.size());
00156 float costs=std::inner_product(data.weights[factorType].begin(),
00157 data.weights[factorType].end(),
00158 cand->second.begin(),
00159 0.0);
00160
00161 size_t len=cand->first.size();
00162 if(len>=len2cands.size()) len2cands.resize(len+1,0);
00163 if(!len2cands[len]) len2cands[len]=new OutputFactor2TgtCandList(data.DistinctOutFactors());
00164 OutputFactor2TgtCandList &outf2tcandlist=*len2cands[len];
00165
00166 outf2tcandlist[data.OutFT(factorType)].push_back(std::make_pair(costs,cand->first));
00167 }
00168 }
00169
00170 void GenerateTupleTgtCands(OutputFactor2TgtCandList& tCand,E2Costs& e2costs,GCData& data)
00171 {
00172
00173 bool gotCands=1;
00174 for(size_t j=0; gotCands && j<tCand.size(); ++j)
00175 gotCands &= !tCand[j].empty();
00176
00177 if(gotCands) {
00178
00179 CHECK(data.DistinctOutFactors()==tCand.size());
00180 std::vector<unsigned> radix(data.DistinctOutFactors());
00181 for(size_t i=0; i<tCand.size(); ++i) radix[i]=tCand[i].size();
00182
00183 unsigned *tuples=0;
00184 size_t numTuples=GenerateTuples(radix.size(),&radix[0],tuples);
00185
00186 data.totalTuples+=numTuples;
00187
00188 for(size_t i=0; i<numTuples; ++i) {
00189 mPhrase e(radix.size());
00190 float costs=0.0;
00191 for(size_t j=0; j<radix.size(); ++j) {
00192 CHECK(tuples[radix.size()*i+j]<tCand[j].size());
00193 std::pair<float,vFactor> const& mycand=tCand[j][tuples[radix.size()*i+j]];
00194 e[j]=mycand.second;
00195 costs+=mycand.first;
00196 }
00197 #ifdef DEBUG
00198 bool mismatch=0;
00199 for(size_t j=1; !mismatch && j<e.size(); ++j)
00200 if(e[j].size()!=e[j-1].size()) mismatch=1;
00201 CHECK(mismatch==0);
00202 #endif
00203 std::pair<E2Costs::iterator,bool> p=e2costs.insert(std::make_pair(e,costs));
00204 if(p.second) ++data.distinctTuples;
00205 else {
00206
00207 if(costs<p.first->second) p.first->second=costs;
00208 }
00209 }
00210 delete [] tuples;
00211 }
00212 }
00213
00214 void GenerateCandidates_(E2Costs& e2costs,const vPPtr& nextP,GCData& data)
00215 {
00216 Len2Cands len2cands;
00217
00218 for(size_t factorType=0; factorType<nextP.size(); ++factorType)
00219 if(nextP[factorType])
00220 GeneratePerFactorTgtList(factorType,nextP[factorType],data,len2cands);
00221
00222
00223 for(size_t len=0; len<len2cands.size(); ++len) if(len2cands[len])
00224 GenerateTupleTgtCands(*len2cands[len],e2costs,data);
00225 }
00226
00227 void GenerateCandidates(const ConfusionNet& src,
00228 const std::vector<PhraseDictionaryTree const*>& pdicts,
00229 const std::vector<std::vector<float> >& weights,
00230 int verbose)
00231 {
00232 GCData data(pdicts,weights);
00233
00234 std::vector<State> stack;
00235 for(size_t i=0; i<src.GetSize(); ++i) stack.push_back(State(i,i,data.GetRoot()));
00236
00237 std::map<WordsRange,E2Costs> cov2E;
00238
00239
00240
00241 while(!stack.empty()) {
00242 State curr(stack.back());
00243 stack.pop_back();
00244
00245
00246
00247 CHECK(curr.end()<src.GetSize());
00248 const ConfusionNet::Column &currCol=src[curr.end()];
00249 for(size_t colidx=0; colidx<currCol.size(); ++colidx) {
00250 const Word& w=currCol[colidx].first;
00251 vPPtr nextP(curr.ptrs);
00252 for(size_t j=0; j<nextP.size(); ++j)
00253 nextP[j]=pdicts[j]->Extend(nextP[j],
00254 w.GetFactor(data.InFT(j))->GetString());
00255
00256 bool valid=1;
00257 for(size_t j=0; j<nextP.size(); ++j) if(!nextP[j]) {
00258 valid=0;
00259 break;
00260 }
00261
00262 if(valid) {
00263 if(curr.end()+1<src.GetSize())
00264 stack.push_back(State(curr.begin(),curr.end()+1,nextP,
00265 curr.GetScore()+currCol[colidx].second));
00266
00267 E2Costs &e2costs=cov2E[WordsRange(curr.begin(),curr.end()+1)];
00268 GenerateCandidates_(e2costs,nextP,data);
00269 }
00270 }
00271
00272
00273
00274
00275 }
00276
00277 if(verbose) {
00278
00279 std::cerr<<"tuple stats: total: "<<data.totalTuples
00280 <<" distinct: "<<data.distinctTuples<<" ("
00281 <<(data.distinctTuples/(0.01*data.totalTuples))
00282 <<"%)\n";
00283 std::cerr<<"per coverage set:\n";
00284 for(std::map<WordsRange,E2Costs>::const_iterator i=cov2E.begin();
00285 i!=cov2E.end(); ++i) {
00286 std::cerr<<i->first<<" -- distinct cands: "
00287 <<i->second.size()<<"\n";
00288 }
00289 std::cerr<<"\n\n";
00290 }
00291
00292 if(verbose>10) {
00293 std::cerr<<"full list:\n";
00294 for(std::map<WordsRange,E2Costs>::const_iterator i=cov2E.begin();
00295 i!=cov2E.end(); ++i) {
00296 std::cerr<<i->first<<" -- distinct cands: "
00297 <<i->second.size()<<"\n";
00298 for(E2Costs::const_iterator j=i->second.begin(); j!=i->second.end(); ++j)
00299 std::cerr<<j->first<<" -- "<<j->second<<"\n";
00300 }
00301 }
00302 }
00303
00304 #else
00305
00306 void GenerateCandidates(const ConfusionNet&,
00307 const std::vector<PhraseDictionaryTree const*>&,
00308 const std::vector<std::vector<float> >&,
00309 int)
00310 {
00311 std::cerr<<"ERROR: GenerateCandidates is currently broken\n";
00312 }
00313
00314 #endif