00001 #include "dsgHyp.h"
00002 #include <sstream>
00003 #include <boost/algorithm/string.hpp>
00004 #include <algorithm>
00005 #include <cstdlib>
00006 #include <math.h>
00007 #include <map>
00008
00009
00010 using namespace std;
00011 using namespace lm::ngram;
00012
00013 namespace Moses
00014 {
00015 dsgState::dsgState(const State & val)
00016 {
00017 lmState = val;
00018 }
00019
00020 void dsgState::saveState( std::vector<std::string> danglingTok, std::vector<int> srcSpans,float deltaValue)
00021 {
00022 buffer = danglingTok;
00023 span=srcSpans;
00024 delta=deltaValue;
00025 }
00026
00027
00028 size_t dsgState::hash() const
00029 {
00030
00031 size_t ret = 0;
00032 boost::hash_combine(ret, lmState);
00033
00034
00035
00036
00037
00038
00039 }
00040
00041 bool dsgState::operator==(const FFState& otherBase) const
00042 {
00043 const dsgState &other = static_cast<const dsgState&>(otherBase);
00044
00045 if (lmState < other.lmState) return false;
00046 if (lmState == other.lmState) return true;
00047 return false;
00048 }
00049
00050
00051
00052 std::string dsgState :: getName() const
00053 {
00054 return "done";
00055 }
00056
00057 dsgHypothesis :: dsgHypothesis()
00058 {
00059 lmProb = 0;
00060 discontig0 = 0;
00061 discontig1 = 0;
00062 discontig2 = 0;
00063 UnsegWP = 0;
00064 m_buffer.clear();
00065 }
00066
00067 void dsgHypothesis :: setState(const FFState* prev_state)
00068 {
00069 if(prev_state != NULL) {
00070 m_buffer = static_cast <const dsgState *> (prev_state)->getBuffer();
00071 m_span = static_cast <const dsgState *> (prev_state)->getSpan();
00072 lmState = static_cast <const dsgState *> (prev_state)->getLMState();
00073 delta = static_cast <const dsgState *> (prev_state)->getDelta();
00074 }
00075 }
00076
00077 dsgState * dsgHypothesis :: saveState()
00078 {
00079 dsgState * statePtr = new dsgState(lmState);
00080 statePtr->saveState(m_buffer, m_span, delta);
00081 return statePtr;
00082 }
00083
00084 void dsgHypothesis :: populateScores(vector <float> & scores , const int numFeatures)
00085 {
00086 scores.clear();
00087 scores.push_back(lmProb);
00088
00089 if (numFeatures == 1)
00090 return;
00091 scores.push_back(discontig0);
00092 scores.push_back(discontig1);
00093 scores.push_back(discontig2);
00094 scores.push_back(UnsegWP);
00095 }
00096
00097
00098
00099 bool dsgHypothesis::isPrefix(const std::string &tok)
00100 {
00101 if ((tok.at(tok.size() - 1) == '+' )&& (tok != "+")) {
00102 return true;
00103 } else {
00104 return false;
00105 };
00106 }
00107
00108 bool dsgHypothesis::isSuffix(const std::string &tok)
00109 {
00110 if ((tok.at(0) == '+' )&& (tok != "+")) {
00111 return true;
00112 } else {
00113 return false;
00114 };
00115 }
00116
00117 bool dsgHypothesis::isStem(const std::string &tok)
00118 {
00119 if ((tok.at(0) != '+') && (tok.at(tok.size() - 1) != '+')) {
00120 return true;
00121 } else {
00122 return false;
00123 };
00124 }
00125
00126
00127
00133 bool dsgHypothesis::isValidChain(const std::string &tok, std::vector<std::string> &chain)
00134 {
00135 std::string last_tok;
00136 if (chain.size() >= 1) {
00137 last_tok = chain[chain.size() - 1];
00138 } else {
00139 last_tok = "NULL";
00140 }
00141 if(tok=="+") {
00142 return false;
00143 }
00144 if (isPrefix(tok) && (chain.size() == 0 || isPrefix(last_tok))) {
00145 return true;
00146 } else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok)))) {
00147 return true;
00148 }
00149
00150 else if (isStem(tok) && (chain.size() == 0 || isPrefix(last_tok))) {
00151 return true;
00152 } else {
00153 return false;
00154 }
00155 }
00156
00160 vector<string> dsgHypothesis::grouper(std::vector<std::string> &phr_vec,vector<vector<int> > &allchain_ids, int sourceOffset,const AlignmentInfo &align, bool isolation)
00161 {
00162
00163 std::vector<std::string> chain;
00164 std::vector<int> chain_ids;
00165 std::vector<std::string> allchains;
00166 chain_ids=m_span;
00167
00168 if (!m_buffer.empty() && !isolation) {
00169 for (int i = 0; i < m_buffer.size(); i++) {
00170 chain.push_back(m_buffer[i]);
00171 }
00172 }
00173
00174 for (int i = 0; i < phr_vec.size(); i++) {
00175 std::set<std::size_t> sourcePosSet = align.GetAlignmentsForTarget(i);
00176
00177 if (isValidChain(phr_vec[i], chain)) {
00178 chain.push_back(phr_vec[i]);
00179 if (sourcePosSet.empty()==false) {
00180 for (std::set<size_t>::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) {
00181 int cur=*it;
00182 chain_ids.push_back(cur+sourceOffset);
00183 }
00184 }
00185 }
00186
00187 else if (chain.size() == 0) {
00188 allchains.push_back(phr_vec[i]);
00189 allchain_ids.push_back(chain_ids);
00190 chain_ids.clear();
00191 }
00192
00193 else {
00194 std::string joined = boost::algorithm::join(chain, " ");
00195 allchains.push_back(joined);
00196 allchain_ids.push_back(chain_ids);
00197
00198 chain.clear();
00199 chain_ids.clear();
00200
00201 chain.push_back(phr_vec[i]);
00202 if (sourcePosSet.empty()==false) {
00203 for (std::set<size_t>::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) {
00204 int cur=*it;
00205 chain_ids.push_back(cur+sourceOffset);
00206 }
00207 }
00208
00209 }
00210
00211 }
00212
00213 if (!chain.empty()) {
00214 std::string joined = boost::algorithm::join(chain, " ");
00215 allchains.push_back(joined);
00216 allchain_ids.push_back(chain_ids);
00217 }
00218 return allchains;
00219 }
00220
00221
00222
00223 void dsgHypothesis :: calculateDsgProbinIsol(DsgLM & ptrDsgLM, Desegmenter &desegT, const AlignmentInfo &align )
00224 {
00225 lmProb = 0;
00226 State currState = lmState;
00227 State temp;
00228 string desegmented="";
00229 vector <string> words;
00230 vector <string> currFVec;
00231
00232 discontig0=0;
00233 discontig1=0;
00234 discontig2=0;
00235 UnsegWP=0;
00236
00237 currFVec = m_buffer;
00238 currFVec.insert( currFVec.end(), m_curr_phr.begin(), m_curr_phr.end() );
00239
00240 int vecSize=currFVec.size();
00241
00242
00243 if (currFVec.size()>0 && isPrefix (currFVec.back())) {
00244 UnsegWP-=0.5;
00245 }
00246 if (currFVec.size()>0 && isSuffix (currFVec.front())) {
00247 UnsegWP-=0.5;
00248 }
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259 vector<vector<int> > chain_ids;
00260 words = grouper(currFVec,chain_ids,0,align,1);
00261
00262 for (int i = 0; i<words.size(); i++) {
00263 UnsegWP+=1;
00264 temp = currState;
00265 if (words[i].find(" ")!=std::string::npos) {
00266 desegmented=desegT.Search(words[i])[0];
00267 lmProb += ptrDsgLM.Score(temp,desegmented,currState);
00268 } else {
00269 boost::replace_all(words[i], "-LRB-", "(");
00270 boost::replace_all(words[i], "-RRB-", ")");
00271 lmProb += ptrDsgLM.Score(temp,words[i],currState);
00272 }
00273 }
00274 lmState = currState;
00275 }
00276
00277 void dsgHypothesis :: calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &desegT, bool isCompleted , const AlignmentInfo &align, int sourceOffset, bool optimistic)
00278 {
00279 lmProb = 0;
00280 discontig0=0;
00281 discontig1=0;
00282 discontig2=0;
00283 UnsegWP=0;
00284
00285 State currState = lmState;
00286 State temp;
00287 string desegmented="";
00288 vector <string> words;
00289 vector <string> currFVec;
00290 bool completePhraseSuffixEnd = false;
00291 vector<vector<int> > all_chain_ids;
00292 double pscore;
00293 currFVec=m_curr_phr;
00294
00295
00296 if (isSuffix (currFVec.back()) && (currFVec.back()!="+")) {
00297 completePhraseSuffixEnd=true;
00298 }
00299
00300 words = grouper(currFVec,all_chain_ids,sourceOffset,align,0);
00301
00302 for (int i = 0; i < words.size(); i++) {
00303 temp = currState;
00304
00305 if (i==words.size()-1) {
00306 if (completePhraseSuffixEnd) {
00307 m_buffer.clear();
00308 m_span.clear();
00309 } else if (!isCompleted) {
00310 m_buffer.clear();
00311 if (optimistic == 1) {
00312 if ( isPrefix (currFVec.back())) {
00313
00314 lmProb -= delta;
00315 delta = 0.0;
00316 }
00317
00318 else if (words[i].find(" ")!=std::string::npos) {
00319 desegmented=desegT.Search(words[i])[0];
00320 pscore=ptrDsgLM.Score(temp,desegmented,currState);
00321 lmProb = lmProb + pscore - delta;
00322 delta=pscore;
00323 currState=temp;
00324 } else {
00325 boost::replace_all(words[i], "-LRB-", "(");
00326 boost::replace_all(words[i], "-RRB-", ")");
00327 pscore=ptrDsgLM.Score(temp,words[i],currState);
00328 lmProb = lmProb + pscore - delta;
00329 delta=pscore;
00330 currState=temp;
00331 }
00332 }
00333
00334 m_buffer.push_back(words.back());
00335 m_span=all_chain_ids.back();
00336 break;
00337 }
00338 }
00339
00340
00341 if (words[i].find(" ")!=std::string::npos) {
00342 UnsegWP+=1;
00343 desegmented=desegT.Search(words[i])[0];
00344 std::set<int> cur_chain_ids(all_chain_ids[i].begin(),all_chain_ids[i].end());
00345 if (cur_chain_ids.size()>1) {
00346 vector<int> dsc;
00347 for (std::set<int>::iterator it(cur_chain_ids.begin()), next(it); it != cur_chain_ids.end() && ++next != cur_chain_ids.end(); it = next) {
00348 int cur=*it;
00349 int mynext=*next;
00350 if (std::abs(cur - mynext)>= 3) {
00351 dsc.push_back(3);
00352 } else if (std::abs(cur - mynext)== 2) {
00353 dsc.push_back(2);
00354 } else if (std::abs(cur - mynext)<= 1) {
00355 dsc.push_back(1);
00356 }
00357 }
00358 int mymax=*std::max_element(dsc.begin(),dsc.end());
00359 if (mymax==3) {
00360 discontig2+=1;
00361 } else if (mymax==2) {
00362 discontig1+=1;
00363 } else {
00364 discontig0+=1;
00365 }
00366 } else {
00367 discontig0 += 1;
00368 }
00369
00370 lmProb += ptrDsgLM.Score(temp,desegmented,currState);
00371 } else {
00372 UnsegWP+=1;
00373 boost::replace_all(words[i], "-LRB-", "(");
00374 boost::replace_all(words[i], "-RRB-", ")");
00375 lmProb += ptrDsgLM.Score(temp,words[i],currState);
00376 }
00377 }
00378
00379 if (isCompleted) {
00380 temp = currState;
00381 lmProb = lmProb + ptrDsgLM.ScoreEndSentence(temp,currState) - delta;
00382 }
00383 lmState = currState;
00384 }
00385
00386
00387 void dsgHypothesis :: print()
00388 {}
00389
00390
00391 }