00001 #include "RDLM.h"
00002 #include <vector>
00003 #include "moses/StaticData.h"
00004 #include "moses/ScoreComponentCollection.h"
00005 #include "moses/ChartHypothesis.h"
00006 #include "moses/InputFileStream.h"
00007 #include "moses/Util.h"
00008 #include "util/exception.hh"
00009 #include "neuralTM.h"
00010
00011 namespace Moses
00012 {
00013
00014 namespace rdlm
00015 {
00016 ThreadLocal::ThreadLocal(nplm::neuralTM *lm_head_base_instance_, nplm::neuralTM *lm_label_base_instance_, bool normalizeHeadLM, bool normalizeLabelLM, int cacheSize)
00017 {
00018 lm_head = new nplm::neuralTM(*lm_head_base_instance_);
00019 lm_label = new nplm::neuralTM(*lm_label_base_instance_);
00020 lm_head->set_normalization(normalizeHeadLM);
00021 lm_label->set_normalization(normalizeLabelLM);
00022 lm_head->set_cache(cacheSize);
00023 lm_label->set_cache(cacheSize);
00024 }
00025
00026 ThreadLocal::~ThreadLocal()
00027 {
00028 delete lm_head;
00029 delete lm_label;
00030 }
00031
00032 }
00033
00034 typedef Eigen::Map<Eigen::Matrix<int,Eigen::Dynamic,1> > EigenMap;
00035
00036 RDLM::~RDLM()
00037 {
00038 delete lm_head_base_instance_;
00039 delete lm_label_base_instance_;
00040 }
00041
00042 void RDLM::Load(AllOptions::ptr const& opts)
00043 {
00044
00045 lm_head_base_instance_ = new nplm::neuralTM();
00046 lm_head_base_instance_->read(m_path_head_lm);
00047
00048 m_sharedVocab = lm_head_base_instance_->get_input_vocabulary().words() == lm_head_base_instance_->get_output_vocabulary().words();
00049
00050
00051 lm_label_base_instance_ = new nplm::neuralTM();
00052 lm_label_base_instance_->read(m_path_label_lm);
00053
00054 if (m_premultiply) {
00055 lm_head_base_instance_->premultiply();
00056 lm_label_base_instance_->premultiply();
00057 }
00058
00059 lm_head_base_instance_->set_cache(m_cacheSize);
00060 lm_label_base_instance_->set_cache(m_cacheSize);
00061
00062 StaticData &staticData = StaticData::InstanceNonConst();
00063 if (staticData.GetTreeStructure() == NULL) {
00064 staticData.SetTreeStructure(this);
00065 }
00066
00067 offset_up_head = 2*m_context_left + 2*m_context_right;
00068 offset_up_label = 2*m_context_left + 2*m_context_right + m_context_up;
00069
00070 size_head = 2*m_context_left + 2*m_context_right + 2*m_context_up + 2;
00071 size_label = 2*m_context_left + 2*m_context_right + 2*m_context_up + 1;
00072
00073 UTIL_THROW_IF2(size_head != lm_head_base_instance_->get_order(),
00074 "Error: order of head LM (" << lm_head_base_instance_->get_order() << ") does not match context size specified (left_context=" << m_context_left << " , right_context=" << m_context_right << " , up_context=" << m_context_up << " for a total order of " << size_head);
00075 UTIL_THROW_IF2(size_label != lm_label_base_instance_->get_order(),
00076 "Error: order of label LM (" << lm_label_base_instance_->get_order() << ") does not match context size specified (left_context=" << m_context_left << " , right_context=" << m_context_right << " , up_context=" << m_context_up << " for a total order of " << size_label);
00077
00078
00079 static_head_null.resize(size_head);
00080 for (unsigned int i = 0; i < size_head; i++) {
00081 char numstr[20];
00082 sprintf(numstr, "<null_%d>", i);
00083 static_head_null[i] = lm_head_base_instance_->lookup_input_word(numstr);
00084 }
00085
00086 static_label_null.resize(size_label);
00087 for (unsigned int i = 0; i < size_label; i++) {
00088 char numstr[20];
00089 sprintf(numstr, "<null_%d>", i);
00090 static_label_null[i] = lm_label_base_instance_->lookup_input_word(numstr);
00091 }
00092
00093 static_dummy_head = lm_head_base_instance_->lookup_input_word(dummy_head.GetString(0).as_string());
00094
00095 static_start_head = lm_head_base_instance_->lookup_input_word("<start_head>");
00096 static_start_label = lm_head_base_instance_->lookup_input_word("<start_label>");
00097
00098 static_head_head = lm_head_base_instance_->lookup_input_word("<head_head>");
00099 static_head_label = lm_head_base_instance_->lookup_input_word("<head_label>");
00100 static_head_label_output = lm_label_base_instance_->lookup_output_word("<head_label>");
00101
00102 static_stop_head = lm_head_base_instance_->lookup_input_word("<stop_head>");
00103 static_stop_label = lm_head_base_instance_->lookup_input_word("<stop_label>");
00104 static_stop_label_output = lm_label_base_instance_->lookup_output_word("<stop_label>");
00105 static_start_label_output = lm_label_base_instance_->lookup_output_word("<start_label>");
00106
00107 static_root_head = lm_head_base_instance_->lookup_input_word("<root_head>");
00108 static_root_label = lm_head_base_instance_->lookup_input_word("<root_label>");
00109
00110
00111 if (!m_debugPath.empty()) {
00112 ScoreFile(m_debugPath);
00113 exit(1);
00114 }
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233 }
00234
00235
00236 void RDLM::Score(InternalTree* root, const TreePointerMap & back_pointers, boost::array<float, 4> &score, size_t &boundary_hash, rdlm::ThreadLocal &thread_objects, int num_virtual, int rescoring_levels) const
00237 {
00238
00239
00240 if (root->IsTerminal()) {
00241 return;
00242 }
00243
00244
00245 if (root->GetLabel() == m_glueSymbol) {
00246
00247 for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) {
00248 Score(it->get(), back_pointers, score, boundary_hash, thread_objects, num_virtual, rescoring_levels);
00249 }
00250 return;
00251 }
00252
00253 std::vector<int> &ancestor_heads = thread_objects.ancestor_heads;
00254 std::vector<int> &ancestor_labels = thread_objects.ancestor_labels;
00255
00256
00257 if (m_binarized && root->GetLabel().GetString(0).as_string()[0] == '^' && !ancestor_heads.empty()) {
00258
00259 if (root->IsLeafNT() && m_context_up > 1 && ancestor_heads.size()) {
00260 root = back_pointers.find(root)->second.get();
00261 rescoring_levels = m_context_up-1;
00262 }
00263 for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) {
00264 Score(it->get(), back_pointers, score, boundary_hash, thread_objects, num_virtual, rescoring_levels);
00265 }
00266 return;
00267 }
00268
00269
00270 if (root->GetLabel() == m_startSymbol || root->GetLabel() == m_endSymbol) {
00271 return;
00272 }
00273
00274
00275
00276 if (root->GetLength() == 1 && root->GetChildren()[0]->IsTerminal()) {
00277
00278 if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) {
00279 std::vector<int> & ngram = thread_objects.ngram;
00280 ngram = static_head_null;
00281 ngram.back() = Factor2ID(root->GetChildren()[0]->GetLabel()[m_factorType], HEAD_OUTPUT);
00282 if (m_isPretermBackoff && ngram.back() == 0) {
00283 ngram.back() = Factor2ID(root->GetLabel()[m_factorType], HEAD_OUTPUT);
00284 }
00285 if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head) {
00286 std::vector<int>::iterator it = ngram.begin();
00287 std::fill_n(it, m_context_left, static_start_head);
00288 it += m_context_left;
00289 std::fill_n(it, m_context_left, static_start_label);
00290 it += m_context_left;
00291 std::fill_n(it, m_context_right, static_stop_head);
00292 it += m_context_right;
00293 std::fill_n(it, m_context_right, static_stop_label);
00294 it += m_context_right;
00295 size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size());
00296 it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it);
00297 it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it);
00298 }
00299 if (ancestor_labels.size() >= m_context_up && !num_virtual) {
00300 score[0] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
00301 } else {
00302 boost::hash_combine(boundary_hash, ngram.back());
00303 score[1] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
00304 }
00305 }
00306 return;
00307
00308 } else if (root->IsLeafNT()) {
00309 if (m_context_up > 1 && ancestor_heads.size()) {
00310 root = back_pointers.find(root)->second.get();
00311
00312 if (root->GetLength() == 1 && root->GetChildren()[0]->IsTerminal()) {
00313 return;
00314 }
00315 rescoring_levels = m_context_up-1;
00316 } else {
00317 return;
00318 }
00319 }
00320
00321
00322 std::pair<int,int> head_ids;
00323 bool found = GetHead(root, back_pointers, head_ids);
00324 if (!found) {
00325 head_ids = std::make_pair(static_dummy_head, static_dummy_head);
00326 }
00327
00328 size_t context_up_nonempty = std::min(m_context_up, ancestor_heads.size());
00329 const StringPiece & head_label = root->GetLabel().GetString(0);
00330 bool virtual_head = false;
00331 int reached_end = 0;
00332 int label_idx, label_idx_out;
00333 if (m_binarized && head_label[0] == '^') {
00334 virtual_head = true;
00335 if (m_binarized == 1 || (m_binarized == 3 && head_label[2] == 'l')) {
00336 reached_end = 1;
00337 } else if (m_binarized == 2 || (m_binarized == 3 && head_label[2] == 'r')) {
00338 reached_end = 2;
00339 }
00340
00341 StringPiece clipped_label = (m_binarized == 3) ? head_label.substr(2,head_label.size()-2) : head_label.substr(1,head_label.size()-1);
00342 label_idx = lm_label_base_instance_->lookup_input_word(clipped_label.as_string());
00343 label_idx_out = lm_label_base_instance_->lookup_output_word(clipped_label.as_string());
00344 } else {
00345 reached_end = 3;
00346 label_idx = Factor2ID(root->GetLabel()[0], LABEL_INPUT);
00347 label_idx_out = Factor2ID(root->GetLabel()[0], LABEL_OUTPUT);
00348 }
00349
00350 int head_idx = (virtual_head && head_ids.first == static_dummy_head) ? static_label_null[offset_up_head+m_context_up-1] : head_ids.first;
00351
00352
00353 if (ancestor_heads.empty() || (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head)) {
00354 std::vector<int> & ngram = thread_objects.ngram;
00355 ngram = static_label_null;
00356 ngram.back() = label_idx_out;
00357 if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head && !num_virtual) {
00358 std::vector<int>::iterator it = ngram.begin();
00359 std::fill_n(it, m_context_left, static_start_head);
00360 it += m_context_left;
00361 std::fill_n(it, m_context_left, static_start_label);
00362 it += m_context_left;
00363 std::fill_n(it, m_context_right, static_stop_head);
00364 it += m_context_right;
00365 std::fill_n(it, m_context_right, static_stop_label);
00366 it += m_context_right;
00367 it = std::copy(ancestor_heads.end()-context_up_nonempty, ancestor_heads.end(), it);
00368 it = std::copy(ancestor_labels.end()-context_up_nonempty, ancestor_labels.end(), it);
00369 score[2] += FloorScore(thread_objects.lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
00370 } else {
00371 boost::hash_combine(boundary_hash, ngram.back());
00372 score[3] += FloorScore(thread_objects.lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
00373 }
00374 if (head_idx != static_dummy_head && head_idx != static_head_head) {
00375 ngram.push_back(head_ids.second);
00376 *(ngram.end()-2) = label_idx;
00377 if (ancestor_heads.size() == m_context_up && ancestor_heads.back() == static_root_head && !num_virtual) {
00378 score[0] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
00379 } else {
00380 boost::hash_combine(boundary_hash, ngram.back());
00381 score[1] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
00382 }
00383 }
00384 }
00385
00386 ancestor_heads.push_back(head_idx);
00387 ancestor_labels.push_back(label_idx);
00388
00389 if (virtual_head) {
00390 num_virtual = m_context_up;
00391 } else if (num_virtual) {
00392 --num_virtual;
00393 }
00394
00395
00396
00397 if (context_up_nonempty < m_context_up) {
00398 ++context_up_nonempty;
00399 }
00400 size_t up_padding = m_context_up - context_up_nonempty;
00401
00402 std::vector<int> & ngram = thread_objects.ngram;
00403 ngram = static_label_null;
00404
00405 std::vector<int>::iterator it = ngram.begin() + offset_up_head;
00406 if (up_padding > 0) {
00407 it += up_padding;
00408 }
00409
00410 it = std::copy(ancestor_heads.end() - context_up_nonempty, ancestor_heads.end(), it);
00411
00412 if (up_padding > 0) {
00413 it += up_padding;
00414 }
00415
00416 it = std::copy(ancestor_labels.end() - context_up_nonempty, ancestor_labels.end(), it);
00417
00418
00419 int num_children = root->GetLength();
00420
00421
00422 if (m_binarized) {
00423 num_children = 0;
00424 UnbinarizedChildren real_children(root, back_pointers, m_binarized, thread_objects.stack);
00425 for (std::vector<TreePointer>::const_iterator it = real_children.begin(); !real_children.ended(); it = ++real_children) {
00426 num_children++;
00427 }
00428 }
00429
00430 if (m_context_right && (reached_end == 1 || reached_end == 3)) num_children++;
00431 if (m_context_left && (reached_end == 2 || reached_end == 3)) num_children++;
00432 std::vector<int> & heads = thread_objects.heads;
00433 std::vector<int> & labels = thread_objects.labels;
00434 std::vector<int> & heads_output = thread_objects.heads_output;
00435 std::vector<int> & labels_output = thread_objects.labels_output;
00436
00437 heads.resize(num_children);
00438 labels.resize(num_children);
00439 heads_output.resize(num_children);
00440 labels_output.resize(num_children);
00441
00442 GetChildHeadsAndLabels(root, back_pointers, reached_end, thread_objects);
00443
00444
00445 if (reached_end == 1 || reached_end == 3) {
00446 std::fill_n(ngram.begin(), m_context_left, static_start_head);
00447 std::fill_n(ngram.begin() + m_context_left, m_context_left, static_start_label);
00448 }
00449 size_t left_padding = m_context_left;
00450 size_t left_offset = 0;
00451 size_t right_offset = std::min(heads.size(), m_context_right + 1);
00452 size_t right_padding = m_context_right + 1 - right_offset;
00453
00454
00455 for (size_t i = 0; i != heads.size(); i++) {
00456
00457 std::vector<int>::iterator it = ngram.begin();
00458
00459 if (left_padding > 0) {
00460 it += left_padding;
00461 }
00462
00463 it = std::copy(heads.begin()+left_offset, heads.begin()+i, it);
00464
00465 if (left_padding > 0) {
00466 it += left_padding;
00467 }
00468
00469 it = std::copy(labels.begin()+left_offset, labels.begin()+i, it);
00470
00471 it = std::copy(heads.begin()+i+1, heads.begin()+right_offset, it);
00472
00473 if (right_padding > 0) {
00474 if (reached_end == 2 || reached_end == 3) {
00475 std::fill_n(it, right_padding, static_stop_head);
00476 it += right_padding;
00477 } else {
00478 std::copy(static_label_null.begin()+offset_up_head-m_context_right-right_padding, static_label_null.begin()-m_context_right+offset_up_head, it);
00479 }
00480 }
00481
00482 it = std::copy(labels.begin()+i+1, labels.begin()+right_offset, it);
00483
00484 if (right_padding > 0) {
00485 if (reached_end == 2 || reached_end == 3) {
00486 std::fill_n(it, right_padding, static_stop_label);
00487 it += right_padding;
00488 } else {
00489 std::copy(static_label_null.begin()+offset_up_head-right_padding, static_label_null.begin()+offset_up_head, it);
00490 }
00491 }
00492
00493 ngram.back() = labels_output[i];
00494
00495 if (ancestor_labels.size() >= m_context_up && !num_virtual) {
00496 score[2] += FloorScore(thread_objects.lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
00497 } else {
00498 boost::hash_combine(boundary_hash, ngram.back());
00499 score[3] += FloorScore(thread_objects.lm_label->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
00500 }
00501
00502
00503 if (heads[i] != static_start_head && heads[i] != static_stop_head && heads[i] != static_dummy_head && heads[i] != static_head_head) {
00504
00505 ngram.back() = labels[i];
00506 ngram.push_back(heads_output[i]);
00507
00508 if (ancestor_labels.size() >= m_context_up && !num_virtual) {
00509 score[0] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
00510 } else {
00511 boost::hash_combine(boundary_hash, ngram.back());
00512 score[1] += FloorScore(thread_objects.lm_head->lookup_ngram(EigenMap(ngram.data(), ngram.size())));
00513 }
00514 ngram.pop_back();
00515 }
00516
00517
00518 if (left_padding)
00519 left_padding--;
00520 else
00521 left_offset++;
00522
00523 if (right_offset < heads.size())
00524 right_offset++;
00525 else
00526 right_padding++;
00527 }
00528
00529
00530 if (rescoring_levels == 1) {
00531 ancestor_heads.pop_back();
00532 ancestor_labels.pop_back();
00533 return;
00534 }
00535
00536 for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) {
00537 Score(it->get(), back_pointers, score, boundary_hash, thread_objects, num_virtual, rescoring_levels - 1);
00538 }
00539 ancestor_heads.pop_back();
00540 ancestor_labels.pop_back();
00541 }
00542
00543 bool RDLM::GetHead(InternalTree* root, const TreePointerMap & back_pointers, std::pair<int,int> & IDs) const
00544 {
00545 InternalTree *tree;
00546
00547 for (std::vector<TreePointer>::const_iterator it = root->GetChildren().begin(); it != root->GetChildren().end(); ++it) {
00548 if ((*it)->IsLeafNT()) {
00549 tree = back_pointers.find(it->get())->second.get();
00550 } else {
00551 tree = it->get();
00552 }
00553
00554 if (m_binarized && tree->GetLabel().GetString(0).as_string()[0] == '^') {
00555 bool found = GetHead(tree, back_pointers, IDs);
00556 if (found) {
00557 return true;
00558 }
00559 }
00560
00561
00562
00563 else if (tree->GetLength() == 1 && tree->GetChildren()[0]->IsTerminal()) {
00564 GetIDs(tree->GetChildren()[0]->GetLabel(), tree->GetLabel(), IDs);
00565 return true;
00566 }
00567 }
00568
00569 return false;
00570 }
00571
00572
00573 void RDLM::GetChildHeadsAndLabels(InternalTree *root, const TreePointerMap & back_pointers, int reached_end, rdlm::ThreadLocal &thread_objects) const
00574 {
00575 std::pair<int,int> child_ids;
00576 size_t j = 0;
00577
00578 std::vector<int> & heads = thread_objects.heads;
00579 std::vector<int> & labels = thread_objects.labels;
00580 std::vector<int> & heads_output = thread_objects.heads_output;
00581 std::vector<int> & labels_output = thread_objects.labels_output;
00582
00583
00584 if (m_context_right && (reached_end == 1 || reached_end == 3)) {
00585 heads[j] = static_start_head;
00586 labels[j] = static_start_label;
00587 labels_output[j] = static_start_label_output;
00588 j++;
00589 }
00590
00591 UnbinarizedChildren real_children(root, back_pointers, m_binarized, thread_objects.stack);
00592
00593
00594 for (std::vector<TreePointer>::const_iterator itx = real_children.begin(); !real_children.ended(); itx = ++real_children) {
00595 if ((*itx)->IsTerminal()) {
00596 std::cerr << "non-terminal node " << root->GetLabel() << " has a mix of terminal and non-terminal children. This shouldn't happen..." << std::endl;
00597 std::cerr << "children: ";
00598 for (std::vector<TreePointer>::const_iterator itx2 = root->GetChildren().begin(); itx2 != root->GetChildren().end(); ++itx2) {
00599 std::cerr << (*itx2)->GetLabel() << " ";
00600 }
00601 std::cerr << std::endl;
00602
00603 heads.pop_back();
00604 labels.pop_back();
00605 heads_output.pop_back();
00606 labels_output.pop_back();
00607 continue;
00608 }
00609 InternalTree* child = itx->get();
00610
00611 if ((*itx)->IsLeafNT()) {
00612 child = back_pointers.find(itx->get())->second.get();
00613 }
00614
00615
00616 if (child->GetLength() == 1 && child->GetChildren()[0]->IsTerminal()) {
00617 heads[j] = static_head_head;
00618 labels[j] = static_head_label;
00619 labels_output[j] = static_head_label_output;
00620 j++;
00621 continue;
00622 }
00623
00624 bool found = GetHead(child, back_pointers, child_ids);
00625 if (!found) {
00626 child_ids = std::make_pair(static_dummy_head, static_dummy_head);
00627 }
00628
00629 labels[j] = Factor2ID(child->GetLabel()[0], LABEL_INPUT);
00630 labels_output[j] = Factor2ID(child->GetLabel()[0], LABEL_OUTPUT);
00631 heads[j] = child_ids.first;
00632 heads_output[j] = child_ids.second;
00633 j++;
00634 }
00635
00636
00637 if (m_context_left && (reached_end == 2 || reached_end == 3)) {
00638 heads[j] = static_stop_head;
00639 labels[j] = static_stop_label;
00640 labels_output[j] = static_stop_label_output;
00641 }
00642 }
00643
00644
00645 void RDLM::GetIDs(const Word & head, const Word & preterminal, std::pair<int,int> & IDs) const
00646 {
00647 IDs.first = Factor2ID(head[m_factorType], HEAD_INPUT);
00648 if (m_isPretermBackoff && IDs.first == 0) {
00649 IDs.first = Factor2ID(preterminal[0], HEAD_INPUT);
00650 }
00651 if (m_sharedVocab) {
00652 IDs.second = IDs.first;
00653 } else {
00654 IDs.second = Factor2ID(head[m_factorType], HEAD_OUTPUT);
00655 if (m_isPretermBackoff && IDs.second == 0) {
00656 IDs.second = Factor2ID(preterminal[0], HEAD_OUTPUT);
00657 }
00658 }
00659 }
00660
00661
00662 int RDLM::Factor2ID(const Factor * const factor, int model_type) const
00663 {
00664 size_t ID = factor->GetId();
00665 int ret;
00666
00667 std::vector<int>* cache = NULL;
00668 switch(model_type) {
00669 case LABEL_INPUT:
00670 cache = &factor2id_label_input;
00671 break;
00672 case LABEL_OUTPUT:
00673 cache = &factor2id_label_output;
00674 break;
00675 case HEAD_INPUT:
00676 cache = &factor2id_head_input;
00677 break;
00678 case HEAD_OUTPUT:
00679 cache = &factor2id_head_output;
00680 break;
00681 }
00682
00683 try {
00684 ret = cache->at(ID);
00685 } catch (const std::out_of_range& oor) {
00686 #ifdef WITH_THREADS //need to resize cache; write lock
00687 m_accessLock.unlock_shared();
00688 m_accessLock.lock();
00689 #endif
00690 cache->resize(ID*2, -1);
00691 #ifdef WITH_THREADS //go back to read lock
00692 m_accessLock.unlock();
00693 m_accessLock.lock_shared();
00694 #endif
00695 ret = -1;
00696 }
00697 if (ret == -1) {
00698 switch(model_type) {
00699 case LABEL_INPUT:
00700 ret = lm_label_base_instance_->lookup_input_word(factor->GetString().as_string());
00701 break;
00702 case LABEL_OUTPUT:
00703 ret = lm_label_base_instance_->lookup_output_word(factor->GetString().as_string());
00704 break;
00705 case HEAD_INPUT:
00706 ret = lm_head_base_instance_->lookup_input_word(factor->GetString().as_string());
00707 break;
00708 case HEAD_OUTPUT:
00709 ret = lm_head_base_instance_->lookup_output_word(factor->GetString().as_string());
00710 break;
00711 }
00712 (*cache)[ID] = ret;
00713 }
00714
00715 return ret;
00716 }
00717
00718 void RDLM::PrintInfo(std::vector<int> &ngram, nplm::neuralTM* lm) const
00719 {
00720 for (size_t i = 0; i < ngram.size()-1; i++) {
00721 std::cerr << lm->get_input_vocabulary().words()[ngram[i]] << " ";
00722 }
00723 std::cerr << lm->get_output_vocabulary().words()[ngram.back()] << " ";
00724
00725 for (size_t i = 0; i < ngram.size(); i++) {
00726 std::cerr << ngram[i] << " ";
00727 }
00728 std::cerr << "score: " << lm->lookup_ngram(ngram) << std::endl;
00729 }
00730
00731
00732 RDLM::TreePointerMap RDLM::AssociateLeafNTs(InternalTree* root, const std::vector<TreePointer> &previous) const
00733 {
00734
00735 TreePointerMap ret;
00736 std::vector<TreePointer>::iterator it;
00737 bool found = false;
00738 InternalTree::leafNT next_leafNT(root);
00739 for (std::vector<TreePointer>::const_iterator it_prev = previous.begin(); it_prev != previous.end(); ++it_prev) {
00740 found = next_leafNT(it);
00741 if (found) {
00742 ret[it->get()] = *it_prev;
00743 } else {
00744 std::cerr << "Warning: leaf nonterminal not found in rule; why did this happen?\n";
00745 }
00746 }
00747 return ret;
00748 }
00749
00750 void RDLM::ScoreFile(std::string &path)
00751 {
00752 InputFileStream inStream(path);
00753 rdlm::ThreadLocal *thread_objects = thread_objects_backend_.get();
00754 if (!thread_objects) {
00755 thread_objects = new rdlm::ThreadLocal(lm_head_base_instance_, lm_label_base_instance_, m_normalizeHeadLM, m_normalizeLabelLM, m_cacheSize);
00756 thread_objects_backend_.reset(thread_objects);
00757 }
00758 std::string line, null;
00759 thread_objects->ancestor_heads.resize(0);
00760 thread_objects->ancestor_labels.resize(0);
00761 thread_objects->ancestor_heads.resize(m_context_up, static_root_head);
00762 thread_objects->ancestor_labels.resize(m_context_up, static_root_label);
00763 #ifdef WITH_THREADS
00764
00765 m_accessLock.lock_shared();
00766 #endif
00767 while(getline(inStream, line)) {
00768 TreePointerMap back_pointers;
00769 boost::array<float, 4> score;
00770 score.fill(0);
00771 InternalTree* mytree (new InternalTree(line));
00772 size_t boundary_hash = 0;
00773 Score(mytree, back_pointers, score, boundary_hash, *thread_objects);
00774 std::cerr << "head LM: " << score[0] << "label LM: " << score[2] << std::endl;
00775 }
00776 #ifdef WITH_THREADS
00777 m_accessLock.unlock_shared();
00778 #endif
00779 }
00780
00781
00782 void RDLM::SetParameter(const std::string& key, const std::string& value)
00783 {
00784 std::cerr << "setting: " << this->GetScoreProducerDescription() << " - " << key << "\n";
00785 if (key == "tuneable") {
00786 m_tuneable = Scan<bool>(value);
00787 } else if (key == "filterable") {
00788 } else if (key == "path_head_lm") {
00789 m_path_head_lm = value;
00790 } else if (key == "path_label_lm") {
00791 m_path_label_lm = value;
00792 } else if (key == "backoff") {
00793 m_isPretermBackoff = Scan<bool>(value);
00794 } else if (key == "context_up") {
00795 m_context_up = Scan<size_t>(value);
00796 } else if (key == "context_left") {
00797 m_context_left = Scan<size_t>(value);
00798 } else if (key == "context_right") {
00799 m_context_right = Scan<size_t>(value);
00800 } else if (key == "debug_path") {
00801 m_debugPath = value;
00802 } else if (key == "premultiply") {
00803 m_premultiply = Scan<bool>(value);
00804 } else if (key == "rerank") {
00805 m_rerank = Scan<bool>(value);
00806 } else if (key == "normalize_head_lm") {
00807 m_normalizeHeadLM = Scan<bool>(value);
00808 } else if (key == "normalize_label_lm") {
00809 m_normalizeLabelLM = Scan<bool>(value);
00810 } else if (key == "binarized") {
00811 if (value == "left")
00812 m_binarized = 1;
00813 else if (value == "right")
00814 m_binarized = 2;
00815 else if (value == "full")
00816 m_binarized = 3;
00817 else
00818 UTIL_THROW(util::Exception, "Unknown value for argument " << key << "=" << value);
00819 } else if (key == "glue_symbol") {
00820 m_glueSymbolString = value;
00821 } else if (key == "factor") {
00822 m_factorType = Scan<FactorType>(value);
00823 } else if (key == "cache_size") {
00824 m_cacheSize = Scan<int>(value);
00825 } else {
00826 UTIL_THROW(util::Exception, "Unknown argument " << key << "=" << value);
00827 }
00828 }
00829
00830
00831 FFState* RDLM::EvaluateWhenApplied(const ChartHypothesis& cur_hypo
00832 , int featureID
00833 , ScoreComponentCollection* accumulator) const
00834 {
00835 if (const PhraseProperty *property = cur_hypo.GetCurrTargetPhrase().GetProperty("Tree")) {
00836 const std::string *tree = property->GetValueString();
00837 TreePointer mytree (boost::make_shared<InternalTree>(*tree));
00838
00839
00840 std::vector<TreePointer> previous_trees;
00841 float prev_approx_head = 0, prev_approx_label = 0;
00842 for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) {
00843 const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(pos);
00844 if (word.IsNonTerminal()) {
00845 size_t nonTermInd = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos];
00846 const RDLMState* prev = static_cast<const RDLMState*>(cur_hypo.GetPrevHypo(nonTermInd)->GetFFState(featureID));
00847 previous_trees.push_back(prev->GetTree());
00848 prev_approx_head -= prev->GetApproximateScoreHead();
00849 prev_approx_label -= prev->GetApproximateScoreLabel();
00850 }
00851 }
00852 size_t ff_idx = m_index;
00853
00854 accumulator->PlusEquals(ff_idx, prev_approx_head);
00855 accumulator->PlusEquals(ff_idx+1, prev_approx_label);
00856
00857 bool full_sentence = (mytree->GetChildren().back()->GetLabel() == m_endTag || (mytree->GetChildren().back()->GetLabel() == m_endSymbol && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == m_endTag));
00858
00859 TreePointerMap back_pointers = AssociateLeafNTs(mytree.get(), previous_trees);
00860 boost::array<float, 4> score;
00861 score.fill(0);
00862
00863 size_t boundary_hash = 0;
00864 if (!m_rerank) {
00865 #ifdef WITH_THREADS
00866
00867 m_accessLock.lock_shared();
00868 #endif
00869 rdlm::ThreadLocal *thread_objects = thread_objects_backend_.get();
00870 if (!thread_objects) {
00871 thread_objects = new rdlm::ThreadLocal(lm_head_base_instance_, lm_label_base_instance_, m_normalizeHeadLM, m_normalizeLabelLM, m_cacheSize);
00872 thread_objects_backend_.reset(thread_objects);
00873 }
00874 thread_objects->ancestor_heads.resize(0);
00875 thread_objects->ancestor_labels.resize(0);
00876 thread_objects->ancestor_heads.resize((full_sentence ? m_context_up : 0), static_root_head);
00877 thread_objects->ancestor_labels.resize((full_sentence ? m_context_up : 0), static_root_label);
00878 Score(mytree.get(), back_pointers, score, boundary_hash, *thread_objects);
00879 #ifdef WITH_THREADS
00880 m_accessLock.unlock_shared();
00881 #endif
00882 accumulator->PlusEquals(ff_idx, score[0] + score[1]);
00883 accumulator->PlusEquals(ff_idx+1, score[2] + score[3]);
00884 }
00885 mytree->Combine(previous_trees);
00886 if (m_rerank && full_sentence) {
00887 #ifdef WITH_THREADS
00888
00889 m_accessLock.lock_shared();
00890 #endif
00891 rdlm::ThreadLocal *thread_objects = thread_objects_backend_.get();
00892 if (!thread_objects) {
00893 thread_objects = new rdlm::ThreadLocal(lm_head_base_instance_, lm_label_base_instance_, m_normalizeHeadLM, m_normalizeLabelLM, m_cacheSize);
00894 thread_objects_backend_.reset(thread_objects);
00895 }
00896 thread_objects->ancestor_heads.resize(0);
00897 thread_objects->ancestor_labels.resize(0);
00898 thread_objects->ancestor_heads.resize((full_sentence ? m_context_up : 0), static_root_head);
00899 thread_objects->ancestor_labels.resize((full_sentence ? m_context_up : 0), static_root_label);
00900 Score(mytree.get(), back_pointers, score, boundary_hash, *thread_objects);
00901 #ifdef WITH_THREADS
00902 m_accessLock.unlock_shared();
00903 #endif
00904 accumulator->PlusEquals(ff_idx, score[0] + score[1]);
00905 accumulator->PlusEquals(ff_idx+1, score[2] + score[3]);
00906 }
00907 if (m_binarized && full_sentence) {
00908 mytree->Unbinarize();
00909 }
00910
00911 return new RDLMState(mytree, score[1], score[3], boundary_hash);
00912 } else {
00913 UTIL_THROW2("Error: RDLM active, but no internal tree structure found");
00914 }
00915
00916 }
00917
00918 }