00001 #include "HyperTreeLoader.h"
00002
00003 #include <sys/stat.h>
00004
00005 #include <cmath>
00006 #include <cstdlib>
00007 #include <fstream>
00008 #include <string>
00009 #include <iterator>
00010 #include <algorithm>
00011 #include <iostream>
00012
00013 #include "moses/FactorCollection.h"
00014 #include "moses/Word.h"
00015 #include "moses/Util.h"
00016 #include "moses/Timer.h"
00017 #include "moses/InputFileStream.h"
00018 #include "moses/StaticData.h"
00019 #include "moses/Range.h"
00020 #include "moses/ChartTranslationOptionList.h"
00021 #include "moses/FactorCollection.h"
00022 #include "moses/Syntax/RuleTableFF.h"
00023 #include "moses/parameters/AllOptions.h"
00024 #include "util/file_piece.hh"
00025 #include "util/string_piece.hh"
00026 #include "util/tokenize_piece.hh"
00027 #include "util/double-conversion/double-conversion.h"
00028 #include "util/exception.hh"
00029
00030 #include "HyperPath.h"
00031 #include "HyperPathLoader.h"
00032 #include "HyperTree.h"
00033
00034 namespace Moses
00035 {
00036
00037 namespace Syntax
00038 {
00039 namespace F2S
00040 {
00041
00042 bool HyperTreeLoader::Load(AllOptions const& opts,
00043 const std::vector<FactorType> &input,
00044 const std::vector<FactorType> &output,
00045 const std::string &inFile,
00046 const RuleTableFF &ff,
00047 HyperTree &trie,
00048 boost::unordered_set<std::size_t> &sourceTermSet)
00049 {
00050 PrintUserTime(std::string("Start loading HyperTree"));
00051
00052 sourceTermSet.clear();
00053
00054 std::size_t count = 0;
00055
00056 std::ostream *progress = NULL;
00057 IFVERBOSE(1) progress = &std::cerr;
00058 util::FilePiece in(inFile.c_str(), progress);
00059
00060
00061 std::vector<float> scoreVector;
00062 StringPiece line;
00063
00064 double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
00065
00066 HyperPathLoader hyperPathLoader;
00067
00068 Phrase dummySourcePhrase;
00069 {
00070 Word *lhs = NULL;
00071 dummySourcePhrase.CreateFromString(Input, input, "hello", &lhs);
00072 delete lhs;
00073 }
00074
00075 while(true) {
00076 try {
00077 line = in.ReadLine();
00078 } catch (const util::EndOfFileException &e) {
00079 break;
00080 }
00081
00082 util::TokenIter<util::MultiCharacter> pipes(line, "|||");
00083 StringPiece sourceString(*pipes);
00084 StringPiece targetString(*++pipes);
00085 StringPiece scoreString(*++pipes);
00086
00087 StringPiece alignString;
00088 if (++pipes) {
00089 StringPiece temp(*pipes);
00090 alignString = temp;
00091 }
00092
00093 ++pipes;
00094
00095 scoreVector.clear();
00096 for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
00097 int processed;
00098 float score = converter.StringToFloat(s->data(), s->length(), &processed);
00099 UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
00100 scoreVector.push_back(FloorScore(TransformScore(score)));
00101 }
00102 const std::size_t numScoreComponents = ff.GetNumScoreComponents();
00103 if (scoreVector.size() != numScoreComponents) {
00104 UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
00105 << numScoreComponents << ") of score components on line " << count);
00106 }
00107
00108
00109 HyperPath sourceFragment;
00110 hyperPathLoader.Load(sourceString, sourceFragment);
00111 ExtractSourceTerminalSetFromHyperPath(sourceFragment, sourceTermSet);
00112
00113
00114 TargetPhrase *targetPhrase = new TargetPhrase(&ff);
00115 Word *targetLHS = NULL;
00116 targetPhrase->CreateFromString(Output, output, targetString, &targetLHS);
00117 targetPhrase->SetTargetLHS(targetLHS);
00118 targetPhrase->SetAlignmentInfo(alignString);
00119
00120 if (++pipes) {
00121 StringPiece sparseString(*pipes);
00122 targetPhrase->SetSparseScore(&ff, sparseString);
00123 }
00124
00125 if (++pipes) {
00126 StringPiece propertiesString(*pipes);
00127 targetPhrase->SetProperties(propertiesString);
00128 }
00129
00130 targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector);
00131 targetPhrase->EvaluateInIsolation(dummySourcePhrase,
00132 ff.GetFeaturesToApply());
00133
00134
00135 TargetPhraseCollection::shared_ptr phraseColl
00136 = GetOrCreateTargetPhraseCollection(trie, sourceFragment);
00137 phraseColl->Add(targetPhrase);
00138
00139 count++;
00140 }
00141
00142
00143 if (ff.GetTableLimit()) {
00144 SortAndPrune(trie, ff.GetTableLimit());
00145 }
00146
00147 return true;
00148 }
00149
00150 void HyperTreeLoader::ExtractSourceTerminalSetFromHyperPath(
00151 const HyperPath &hp, boost::unordered_set<std::size_t> &sourceTerminalSet)
00152 {
00153 for (std::vector<HyperPath::NodeSeq>::const_iterator p = hp.nodeSeqs.begin();
00154 p != hp.nodeSeqs.end(); ++p) {
00155 for (std::vector<std::size_t>::const_iterator q = p->begin();
00156 q != p->end(); ++q) {
00157 const std::size_t factorId = *q;
00158 if (factorId >= moses_MaxNumNonterminals &&
00159 factorId != HyperPath::kComma &&
00160 factorId != HyperPath::kEpsilon) {
00161 sourceTerminalSet.insert(factorId);
00162 }
00163 }
00164 }
00165 }
00166
00167 }
00168 }
00169 }