00001 #include "ForestInput.h"
00002
00003 #include <algorithm>
00004
00005 #include <boost/make_shared.hpp>
00006
00007 #include "util/tokenize_piece.hh"
00008
00009 #include "moses/Syntax/F2S/Forest.h"
00010 #include "moses/TranslationModel/PhraseDictionary.h"
00011
00012 #include "FactorCollection.h"
00013 #include "StaticData.h"
00014 #include "Util.h"
00015
00016 namespace Moses
00017 {
00018
00020 int ForestInput::
00021 Read(std::istream &in)
00022 {
00023 using Syntax::F2S::Forest;
00024
00025 m_forest = boost::make_shared<Forest>();
00026 m_rootVertex = NULL;
00027 m_vertexSet.clear();
00028
00029 std::string line;
00030 if (std::getline(in, line, '\n').eof()) {
00031 return 0;
00032 }
00033
00034
00035
00036 std::string sentence;
00037 std::getline(in, sentence);
00038
00039
00040
00041 std::getline(in, line);
00042 if (line == "") {
00043
00044 sentence = "";
00045
00046 std::getline(in, line);
00047 } else {
00048 do {
00049 ParseHyperedgeLine(line);
00050 std::getline(in, line);
00051 } while (line != "");
00052 }
00053
00054
00055
00056
00057 std::stringstream strme;
00058 strme << "<s> " << sentence << " </s>" << std::endl;
00059 Sentence::Read(strme);
00060
00061
00062 std::size_t maxEnd = FindMaxEnd(*m_forest);
00063
00064
00065 std::vector<Forest::Vertex *> topVertices;
00066 if (!m_forest->vertices.empty()) {
00067 FindTopVertices(*m_forest, topVertices);
00068 assert(topVertices.size() >= 1);
00069 }
00070
00071
00072 const std::vector<FactorType>& factorOrder = m_options->input.factor_order;
00073
00074
00075 Forest::Vertex *startSymbol = NULL;
00076 {
00077 Word symbol;
00078 symbol.CreateFromString(Input, factorOrder, "<s>", false);
00079 Syntax::PVertex pvertex(Range(0, 0), symbol);
00080 startSymbol = new Forest::Vertex(pvertex);
00081 m_forest->vertices.push_back(startSymbol);
00082 }
00083
00084
00085 Forest::Vertex *endSymbol = NULL;
00086 {
00087 Word symbol;
00088 symbol.CreateFromString(Input, factorOrder, "</s>", false);
00089 Syntax::PVertex pvertex(Range(maxEnd+1, maxEnd+1), symbol);
00090 endSymbol = new Forest::Vertex(pvertex);
00091 m_forest->vertices.push_back(endSymbol);
00092 }
00093
00094
00095 {
00096 Word symbol;
00097 symbol.CreateFromString(Input, factorOrder, "Q", true);
00098 Syntax::PVertex pvertex(Range(0, maxEnd+1), symbol);
00099 m_rootVertex = new Forest::Vertex(pvertex);
00100 m_forest->vertices.push_back(m_rootVertex);
00101 }
00102
00103
00104 if (topVertices.empty()) {
00105 Forest::Hyperedge *e = new Forest::Hyperedge();
00106 e->head = m_rootVertex;
00107 e->tail.push_back(startSymbol);
00108 e->tail.push_back(endSymbol);
00109 m_rootVertex->incoming.push_back(e);
00110 } else {
00111
00112 for (std::vector<Forest::Vertex *>::const_iterator
00113 p = topVertices.begin(); p != topVertices.end(); ++p) {
00114 Forest::Hyperedge *e = new Forest::Hyperedge();
00115 e->head = m_rootVertex;
00116 e->tail.push_back(startSymbol);
00117 e->tail.push_back(*p);
00118 e->tail.push_back(endSymbol);
00119 m_rootVertex->incoming.push_back(e);
00120 }
00121 }
00122
00123 return 1;
00124 }
00125
00126 Syntax::F2S::Forest::Vertex*
00127 ForestInput::
00128 AddOrDeleteVertex(Forest::Vertex *v)
00129 {
00130 std::pair<VertexSet::iterator, bool> ret = m_vertexSet.insert(v);
00131 if (ret.second) {
00132 m_forest->vertices.push_back(*ret.first);
00133 } else {
00134 delete v;
00135 }
00136 return *ret.first;
00137 }
00138
00139 std::size_t ForestInput::FindMaxEnd(const Forest &forest)
00140 {
00141 std::size_t maxEnd = 0;
00142 for (std::vector<Forest::Vertex *>::const_iterator
00143 p = forest.vertices.begin(); p != forest.vertices.end(); ++p) {
00144 maxEnd = std::max(maxEnd, (*p)->pvertex.span.GetEndPos());
00145 }
00146 return maxEnd;
00147 }
00148
00149 void ForestInput::FindTopVertices(Forest &forest,
00150 std::vector<Forest::Vertex *> &topVertices)
00151 {
00152 topVertices.clear();
00153
00154
00155 std::set<Forest::Vertex *> all;
00156
00157
00158 std::set<Forest::Vertex *> preds;
00159
00160
00161 for (std::vector<Forest::Vertex *>::const_iterator
00162 p = forest.vertices.begin(); p != forest.vertices.end(); ++p) {
00163 all.insert(*p);
00164 for (std::vector<Forest::Hyperedge *>::const_iterator
00165 q = (*p)->incoming.begin(); q != (*p)->incoming.end(); ++q) {
00166 for (std::vector<Forest::Vertex*>::const_iterator
00167 r = (*q)->tail.begin(); r != (*q)->tail.end(); ++r) {
00168 preds.insert(*r);
00169 }
00170 }
00171 }
00172
00173
00174 std::set_difference(all.begin(), all.end(), preds.begin(), preds.end(),
00175 std::back_inserter(topVertices));
00176 }
00177
00178 void
00179 ForestInput::
00180 ParseHyperedgeLine(const std::string &line)
00181 {
00182 const std::vector<FactorType>& factorOrder = m_options->input.factor_order;
00183 using Syntax::F2S::Forest;
00184
00185 const util::AnyCharacter delimiter(" \t");
00186 util::TokenIter<util::AnyCharacter, true> p(line, delimiter);
00187 Forest::Vertex *v = AddOrDeleteVertex(ParseVertex(*p));
00188 Forest::Hyperedge *e = new Forest::Hyperedge();
00189 e->head = v;
00190 ++p;
00191 if (*p != "=>") {
00192
00193
00194 }
00195 for (++p; *p != "|||"; ++p) {
00196 v = ParseVertex(*p);
00197 if (!v->pvertex.symbol.IsNonTerminal()) {
00198
00199 v->pvertex.span = Range(e->head->pvertex.span.GetStartPos(),
00200 e->head->pvertex.span.GetStartPos());
00201 }
00202 e->tail.push_back(AddOrDeleteVertex(v));
00203 }
00204 ++p;
00205 std::string tmp;
00206 p->CopyToString(&tmp);
00207 e->weight = std::atof(tmp.c_str());
00208 e->head->incoming.push_back(e);
00209 }
00210
00211 Syntax::F2S::Forest::Vertex*
00212 ForestInput::ParseVertex(const StringPiece &s)
00213 {
00214 using Syntax::F2S::Forest;
00215 const std::vector<FactorType>& factorOrder = m_options->input.factor_order;
00216 Word symbol;
00217 std::size_t pos = s.rfind('[');
00218 if (pos == std::string::npos) {
00219 symbol.CreateFromString(Input, factorOrder, s, false);
00220
00221 Range span(0, 0);
00222 return new Forest::Vertex(Syntax::PVertex(span, symbol));
00223 }
00224 symbol.CreateFromString(Input, factorOrder, s.substr(0, pos), true);
00225 std::size_t begin = pos + 1;
00226 pos = s.find(',', begin+1);
00227 std::string tmp;
00228 s.substr(begin, pos-begin).CopyToString(&tmp);
00229 std::size_t start = std::atoi(tmp.c_str());
00230 s.substr(pos+1, s.size()-pos-2).CopyToString(&tmp);
00231 std::size_t end = std::atoi(tmp.c_str());
00232
00233 Range span(start+1, end+1);
00234 return new Forest::Vertex(Syntax::PVertex(span, symbol));
00235 }
00236
00238 void ForestInput::Print(std::ostream &out) const
00239 {
00240 out << *this << "\n";
00241 }
00242
00244 TranslationOptionCollection* ForestInput::
00245 CreateTranslationOptionCollection() const
00246 {
00247
00248 return NULL;
00249 }
00250
00251
00252 std::ostream& operator<<(std::ostream &out, const ForestInput &)
00253 {
00254 return out;
00255 }
00256
00257 }