00001 #include "StringForestParser.h"
00002
00003 #include <istream>
00004 #include <string>
00005
00006 #include <boost/make_shared.hpp>
00007
00008 #include "util/tokenize_piece.hh"
00009
00010 #include "syntax-common/exception.h"
00011
00012 namespace MosesTraining
00013 {
00014 namespace Syntax
00015 {
00016 namespace FilterRuleTable
00017 {
00018
00019 StringForestParser::StringForestParser()
00020 : m_input(0)
00021 {
00022 }
00023
00024 StringForestParser::StringForestParser(std::istream &input)
00025 : m_input(&input)
00026 {
00027 ++(*this);
00028 }
00029
00030 StringForestParser &StringForestParser::operator++()
00031 {
00032 if (!m_input) {
00033 return *this;
00034 }
00035 m_vertexSet.clear();
00036 m_entry.forest.reset(new StringForest());
00037 if (!std::getline(*m_input, m_tmpLine)) {
00038 m_input = 0;
00039 return *this;
00040 }
00041
00042 ParseSentenceNumLine(m_tmpLine, m_entry.sentNum);
00043
00044 std::getline(*m_input, m_entry.sentence);
00045
00046
00047 std::getline(*m_input, m_tmpLine);
00048 if (m_tmpLine == "") {
00049 std::getline(*m_input, m_tmpLine);
00050 assert(m_tmpLine == "");
00051 return *this;
00052 }
00053 while (m_tmpLine != "") {
00054 ParseHyperedgeLine(m_tmpLine, *m_entry.forest);
00055 std::getline(*m_input, m_tmpLine);
00056 }
00057 return *this;
00058 }
00059
00060 StringForest::Vertex *StringForestParser::AddOrDeleteVertex(
00061 StringForest::Vertex *v)
00062 {
00063 std::pair<VertexSet::iterator, bool> ret = m_vertexSet.insert(v);
00064 if (ret.second) {
00065 m_entry.forest->vertices.push_back(*ret.first);
00066 } else {
00067 delete v;
00068 }
00069 return *ret.first;
00070 }
00071
00072 void StringForestParser::ParseSentenceNumLine(const std::string &line,
00073 std::size_t &sentNum)
00074 {
00075 const util::AnyCharacter delimiter(" \t");
00076 util::TokenIter<util::AnyCharacter, true> p(line, delimiter);
00077 if (*p != "sentence") {
00078
00079 throw Exception("");
00080 }
00081 ++p;
00082 std::string tmp;
00083 p->CopyToString(&tmp);
00084 sentNum = std::atoi(tmp.c_str());
00085 }
00086
00087 void StringForestParser::ParseHyperedgeLine(const std::string &line,
00088 StringForest &forest)
00089 {
00090 const util::AnyCharacter delimiter(" \t");
00091 util::TokenIter<util::AnyCharacter, true> p(line, delimiter);
00092 StringForest::Vertex *v = AddOrDeleteVertex(ParseVertex(*p));
00093 StringForest::Hyperedge *e = new StringForest::Hyperedge();
00094 e->head = v;
00095 ++p;
00096 if (*p != "=>") {
00097
00098 throw Exception("");
00099 }
00100 for (++p; *p != "|||"; ++p) {
00101 v = ParseVertex(*p);
00102 if (v->value.start == -1) {
00103
00104 v->value.start = v->value.end = e->head->value.start;
00105 }
00106 e->tail.push_back(AddOrDeleteVertex(v));
00107 }
00108
00109 e->head->incoming.push_back(e);
00110 }
00111
00112 StringForest::Vertex *StringForestParser::ParseVertex(const StringPiece &s)
00113 {
00114 StringForest::Vertex *v = new StringForest::Vertex();
00115 std::size_t pos = s.rfind('[');
00116 if (pos == std::string::npos) {
00117 s.CopyToString(&v->value.symbol);
00118
00119 v->value.start = v->value.end = -1;
00120 return v;
00121 }
00122 if (pos > 2 && s[pos-2] == '^' && s[pos-1] == 'g') {
00123 s.substr(0, pos-2).CopyToString(&v->value.symbol);
00124 } else {
00125 s.substr(0, pos).CopyToString(&v->value.symbol);
00126 }
00127
00128 std::size_t begin = pos + 1;
00129 pos = s.find(',', begin+1);
00130 std::string tmp;
00131 s.substr(begin, pos-begin).CopyToString(&tmp);
00132 v->value.start = std::atoi(tmp.c_str());
00133 s.substr(pos+1, s.size()-pos-2).CopyToString(&tmp);
00134 v->value.end = std::atoi(tmp.c_str());
00135 return v;
00136 }
00137
00138 bool operator==(const StringForestParser &lhs, const StringForestParser &rhs)
00139 {
00140
00141 return lhs.m_input == rhs.m_input;
00142 }
00143
00144 bool operator!=(const StringForestParser &lhs, const StringForestParser &rhs)
00145 {
00146 return !(lhs == rhs);
00147 }
00148
00149 }
00150 }
00151 }