00001
00002
00003
00004
00005
00006
00007
00008 #include "AlignedSentenceSyntax.h"
00009 #include "Parameter.h"
00010 #include "pugixml.hpp"
00011 #include "moses/Util.h"
00012
00013 using namespace std;
00014
00015 AlignedSentenceSyntax::AlignedSentenceSyntax(int lineNum,
00016 const std::string &source,
00017 const std::string &target,
00018 const std::string &alignment)
00019 :AlignedSentence(lineNum)
00020 ,m_sourceStr(source)
00021 ,m_targetStr(target)
00022 ,m_alignmentStr(alignment)
00023 {
00024 }
00025
00026 AlignedSentenceSyntax::~AlignedSentenceSyntax()
00027 {
00028
00029 }
00030
00031 void AlignedSentenceSyntax::Populate(bool isSyntax, int mixedSyntaxType, const Parameter ¶ms,
00032 string line, Phrase &phrase, SyntaxTree &tree)
00033 {
00034
00035 if (isSyntax) {
00036 line = "<xml><tree label=\"X\">" + line + "</tree></xml>";
00037 XMLParse(phrase, tree, line, params);
00038
00039 if (mixedSyntaxType != 0) {
00040
00041 tree.SetHieroLabel(params.hieroNonTerm);
00042 if (mixedSyntaxType == 2) {
00043 tree.AddToAll(params.hieroNonTerm);
00044 }
00045 }
00046 } else {
00047 PopulateWordVec(phrase, line);
00048 tree.SetHieroLabel(params.hieroNonTerm);
00049 }
00050
00051 }
00052
00053 void AlignedSentenceSyntax::Create(const Parameter ¶ms)
00054 {
00055 Populate(params.sourceSyntax, params.mixedSyntaxType, params, m_sourceStr,
00056 m_source, m_sourceTree);
00057 Populate(params.targetSyntax, params.mixedSyntaxType, params, m_targetStr,
00058 m_target, m_targetTree);
00059
00060 PopulateAlignment(m_alignmentStr);
00061 CreateConsistentPhrases(params);
00062
00063
00064 CreateNonTerms();
00065 }
00066
00067 void Escape(string &text)
00068 {
00069 text = Moses::Replace(text, "&", "&");
00070 text = Moses::Replace(text, "|", "|");
00071 text = Moses::Replace(text, "<", "<");
00072 text = Moses::Replace(text, ">", ">");
00073 text = Moses::Replace(text, "'", "'");
00074 text = Moses::Replace(text, "\"", """);
00075 text = Moses::Replace(text, "[", "[");
00076 text = Moses::Replace(text, "]", "]");
00077
00078 }
00079
00080 void AlignedSentenceSyntax::XMLParse(Phrase &output,
00081 SyntaxTree &tree,
00082 const pugi::xml_node &parentNode,
00083 const Parameter ¶ms)
00084 {
00085 int childNum = 0;
00086 for (pugi::xml_node childNode = parentNode.first_child(); childNode; childNode = childNode.next_sibling()) {
00087 string nodeName = childNode.name();
00088
00089
00090 string label;
00091 int startPos = output.size();
00092
00093 if (!nodeName.empty()) {
00094 pugi::xml_attribute attribute = childNode.attribute("label");
00095 label = attribute.as_string();
00096
00097
00098 XMLParse(output, tree, childNode, params);
00099 }
00100
00101
00102
00103
00104 string text = childNode.value();
00105 Escape(text);
00106
00107
00108 std::vector<string> toks;
00109 Moses::Tokenize(toks, text);
00110
00111 for (size_t i = 0; i < toks.size(); ++i) {
00112 const string &tok = toks[i];
00113 Word *word = new Word(output.size(), tok);
00114 output.push_back(word);
00115 }
00116
00117
00118 int endPos = output.size() - 1;
00119
00120
00121 if (!label.empty()) {
00122 label = "[" + label + "]";
00123 tree.Add(startPos, endPos, label, params);
00124 }
00125
00126 ++childNum;
00127 }
00128
00129 }
00130
00131 void AlignedSentenceSyntax::XMLParse(Phrase &output,
00132 SyntaxTree &tree,
00133 const std::string input,
00134 const Parameter ¶ms)
00135 {
00136 pugi::xml_document doc;
00137 pugi::xml_parse_result result = doc.load(input.c_str(),
00138 pugi::parse_default | pugi::parse_comments);
00139
00140 pugi::xml_node topNode = doc.child("xml");
00141 XMLParse(output, tree, topNode, params);
00142 }
00143
00144 void AlignedSentenceSyntax::CreateNonTerms()
00145 {
00146 for (int sourceStart = 0; sourceStart < m_source.size(); ++sourceStart) {
00147 for (int sourceEnd = sourceStart; sourceEnd < m_source.size(); ++sourceEnd) {
00148 ConsistentPhrases::Coll &coll = m_consistentPhrases.GetColl(sourceStart, sourceEnd);
00149 const SyntaxTree::Labels &sourceLabels = m_sourceTree.Find(sourceStart, sourceEnd);
00150
00151 ConsistentPhrases::Coll::iterator iter;
00152 for (iter = coll.begin(); iter != coll.end(); ++iter) {
00153 ConsistentPhrase &cp = **iter;
00154
00155 int targetStart = cp.corners[2];
00156 int targetEnd = cp.corners[3];
00157 const SyntaxTree::Labels &targetLabels = m_targetTree.Find(targetStart, targetEnd);
00158
00159 CreateNonTerms(cp, sourceLabels, targetLabels);
00160 }
00161 }
00162 }
00163
00164 }
00165
00166 void AlignedSentenceSyntax::CreateNonTerms(ConsistentPhrase &cp,
00167 const SyntaxTree::Labels &sourceLabels,
00168 const SyntaxTree::Labels &targetLabels)
00169 {
00170 SyntaxTree::Labels::const_iterator iterSource;
00171 for (iterSource = sourceLabels.begin(); iterSource != sourceLabels.end(); ++iterSource) {
00172 const string &sourceLabel = *iterSource;
00173
00174 SyntaxTree::Labels::const_iterator iterTarget;
00175 for (iterTarget = targetLabels.begin(); iterTarget != targetLabels.end(); ++iterTarget) {
00176 const string &targetLabel = *iterTarget;
00177 cp.AddNonTerms(sourceLabel, targetLabel);
00178 }
00179 }
00180 }
00181
00182