00001 #include "SplitPoint.h"
00002
00003 #include <map>
00004 #include <set>
00005 #include <sstream>
00006
00007 #include "util/string_piece.hh"
00008 #include "util/tokenize_piece.hh"
00009
00010 #include "syntax-common/exception.h"
00011
00012 namespace MosesTraining
00013 {
00014 namespace Syntax
00015 {
00016 namespace PostprocessEgretForests
00017 {
00018
00019 void MarkSplitPoints(const std::vector<SplitPoint> &splitPoints,
00020 std::string &sentence)
00021 {
00022 if (splitPoints.empty()) {
00023 return;
00024 }
00025
00026
00027 std::string connector;
00028 std::map<int, std::set<int> > points;
00029 for (std::vector<SplitPoint>::const_iterator p = splitPoints.begin();
00030 p != splitPoints.end(); ++p) {
00031 points[p->tokenPos].insert(p->charPos);
00032 connector = p->connector;
00033 }
00034
00035
00036 std::vector<std::string> terminals;
00037 const util::AnyCharacter delim(" \t");
00038 for (util::TokenIter<util::AnyCharacter, true> p(sentence, delim); p; ++p) {
00039 terminals.resize(terminals.size()+1);
00040 p->CopyToString(&terminals.back());
00041 }
00042
00043
00044 for (std::map<int, std::set<int> >::const_iterator p = points.begin();
00045 p != points.end(); ++p) {
00046 std::string &word = terminals[p->first];
00047 int offset = 0;
00048 for (std::set<int>::const_iterator q = p->second.begin();
00049 q != p->second.end(); ++q) {
00050 std::string str = std::string("@") + connector + std::string("@");
00051 word.replace(*q+offset, connector.size(), str);
00052 offset += 2;
00053 }
00054 }
00055
00056 sentence.clear();
00057 for (std::size_t i = 0; i < terminals.size(); ++i) {
00058 if (i > 0) {
00059 sentence += " ";
00060 }
00061 sentence += terminals[i];
00062 }
00063 }
00064
00065 void MarkSplitPoints(const std::vector<SplitPoint> &splitPoints, Forest &forest)
00066 {
00067 if (splitPoints.empty()) {
00068 return;
00069 }
00070
00071
00072 std::string connector;
00073 std::map<int, std::set<int> > points;
00074 for (std::vector<SplitPoint>::const_iterator p = splitPoints.begin();
00075 p != splitPoints.end(); ++p) {
00076 points[p->tokenPos].insert(p->charPos);
00077 connector = p->connector;
00078 }
00079
00080
00081 std::vector<Forest::Vertex *> terminals;
00082 for (std::vector<boost::shared_ptr<Forest::Vertex> >::const_iterator
00083 p = forest.vertices.begin(); p != forest.vertices.end(); ++p) {
00084 if (!(*p)->incoming.empty()) {
00085 continue;
00086 }
00087 int pos = (*p)->start;
00088 if (pos >= terminals.size()) {
00089 terminals.resize(pos+1);
00090 }
00091 terminals[pos] = p->get();
00092 }
00093
00094
00095 for (std::map<int, std::set<int> >::const_iterator p = points.begin();
00096 p != points.end(); ++p) {
00097 std::string &word = terminals[p->first]->symbol.value;
00098 int offset = 0;
00099 for (std::set<int>::const_iterator q = p->second.begin();
00100 q != p->second.end(); ++q) {
00101 std::string str = std::string("@") + connector + std::string("@");
00102 word.replace(*q+offset, connector.size(), str);
00103 offset += 2;
00104 }
00105 }
00106
00107 }
00108
00109 }
00110 }
00111 }