00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "relax-parse.h"
00023 #include "tables-core.h"
00024 #include "util/tokenize.hh"
00025
00026 using namespace std;
00027 using namespace MosesTraining;
00028
00029 int main(int argc, char* argv[])
00030 {
00031 init( argc, argv );
00032
00033
00034 int i=0;
00035 string inBufferString;
00036 while(cin.peek() != EOF) {
00037 getline(cin,inBufferString);
00038 i++;
00039 if (i%1000 == 0) cerr << "." << flush;
00040 if (i%10000 == 0) cerr << ":" << flush;
00041 if (i%100000 == 0) cerr << "!" << flush;
00042
00043
00044 set< string > labelCollection;
00045 map< string, int > topLabelCollection;
00046 SyntaxNodeCollection tree;
00047 ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection, false );
00048 const vector< string > inWords = util::tokenize( inBufferString );
00049
00050
00051
00052
00053 ParentNodes parents = determineSplitPoints(tree);
00054
00055
00056 if (leftBinarizeFlag)
00057 LeftBinarize( tree, parents );
00058
00059 if (rightBinarizeFlag)
00060 RightBinarize( tree, parents );
00061
00062 if (SAMTLevel>0)
00063 SAMT( tree, parents );
00064
00065
00066
00067
00068 store( tree, inWords );
00069 }
00070 }
00071
00072
00073
00074 void init(int argc, char* argv[])
00075 {
00076 cerr << "Parse Relaxer v1.0, written by Philipp Koehn\n";
00077 cerr << "adds additional constituents to a parse tree\n";
00078
00079 if (argc < 2) {
00080 cerr << "syntax: relax-parse < in-parse > out-parse ["
00081 << " --LeftBinarize | --RightBinarize |"
00082 << " --SAMT 1-4 ]" << endl;
00083 exit(1);
00084 }
00085
00086 for(int i=1; i<argc; i++) {
00087
00088 if (strcmp(argv[i],"--LeftBinarize") == 0) {
00089 leftBinarizeFlag = true;
00090 } else if (strcmp(argv[i],"--RightBinarize") == 0) {
00091 rightBinarizeFlag = true;
00092 }
00093
00094
00095 else if (strcmp(argv[i],"--SAMT") == 0) {
00096 SAMTLevel = atoi( argv[++i] );
00097 cerr << "using SAMT grammar, level " << SAMTLevel << endl;
00098 }
00099
00100
00101 else {
00102 cerr << "relax-grammar: syntax error, unknown option '" << string(argv[i]) << "'\n";
00103 exit(1);
00104 }
00105 }
00106 }
00107
00108 void store( SyntaxNodeCollection &tree, const vector< string > &words )
00109 {
00110
00111 for( size_t i=0; i<words.size(); i++ ) {
00112 if (i>0) {
00113 cout << " ";
00114 }
00115 cout << words[i];
00116 }
00117
00118
00119 vector< SyntaxNode* > nodes = tree.GetAllNodes();
00120 for( size_t i=0; i<nodes.size(); i++ ) {
00121 cout << " <tree span=\"" << nodes[i]->start
00122 << "-" << nodes[i]->end
00123 << "\" label=\"" << nodes[i]->label << "\"";
00124 for (SyntaxNode::AttributeMap::const_iterator
00125 p = nodes[i]->attributes.begin();
00126 p != nodes[i]->attributes.end(); ++p) {
00127 cout << " " << p->first << "=\"" << p->second << "\"";
00128 }
00129 cout << "/>";
00130 }
00131 cout << endl;
00132 }
00133
00134 void LeftBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
00135 {
00136 for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) {
00137 const SplitPoints &point = *p;
00138 if (point.size() > 3) {
00139 const vector< SyntaxNode* >& topNodes
00140 = tree.GetNodes( point[0], point[point.size()-1]-1);
00141 string topLabel = topNodes[0]->label;
00142
00143 for(size_t i=2; i<point.size()-1; i++) {
00144
00145 tree.AddNode( point[0], point[i]-1, "^" + topLabel );
00146 }
00147 }
00148 }
00149 }
00150
00151 void RightBinarize( SyntaxNodeCollection &tree, ParentNodes &parents )
00152 {
00153 for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) {
00154 const SplitPoints &point = *p;
00155 if (point.size() > 3) {
00156 int endPoint = point[point.size()-1]-1;
00157 const vector< SyntaxNode* >& topNodes
00158 = tree.GetNodes( point[0], endPoint);
00159 string topLabel = topNodes[0]->label;
00160
00161 for(size_t i=1; i<point.size()-2; i++) {
00162
00163 tree.AddNode( point[i], endPoint, "^" + topLabel );
00164 }
00165 }
00166 }
00167 }
00168
00169 void SAMT( SyntaxNodeCollection &tree, ParentNodes &parents )
00170 {
00171 int numWords = tree.GetNumWords();
00172
00173 SyntaxNodeCollection newTree;
00174
00175
00176 for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) {
00177 const SplitPoints &point = *p;
00178
00179
00180 if (point.size() >= 3) {
00181
00182
00183
00184
00185 for(size_t i = 0; i+2 < point.size(); i++) {
00186
00187
00188 newTree.AddNode( point[i],point[i+2]-1,
00189 tree.GetNodes(point[i ],point[i+1]-1)[0]->label
00190 + "+" +
00191 tree.GetNodes(point[i+1],point[i+2]-1)[0]->label);
00192 }
00193 }
00194 if (point.size() >= 4) {
00195 int ps = point.size();
00196 string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->label;
00197
00198
00199 newTree.AddNode( point[1],point[ps-1]-1,
00200 topLabel
00201 + "\\" +
00202 tree.GetNodes(point[0],point[1]-1)[0]->label );
00203
00204
00205 newTree.AddNode( point[0],point[ps-2]-1,
00206 topLabel
00207 + "/" +
00208 tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->label );
00209 }
00210 }
00211
00212
00213 for(int size = 2; size < numWords; size++) {
00214 for(int start = 0; start < numWords-size+1; start++) {
00215 int end = start+size-1;
00216 bool done = false;
00217
00218 if (tree.HasNode( start,end ) || newTree.HasNode( start,end )
00219 || SAMTLevel <= 1) {
00220 continue;
00221 }
00222
00223
00224
00225 for(int mid=start+1; mid<=end && !done; mid++) {
00226 if (tree.HasNode(start,mid-1) && tree.HasNode(mid,end)) {
00227
00228
00229 newTree.AddNode( start, end,
00230 tree.GetNodes(start,mid-1)[0]->label
00231 + "++" +
00232 tree.GetNodes(mid, end )[0]->label );
00233 done = true;
00234 }
00235 }
00236 if (done) continue;
00237
00238
00239 for(int postEnd=end+1; postEnd<numWords && !done; postEnd++) {
00240 if (tree.HasNode(start,postEnd) && tree.HasNode(end+1,postEnd)) {
00241 newTree.AddNode( start, end,
00242 tree.GetNodes(start,postEnd)[0]->label
00243 + "//" +
00244 tree.GetNodes(end+1,postEnd)[0]->label );
00245 done = true;
00246 }
00247 }
00248 if (done) continue;
00249
00250
00251 for(int preStart=start-1; preStart>=0; preStart--) {
00252 if (tree.HasNode(preStart,end) && tree.HasNode(preStart,start-1)) {
00253
00254 newTree.AddNode( start, end,
00255 tree.GetNodes(preStart,end )[0]->label
00256 + "\\\\" +
00257 tree.GetNodes(preStart,start-1)[0]->label );
00258 done = true;
00259 }
00260 }
00261 if (done) continue;
00262
00263
00264
00265
00266
00267 if (SAMTLevel>=4) {
00268 newTree.AddNode( start, end, "_FAIL" );
00269 }
00270 }
00271 }
00272
00273
00274 vector< SyntaxNode* > nodes = newTree.GetAllNodes();
00275 for( size_t i=0; i<nodes.size(); i++ ) {
00276 tree.AddNode( nodes[i]->start, nodes[i]->end, nodes[i]->label);
00277 }
00278 }
00279
00280 ParentNodes determineSplitPoints(const SyntaxNodeCollection &nodeColl)
00281 {
00282 ParentNodes parents;
00283
00284 const std::size_t numWords = nodeColl.GetNumWords();
00285
00286
00287 for( int length=2; length<=numWords; length++ ) {
00288 for( int startPos = 0; startPos <= numWords-length; startPos++ ) {
00289 if (nodeColl.HasNode( startPos, startPos+length-1 )) {
00290
00291
00292
00293 SplitPoints splitPoints;
00294 splitPoints.push_back( startPos );
00295
00296
00297 int first = 1;
00298 int covered = 0;
00299 int found_somehing = 1;
00300 while( covered < length && found_somehing ) {
00301
00302
00303 found_somehing = 0;
00304 for( int midPos=length-first; midPos>covered; midPos-- ) {
00305 if( nodeColl.HasNode( startPos+covered, startPos+midPos-1 ) ) {
00306 covered = midPos;
00307 splitPoints.push_back( startPos+covered );
00308
00309 first = 0;
00310 found_somehing = 1;
00311 }
00312 }
00313 }
00314
00315 parents.push_back( splitPoints );
00316 }
00317 }
00318 }
00319 return parents;
00320 }