00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <cstdio>
00021 #include <iostream>
00022 #include <fstream>
00023 #include <vector>
00024 #include <string>
00025 #include <sstream>
00026 #include <cstdlib>
00027 #include <cstring>
00028
00029 #include "tables-core.h"
00030 #include "InputFileStream.h"
00031 #include "util/tokenize.hh"
00032
00033 using namespace std;
00034
00035 bool hierarchicalFlag = false;
00036 bool onlyDirectFlag = false;
00037 bool phraseCountFlag = true;
00038 bool logProbFlag = false;
00039
00040 void processFiles( char*, char*, char* );
00041 bool getLine( istream &fileP, vector< string > &item );
00042 string reverseAlignment(const string &alignments);
00043 vector< string > splitLine(const char *lin);
00044
00045 inline void Tokenize(std::vector<std::string> &output
00046 , const std::string& str
00047 , const std::string& delimiters = " \t")
00048 {
00049
00050 std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
00051
00052 std::string::size_type pos = str.find_first_of(delimiters, lastPos);
00053
00054 while (std::string::npos != pos || std::string::npos != lastPos) {
00055
00056 output.push_back(str.substr(lastPos, pos - lastPos));
00057
00058 lastPos = str.find_first_not_of(delimiters, pos);
00059
00060 pos = str.find_first_of(delimiters, lastPos);
00061 }
00062 }
00063
00064 int main(int argc, char* argv[])
00065 {
00066 cerr << "Consolidate v2.0 written by Philipp Koehn\n"
00067 << "consolidating direct and indirect rule tables\n";
00068
00069 if (argc < 4) {
00070 cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect]\n";
00071 exit(1);
00072 }
00073 char* &fileNameDirect = argv[1];
00074 char* &fileNameIndirect = argv[2];
00075 char* &fileNameConsolidated = argv[3];
00076
00077 for(int i=4; i<argc; i++) {
00078 if (strcmp(argv[i],"--Hierarchical") == 0) {
00079 hierarchicalFlag = true;
00080 cerr << "processing hierarchical rules\n";
00081 } else if (strcmp(argv[i],"--OnlyDirect") == 0) {
00082 onlyDirectFlag = true;
00083 cerr << "only including direct translation scores p(e|f)\n";
00084 } else if (strcmp(argv[i],"--NoPhraseCount") == 0) {
00085 phraseCountFlag = false;
00086 cerr << "not including the phrase count feature\n";
00087 } else if (strcmp(argv[i],"--LogProb") == 0) {
00088 logProbFlag = true;
00089 cerr << "using log-probabilities\n";
00090 } else {
00091 cerr << "ERROR: unknown option " << argv[i] << endl;
00092 exit(1);
00093 }
00094 }
00095
00096 processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated );
00097 }
00098
00099 void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated )
00100 {
00101
00102 Moses::InputFileStream fileDirect(fileNameDirect);
00103 Moses::InputFileStream fileIndirect(fileNameIndirect);
00104
00105 if (fileDirect.fail()) {
00106 cerr << "ERROR: could not open phrase table file " << fileNameDirect << endl;
00107 exit(1);
00108 }
00109 istream &fileDirectP = fileDirect;
00110
00111 if (fileIndirect.fail()) {
00112 cerr << "ERROR: could not open phrase table file " << fileNameIndirect << endl;
00113 exit(1);
00114 }
00115 istream &fileIndirectP = fileIndirect;
00116
00117
00118 ofstream fileConsolidated;
00119 fileConsolidated.open(fileNameConsolidated);
00120 if (fileConsolidated.fail()) {
00121 cerr << "ERROR: could not open output file " << fileNameConsolidated << endl;
00122 exit(1);
00123 }
00124
00125
00126 int i=0;
00127 while(true) {
00128 i++;
00129 if (i%100000 == 0) cerr << "." << flush;
00130
00131 vector< string > itemDirect, itemIndirect;
00132 if (! getLine(fileIndirectP,itemIndirect) ||
00133 ! getLine(fileDirectP, itemDirect ))
00134 break;
00135
00136
00137
00138
00139
00140 if (itemDirect[0].compare( itemIndirect[0] ) != 0) {
00141 cerr << "ERROR: target phrase does not match in line " << i << ": '"
00142 << itemDirect[0] << "' != '" << itemIndirect[0] << "'" << endl;
00143 exit(1);
00144 }
00145
00146 if (itemDirect[1].compare( itemIndirect[1] ) != 0) {
00147 cerr << "ERROR: source phrase does not match in line " << i << ": '"
00148 << itemDirect[1] << "' != '" << itemIndirect[1] << "'" << endl;
00149 exit(1);
00150 }
00151
00152
00153 fileConsolidated << itemDirect[1] << " ||| " << itemDirect[0];
00154
00155
00156 fileConsolidated << " ||| ";
00157 if (!onlyDirectFlag) {
00158 fileConsolidated << itemDirect[2];
00159 }
00160 fileConsolidated << " " << itemIndirect[2];
00161 if (phraseCountFlag) {
00162 fileConsolidated << " " << (logProbFlag ? 1 : 2.718);
00163 }
00164
00165
00166 fileConsolidated << " ||| " << reverseAlignment(itemDirect[3]);
00167
00168
00169 const vector<string> directCounts = util::tokenize(itemDirect[4]);
00170 const vector<string> indirectCounts = util::tokenize(itemIndirect[4]);
00171 fileConsolidated << "||| " << directCounts[0] << " " << indirectCounts[0];
00172
00173 if (indirectCounts.size() > 1) {
00174 fileConsolidated << " " << indirectCounts[1];
00175 } else if (directCounts.size() > 1) {
00176 fileConsolidated << " " << directCounts[1];
00177 }
00178
00179 fileConsolidated << endl;
00180 }
00181 fileDirect.Close();
00182 fileIndirect.Close();
00183 fileConsolidated.close();
00184 }
00185
00186 bool getLine( istream &fileP, vector< string > &item )
00187 {
00188 if (fileP.eof())
00189 return false;
00190
00191 string line;
00192 if (getline(fileP, line)) {
00193 item = splitLine(line.c_str());
00194 return false;
00195 } else {
00196 return false;
00197 }
00198 }
00199
00200 vector< string > splitLine(const char *line)
00201 {
00202 vector< string > item;
00203 int start=0;
00204 int i=0;
00205 for(; line[i] != '\0'; i++) {
00206 if (line[i] == ' ' &&
00207 line[i+1] == '|' &&
00208 line[i+2] == '|' &&
00209 line[i+3] == '|' &&
00210 line[i+4] == ' ') {
00211 if (start > i) start = i;
00212 item.push_back( string( line+start, i-start ) );
00213 start = i+5;
00214 i += 3;
00215 }
00216 }
00217 item.push_back( string( line+start, i-start ) );
00218
00219 return item;
00220 }
00221
00222 string reverseAlignment(const string &alignments)
00223 {
00224 stringstream ret("");
00225
00226 const vector<string> alignToks = util::tokenize(alignments);
00227
00228 for (size_t i = 0; i < alignToks.size(); ++i) {
00229 const string &alignPair = alignToks[i];
00230 vector<string> alignPoints;
00231 Tokenize(alignPoints, alignPair, "-");
00232 assert(alignPoints.size() == 2);
00233
00234 ret << alignPoints[1] << "-" << alignPoints[0] << " ";
00235 }
00236
00237 return ret.str();
00238 }
00239
00240