00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifdef WIN32
00023 #include <windows.h>
00024 #else
00025 #include <sys/times.h>
00026 #include <sys/resource.h>
00027 #endif
00028
00029 #include <cstring>
00030 #include <cctype>
00031 #include <algorithm>
00032 #include <cstdio>
00033 #include <iostream>
00034 #include <iomanip>
00035 #include <boost/algorithm/string/predicate.hpp>
00036 #include "TypeDef.h"
00037 #include "Util.h"
00038
00039 #include "util/exception.hh"
00040 #include "util/file.hh"
00041 #include "moses/FF/StatelessFeatureFunction.h"
00042 #include "moses/FF/StatefulFeatureFunction.h"
00043 #include "moses/StaticData.h"
00044
00045 using namespace std;
00046 using namespace boost::algorithm;
00047
00048 namespace Moses
00049 {
00050
00051
00052 string GetTempFolder()
00053 {
00054 #ifdef _WIN32
00055 char *tmpPath = getenv("TMP");
00056 string str(tmpPath);
00057 if (!ends_with(str, "\\"))
00058 str += "\\";
00059 return str;
00060 #else
00061 return "/tmp/";
00062 #endif
00063 }
00064
00065 const std::string ToLower(const std::string& str)
00066 {
00067 std::string lc(str);
00068 std::transform(lc.begin(), lc.end(), lc.begin(), (int(*)(int))std::tolower);
00069 return lc;
00070 }
00071
00072 class BoolValueException : public util::Exception {};
00073
00074 template<>
00075 bool Scan<bool>(const std::string &input)
00076 {
00077 std::string lc = ToLower(input);
00078 if (lc == "yes" || lc == "y" || lc == "true" || lc == "1")
00079 return true;
00080 if (lc == "no" || lc == "n" || lc =="false" || lc == "0")
00081 return false;
00082 UTIL_THROW(BoolValueException, "Could not interpret " << input << " as a boolean. After lowercasing, valid values are yes, y, true, 1, no, n, false, and 0.");
00083 }
00084
00085 bool FileExists(const std::string& filePath)
00086 {
00087 ifstream ifs(filePath.c_str());
00088 return !ifs.fail();
00089 }
00090
00091 std::vector< std::map<std::string, std::string> > ProcessAndStripDLT(std::string &line)
00092 {
00093 std::vector< std::map<std::string, std::string> > meta;
00094 std::string lline = ToLower(line);
00095 bool check_dlt = true;
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105 while (check_dlt) {
00106 size_t start = lline.find("<dlt");
00107 if (start == std::string::npos) {
00108
00109 check_dlt = false;
00110 continue;
00111 }
00112 size_t close = lline.find("/>");
00113 if (close == std::string::npos) {
00114
00115 check_dlt = false;
00116 continue;
00117 }
00118
00119 std::string dlt = Trim(line.substr(start+4, close-start-4));
00120
00121 line.erase(start,close-start+2);
00122 lline.erase(start,close-start+2);
00123
00124 if (dlt != "") {
00125 std::map<std::string, std::string> tmp_meta;
00126
00127
00128 size_t start_type = dlt.find("type=");
00129 size_t len_type=4;
00130 if (start_type != std::string::npos) {
00131
00132
00133
00134
00135 std::string val_type;
00136 std::string label_type = dlt.substr(start_type, len_type);
00137 if (dlt[start_type+len_type+1] == '"') {
00138 val_type = dlt.substr(start_type+len_type+2);
00139 size_t close_type = val_type.find('"');
00140 val_type = val_type.substr(0, close_type);
00141 dlt.erase(start_type,start_type+len_type+2+close_type+1);
00142 } else {
00143 TRACE_ERR("DLT parse error: missing character \" for type \n");
00144 }
00145 label_type = Trim(label_type);
00146 dlt = Trim(dlt);
00147
00148 tmp_meta[label_type] = val_type;
00149 } else {
00150
00151 UTIL_THROW(util::Exception, "ProcessAndStripDLT(std::string &line): Attribute type for dlt tag is mandatory.");
00152 }
00153
00154
00155 size_t start_id = dlt.find("id=");
00156 size_t len_id=2;
00157 if (start_id != std::string::npos) {
00158
00159
00160
00161
00162 std::string val_id;
00163 std::string label_id = dlt.substr(start_id, len_id);
00164 if (dlt[start_id+len_id+1] == '"') {
00165 val_id = dlt.substr(start_id+len_id+2);
00166 size_t close_id = val_id.find('"');
00167 val_id = val_id.substr(0, close_id);
00168 dlt.erase(start_id,start_id+len_id+2+close_id+1);
00169 } else {
00170 TRACE_ERR("DLT parse error: missing character \" for id \n");
00171 }
00172 label_id = Trim(label_id);
00173 dlt = Trim(dlt);
00174
00175 tmp_meta[label_id] = val_id;
00176 } else {
00177
00178
00179 }
00180
00181 for (size_t i = 1; i < dlt.size(); i++) {
00182 if (dlt[i] == '=') {
00183 std::string label = dlt.substr(0, i);
00184 std::string val = dlt.substr(i+1);
00185 if (val[0] == '"') {
00186
00187 val = val.substr(1);
00188
00189
00190
00191 size_t close = val.rfind('"');
00192 if (close == std::string::npos) {
00193 TRACE_ERR("SGML parse error: missing \"\n");
00194 dlt = "";
00195 i = 0;
00196 } else {
00197 dlt = val.substr(close+1);
00198 val = val.substr(0, close);
00199 i = 0;
00200 }
00201 } else {
00202 size_t close = val.find(' ');
00203 if (close == std::string::npos) {
00204 dlt = "";
00205 i = 0;
00206 } else {
00207 dlt = val.substr(close+1);
00208 val = val.substr(0, close);
00209 }
00210 }
00211 label = Trim(label);
00212 dlt = Trim(dlt);
00213
00214 tmp_meta[label] = val;
00215 }
00216 }
00217
00218 meta.push_back(tmp_meta);
00219 }
00220 }
00221
00222 return meta;
00223 }
00224
00225 std::map<std::string, std::string> ProcessAndStripSGML(std::string &line)
00226 {
00227 std::map<std::string, std::string> meta;
00228 std::string lline = ToLower(line);
00229 if (lline.find("<seg")!=0) return meta;
00230 size_t close = lline.find(">");
00231 if (close == std::string::npos) return meta;
00232 size_t end = lline.find("</seg>");
00233 std::string seg = Trim(lline.substr(4, close-4));
00234 std::string text = line.substr(close+1, end - close - 1);
00235 for (size_t i = 1; i < seg.size(); i++) {
00236 if (seg[i] == '=' && seg[i-1] == ' ') {
00237 std::string less = seg.substr(0, i-1) + seg.substr(i);
00238 seg = less;
00239 i = 0;
00240 continue;
00241 }
00242 if (seg[i] == '=' && seg[i+1] == ' ') {
00243 std::string less = seg.substr(0, i+1);
00244 if (i+2 < seg.size()) less += seg.substr(i+2);
00245 seg = less;
00246 i = 0;
00247 continue;
00248 }
00249 }
00250 line = Trim(text);
00251 if (seg == "") return meta;
00252 for (size_t i = 1; i < seg.size(); i++) {
00253 if (seg[i] == '=') {
00254 std::string label = seg.substr(0, i);
00255 std::string val = seg.substr(i+1);
00256 if (val[0] == '"') {
00257 val = val.substr(1);
00258 size_t close = val.find('"');
00259 if (close == std::string::npos) {
00260 TRACE_ERR("SGML parse error: missing \"\n");
00261 seg = "";
00262 i = 0;
00263 } else {
00264 seg = val.substr(close+1);
00265 val = val.substr(0, close);
00266 i = 0;
00267 }
00268 } else {
00269 size_t close = val.find(' ');
00270 if (close == std::string::npos) {
00271 seg = "";
00272 i = 0;
00273 } else {
00274 seg = val.substr(close+1);
00275 val = val.substr(0, close);
00276 }
00277 }
00278 label = Trim(label);
00279 seg = Trim(seg);
00280 meta[label] = val;
00281 }
00282 }
00283 return meta;
00284 }
00285
00286 std::string PassthroughSGML(std::string &line, const std::string tagName, const std::string& lbrackStr, const std::string& rbrackStr)
00287 {
00288 string lbrack = lbrackStr;
00289 string rbrack = rbrackStr;
00290
00291 std::string meta = "";
00292
00293 std::string lline = ToLower(line);
00294 size_t open = lline.find(lbrack+tagName);
00295
00296 if (open == std::string::npos) return meta;
00297
00298 size_t close = lline.find(rbrack, open);
00299
00300 if (close == std::string::npos) {
00301 TRACE_ERR("PassthroughSGML error: the <passthrough info/> tag does not end properly\n");
00302 return meta;
00303 }
00304
00305 std::string tmp = line.substr(open, close - open + 1);
00306 meta = line.substr(open, close - open + 1);
00307
00308
00309 line = line.substr(0, open) + line.substr(close + 1, std::string::npos);
00310
00311 TRACE_ERR("The input contains a <passthrough info/> tag:" << meta << std::endl);
00312
00313 lline = ToLower(line);
00314 open = lline.find(lbrack+tagName);
00315 if (open != std::string::npos) {
00316 TRACE_ERR("PassthroughSGML error: there are two <passthrough> tags\n");
00317 }
00318 return meta;
00319 }
00320
00321 void PrintFeatureWeight(const FeatureFunction* ff)
00322 {
00323 cout << ff->GetScoreProducerDescription() << "=";
00324 size_t numScoreComps = ff->GetNumScoreComponents();
00325 vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
00326 for (size_t i = 0; i < numScoreComps; ++i) {
00327 if (ff->IsTuneableComponent(i)) {
00328 cout << " " << values[i];
00329 } else {
00330 cout << " UNTUNEABLECOMPONENT";
00331 }
00332 }
00333 cout << endl;
00334
00335 }
00336
00337 void ShowWeights()
00338 {
00339 FixPrecision(cout,6);
00340 const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
00341 const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
00342
00343 for (size_t i = 0; i < sff.size(); ++i) {
00344 const StatefulFeatureFunction *ff = sff[i];
00345 if (ff->IsTuneable()) {
00346 PrintFeatureWeight(ff);
00347 } else {
00348 cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
00349 }
00350 }
00351 for (size_t i = 0; i < slf.size(); ++i) {
00352 const StatelessFeatureFunction *ff = slf[i];
00353 if (ff->IsTuneable()) {
00354 PrintFeatureWeight(ff);
00355 } else {
00356 cout << ff->GetScoreProducerDescription() << " UNTUNEABLE" << endl;
00357 }
00358 }
00359 }
00360
00361 }
00362
00363