00001 #include "PCNTools.h"
00002
00003 #include <iostream>
00004 #include <cstdlib>
00005 #include "Util.h"
00006 #include "util/exception.hh"
00007
00008 using namespace std;
00009
00010 namespace PCN
00011 {
00012
00013 const std::string chars = "'\\";
00014 const char& quote = chars[0];
00015 const char& slash = chars[1];
00016
00017
00018 inline char get(const std::string& in, int c)
00019 {
00020 if (c < 0 || c >= (int)in.size()) return 0;
00021 else return in[(size_t)c];
00022 }
00023
00024
00025 inline void eatws(const std::string& in, int& c)
00026 {
00027 while (get(in,c) == ' ') {
00028 c++;
00029 }
00030 }
00031
00032 std::string getString(const std::string& in, int &c)
00033 {
00034 std::string ret;
00035 eatws(in,c);
00036 while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') {
00037 ret += get(in,c++);
00038 }
00039 eatws(in,c);
00040 return ret;
00041 }
00042
00043
00044 std::string getEscapedString(const std::string& in, int &c)
00045 {
00046 eatws(in,c);
00047 if (get(in,c++) != quote) return "ERROR";
00048 std::string res;
00049 char cur = 0;
00050 do {
00051 cur = get(in,c++);
00052 if (cur == slash) {
00053 res += get(in,c++);
00054 } else if (cur != quote) {
00055 res += cur;
00056 }
00057 } while (get(in,c) != quote && (c < (int)in.size()));
00058 c++;
00059 eatws(in,c);
00060 return res;
00061 }
00062
00063
00064 float getFloat(const std::string& in, int &c)
00065 {
00066 std::string tmp;
00067 eatws(in,c);
00068 while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') {
00069 tmp += get(in,c++);
00070 }
00071 eatws(in,c);
00072 return atof(tmp.c_str());
00073 }
00074
00075
00076 int getInt(const std::string& in, int &c)
00077 {
00078 std::string tmp;
00079 eatws(in,c);
00080 while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') {
00081 tmp += get(in,c++);
00082 }
00083 eatws(in,c);
00084 return atoi(tmp.c_str());
00085 }
00086
00087
00088 CNAlt getCNAlt(const std::string& in, int &c)
00089 {
00090 if (get(in,c++) != '(') {
00091 std::cerr << "PCN/PLF parse error: expected ( at start of cn alt block\n";
00092 return CNAlt();
00093 }
00094 std::string word = getEscapedString(in,c);
00095 if (get(in,c++) != ',') {
00096 std::cerr << "PCN/PLF parse error: expected , after string\n";
00097 return CNAlt();
00098 }
00099 size_t cnNext = 1;
00100
00101
00102 std::vector<string> toks;
00103 toks.push_back(getString(in,c));
00104 while (get(in,c) == ',') {
00105 c++;
00106 string tok = getString(in,c);
00107 toks.push_back(tok);
00108 }
00109
00110 std::vector<float> probs;
00111
00112
00113 size_t ind;
00114 for (ind = 0; ind < toks.size() - 1; ++ind) {
00115 const string &tok = toks[ind];
00116
00117 if (tok.find('=') == tok.npos) {
00118 float val = Moses::Scan<float>(tok);
00119 probs.push_back(val);
00120 } else {
00121
00122 break;
00123 }
00124 }
00125
00126
00127 std::map<string, float> sparseFeatures;
00128 for (; ind < toks.size() - 1; ++ind) {
00129 const string &tok = toks[ind];
00130 vector<string> keyValue = Moses::Tokenize(tok, "=");
00131 UTIL_THROW_IF2(keyValue.size() != 2, "Format error: " << tok);
00132 float prob = Moses::Scan<float>(keyValue[1]);
00133 sparseFeatures[ keyValue[0] ] = prob;
00134 }
00135
00136
00137 cnNext = Moses::Scan<size_t>(toks.back());
00138
00139 if (get(in,c++) != ')') {
00140 std::cerr << "PCN/PLF parse error: expected ) at end of cn alt block\n";
00141 return CNAlt();
00142 }
00143 eatws(in,c);
00144 return CNAlt(word, probs, sparseFeatures, cnNext);
00145 }
00146
00147
00148 CNCol getCNCol(const std::string& in, int &c)
00149 {
00150 CNCol res;
00151 if (get(in,c++) != '(') return res;
00152 eatws(in,c);
00153 while (1) {
00154 if (c > (int)in.size()) {
00155 break;
00156 }
00157 if (get(in,c) == ')') {
00158 c++;
00159 eatws(in,c);
00160 break;
00161 }
00162 if (get(in,c) == ',' && get(in,c+1) == ')') {
00163 c+=2;
00164 eatws(in,c);
00165 break;
00166 }
00167 if (get(in,c) == ',') {
00168 c++;
00169 eatws(in,c);
00170 }
00171 res.push_back(getCNAlt(in, c));
00172 }
00173 return res;
00174 }
00175
00176
00177 CN parsePCN(const std::string& in)
00178 {
00179 CN res;
00180 int c = 0;
00181 if (in[c++] != '(') return res;
00182 while (1) {
00183 if (c > (int)in.size()) {
00184 break;
00185 }
00186 if (get(in,c) == ')') {
00187 c++;
00188 eatws(in,c);
00189 break;
00190 }
00191 if (get(in,c) == ',' && get(in,c+1) == ')') {
00192 c+=2;
00193 eatws(in,c);
00194 break;
00195 }
00196 if (get(in,c) == ',') {
00197 c++;
00198 eatws(in,c);
00199 }
00200 res.push_back(getCNCol(in, c));
00201 }
00202 return res;
00203 }
00204
00205
00206 }
00207