00001 #include <fstream> 00002 #include <iostream> 00003 #include<string> 00004 #include<sstream> 00005 #include<vector> 00006 #include<map> 00007 #include "Desegmenter.h" 00008 #include <boost/algorithm/string/replace.hpp> 00009 00010 using namespace std; 00011 00012 namespace Moses 00013 { 00014 void Desegmenter::Load(const string filename) 00015 { 00016 00017 std::ifstream myFile(filename.c_str() ); 00018 if (myFile.is_open()) { 00019 cerr << "Desegmentation File open successful." << endl; 00020 string line; 00021 while (getline(myFile, line)) { 00022 stringstream ss(line); 00023 string token; 00024 vector<string> myline; 00025 while (getline(ss, token, '\t')) { 00026 myline.push_back(token); 00027 } 00028 mmDesegTable.insert(pair<string, string>(myline[2], myline[1] )); 00029 } 00030 myFile.close(); 00031 } else 00032 cerr << "open() failed: check if Desegmentation file is in right folder" << endl; 00033 } 00034 00035 00036 vector<string> Desegmenter::Search(string myKey) 00037 { 00038 multimap<string, string>::const_iterator mmiPairFound = mmDesegTable.find(myKey); 00039 vector<string> result; 00040 if (mmiPairFound != mmDesegTable.end()) { 00041 size_t nNumPairsInMap = mmDesegTable.count(myKey); 00042 for (size_t nValuesCounter = 0; nValuesCounter < nNumPairsInMap; ++nValuesCounter) { 00043 if (mmiPairFound != mmDesegTable.end()) { 00044 result.push_back(mmiPairFound->second); 00045 } 00046 ++mmiPairFound; 00047 } 00048 return result; 00049 } else { 00050 string rule_deseg ; 00051 rule_deseg = ApplyRules(myKey); 00052 result.push_back(rule_deseg); 00053 return result; 00054 } 00055 } 00056 00057 00058 string Desegmenter::ApplyRules(string & segToken) 00059 { 00060 00061 string desegToken=segToken; 00062 if (!simple) { 00063 boost::replace_all(desegToken, "l+ All", "ll"); 00064 boost::replace_all(desegToken, "l+ Al", "ll"); 00065 boost::replace_all(desegToken, "y+ y ", "y"); 00066 boost::replace_all(desegToken, "p+ ", "t"); 00067 boost::replace_all(desegToken, "' +", "}"); 00068 boost::replace_all(desegToken, "y +", "A"); 00069 boost::replace_all(desegToken, "n +n", "n"); 00070 boost::replace_all(desegToken, "mn +m", "mm"); 00071 boost::replace_all(desegToken, "En +m", "Em"); 00072 boost::replace_all(desegToken, "An +lA", "Em"); 00073 boost::replace_all(desegToken, "-LRB-", "("); 00074 boost::replace_all(desegToken, "-RRB-", ")"); 00075 } 00076 00077 boost::replace_all(desegToken, "+ +", ""); 00078 boost::replace_all(desegToken, "+ ", ""); 00079 boost::replace_all(desegToken, " +", ""); 00080 00081 return desegToken; 00082 } 00083 00084 Desegmenter::~Desegmenter() 00085 {} 00086 00087 }