00001 #include <fstream>
00002 #include <iostream>
00003 #include<string>
00004 #include<sstream>
00005 #include<vector>
00006 #include<map>
00007 #include "Desegmenter.h"
00008 #include <boost/algorithm/string/replace.hpp>
00009
00010 using namespace std;
00011
00012 namespace Moses
00013 {
00014 void Desegmenter::Load(const string filename)
00015 {
00016
00017 std::ifstream myFile(filename.c_str() );
00018 if (myFile.is_open()) {
00019 cerr << "Desegmentation File open successful." << endl;
00020 string line;
00021 while (getline(myFile, line)) {
00022 stringstream ss(line);
00023 string token;
00024 vector<string> myline;
00025 while (getline(ss, token, '\t')) {
00026 myline.push_back(token);
00027 }
00028 mmDesegTable.insert(pair<string, string>(myline[2], myline[1] ));
00029 }
00030 myFile.close();
00031 } else
00032 cerr << "open() failed: check if Desegmentation file is in right folder" << endl;
00033 }
00034
00035
00036 vector<string> Desegmenter::Search(string myKey)
00037 {
00038 multimap<string, string>::const_iterator mmiPairFound = mmDesegTable.find(myKey);
00039 vector<string> result;
00040 if (mmiPairFound != mmDesegTable.end()) {
00041 size_t nNumPairsInMap = mmDesegTable.count(myKey);
00042 for (size_t nValuesCounter = 0; nValuesCounter < nNumPairsInMap; ++nValuesCounter) {
00043 if (mmiPairFound != mmDesegTable.end()) {
00044 result.push_back(mmiPairFound->second);
00045 }
00046 ++mmiPairFound;
00047 }
00048 return result;
00049 } else {
00050 string rule_deseg ;
00051 rule_deseg = ApplyRules(myKey);
00052 result.push_back(rule_deseg);
00053 return result;
00054 }
00055 }
00056
00057
00058 string Desegmenter::ApplyRules(string & segToken)
00059 {
00060
00061 string desegToken=segToken;
00062 if (!simple) {
00063 boost::replace_all(desegToken, "l+ All", "ll");
00064 boost::replace_all(desegToken, "l+ Al", "ll");
00065 boost::replace_all(desegToken, "y+ y ", "y");
00066 boost::replace_all(desegToken, "p+ ", "t");
00067 boost::replace_all(desegToken, "' +", "}");
00068 boost::replace_all(desegToken, "y +", "A");
00069 boost::replace_all(desegToken, "n +n", "n");
00070 boost::replace_all(desegToken, "mn +m", "mm");
00071 boost::replace_all(desegToken, "En +m", "Em");
00072 boost::replace_all(desegToken, "An +lA", "Em");
00073 boost::replace_all(desegToken, "-LRB-", "(");
00074 boost::replace_all(desegToken, "-RRB-", ")");
00075 }
00076
00077 boost::replace_all(desegToken, "+ +", "");
00078 boost::replace_all(desegToken, "+ ", "");
00079 boost::replace_all(desegToken, " +", "");
00080
00081 return desegToken;
00082 }
00083
00084 Desegmenter::~Desegmenter()
00085 {}
00086
00087 }