00001 #include "DomainFeature.h"
00002 #include "ExtractionPhrasePair.h"
00003 #include "tables-core.h"
00004 #include "InputFileStream.h"
00005 #include "util/tokenize.hh"
00006
00007 using namespace std;
00008
00009 namespace MosesTraining
00010 {
00011
00012
00013 void Domain::load( const std::string &domainFileName )
00014 {
00015 Moses::InputFileStream fileS( domainFileName );
00016 istream *fileP = &fileS;
00017
00018 string line;
00019 while(getline(*fileP, line)) {
00020
00021 const vector< string > domainSpecLine = util::tokenize( line );
00022 int lineNumber;
00023 if (domainSpecLine.size() != 2 ||
00024 ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
00025 std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
00026 exit(1);
00027 }
00028
00029 const string &name = domainSpecLine[1];
00030 spec.push_back( make_pair( lineNumber, name ));
00031 if (name2id.find( name ) == name2id.end()) {
00032 name2id[ name ] = list.size();
00033 list.push_back( name );
00034 }
00035 }
00036 }
00037
00038
00039 string Domain::getDomainOfSentence( int sentenceId ) const
00040 {
00041 for(size_t i=0; i<spec.size(); i++) {
00042 if (sentenceId <= spec[i].first) {
00043 return spec[i].second;
00044 }
00045 }
00046 return "undefined";
00047 }
00048
00049 DomainFeature::DomainFeature(const string& domainFile) : m_propertyKey("domain")
00050 {
00051
00052 m_domain.load(domainFile);
00053 }
00054
00055 void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
00056 float count,
00057 int sentenceId) const
00058 {
00059 std::string value = m_domain.getDomainOfSentence(sentenceId);
00060 phrasePair.AddProperty(m_propertyKey, value, count);
00061 }
00062
00063 void DomainFeature::add(const ScoreFeatureContext& context,
00064 std::vector<float>& denseValues,
00065 std::map<std::string,float>& sparseValues) const
00066 {
00067 const map<string,float> *domainCount = context.phrasePair.GetProperty(m_propertyKey);
00068 assert( domainCount != NULL );
00069 add(*domainCount,
00070 context.phrasePair.GetCount(),
00071 context.maybeLog,
00072 denseValues, sparseValues);
00073 }
00074
00075 void SubsetDomainFeature::add(const map<string,float>& domainCount,
00076 float count,
00077 const MaybeLog& maybeLog,
00078 std::vector<float>& denseValues,
00079 std::map<std::string,float>& sparseValues) const
00080 {
00081 if (m_domain.list.size() > 6) {
00082 UTIL_THROW_IF(m_domain.list.size() > 6, ScoreFeatureArgumentException,
00083 "too many domains for core domain subset features");
00084 }
00085 size_t bitmap = 0;
00086 for(size_t bit = 0; bit < m_domain.list.size(); bit++) {
00087 if (domainCount.find( m_domain.list[ bit ] ) != domainCount.end()) {
00088 bitmap += 1 << bit;
00089 }
00090 }
00091 for(size_t i = 1; i < (1 << m_domain.list.size()); i++) {
00092 denseValues.push_back(maybeLog( (bitmap == i) ? 2.718 : 1 ));
00093 }
00094 }
00095
00096 void SparseSubsetDomainFeature::add(const map<string,float>& domainCount,float count,
00097 const MaybeLog& maybeLog,
00098 std::vector<float>& denseValues,
00099 std::map<std::string,float>& sparseValues) const
00100 {
00101 typedef vector<string>::const_iterator I;
00102 ostringstream key;
00103 key << "doms";
00104 for (I i = m_domain.list.begin(); i != m_domain.list.end(); ++i) {
00105 if (domainCount.find(*i) != domainCount.end()) {
00106 key << "_" << *i;
00107 }
00108 }
00109 sparseValues[key.str()] = 1;
00110 }
00111
00112
00113 void RatioDomainFeature::add(const map<string,float>& domainCount,float count,
00114 const MaybeLog& maybeLog,
00115 std::vector<float>& denseValues,
00116 std::map<std::string,float>& sparseValues) const
00117 {
00118 typedef vector< string >::const_iterator I;
00119 for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
00120 map<string,float>::const_iterator dci = domainCount.find(*i);
00121 if (dci == domainCount.end() ) {
00122 denseValues.push_back(maybeLog( 1 ));
00123 } else {
00124 denseValues.push_back(maybeLog(exp( dci->second / count ) ));
00125 }
00126 }
00127 }
00128
00129
00130 void SparseRatioDomainFeature::add(const map<string,float>& domainCount,float count,
00131 const MaybeLog& maybeLog,
00132 std::vector<float>& denseValues,
00133 std::map<std::string,float>& sparseValues) const
00134 {
00135 typedef map< string, float >::const_iterator I;
00136 for (I i=domainCount.begin(); i != domainCount.end(); i++) {
00137 sparseValues["domr_" + i->first] = (i->second / count);
00138 }
00139 }
00140
00141
00142 void IndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
00143 const MaybeLog& maybeLog,
00144 std::vector<float>& denseValues,
00145 std::map<std::string,float>& sparseValues) const
00146 {
00147 typedef vector< string >::const_iterator I;
00148 for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
00149 map<string,float>::const_iterator dci = domainCount.find(*i);
00150 if (dci == domainCount.end() ) {
00151 denseValues.push_back(maybeLog( 1 ));
00152 } else {
00153 denseValues.push_back(maybeLog(2.718));
00154 }
00155 }
00156 }
00157
00158 void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
00159 const MaybeLog& maybeLog,
00160 std::vector<float>& denseValues,
00161 std::map<std::string,float>& sparseValues) const
00162 {
00163 typedef map< string, float >::const_iterator I;
00164 for (I i=domainCount.begin(); i != domainCount.end(); i++) {
00165 sparseValues["dom_" + i->first] = 1;
00166 }
00167 }
00168
00169 }
00170