00001 #pragma once
00002
00003 #include <string>
00004 #include <algorithm>
00005 #include <boost/foreach.hpp>
00006 #include "ThreadLocalByFeatureStorage.h"
00007 #include "VWFeatureSource.h"
00008 #include "moses/Util.h"
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 namespace Moses
00022 {
00023
00024 class VWFeatureSourceSenseWindow : public VWFeatureSource
00025 {
00026 public:
00027 VWFeatureSourceSenseWindow(const std::string &line)
00028 : VWFeatureSource(line), m_tlsSenses(this), m_tlsForms(this), m_lexicalized(true), m_size(DEFAULT_WINDOW_SIZE) {
00029 ReadParameters();
00030
00031
00032 VWFeatureBase::UpdateRegister();
00033 }
00034
00035
00036 virtual void InitializeForInput(ttasksptr const& ttask) {
00037 InputType const& input = *(ttask->GetSource().get());
00038
00039 std::vector<WordSenses>& senses = *m_tlsSenses.GetStored();
00040 std::vector<std::string>& forms = *m_tlsForms.GetStored();
00041 senses.clear();
00042 forms.clear();
00043
00044 senses.resize(input.GetSize());
00045 forms.resize(input.GetSize());
00046
00047 for (size_t i = 0; i < input.GetSize(); i++) {
00048 senses[i] = GetSenses(input, i);
00049 forms[i] = m_lexicalized ? GetWordForm(input, i) + "^" : "";
00050 }
00051 }
00052
00053 void operator()(const InputType &input
00054 , const Range &sourceRange
00055 , Discriminative::Classifier &classifier
00056 , Discriminative::FeatureVector &outFeatures) const {
00057 int begin = sourceRange.GetStartPos();
00058 int end = sourceRange.GetEndPos() + 1;
00059 int inputLen = input.GetSize();
00060
00061 const std::vector<WordSenses>& senses = *m_tlsSenses.GetStored();
00062 const std::vector<std::string>& forms = *m_tlsForms.GetStored();
00063
00064
00065 for (int i = std::max(0, begin - m_size); i < begin; i++) {
00066 BOOST_FOREACH(const Sense &sense, senses[i]) {
00067 outFeatures.push_back(classifier.AddLabelIndependentFeature("snsb^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob));
00068 outFeatures.push_back(classifier.AddLabelIndependentFeature("snsb^" + forms[i] + sense.m_label, sense.m_prob));
00069 }
00070 }
00071
00072
00073 for (int i = begin; i < end; i++) {
00074 BOOST_FOREACH(const Sense &sense, senses[i]) {
00075 outFeatures.push_back(classifier.AddLabelIndependentFeature("snsin^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob));
00076 outFeatures.push_back(classifier.AddLabelIndependentFeature("snsin^" + forms[i] + sense.m_label, sense.m_prob));
00077 }
00078 }
00079
00080
00081 for (int i = end; i < std::min(end + m_size, inputLen); i++) {
00082 BOOST_FOREACH(const Sense &sense, senses[i]) {
00083 outFeatures.push_back(classifier.AddLabelIndependentFeature("snsa^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob));
00084 outFeatures.push_back(classifier.AddLabelIndependentFeature("snsa^" + forms[i] + sense.m_label, sense.m_prob));
00085 }
00086 }
00087 }
00088
00089 virtual void SetParameter(const std::string& key, const std::string& value) {
00090 if (key == "size") {
00091 m_size = Scan<size_t>(value);
00092 } else if (key == "lexicalized") {
00093 m_lexicalized = Scan<bool>(value);
00094 } else {
00095 VWFeatureSource::SetParameter(key, value);
00096 }
00097 }
00098
00099 private:
00100 static const int DEFAULT_WINDOW_SIZE = 3;
00101
00102 struct Sense {
00103 std::string m_label;
00104 float m_prob;
00105 };
00106
00107 typedef std::vector<Sense> WordSenses;
00108 typedef ThreadLocalByFeatureStorage<std::vector<WordSenses> > TLSSenses;
00109 typedef ThreadLocalByFeatureStorage<std::vector<std::string> > TLSWordForms;
00110
00111 TLSSenses m_tlsSenses;
00112 TLSWordForms m_tlsForms;
00113
00114
00115 std::vector<Sense> GetSenses(const InputType &input, size_t pos) const {
00116 std::string w = GetWord(input, pos);
00117 std::vector<std::string> senseTokens = Tokenize(w, "^");
00118
00119 std::vector<Sense> out(senseTokens.size());
00120 for (size_t i = 0; i < senseTokens.size(); i++) {
00121 std::vector<std::string> senseColumns = Tokenize(senseTokens[i], ":");
00122 if (senseColumns.size() != 2) {
00123 UTIL_THROW2("VW :: bad format of sense distribution: " << senseTokens[i]);
00124 }
00125 out[i].m_label = senseColumns[0];
00126 out[i].m_prob = Scan<float>(senseColumns[1]);
00127 }
00128
00129 return out;
00130 }
00131
00132
00133 inline std::string GetWordForm(const InputType &input, size_t pos) const {
00134 return input.GetWord(pos).GetString(0).as_string();
00135 }
00136
00137 bool m_lexicalized;
00138 int m_size;
00139 };
00140
00141 }