00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef MERT_FOREST_RESCORE_H
00020 #define MERT_FOREST_RESCORE_H
00021
00022 #include <valarray>
00023 #include <vector>
00024
00025 #include <boost/unordered_set.hpp>
00026
00027 #include "BleuScorer.h"
00028 #include "Hypergraph.h"
00029
00030 namespace MosesTuning
00031 {
00032
00033 std::ostream& operator<<(std::ostream& out, const WordVec& wordVec);
00034
00035 struct NgramHash : public std::unary_function<const WordVec&, std::size_t> {
00036 std::size_t operator()(const WordVec& ngram) const {
00037 return util::MurmurHashNative(&(ngram[0]), ngram.size() * sizeof(WordVec::value_type));
00038 }
00039 };
00040
00041 struct NgramEquals : public std::binary_function<const WordVec&, const WordVec&, bool> {
00042 bool operator()(const WordVec& first, const WordVec& second) const {
00043 if (first.size() != second.size()) return false;
00044 return memcmp(&(first[0]), &(second[0]), first.size() * sizeof(WordVec::value_type)) == 0;
00045 }
00046 };
00047
00048 typedef boost::unordered_map<WordVec, size_t, NgramHash, NgramEquals> NgramCounter;
00049
00050
00051 class ReferenceSet
00052 {
00053
00054
00055 public:
00056
00057 void AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab);
00058
00059 void Load(const std::vector<std::string>& files, Vocab& vocab);
00060
00061 size_t NgramMatches(size_t sentenceId, const WordVec&, bool clip) const;
00062
00063 size_t Length(size_t sentenceId) const {
00064 return lengths_[sentenceId];
00065 }
00066
00067 private:
00068
00069 typedef boost::unordered_map<WordVec, std::pair<std::size_t,std::size_t>, NgramHash,NgramEquals> NgramMap;
00070 std::vector<NgramMap> ngramCounts_;
00071 std::vector<size_t> lengths_;
00072
00073 };
00074
00075 struct VertexState {
00076 VertexState();
00077
00078 std::vector<FeatureStatsType> bleuStats;
00079 WordVec leftContext;
00080 WordVec rightContext;
00081 size_t targetLength;
00082 };
00083
00087 class HgBleuScorer
00088 {
00089 public:
00090 HgBleuScorer(const ReferenceSet& references, const Graph& graph, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu):
00091 references_(references), sentenceId_(sentenceId), graph_(graph), backgroundBleu_(backgroundBleu),
00092 backgroundRefLength_(backgroundBleu[kBleuNgramOrder*2]) {
00093 vertexStates_.resize(graph.VertexSize());
00094 totalSourceLength_ = graph.GetVertex(graph.VertexSize()-1).SourceCovered();
00095 }
00096
00097 FeatureStatsType Score(const Edge& edge, const Vertex& head, std::vector<FeatureStatsType>& bleuStats) ;
00098
00099 void UpdateState(const Edge& winnerEdge, size_t vertexId, const std::vector<FeatureStatsType>& bleuStats);
00100
00101
00102 private:
00103 const ReferenceSet& references_;
00104 std::vector<VertexState> vertexStates_;
00105 size_t sentenceId_;
00106 size_t totalSourceLength_;
00107 const Graph& graph_;
00108 std::vector<FeatureStatsType> backgroundBleu_;
00109 FeatureStatsType backgroundRefLength_;
00110
00111 void UpdateMatches(const NgramCounter& counter, std::vector<FeatureStatsType>& bleuStats) const;
00112 size_t GetTargetLength(const Edge& edge) const;
00113 };
00114
00115 struct HgHypothesis {
00116 SparseVector featureVector;
00117 WordVec text;
00118 std::vector<FeatureStatsType> bleuStats;
00119 };
00120
00121 void Viterbi(const Graph& graph, const SparseVector& weights, float bleuWeight, const ReferenceSet& references, size_t sentenceId, const std::vector<FeatureStatsType>& backgroundBleu, HgHypothesis* bestHypo);
00122
00123 };
00124
00125 #endif