00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #pragma once
00021 #include "tables-core.h"
00022
00023 #include <vector>
00024 #include <set>
00025 #include <map>
00026 #include <boost/unordered_map.hpp>
00027
00028 namespace MosesTraining
00029 {
00030
00031
00032 typedef std::vector< std::set<size_t> > ALIGNMENT;
00033
00034
00035 class ExtractionPhrasePair
00036 {
00037
00038 protected:
00039
00040 typedef std::map<std::string,float> PROPERTY_VALUES;
00041 typedef std::map<std::string,float>::iterator LAST_PROPERTY_VALUE;
00042
00043
00044 bool m_isValid;
00045
00046 const PHRASE *m_phraseSource;
00047 const PHRASE *m_phraseTarget;
00048
00049 float m_count;
00050 float m_pcfgSum;
00051
00052 std::map<ALIGNMENT*,float> m_targetToSourceAlignments;
00053 std::map<std::string,
00054 std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > > m_properties;
00055
00056 float m_lastCount;
00057 float m_lastPcfgSum;
00058 std::map<ALIGNMENT*,float>::iterator m_lastTargetToSourceAlignment;
00059
00060 public:
00061
00062 ExtractionPhrasePair( const PHRASE *phraseSource,
00063 const PHRASE *phraseTarget,
00064 ALIGNMENT *targetToSourceAlignment,
00065 float count, float pcfgSum );
00066
00067 ~ExtractionPhrasePair();
00068
00069 bool Add( ALIGNMENT *targetToSourceAlignment,
00070 float count, float pcfgSum );
00071
00072 void IncrementPrevious( float count, float pcfgSum );
00073
00074 bool Matches( const PHRASE *otherPhraseSource,
00075 const PHRASE *otherPhraseTarget,
00076 ALIGNMENT *otherTargetToSourceAlignment ) const;
00077
00078 bool Matches( const PHRASE *otherPhraseSource,
00079 const PHRASE *otherPhraseTarget,
00080 ALIGNMENT *otherTargetToSourceAlignment,
00081 bool &sourceMatch,
00082 bool &targetMatch,
00083 bool &alignmentMatch ) const;
00084
00085 bool MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const;
00086
00087 void Clear();
00088
00089 bool IsValid() const {
00090 return m_isValid;
00091 }
00092
00093
00094 const PHRASE *GetSource() const {
00095 return m_phraseSource;
00096 }
00097
00098 const PHRASE *GetTarget() const {
00099 return m_phraseTarget;
00100 }
00101
00102 float GetCount() const {
00103 return m_count;
00104 }
00105
00106 float GetPcfgScore() const {
00107 return m_pcfgSum;
00108 }
00109
00110 const size_t GetNumberOfProperties() const {
00111 return m_properties.size();
00112 }
00113
00114 const std::map<std::string,float> *GetProperty( const std::string &key ) const {
00115 std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::const_iterator iter;
00116 iter = m_properties.find(key);
00117 if (iter == m_properties.end()) {
00118 return NULL;
00119 } else {
00120 return iter->second.first;
00121 }
00122 }
00123
00124 const ALIGNMENT *FindBestAlignmentTargetToSource() const;
00125
00126 const std::string *FindBestPropertyValue(const std::string &key) const;
00127
00128 std::string CollectAllPropertyValues(const std::string &key) const;
00129
00130 std::string CollectAllLabelsSeparateLHSAndRHS(const std::string& propertyKey,
00131 std::set<std::string>& sourceLabelSet,
00132 boost::unordered_map<std::string,float>& sourceLHSCounts,
00133 boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >& sourceRHSAndLHSJointCounts,
00134 Vocabulary &vcbT) const;
00135
00136 void CollectAllPhraseOrientations(const std::string &key,
00137 const std::vector<float> &orientationClassPriorsL2R,
00138 const std::vector<float> &orientationClassPriorsR2L,
00139 double smoothingFactor,
00140 std::ostream &out) const;
00141
00142 void UpdateVocabularyFromValueTokens(const std::string& propertyKey,
00143 std::set<std::string>& vocabulary) const;
00144
00145 void AddProperties(const std::string &str, float count);
00146
00147 void AddProperty(const std::string &key, const std::string &value, float count) {
00148 std::map<std::string,
00149 std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter = m_properties.find(key);
00150 if ( iter == m_properties.end() ) {
00151
00152 PROPERTY_VALUES *propertyValues = new PROPERTY_VALUES();
00153 std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
00154 LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
00155 m_properties[key] = std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* >(propertyValues, lastPropertyValue);
00156 } else {
00157 LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
00158 if ( (*lastPropertyValue)->first == value ) {
00159
00160 (*lastPropertyValue)->second += count;
00161 } else {
00162
00163
00164 PROPERTY_VALUES *propertyValues = (iter->second).first;
00165 std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
00166 if ( !insertedProperty.second ) {
00167 insertedProperty.first->second += count;
00168 }
00169 LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
00170 delete (iter->second).second;
00171 (iter->second).second = lastPropertyValue;
00172 }
00173 }
00174 }
00175
00176 };
00177
00178 }
00179