00001
00002
00003
00004 #pragma once
00005 #define PROVIDES_RANKED_SAMPLING 0
00006
00007 #include <boost/thread.hpp>
00008 #include <boost/scoped_ptr.hpp>
00009 #include <boost/intrusive_ptr.hpp>
00010
00011 #include "moses/TypeDef.h"
00012 #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
00013 #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
00014 #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
00015 #include "moses/TranslationModel/UG/generic/threading/ug_thread_pool.h"
00016
00017 #include "moses/TranslationModel/UG/mm/ug_mm_ttrack.h"
00018 #include "moses/TranslationModel/UG/mm/ug_mm_tsa.h"
00019 #include "moses/TranslationModel/UG/mm/tpt_tokenindex.h"
00020 #include "moses/TranslationModel/UG/mm/ug_corpus_token.h"
00021 #include "moses/TranslationModel/UG/mm/ug_typedefs.h"
00022 #include "moses/TranslationModel/UG/mm/tpt_pickler.h"
00023 #include "moses/TranslationModel/UG/mm/ug_bitext.h"
00024 #include "moses/TranslationModel/UG/mm/ug_bitext_sampler.h"
00025 #include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h"
00026
00027 #include "moses/TranslationModel/UG/TargetPhraseCollectionCache.h"
00028
00029 #ifndef NO_MOSES
00030 #include "moses/FF/LexicalReordering/LexicalReordering.h"
00031 #endif
00032
00033 #include "moses/InputFileStream.h"
00034 #include "moses/FactorTypeSet.h"
00035 #include "moses/TargetPhrase.h"
00036 #include <boost/dynamic_bitset.hpp>
00037 #include "moses/TargetPhraseCollection.h"
00038 #include "util/usage.hh"
00039 #include <map>
00040
00041 #include "moses/TranslationModel/PhraseDictionary.h"
00042 #include "sapt_phrase_scorers.h"
00043
00044
00045
00046
00047
00048
00049
00050 namespace Moses
00051 {
00052 class Mmsapt
00053 #ifndef NO_MOSES
00054 : public PhraseDictionary
00055 #endif
00056 {
00057 class TPCOllCache;
00058 friend class Alignment;
00059 std::map<std::string,std::string> param;
00060 std::string m_name;
00061 #ifndef NO_MOSES
00062
00063 friend class PhraseDictionaryGroup;
00064 #endif
00065 public:
00066 typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
00067 typedef sapt::mmBitext<Token> mmbitext;
00068 typedef sapt::imBitext<Token> imbitext;
00069 typedef sapt::Bitext<Token> bitext;
00070 typedef sapt::TSA<Token> tsa;
00071 typedef sapt::PhraseScorer<Token> pscorer;
00072 private:
00073
00074 SPTR<mmbitext> btfix;
00075 SPTR<imbitext> btdyn;
00076 std::string m_bname, m_extra_data, m_bias_file,m_bias_server;
00077 std::string L1;
00078 std::string L2;
00079 float m_lbop_conf;
00080 float m_lex_alpha;
00081
00082
00083 size_t m_default_sample_size;
00084 size_t m_min_sample_size;
00085 size_t m_workers;
00086 std::vector<std::string> m_feature_set_names;
00087 std::string m_bias_logfile;
00088 boost::scoped_ptr<std::ofstream> m_bias_logger;
00089 std::ostream* m_bias_log;
00090 int m_bias_loglevel;
00091 #ifndef NO_MOSES
00092 LexicalReordering* m_lr_func;
00093 #endif
00094 std::string m_lr_func_name;
00095 sapt::sampling_method m_sampling_method;
00096 boost::scoped_ptr<ug::ThreadPool> m_thread_pool;
00097 public:
00098 void* const bias_key;
00099 void* const cache_key;
00100 void* const context_key;
00101 private:
00102 boost::shared_ptr<sapt::SamplingBias> m_bias;
00103 boost::shared_ptr<TPCollCache> m_cache;
00104 size_t m_cache_size;
00105
00106
00107
00108
00109
00110
00111
00112
00113 std::vector<std::string> m_feature_names;
00114 std::vector<bool> m_is_logval;
00115 std::vector<bool> m_is_integer;
00116
00117 std::vector<SPTR<pscorer > > m_active_ff_fix;
00118 std::vector<SPTR<pscorer > > m_active_ff_dyn;
00119 std::vector<SPTR<pscorer > > m_active_ff_common;
00120
00121
00122 bool m_track_coord;
00123
00124
00125 std::vector<std::vector<SPTR<std::vector<float> > > > m_sid_coord_list;
00126 std::vector<size_t> m_coord_spaces;
00127
00128 void
00129 parse_factor_spec(std::vector<FactorType>& flist, std::string const key);
00130
00131 void
00132 register_ff(SPTR<pscorer> const& ff, std::vector<SPTR<pscorer> > & registry);
00133
00134 template<typename fftype>
00135 void
00136 check_ff(std::string const ffname,std::vector<SPTR<pscorer> >* registry = NULL);
00137
00138
00139 template<typename fftype>
00140 void
00141 check_ff(std::string const ffname, float const xtra,
00142 std::vector<SPTR<pscorer> >* registry = NULL);
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156 void init(std::string const& line);
00157 mutable boost::shared_mutex m_lock;
00158
00159
00160 bool withPbwd;
00161 bool poolCounts;
00162 std::vector<FactorType> m_ifactor, m_ofactor;
00163
00164 void setup_local_feature_functions();
00165 void setup_bias(ttasksptr const& ttask);
00166
00167 #if PROVIDES_RANKED_SAMPLING
00168 void
00169 set_bias_for_ranking(ttasksptr const& ttask, SPTR<sapt::Bitext<Token> const> bt);
00170 #endif
00171 private:
00172
00173 void read_config_file(std::string fname, std::map<std::string,std::string>& param);
00174
00175
00176 std::vector<float> feature_weights;
00177
00178 std::vector<std::vector<tpt::id_type> > wlex21;
00179
00180 typedef sapt::mm2dTable<tpt::id_type,tpt::id_type,uint32_t,uint32_t> mm2dtable_t;
00181 mm2dtable_t COOCraw;
00182
00183 TargetPhrase*
00184 mkTPhrase(ttasksptr const& ttask,
00185 Phrase const& src,
00186 sapt::PhrasePair<Token>* fix,
00187 sapt::PhrasePair<Token>* dyn,
00188 SPTR<sapt::Bitext<Token> > const& dynbt) const;
00189
00190 void
00191 process_pstats
00192 (Phrase const& src,
00193 uint64_t const pid1,
00194 sapt::pstats const& stats,
00195 sapt::Bitext<Token> const & bt,
00196 TargetPhraseCollection::shared_ptr tpcoll
00197 ) const;
00198
00199 bool
00200 pool_pstats
00201 (Phrase const& src,
00202 uint64_t const pid1a, sapt::pstats * statsa, sapt::Bitext<Token> const & bta,
00203 uint64_t const pid1b, sapt::pstats const* statsb, sapt::Bitext<Token> const & btb,
00204 TargetPhraseCollection::shared_ptr tpcoll) const;
00205
00206 bool
00207 combine_pstats
00208 (Phrase const& src,
00209 uint64_t const pid1a, sapt::pstats* statsa, sapt::Bitext<Token> const & bta,
00210 uint64_t const pid1b, sapt::pstats const* statsb, sapt::Bitext<Token> const & btb,
00211 TargetPhraseCollection::shared_ptr tpcoll) const;
00212
00213 void load_extra_data(std::string bname, bool locking);
00214 void load_bias(std::string bname);
00215
00216 public:
00217
00218 Mmsapt(std::string const& line);
00219
00220 void Load(AllOptions::ptr const& opts);
00221 void Load(AllOptions::ptr const& opts, bool with_checks);
00222 size_t SetTableLimit(size_t limit);
00223 std::string const& GetName() const;
00224
00225 #ifndef NO_MOSES
00226 TargetPhraseCollection::shared_ptr
00227 GetTargetPhraseCollectionLEGACY(ttasksptr const& ttask, const Phrase& src) const;
00228
00229
00230
00231
00232 void
00233 GetTargetPhraseCollectionBatch
00234 (ttasksptr const& ttask, InputPathList const& inputPathQueue) const;
00235
00237 ChartRuleLookupManager*
00238 CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &);
00239
00240 ChartRuleLookupManager*
00241 CreateRuleLookupManager(const ChartParser &, const ChartCellCollectionBase &,
00242 std::size_t);
00243 #endif
00244
00245 void add(std::string const& s1, std::string const& s2, std::string const& a);
00246
00247
00248 void setWeights(std::vector<float> const& w);
00249
00250
00251
00252
00253
00254
00255
00256 bool ProvidesPrefixCheck() const;
00257
00258 bool PrefixExists(ttasksptr const& ttask, Phrase const& phrase) const;
00259
00260 bool isLogVal(int i) const;
00261 bool isInteger(int i) const;
00262
00263
00264 void InitializeForInput(ttasksptr const& ttask);
00265
00266 void CleanUpAfterSentenceProcessing(ttasksptr const& ttask);
00267
00268
00269 SPTR<std::vector<int> >
00270 align(std::string const& src, std::string const& trg) const;
00271
00272 std::vector<std::string> const&
00273 GetFeatureNames() const;
00274
00275 SPTR<sapt::DocumentBias>
00276 setupDocumentBias(std::map<std::string,float> const& bias) const;
00277
00278 std::vector<float> DefaultWeights() const;
00279 };
00280 }
00281