Moses: /disk4/html/www/moses/doxygen/mosesdecoder/lm/builder/pipeline.hh Source File

00001 #ifndef LM_BUILDER_PIPELINE_H
00002 #define LM_BUILDER_PIPELINE_H
00003 
00004 #include "lm/builder/adjust_counts.hh"
00005 #include "lm/builder/initial_probabilities.hh"
00006 #include "lm/builder/header_info.hh"
00007 #include "lm/lm_exception.hh"
00008 #include "lm/word_index.hh"
00009 #include "util/stream/config.hh"
00010 #include "util/file_piece.hh"
00011 
00012 #include <string>
00013 #include <cstddef>
00014 
00015 namespace lm { namespace builder {
00016 
00017 class Output;
00018 
00019 struct PipelineConfig {
00020   std::size_t order;
00021   util::stream::SortConfig sort;
00022   InitialProbabilitiesConfig initial_probs;
00023   util::stream::ChainConfig read_backoffs;
00024 
00025   // Estimated vocabulary size.  Used for sizing CorpusCount memory and
00026   // initial probing hash table sizing, also in CorpusCount.
00027   lm::WordIndex vocab_estimate;
00028 
00029   // Minimum block size to tolerate.
00030   std::size_t minimum_block;
00031 
00032   // Number of blocks to use.  This will be overridden to 1 if everything fits.
00033   std::size_t block_count;
00034 
00035   // n-gram count thresholds for pruning. 0 values means no pruning for
00036   // corresponding n-gram order
00037   std::vector<uint64_t> prune_thresholds; //mjd
00038   bool prune_vocab;
00039   std::string prune_vocab_file;
00040 
00041   /* Renumber the vocabulary the way the trie likes it? */
00042   bool renumber_vocabulary;
00043 
00044   // What to do with discount failures.
00045   DiscountConfig discount;
00046 
00047   // Compute collapsed q values instead of probability and backoff
00048   bool output_q;
00049 
00050   /* Computing the perplexity of LMs with different vocabularies is hard.  For
00051    * example, the lowest perplexity is attained by a unigram model that
00052    * predicts p(<unk>) = 1 and has no other vocabulary.  Also, linearly
00053    * interpolated models will sum to more than 1 because <unk> is duplicated
00054    * (SRI just pretends p(<unk>) = 0 for these purposes, which makes it sum to
00055    * 1 but comes with its own problems).  This option will make the vocabulary
00056    * a particular size by replicating <unk> multiple times for purposes of
00057    * computing vocabulary size.  It has no effect if the actual vocabulary is
00058    * larger.  This parameter serves the same purpose as IRSTLM's "dub".
00059    */
00060   uint64_t vocab_size_for_unk;
00061 
00062   /* What to do the first time <s>, </s>, or <unk> appears in the input.  If
00063    * this is anything but THROW_UP, then the symbol will always be treated as
00064    * whitespace.
00065    */
00066   WarningAction disallowed_symbol_action;
00067 
00068   const std::string &TempPrefix() const { return sort.temp_prefix; }
00069   std::size_t TotalMemory() const { return sort.total_memory; }
00070 };
00071 
00072 // Takes ownership of text_file and out_arpa.
00073 void Pipeline(PipelineConfig &config, int text_file, Output &output);
00074 
00075 }} // namespaces
00076 #endif // LM_BUILDER_PIPELINE_H