00001 #ifndef LM_BUILDER_PIPELINE_H 00002 #define LM_BUILDER_PIPELINE_H 00003 00004 #include "lm/builder/adjust_counts.hh" 00005 #include "lm/builder/initial_probabilities.hh" 00006 #include "lm/builder/header_info.hh" 00007 #include "lm/lm_exception.hh" 00008 #include "lm/word_index.hh" 00009 #include "util/stream/config.hh" 00010 #include "util/file_piece.hh" 00011 00012 #include <string> 00013 #include <cstddef> 00014 00015 namespace lm { namespace builder { 00016 00017 class Output; 00018 00019 struct PipelineConfig { 00020 std::size_t order; 00021 util::stream::SortConfig sort; 00022 InitialProbabilitiesConfig initial_probs; 00023 util::stream::ChainConfig read_backoffs; 00024 00025 // Estimated vocabulary size. Used for sizing CorpusCount memory and 00026 // initial probing hash table sizing, also in CorpusCount. 00027 lm::WordIndex vocab_estimate; 00028 00029 // Minimum block size to tolerate. 00030 std::size_t minimum_block; 00031 00032 // Number of blocks to use. This will be overridden to 1 if everything fits. 00033 std::size_t block_count; 00034 00035 // n-gram count thresholds for pruning. 0 values means no pruning for 00036 // corresponding n-gram order 00037 std::vector<uint64_t> prune_thresholds; //mjd 00038 bool prune_vocab; 00039 std::string prune_vocab_file; 00040 00041 /* Renumber the vocabulary the way the trie likes it? */ 00042 bool renumber_vocabulary; 00043 00044 // What to do with discount failures. 00045 DiscountConfig discount; 00046 00047 // Compute collapsed q values instead of probability and backoff 00048 bool output_q; 00049 00050 /* Computing the perplexity of LMs with different vocabularies is hard. For 00051 * example, the lowest perplexity is attained by a unigram model that 00052 * predicts p(<unk>) = 1 and has no other vocabulary. Also, linearly 00053 * interpolated models will sum to more than 1 because <unk> is duplicated 00054 * (SRI just pretends p(<unk>) = 0 for these purposes, which makes it sum to 00055 * 1 but comes with its own problems). This option will make the vocabulary 00056 * a particular size by replicating <unk> multiple times for purposes of 00057 * computing vocabulary size. It has no effect if the actual vocabulary is 00058 * larger. This parameter serves the same purpose as IRSTLM's "dub". 00059 */ 00060 uint64_t vocab_size_for_unk; 00061 00062 /* What to do the first time <s>, </s>, or <unk> appears in the input. If 00063 * this is anything but THROW_UP, then the symbol will always be treated as 00064 * whitespace. 00065 */ 00066 WarningAction disallowed_symbol_action; 00067 00068 const std::string &TempPrefix() const { return sort.temp_prefix; } 00069 std::size_t TotalMemory() const { return sort.total_memory; } 00070 }; 00071 00072 // Takes ownership of text_file and out_arpa. 00073 void Pipeline(PipelineConfig &config, int text_file, Output &output); 00074 00075 }} // namespaces 00076 #endif // LM_BUILDER_PIPELINE_H