00001 #pragma once 00002 00003 #include <string> 00004 #include <vector> 00005 00006 #include "syntax-common/numbered_set.h" 00007 00008 #include <boost/shared_ptr.hpp> 00009 #include <boost/unordered_map.hpp> 00010 00011 #include "util/string_piece.hh" 00012 #include "util/tokenize_piece.hh" 00013 00014 #include "CfgFilter.h" 00015 00016 namespace MosesTraining 00017 { 00018 namespace Syntax 00019 { 00020 namespace FilterRuleTable 00021 { 00022 00023 // Filters a rule table, discarding rules that cannot be applied to a given 00024 // test set. The rule table must have a CFG source-side and the test sentences 00025 // must be strings. 00026 class StringCfgFilter : public CfgFilter 00027 { 00028 public: 00029 // Initialize the filter for a given set of test sentences. 00030 StringCfgFilter(const std::vector<boost::shared_ptr<std::string> > &); 00031 00032 void Filter(std::istream &in, std::ostream &out); 00033 00034 private: 00035 // Filtering works by converting the source LHSs of translation rules to 00036 // patterns containing variable length gaps and then pattern matching 00037 // against the test set. 00038 // 00039 // The algorithm is vaguely similar to Algorithm 1 from Rahman et al. (2006), 00040 // but with a slightly different definition of a pattern and designed for a 00041 // text containing sentence boundaries. Here the text is assumed to be 00042 // short (a few thousand sentences) and the number of patterns is assumed to 00043 // be large (tens of millions of rules). 00044 // 00045 // M. Sohel Rahman, Costas S. Iliopoulos, Inbok Lee, Manal Mohamed, and 00046 // William F. Smyth 00047 // "Finding Patterns with Variable Length Gaps or Don't Cares" 00048 // In proceedings of COCOON, 2006 00049 00050 // Max NGram length. 00051 static const std::size_t kMaxNGramLength; 00052 00053 // Maps words from strings to integers. 00054 typedef NumberedSet<std::string, std::size_t> Vocabulary; 00055 00056 // A NGram is a sequence of words. 00057 typedef std::vector<Vocabulary::IdType> NGram; 00058 00059 // A pattern is an alternating sequence of gaps and NGram subpatterns, 00060 // starting and ending with a gap. Every gap has a minimum width, which 00061 // can be any integer >= 0 (a gap of width 0 is really a non-gap). 00062 // 00063 // The source LHSs of translation rules are converted to patterns where each 00064 // sequence of m consecutive non-terminals is converted to a gap with minimum 00065 // width m. For example, if a rule has the source LHS: 00066 // 00067 // [NP] and all the king 's men could n't [VB] [NP] together again 00068 // 00069 // and kMaxN is set to 5 then the following pattern is used: 00070 // 00071 // * <and all the king 's> * <men could n't> * <together again> * 00072 // 00073 // where the gaps have minimum widths of 1, 0, 2, and 0. 00074 // 00075 struct Pattern { 00076 std::vector<NGram> subpatterns; 00077 std::vector<int> minGapWidths; 00078 }; 00079 00080 // A sorted (ascending) sequence of start positions. 00081 typedef std::vector<int> PositionSeq; 00082 00083 // A range of start positions. 00084 typedef std::pair<int, int> Range; 00085 00086 // A CoordinateTable records the set of sentences in which a single 00087 // n-gram occurs and for each of those sentences, the start positions 00088 struct CoordinateTable { 00089 // Sentences IDs (ascending). This contains the same values as the key set 00090 // from intraSentencePositions but sorted into ascending order. 00091 std::vector<int> sentences; 00092 // Map from sentence ID to set of intra-sentence start positions. 00093 boost::unordered_map<int, PositionSeq> intraSentencePositions; 00094 }; 00095 00096 // NGramCoordinateMap is the main search structure. It maps a NGram to 00097 // a CoordinateTable containing the positions that the n-gram occurs at 00098 // in the test set. 00099 typedef boost::unordered_map<NGram, CoordinateTable> NGramCoordinateMap; 00100 00101 // Add all n-grams and coordinates for a single sentence s with index i. 00102 void AddSentenceNGrams(const std::vector<Vocabulary::IdType> &s, 00103 std::size_t i); 00104 00105 // Calculate the range of possible start positions for subpattern i+1 00106 // assuming that subpattern i has position x. 00107 Range CalcNextRange(const Pattern &p, int i, int x, int sentenceLength) const; 00108 00109 // Generate the pattern corresponding to the given source-side of a rule. 00110 // This will fail if the rule's source-side contains any terminals that 00111 // do not occur in the test sentence vocabulary. 00112 bool GeneratePattern(const std::vector<StringPiece> &, Pattern &) const; 00113 00114 // Calculate the minimum width of the pattern suffix starting 00115 // at subpattern i. 00116 int MinWidth(const Pattern &p, int i) const; 00117 00118 bool IsNonTerminal(const StringPiece &symbol) const; 00119 00120 // Try to match the pattern p against any sentence in the test set. 00121 bool MatchPattern(const Pattern &p) const; 00122 00123 // Try to match the pattern p against the sentence with the given ID. 00124 bool MatchPattern(const Pattern &p, 00125 std::vector<const CoordinateTable *> &tables, 00126 int id) const; 00127 00128 // The main search structure constructed from the test set sentences. 00129 NGramCoordinateMap m_ngramCoordinateMap; 00130 00131 // The lengths of the test sentences. 00132 std::vector<int> m_sentenceLengths; 00133 00134 // The maximum length of any test sentence. 00135 int m_maxSentenceLength; 00136 00137 // The symbol vocabulary of the test sentences. 00138 Vocabulary m_testVocab; 00139 }; 00140 00141 } // namespace FilterRuleTable 00142 } // namespace Syntax 00143 } // namespace MosesTraining