Moses: /disk4/html/www/moses/doxygen/mosesdecoder/phrase-extract/filter-rule-table/StringCfgFilter.h Source File

00001 #pragma once
00002 
00003 #include <string>
00004 #include <vector>
00005 
00006 #include "syntax-common/numbered_set.h"
00007 
00008 #include <boost/shared_ptr.hpp>
00009 #include <boost/unordered_map.hpp>
00010 
00011 #include "util/string_piece.hh"
00012 #include "util/tokenize_piece.hh"
00013 
00014 #include "CfgFilter.h"
00015 
00016 namespace MosesTraining
00017 {
00018 namespace Syntax
00019 {
00020 namespace FilterRuleTable
00021 {
00022 
00023 // Filters a rule table, discarding rules that cannot be applied to a given
00024 // test set.  The rule table must have a CFG source-side and the test sentences
00025 // must be strings.
00026 class StringCfgFilter : public CfgFilter
00027 {
00028 public:
00029   // Initialize the filter for a given set of test sentences.
00030   StringCfgFilter(const std::vector<boost::shared_ptr<std::string> > &);
00031 
00032   void Filter(std::istream &in, std::ostream &out);
00033 
00034 private:
00035   // Filtering works by converting the source LHSs of translation rules to
00036   // patterns containing variable length gaps and then pattern matching
00037   // against the test set.
00038   //
00039   // The algorithm is vaguely similar to Algorithm 1 from Rahman et al. (2006),
00040   // but with a slightly different definition of a pattern and designed for a
00041   // text containing sentence boundaries.  Here the text is assumed to be
00042   // short (a few thousand sentences) and the number of patterns is assumed to
00043   // be large (tens of millions of rules).
00044   //
00045   //   M. Sohel Rahman, Costas S. Iliopoulos, Inbok Lee, Manal Mohamed, and
00046   //     William F. Smyth
00047   //   "Finding Patterns with Variable Length Gaps or Don't Cares"
00048   //   In proceedings of COCOON, 2006
00049 
00050   // Max NGram length.
00051   static const std::size_t kMaxNGramLength;
00052 
00053   // Maps words from strings to integers.
00054   typedef NumberedSet<std::string, std::size_t> Vocabulary;
00055 
00056   // A NGram is a sequence of words.
00057   typedef std::vector<Vocabulary::IdType> NGram;
00058 
00059   // A pattern is an alternating sequence of gaps and NGram subpatterns,
00060   // starting and ending with a gap.  Every gap has a minimum width, which
00061   // can be any integer >= 0 (a gap of width 0 is really a non-gap).
00062   //
00063   // The source LHSs of translation rules are converted to patterns where each
00064   // sequence of m consecutive non-terminals is converted to a gap with minimum
00065   // width m.  For example, if a rule has the source LHS:
00066   //
00067   //    [NP] and all the king 's men could n't [VB] [NP] together again
00068   //
00069   // and kMaxN is set to 5 then the following pattern is used:
00070   //
00071   //    * <and all the king 's> * <men could n't> * <together again> *
00072   //
00073   // where the gaps have minimum widths of 1, 0, 2, and 0.
00074   //
00075   struct Pattern {
00076     std::vector<NGram> subpatterns;
00077     std::vector<int> minGapWidths;
00078   };
00079 
00080   // A sorted (ascending) sequence of start positions.
00081   typedef std::vector<int> PositionSeq;
00082 
00083   // A range of start positions.
00084   typedef std::pair<int, int> Range;
00085 
00086   // A CoordinateTable records the set of sentences in which a single
00087   // n-gram occurs and for each of those sentences, the start positions
00088   struct CoordinateTable {
00089     // Sentences IDs (ascending).  This contains the same values as the key set
00090     // from intraSentencePositions but sorted into ascending order.
00091     std::vector<int> sentences;
00092     // Map from sentence ID to set of intra-sentence start positions.
00093     boost::unordered_map<int, PositionSeq> intraSentencePositions;
00094   };
00095 
00096   // NGramCoordinateMap is the main search structure.  It maps a NGram to
00097   // a CoordinateTable containing the positions that the n-gram occurs at
00098   // in the test set.
00099   typedef boost::unordered_map<NGram, CoordinateTable> NGramCoordinateMap;
00100 
00101   // Add all n-grams and coordinates for a single sentence s with index i.
00102   void AddSentenceNGrams(const std::vector<Vocabulary::IdType> &s,
00103                          std::size_t i);
00104 
00105   // Calculate the range of possible start positions for subpattern i+1
00106   // assuming that subpattern i has position x.
00107   Range CalcNextRange(const Pattern &p, int i, int x, int sentenceLength) const;
00108 
00109   // Generate the pattern corresponding to the given source-side of a rule.
00110   // This will fail if the rule's source-side contains any terminals that
00111   // do not occur in the test sentence vocabulary.
00112   bool GeneratePattern(const std::vector<StringPiece> &, Pattern &) const;
00113 
00114   // Calculate the minimum width of the pattern suffix starting
00115   // at subpattern i.
00116   int MinWidth(const Pattern &p, int i) const;
00117 
00118   bool IsNonTerminal(const StringPiece &symbol) const;
00119 
00120   // Try to match the pattern p against any sentence in the test set.
00121   bool MatchPattern(const Pattern &p) const;
00122 
00123   // Try to match the pattern p against the sentence with the given ID.
00124   bool MatchPattern(const Pattern &p,
00125                     std::vector<const CoordinateTable *> &tables,
00126                     int id) const;
00127 
00128   // The main search structure constructed from the test set sentences.
00129   NGramCoordinateMap m_ngramCoordinateMap;
00130 
00131   // The lengths of the test sentences.
00132   std::vector<int> m_sentenceLengths;
00133 
00134   // The maximum length of any test sentence.
00135   int m_maxSentenceLength;
00136 
00137   // The symbol vocabulary of the test sentences.
00138   Vocabulary m_testVocab;
00139 };
00140 
00141 }  // namespace FilterRuleTable
00142 }  // namespace Syntax
00143 }  // namespace MosesTraining