Moses: /disk4/html/www/moses/doxygen/mosesdecoder/phrase-extract/filter-rule-table/TsgFilter.cpp Source File

00001 #include "TsgFilter.h"
00002 
00003 #include "boost/scoped_ptr.hpp"
00004 
00005 #include "util/string_piece.hh"
00006 #include "util/string_piece_hash.hh"
00007 #include "util/tokenize_piece.hh"
00008 
00009 namespace MosesTraining
00010 {
00011 namespace Syntax
00012 {
00013 namespace FilterRuleTable
00014 {
00015 
00016 // Read a rule table from 'in' and filter it according to the test sentences.
00017 //
00018 // This involves testing TSG fragments for matches against at potential match
00019 // sites in the set of test parse trees / forests.  There are a few
00020 // optimizations that make this reasonably fast in practice:
00021 //
00022 // Optimization 1
00023 // If a rule has the same TSG fragment as the previous rule then re-use the
00024 // result of the previous filtering decision.
00025 //
00026 // Optimization 2
00027 // Test if the TSG fragment contains any symbols that don't occur in the
00028 // symbol vocabulary of the test set.  If it does then the rule can be
00029 // discarded.
00030 //
00031 // Optimization 3
00032 // Prior to filtering, a map is constructed from each distinct test set tree /
00033 // forest vertex symbol to the set of vertices having that symbol.  During
00034 // filtering, for each rule's TSG fragment the leaf with the smallest number of
00035 // corresponding test nodes nodes is determined.  Matching is only attempted
00036 // at those sites (this is done in MatchFragment, which has tree- and
00037 // forest-specific implementations).
00038 //
00039 // Some statistics from real data (WMT14, English-German, tree-version):
00040 //
00041 //  4.4M    Parallel sentences (source-side parsed with Berkeley parser)
00042 //  2.7K    Test sentences (newstest2014)
00043 //
00044 // 73.4M    Original rule table size (number of distinct, composed GHKM rules)
00045 // 22.9M    Number of rules with same source-side as previous rule
00046 // 50.5M    Number of rules requiring vocabulary matching test
00047 // 24.1M    Number of rules requiring full tree matching test
00048 //  6.7M    Number of rules retained after filtering
00049 //
00050 void TsgFilter::Filter(std::istream &in, std::ostream &out)
00051 {
00052   const util::MultiCharacter delimiter("|||");
00053 
00054   std::string line;
00055   std::string prevLine;
00056   StringPiece source;
00057   bool keep = true;
00058   int lineNum = 0;
00059   std::vector<TreeFragmentToken> tokens;
00060   std::vector<IdTree *> leaves;
00061 
00062   while (std::getline(in, line)) {
00063     ++lineNum;
00064 
00065     // Read the source-side of the rule.
00066     util::TokenIter<util::MultiCharacter> it(line, delimiter);
00067 
00068     // Check if this rule has the same source-side as the previous rule.  If
00069     // it does then we already know whether or not to keep the rule.  This
00070     // optimisation is based on the assumption that the rule table is sorted
00071     // (which is the case in the standard Moses training pipeline).
00072     if (*it == source) {
00073       if (keep) {
00074         out << line << std::endl;
00075       }
00076       continue;
00077     }
00078 
00079     // The source-side is different from the previous rule's.
00080     source = *it;
00081 
00082     // Tokenize the source-side tree fragment.
00083     tokens.clear();
00084     for (TreeFragmentTokenizer p(source); p != TreeFragmentTokenizer(); ++p) {
00085       tokens.push_back(*p);
00086     }
00087 
00088     // Construct an IdTree representing the source-side tree fragment.  This
00089     // will fail if the fragment contains any symbols that don't occur in
00090     // m_testVocab and in that case the rule can be discarded.  In practice,
00091     // this catches a lot of discardable rules (see comment at the top of this
00092     // function).  If the fragment is successfully created then we attempt to
00093     // match the tree fragment against the test trees.  This test is exact, but
00094     // slow.
00095     int i = 0;
00096     leaves.clear();
00097     boost::scoped_ptr<IdTree> fragment(BuildTree(tokens, i, leaves));
00098     keep = fragment.get() && MatchFragment(*fragment, leaves);
00099     if (keep) {
00100       out << line << std::endl;
00101     }
00102 
00103     // Retain line for the next iteration (in order that the source StringPiece
00104     // remains valid).
00105     prevLine.swap(line);
00106   }
00107 }
00108 
00109 TsgFilter::IdTree *TsgFilter::BuildTree(
00110   const std::vector<TreeFragmentToken> &tokens, int &i,
00111   std::vector<IdTree *> &leaves)
00112 {
00113   // The subtree starting at tokens[i] is either:
00114   // 1. a single non-variable symbol (like NP or dog), or
00115   // 2. a variable symbol (like [NP]), or
00116   // 3. a subtree with children (like [NP [DT] [NN dog]])
00117 
00118   // First check for case 1.
00119   if (tokens[i].type == TreeFragmentToken_WORD) {
00120     Vocabulary::IdType id = m_testVocab.Lookup(tokens[i++].value,
00121                             StringPieceCompatibleHash(),
00122                             StringPieceCompatibleEquals());
00123     if (id == Vocabulary::NullId()) {
00124       return 0;
00125     }
00126     leaves.push_back(new IdTree(id));
00127     return leaves.back();
00128   }
00129 
00130   // We must be dealing with either case 2 or 3.  Case 2 looks like case 3 but
00131   // without the children.
00132   assert(tokens[i].type == TreeFragmentToken_LSB);
00133 
00134   // Skip over the opening [
00135   ++i;
00136 
00137   // Read the root symbol of the subtree.
00138   Vocabulary::IdType id = m_testVocab.Lookup(tokens[i++].value,
00139                           StringPieceCompatibleHash(),
00140                           StringPieceCompatibleEquals());
00141   if (id == Vocabulary::NullId()) {
00142     return 0;
00143   }
00144   IdTree *root = new IdTree(id);
00145 
00146   // Read the children (in case 2 there won't be any).
00147   while (tokens[i].type != TreeFragmentToken_RSB) {
00148     IdTree *child = BuildTree(tokens, i, leaves);
00149     if (!child) {
00150       delete root;
00151       return 0;
00152     }
00153     root->children().push_back(child);
00154     child->parent() = root;
00155   }
00156 
00157   if (root->IsLeaf()) {
00158     leaves.push_back(root);
00159   }
00160 
00161   // Skip over the closing ] and we're done.
00162   ++i;
00163   return root;
00164 }
00165 
00166 }  // namespace FilterRuleTable
00167 }  // namespace Syntax
00168 }  // namespace MosesTraining