00001 #include "TsgFilter.h" 00002 00003 #include "boost/scoped_ptr.hpp" 00004 00005 #include "util/string_piece.hh" 00006 #include "util/string_piece_hash.hh" 00007 #include "util/tokenize_piece.hh" 00008 00009 namespace MosesTraining 00010 { 00011 namespace Syntax 00012 { 00013 namespace FilterRuleTable 00014 { 00015 00016 // Read a rule table from 'in' and filter it according to the test sentences. 00017 // 00018 // This involves testing TSG fragments for matches against at potential match 00019 // sites in the set of test parse trees / forests. There are a few 00020 // optimizations that make this reasonably fast in practice: 00021 // 00022 // Optimization 1 00023 // If a rule has the same TSG fragment as the previous rule then re-use the 00024 // result of the previous filtering decision. 00025 // 00026 // Optimization 2 00027 // Test if the TSG fragment contains any symbols that don't occur in the 00028 // symbol vocabulary of the test set. If it does then the rule can be 00029 // discarded. 00030 // 00031 // Optimization 3 00032 // Prior to filtering, a map is constructed from each distinct test set tree / 00033 // forest vertex symbol to the set of vertices having that symbol. During 00034 // filtering, for each rule's TSG fragment the leaf with the smallest number of 00035 // corresponding test nodes nodes is determined. Matching is only attempted 00036 // at those sites (this is done in MatchFragment, which has tree- and 00037 // forest-specific implementations). 00038 // 00039 // Some statistics from real data (WMT14, English-German, tree-version): 00040 // 00041 // 4.4M Parallel sentences (source-side parsed with Berkeley parser) 00042 // 2.7K Test sentences (newstest2014) 00043 // 00044 // 73.4M Original rule table size (number of distinct, composed GHKM rules) 00045 // 22.9M Number of rules with same source-side as previous rule 00046 // 50.5M Number of rules requiring vocabulary matching test 00047 // 24.1M Number of rules requiring full tree matching test 00048 // 6.7M Number of rules retained after filtering 00049 // 00050 void TsgFilter::Filter(std::istream &in, std::ostream &out) 00051 { 00052 const util::MultiCharacter delimiter("|||"); 00053 00054 std::string line; 00055 std::string prevLine; 00056 StringPiece source; 00057 bool keep = true; 00058 int lineNum = 0; 00059 std::vector<TreeFragmentToken> tokens; 00060 std::vector<IdTree *> leaves; 00061 00062 while (std::getline(in, line)) { 00063 ++lineNum; 00064 00065 // Read the source-side of the rule. 00066 util::TokenIter<util::MultiCharacter> it(line, delimiter); 00067 00068 // Check if this rule has the same source-side as the previous rule. If 00069 // it does then we already know whether or not to keep the rule. This 00070 // optimisation is based on the assumption that the rule table is sorted 00071 // (which is the case in the standard Moses training pipeline). 00072 if (*it == source) { 00073 if (keep) { 00074 out << line << std::endl; 00075 } 00076 continue; 00077 } 00078 00079 // The source-side is different from the previous rule's. 00080 source = *it; 00081 00082 // Tokenize the source-side tree fragment. 00083 tokens.clear(); 00084 for (TreeFragmentTokenizer p(source); p != TreeFragmentTokenizer(); ++p) { 00085 tokens.push_back(*p); 00086 } 00087 00088 // Construct an IdTree representing the source-side tree fragment. This 00089 // will fail if the fragment contains any symbols that don't occur in 00090 // m_testVocab and in that case the rule can be discarded. In practice, 00091 // this catches a lot of discardable rules (see comment at the top of this 00092 // function). If the fragment is successfully created then we attempt to 00093 // match the tree fragment against the test trees. This test is exact, but 00094 // slow. 00095 int i = 0; 00096 leaves.clear(); 00097 boost::scoped_ptr<IdTree> fragment(BuildTree(tokens, i, leaves)); 00098 keep = fragment.get() && MatchFragment(*fragment, leaves); 00099 if (keep) { 00100 out << line << std::endl; 00101 } 00102 00103 // Retain line for the next iteration (in order that the source StringPiece 00104 // remains valid). 00105 prevLine.swap(line); 00106 } 00107 } 00108 00109 TsgFilter::IdTree *TsgFilter::BuildTree( 00110 const std::vector<TreeFragmentToken> &tokens, int &i, 00111 std::vector<IdTree *> &leaves) 00112 { 00113 // The subtree starting at tokens[i] is either: 00114 // 1. a single non-variable symbol (like NP or dog), or 00115 // 2. a variable symbol (like [NP]), or 00116 // 3. a subtree with children (like [NP [DT] [NN dog]]) 00117 00118 // First check for case 1. 00119 if (tokens[i].type == TreeFragmentToken_WORD) { 00120 Vocabulary::IdType id = m_testVocab.Lookup(tokens[i++].value, 00121 StringPieceCompatibleHash(), 00122 StringPieceCompatibleEquals()); 00123 if (id == Vocabulary::NullId()) { 00124 return 0; 00125 } 00126 leaves.push_back(new IdTree(id)); 00127 return leaves.back(); 00128 } 00129 00130 // We must be dealing with either case 2 or 3. Case 2 looks like case 3 but 00131 // without the children. 00132 assert(tokens[i].type == TreeFragmentToken_LSB); 00133 00134 // Skip over the opening [ 00135 ++i; 00136 00137 // Read the root symbol of the subtree. 00138 Vocabulary::IdType id = m_testVocab.Lookup(tokens[i++].value, 00139 StringPieceCompatibleHash(), 00140 StringPieceCompatibleEquals()); 00141 if (id == Vocabulary::NullId()) { 00142 return 0; 00143 } 00144 IdTree *root = new IdTree(id); 00145 00146 // Read the children (in case 2 there won't be any). 00147 while (tokens[i].type != TreeFragmentToken_RSB) { 00148 IdTree *child = BuildTree(tokens, i, leaves); 00149 if (!child) { 00150 delete root; 00151 return 0; 00152 } 00153 root->children().push_back(child); 00154 child->parent() = root; 00155 } 00156 00157 if (root->IsLeaf()) { 00158 leaves.push_back(root); 00159 } 00160 00161 // Skip over the closing ] and we're done. 00162 ++i; 00163 return root; 00164 } 00165 00166 } // namespace FilterRuleTable 00167 } // namespace Syntax 00168 } // namespace MosesTraining