00001 #pragma once 00002 00003 #include <istream> 00004 #include <ostream> 00005 #include <string> 00006 #include <vector> 00007 00008 #include "syntax-common/numbered_set.h" 00009 #include "syntax-common/tree.h" 00010 #include "syntax-common/tree_fragment_tokenizer.h" 00011 00012 namespace MosesTraining 00013 { 00014 namespace Syntax 00015 { 00016 namespace FilterRuleTable 00017 { 00018 00019 // Base class for TreeTsgFilter and ForestTsgFilter, both of which filter rule 00020 // tables where the source-side is TSG. 00021 class TsgFilter 00022 { 00023 public: 00024 virtual ~TsgFilter() {} 00025 00026 // Read a rule table from 'in' and filter it according to the test sentences. 00027 void Filter(std::istream &in, std::ostream &out); 00028 00029 protected: 00030 // Maps symbols (terminals and non-terminals) from strings to integers. 00031 typedef NumberedSet<std::string, std::size_t> Vocabulary; 00032 00033 // Represents a tree using integer vocabulary values. 00034 typedef Tree<Vocabulary::IdType> IdTree; 00035 00036 // Build an IdTree (wrt m_testVocab) for the tree beginning at position i of 00037 // the token sequence or return 0 if any symbol in the fragment is not in 00038 // m_testVocab. If successful then on return, i will be set to the position 00039 // immediately after the last token of the tree and leaves will contain the 00040 // pointers to the fragment's leaves. If the build fails then i and leaves 00041 // are undefined. 00042 IdTree *BuildTree(const std::vector<TreeFragmentToken> &tokens, int &i, 00043 std::vector<IdTree *> &leaves); 00044 00045 // Try to match a fragment. The implementation depends on whether the test 00046 // sentences are trees or forests. 00047 virtual bool MatchFragment(const IdTree &, const std::vector<IdTree *> &) = 0; 00048 00049 // The symbol vocabulary of the test sentences. 00050 Vocabulary m_testVocab; 00051 }; 00052 00053 } // namespace FilterRuleTable 00054 } // namespace Syntax 00055 } // namespace MosesTraining