00001 #ifndef LM_FILTER_VOCAB_H
00002 #define LM_FILTER_VOCAB_H
00003 
00004 
00005 
00006 #include "util/multi_intersection.hh"
00007 #include "util/string_piece.hh"
00008 #include "util/string_piece_hash.hh"
00009 #include "util/tokenize_piece.hh"
00010 
00011 #include <boost/noncopyable.hpp>
00012 #include <boost/range/iterator_range.hpp>
00013 #include <boost/unordered/unordered_map.hpp>
00014 #include <boost/unordered/unordered_set.hpp>
00015 
00016 #include <string>
00017 #include <vector>
00018 
00019 namespace lm {
00020 namespace vocab {
00021 
00022 void ReadSingle(std::istream &in, boost::unordered_set<std::string> &out);
00023 
00024 
00025 unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out);
00026 
00027 
00028 
00029 
00030 
00031 inline bool IsTag(const StringPiece &value) {
00032   
00033   assert(!value.empty());
00034   return (value.data()[0] == '<' && value.data()[value.size() - 1] == '>');
00035 }
00036 
00037 class Single {
00038   public:
00039     typedef boost::unordered_set<std::string> Words;
00040 
00041     explicit Single(const Words &vocab) : vocab_(vocab) {}
00042 
00043     template <class Iterator> bool PassNGram(const Iterator &begin, const Iterator &end) {
00044       for (Iterator i = begin; i != end; ++i) {
00045         if (IsTag(*i)) continue;
00046         if (FindStringPiece(vocab_, *i) == vocab_.end()) return false;
00047       }
00048       return true;
00049     }
00050 
00051   private:
00052     const Words &vocab_;
00053 };
00054 
00055 class Union {
00056   public:
00057     typedef boost::unordered_map<std::string, std::vector<unsigned int> > Words;
00058 
00059     explicit Union(const Words &vocabs) : vocabs_(vocabs) {}
00060 
00061     template <class Iterator> bool PassNGram(const Iterator &begin, const Iterator &end) {
00062       sets_.clear();
00063 
00064       for (Iterator i(begin); i != end; ++i) {
00065         if (IsTag(*i)) continue;
00066         Words::const_iterator found(FindStringPiece(vocabs_, *i));
00067         if (vocabs_.end() == found) return false;
00068         sets_.push_back(boost::iterator_range<const unsigned int*>(&*found->second.begin(), &*found->second.end()));
00069       }
00070       return (sets_.empty() || util::FirstIntersection(sets_));
00071     }
00072 
00073   private:
00074     const Words &vocabs_;
00075 
00076     std::vector<boost::iterator_range<const unsigned int*> > sets_;
00077 };
00078 
00079 class Multiple {
00080   public:
00081     typedef boost::unordered_map<std::string, std::vector<unsigned int> > Words;
00082 
00083     Multiple(const Words &vocabs) : vocabs_(vocabs) {}
00084 
00085   private:
00086     
00087     template <class Output> class Callback {
00088       public:
00089         Callback(Output &out, const StringPiece &line) : out_(out), line_(line) {}
00090 
00091         void operator()(unsigned int index) {
00092           out_.SingleAddNGram(index, line_);
00093         }
00094 
00095       private:
00096         Output &out_;
00097         const StringPiece &line_;
00098     };
00099 
00100   public:
00101     template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) {
00102       sets_.clear();
00103       for (Iterator i(begin); i != end; ++i) {
00104         if (IsTag(*i)) continue;
00105         Words::const_iterator found(FindStringPiece(vocabs_, *i));
00106         if (vocabs_.end() == found) return;
00107         sets_.push_back(boost::iterator_range<const unsigned int*>(&*found->second.begin(), &*found->second.end()));
00108       }
00109       if (sets_.empty()) {
00110         output.AddNGram(line);
00111         return;
00112       }
00113 
00114       Callback<Output> cb(output, line);
00115       util::AllIntersection(sets_, cb);
00116     }
00117 
00118     template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) {
00119       AddNGram(util::TokenIter<util::SingleCharacter, true>(ngram, ' '), util::TokenIter<util::SingleCharacter, true>::end(), line, output);
00120     }
00121 
00122     void Flush() const {}
00123 
00124   private:
00125     const Words &vocabs_;
00126 
00127     std::vector<boost::iterator_range<const unsigned int*> > sets_;
00128 };
00129 
00130 } 
00131 } 
00132 
00133 #endif // LM_FILTER_VOCAB_H