00001 #ifndef LM_FILTER_PHRASE_H
00002 #define LM_FILTER_PHRASE_H
00003 
00004 #include "util/murmur_hash.hh"
00005 #include "util/string_piece.hh"
00006 #include "util/tokenize_piece.hh"
00007 
00008 #include <boost/unordered_map.hpp>
00009 
00010 #include <iosfwd>
00011 #include <vector>
00012 
00013 #define LM_FILTER_PHRASE_METHOD(caps, lower) \
00014 bool Find##caps(Hash key, const std::vector<unsigned int> *&out) const {\
00015   Table::const_iterator i(table_.find(key));\
00016   if (i==table_.end()) return false; \
00017   out = &i->second.lower; \
00018   return true; \
00019 }
00020 
00021 namespace lm {
00022 namespace phrase {
00023 
00024 typedef uint64_t Hash;
00025 
00026 class Substrings {
00027   private:
00028     
00029 
00030 
00031 
00032 
00033 
00034 
00035 
00036     struct SentenceRelation {
00037       std::vector<unsigned int> substring, left, right, phrase;
00038     };
00039     
00040 
00041 
00042 
00043 
00044 
00045     typedef boost::unordered_map<Hash, SentenceRelation> Table;
00046 
00047   public:
00048     Substrings() {}
00049 
00050     
00051 
00052 
00053 
00054 
00055     LM_FILTER_PHRASE_METHOD(Substring, substring)
00056     LM_FILTER_PHRASE_METHOD(Left, left)
00057     LM_FILTER_PHRASE_METHOD(Right, right)
00058     LM_FILTER_PHRASE_METHOD(Phrase, phrase)
00059 
00060 #pragma GCC diagnostic ignored "-Wuninitialized" // end != finish so there's always an initialization
00061     
00062     template <class Iterator> void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) {
00063       
00064       for (Iterator start = begin; start != end; ++start) {
00065         Hash hash = 0;
00066         SentenceRelation *relation;
00067         for (Iterator finish = start; finish != end; ++finish) {
00068           hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *finish);
00069           
00070           relation = &table_[hash];
00071           AppendSentence(relation->substring, sentence_id);
00072           if (start == begin) AppendSentence(relation->left, sentence_id);
00073         }
00074         AppendSentence(relation->right, sentence_id);
00075         if (start == begin) AppendSentence(relation->phrase, sentence_id);
00076       }
00077     }
00078 
00079   private:
00080     void AppendSentence(std::vector<unsigned int> &vec, unsigned int sentence_id) {
00081       if (vec.empty() || vec.back() != sentence_id) vec.push_back(sentence_id);
00082     }
00083 
00084     Table table_;
00085 };
00086 
00087 
00088 
00089 unsigned int ReadMultiple(std::istream &in, Substrings &out);
00090 
00091 namespace detail {
00092 extern const StringPiece kEndSentence;
00093 
00094 template <class Iterator> void MakeHashes(Iterator i, const Iterator &end, std::vector<Hash> &hashes) {
00095   hashes.clear();
00096   if (i == end) return;
00097   
00098   if ((i->data()[0] == '<') && (i->data()[i->size() - 1] == '>')) {
00099     ++i;
00100   }
00101   for (; i != end && (*i != kEndSentence); ++i) {
00102     hashes.push_back(util::MurmurHashNative(i->data(), i->size()));
00103   }
00104 }
00105 
00106 class Vertex;
00107 class Arc;
00108 
00109 class ConditionCommon {
00110   protected:
00111     ConditionCommon(const Substrings &substrings);
00112     ConditionCommon(const ConditionCommon &from);
00113 
00114     ~ConditionCommon();
00115 
00116     detail::Vertex &MakeGraph();
00117 
00118     
00119     std::vector<Hash> hashes_;
00120 
00121   private:
00122     std::vector<detail::Vertex> vertices_;
00123     std::vector<detail::Arc> arcs_;
00124 
00125     const Substrings &substrings_;
00126 };
00127 
00128 } 
00129 
00130 class Union : public detail::ConditionCommon {
00131   public:
00132     explicit Union(const Substrings &substrings) : detail::ConditionCommon(substrings) {}
00133 
00134     template <class Iterator> bool PassNGram(const Iterator &begin, const Iterator &end) {
00135       detail::MakeHashes(begin, end, hashes_);
00136       return hashes_.empty() || Evaluate();
00137     }
00138 
00139   private:
00140     bool Evaluate();
00141 };
00142 
00143 class Multiple : public detail::ConditionCommon {
00144   public:
00145     explicit Multiple(const Substrings &substrings) : detail::ConditionCommon(substrings) {}
00146 
00147     template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) {
00148       detail::MakeHashes(begin, end, hashes_);
00149       if (hashes_.empty()) {
00150         output.AddNGram(line);
00151       } else {
00152         Evaluate(line, output);
00153       }
00154     }
00155 
00156     template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) {
00157       AddNGram(util::TokenIter<util::SingleCharacter, true>(ngram, ' '), util::TokenIter<util::SingleCharacter, true>::end(), line, output);
00158     }
00159 
00160     void Flush() const {}
00161 
00162   private:
00163     template <class Output> void Evaluate(const StringPiece &line, Output &output);
00164 };
00165 
00166 } 
00167 } 
00168 #endif // LM_FILTER_PHRASE_H