00001 #include "util/file_stream.hh"
00002 #include "util/file_piece.hh"
00003 #include "util/murmur_hash.hh"
00004 #include "util/pool.hh"
00005 #include "util/string_piece.hh"
00006 #include "util/string_piece_hash.hh"
00007 #include "util/tokenize_piece.hh"
00008
00009 #include <boost/unordered_map.hpp>
00010 #include <boost/unordered_set.hpp>
00011
00012 #include <cstddef>
00013 #include <vector>
00014
00015 namespace {
00016
00017 struct MutablePiece {
00018 mutable StringPiece behind;
00019 bool operator==(const MutablePiece &other) const {
00020 return behind == other.behind;
00021 }
00022 };
00023
00024 std::size_t hash_value(const MutablePiece &m) {
00025 return hash_value(m.behind);
00026 }
00027
00028 class InternString {
00029 public:
00030 const char *Add(StringPiece str) {
00031 MutablePiece mut;
00032 mut.behind = str;
00033 std::pair<boost::unordered_set<MutablePiece>::iterator, bool> res(strs_.insert(mut));
00034 if (res.second) {
00035 void *mem = backing_.Allocate(str.size() + 1);
00036 memcpy(mem, str.data(), str.size());
00037 static_cast<char*>(mem)[str.size()] = 0;
00038 res.first->behind = StringPiece(static_cast<char*>(mem), str.size());
00039 }
00040 return res.first->behind.data();
00041 }
00042
00043 private:
00044 util::Pool backing_;
00045 boost::unordered_set<MutablePiece> strs_;
00046 };
00047
00048 class TargetWords {
00049 public:
00050 void Introduce(StringPiece source) {
00051 vocab_.resize(vocab_.size() + 1);
00052 std::vector<unsigned int> temp(1, vocab_.size() - 1);
00053 Add(temp, source);
00054 }
00055
00056 void Add(const std::vector<unsigned int> &sentences, StringPiece target) {
00057 if (sentences.empty()) return;
00058 interns_.clear();
00059 for (util::TokenIter<util::SingleCharacter, true> i(target, ' '); i; ++i) {
00060 interns_.push_back(intern_.Add(*i));
00061 }
00062 for (std::vector<unsigned int>::const_iterator i(sentences.begin()); i != sentences.end(); ++i) {
00063 boost::unordered_set<const char *> &vocab = vocab_[*i];
00064 for (std::vector<const char *>::const_iterator j = interns_.begin(); j != interns_.end(); ++j) {
00065 vocab.insert(*j);
00066 }
00067 }
00068 }
00069
00070 void Print() const {
00071 util::FileStream out(1);
00072 for (std::vector<boost::unordered_set<const char *> >::const_iterator i = vocab_.begin(); i != vocab_.end(); ++i) {
00073 for (boost::unordered_set<const char *>::const_iterator j = i->begin(); j != i->end(); ++j) {
00074 out << *j << ' ';
00075 }
00076 out << '\n';
00077 }
00078 }
00079
00080 private:
00081 InternString intern_;
00082
00083 std::vector<boost::unordered_set<const char *> > vocab_;
00084
00085
00086 std::vector<const char *> interns_;
00087 };
00088
00089 class Input {
00090 public:
00091 explicit Input(std::size_t max_length)
00092 : max_length_(max_length), sentence_id_(0), empty_() {}
00093
00094 void AddSentence(StringPiece sentence, TargetWords &targets) {
00095 canonical_.clear();
00096 starts_.clear();
00097 starts_.push_back(0);
00098 for (util::TokenIter<util::AnyCharacter, true> i(sentence, StringPiece("\0 \t", 3)); i; ++i) {
00099 canonical_.append(i->data(), i->size());
00100 canonical_ += ' ';
00101 starts_.push_back(canonical_.size());
00102 }
00103 targets.Introduce(canonical_);
00104 for (std::size_t i = 0; i < starts_.size() - 1; ++i) {
00105 std::size_t subtract = starts_[i];
00106 const char *start = &canonical_[subtract];
00107 for (std::size_t j = i + 1; j < std::min(starts_.size(), i + max_length_ + 1); ++j) {
00108 map_[util::MurmurHash64A(start, &canonical_[starts_[j]] - start - 1)].push_back(sentence_id_);
00109 }
00110 }
00111 ++sentence_id_;
00112 }
00113
00114
00115 const std::vector<unsigned int> &Matches(StringPiece phrase) const {
00116 Map::const_iterator i = map_.find(util::MurmurHash64A(phrase.data(), phrase.size()));
00117 return i == map_.end() ? empty_ : i->second;
00118 }
00119
00120 private:
00121 const std::size_t max_length_;
00122
00123
00124 typedef boost::unordered_map<uint64_t, std::vector<unsigned int> > Map;
00125 Map map_;
00126
00127 std::size_t sentence_id_;
00128
00129
00130 std::string canonical_;
00131 std::vector<std::size_t> starts_;
00132
00133 const std::vector<unsigned int> empty_;
00134 };
00135
00136 }
00137
00138 int main(int argc, char *argv[]) {
00139 if (argc != 2) {
00140 std::cerr << "Expected source text on the command line" << std::endl;
00141 return 1;
00142 }
00143 Input input(7);
00144 TargetWords targets;
00145 try {
00146 util::FilePiece inputs(argv[1], &std::cerr);
00147 while (true)
00148 input.AddSentence(inputs.ReadLine(), targets);
00149 } catch (const util::EndOfFileException &e) {}
00150
00151 util::FilePiece table(0, NULL, &std::cerr);
00152 StringPiece line;
00153 const StringPiece pipes("|||");
00154 while (true) {
00155 try {
00156 line = table.ReadLine();
00157 } catch (const util::EndOfFileException &e) { break; }
00158 util::TokenIter<util::MultiCharacter> it(line, pipes);
00159 StringPiece source(*it);
00160 if (!source.empty() && source[source.size() - 1] == ' ')
00161 source.remove_suffix(1);
00162 targets.Add(input.Matches(source), *++it);
00163 }
00164 targets.Print();
00165 }