00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #ifndef __ugTokenIndex_hh
00011 #define __ugTokenIndex_hh
00012 #include <iostream>
00013 #include <sstream>
00014 #include <fstream>
00015 #include <boost/iostreams/device/mapped_file.hpp>
00016 #include <boost/iostreams/stream.hpp>
00017 #include <boost/shared_ptr.hpp>
00018 #include <boost/scoped_ptr.hpp>
00019 #include <boost/thread.hpp>
00020 #include "tpt_typedefs.h"
00021 #include <vector>
00022 #include <map>
00023
00024 namespace bio=boost::iostreams;
00025
00026 namespace sapt
00027 {
00028 class TokenIndex
00029 {
00030 typedef tpt::id_type id_type;
00032 mutable std::vector<char const*> ridx;
00034 std::string unkLabel;
00035 id_type unkId,numTokens;
00036
00038 boost::scoped_ptr<boost::mutex> lock;
00039
00040
00041 bool dynamic;
00042 boost::shared_ptr<std::map<std::string, tpt::id_type> > str2idExtra;
00043 boost::shared_ptr<std::vector<std::string> > newWords;
00044
00045
00046
00047
00048
00049
00050
00051 public:
00053 class Entry
00054 {
00055 public:
00056 uint32_t offset;
00057 id_type id;
00058 };
00059
00061 class CompFunc
00062 {
00063 public:
00064 char const* base;
00065 CompFunc();
00066 bool operator()(Entry const& A, char const* w);
00067 };
00068
00069 bio::mapped_file_source file;
00070 Entry const* startIdx;
00071 Entry const* endIdx;
00072 CompFunc comp;
00073 TokenIndex(std::string unkToken="UNK");
00074
00075 void open(std::string fname,std::string unkToken="UNK",bool dyna=false);
00076 void close();
00077
00078 id_type operator[](char const* w) const;
00079 id_type operator[](std::string const& w) const;
00080 char const* const operator[](id_type id) const;
00081 char const* const operator[](id_type id);
00082 std::vector<char const*> reverseIndex() const;
00083
00084 std::string toString(std::vector<id_type> const& v);
00085 std::string toString(std::vector<id_type> const& v) const;
00086
00087 std::string toString(id_type const* start, id_type const* const stop);
00088 std::string toString(id_type const* start, id_type const* const stop) const;
00089
00090 std::vector<id_type> toIdSeq(std::string const& line) const;
00091
00092 bool fillIdSeq(std::string const& line, std::vector<id_type> & v) const;
00093
00094 void iniReverseIndex();
00095 id_type getNumTokens() const;
00096 id_type getUnkId() const;
00097
00098
00099 id_type knownVocabSize() const;
00100 id_type totalVocabSize() const;
00101
00102 id_type ksize() const;
00103 id_type tsize() const;
00104
00105
00106 char const* const getUnkToken() const;
00107
00108 void write(std::string fname);
00109 bool isDynamic() const;
00110 bool setDynamic(bool onoff);
00111
00112 void setUnkLabel(std::string unk);
00113 };
00114
00115 void
00116 write_tokenindex_to_disk(std::vector<std::pair<std::string,uint32_t> > const& tok,
00117 std::string const& ofile, std::string const& unkToken);
00118
00120 class compWords
00121 {
00122 std::string unk;
00123 public:
00124 compWords(std::string _unk) : unk(_unk) {};
00125
00126 bool
00127 operator()(std::pair<std::string,size_t> const& A,
00128 std::pair<std::string,size_t> const& B) const
00129 {
00130 if (A.first == unk) return false;
00131 if (B.first == unk) return true;
00132 if (A.second == B.second)
00133 return A.first < B.first;
00134 return A.second > B.second;
00135 }
00136 };
00137
00138 template<class MYMAP>
00139 void
00140 mkTokenIndex(std::string ofile,MYMAP const& M,std::string unkToken)
00141 {
00142
00143 typedef std::pair<std::string,uint32_t> Token;
00144
00145
00146
00147
00148 std::vector<std::pair<std::string,size_t> > wcounts(M.size());
00149 typedef typename MYMAP::const_iterator myIter;
00150 size_t z=0;
00151 for (myIter m = M.begin(); m != M.end(); m++)
00152 {
00153
00154 wcounts[z++] = std::pair<std::string,size_t>(m->first,m->second);
00155 }
00156 compWords compFunc(unkToken);
00157 sort(wcounts.begin(),wcounts.end(),compFunc);
00158
00159
00160 std::vector<Token> tok(wcounts.size());
00161 for (size_t i = 0; i < wcounts.size(); i++)
00162 tok[i] = Token(wcounts[i].first,i);
00163
00164 sort(tok.begin(),tok.end());
00165 write_tokenindex_to_disk(tok,ofile,unkToken);
00166 }
00167
00168 template<typename Token>
00169 void
00170 fill_token_seq(TokenIndex& V, std::string const& line, std::vector<Token>& dest)
00171 {
00172 std::istringstream buf(line); std::string w;
00173 while (buf>>w) dest.push_back(Token(V[w]));
00174 }
00175 }
00176 #endif