00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifndef __ug_mm_ttrack
00010 #define __ug_mm_ttrack
00011
00012 #include <sstream>
00013 #include <string>
00014 #include <stdexcept>
00015
00016 #include <boost/iostreams/device/mapped_file.hpp>
00017 #include <boost/shared_ptr.hpp>
00018
00019 #include "tpt_typedefs.h"
00020 #include "ug_typedefs.h"
00021 #include "tpt_tokenindex.h"
00022 #include "ug_ttrack_base.h"
00023 #include "num_read_write.h"
00024 #include "ug_load_primer.h"
00025 #include "ug_tsa_base.h"
00026
00027 namespace sapt
00028 {
00029 namespace bio=boost::iostreams;
00030
00031 template<typename TKN=id_type>
00032 class mmTtrack : public Ttrack<TKN>
00033 {
00034 public:
00035 typedef TKN Token;
00036
00037 private:
00038 id_type numSent;
00039 id_type numWords;
00040 bio::mapped_file_source file;
00041 Token const* data;
00042 id_type const* index;
00043
00044
00045 public:
00046 mmTtrack(std::string fname);
00047 mmTtrack();
00048
00049
00050 Token const* sntStart(size_t sid) const;
00051
00052
00053 Token const* sntEnd(size_t sid) const;
00054
00055
00056 size_t size() const;
00057
00058
00059 size_t numTokens() const;
00060
00061
00062 void open(std::string fname);
00063
00064
00065
00066 void write_blank_file_header(std::ostream& out) const;
00067
00068
00069 void write_index_and_finalize(std::ostream& out,
00070 std::vector<id_type> const& idx,
00071 count_type tokenCount) const;
00072
00073
00074
00075 id_type copySentences(std::ostream& trg, id_type start, id_type stop) const;
00076
00078 id_type findSid(TKN const* t) const;
00079
00080 id_type findSid(id_type tokenOffset) const;
00081
00083 void remap(std::string const fname, std::vector<id_type const*> const & f) const;
00084
00085 };
00086
00088 template<typename TKN>
00089 void
00090 mmTtrack<TKN>::
00091 remap(std::string const fname, std::vector<id_type const*> const & f) const
00092 {
00093 bio::mapped_file myfile(fname);
00094 assert(myfile.is_open());
00095 Moses::prime(myfile);
00096 tpt::filepos_type idxOffset;
00097 const char* p = myfile.data();
00098 id_type numSent,numWords;
00099 p = tpt::numread(p,idxOffset);
00100 p = tpt::numread(p,numSent);
00101 p = tpt::numread(p,numWords);
00102 data = reinterpret_cast<TKN*>(p);
00103 for (size_t i = 0; i < numWords; ++i)
00104 data[i] = data[i].remap(f);
00105 myfile.close();
00106 }
00107
00108
00109 template<typename TKN>
00110 size_t
00111 mmTtrack<TKN>::
00112 size() const
00113 {
00114 return this->numSent;
00115 }
00116
00117 template<typename TKN>
00118 size_t
00119 mmTtrack<TKN>::
00120 numTokens() const
00121 {
00122 return this->numWords;
00123 }
00124
00125 template<typename TKN>
00126 TKN const*
00127 mmTtrack<TKN>::
00128 sntStart(size_t sid) const
00129 {
00130 if (sid >= this->numSent)
00131 {
00132 std::cerr << "Fatal error: requested sentence #"
00133 << sid <<" is beyond corpus size ("
00134 << this->numSent <<")" << std::endl;
00135 }
00136 assert(sid < this->numSent);
00137 return data+index[sid];
00138 }
00139
00140 template<typename TKN>
00141 TKN const*
00142 mmTtrack<TKN>::
00143 sntEnd(size_t sid) const
00144 {
00145 assert(sid < this->numSent);
00146 return data+index[sid+1];
00147 }
00148
00149 template<typename TKN>
00150 mmTtrack<TKN>::
00151 mmTtrack()
00152 {
00153 data = NULL;
00154 index = NULL;
00155 this->numSent = this->numWords = 0;
00156 }
00157
00158 template<typename TKN>
00159 mmTtrack<TKN>::
00160 mmTtrack(std::string fname)
00161 {
00162 open(fname);
00163 }
00164
00165 template<typename TKN>
00166 void
00167 mmTtrack<TKN>::
00168 open(std::string fname)
00169 {
00170 if (access(fname.c_str(),F_OK))
00171 {
00172 std::ostringstream msg;
00173 msg << "mmTtrack<>::open: File '" << fname << "' does not exist.";
00174 throw std::runtime_error(msg.str().c_str());
00175 }
00176 file.open(fname);
00177 if (!file.is_open())
00178 {
00179 std::cerr << "Error opening file " << fname << std::endl;
00180 assert(0);
00181 }
00182 tpt::filepos_type idxOffset;
00183 char const* p = file.data();
00184 p = tpt::numread(p, idxOffset);
00185 p = tpt::numread(p,this->numSent);
00186 p = tpt::numread(p,this->numWords);
00187 data = reinterpret_cast<Token const*>(p);
00188 index = reinterpret_cast<id_type const*>(file.data()+idxOffset);
00189 }
00190
00191 template<typename TKN>
00192 id_type
00193 mmTtrack<TKN>::
00194 findSid(TKN const* t) const
00195 {
00196 id_type tokenPos = t-data;
00197 id_type const* p = std::upper_bound(index,index+this->numSent,tokenPos);
00198 assert(p>index);
00199 return p-index-1;
00200 }
00201
00202 template<typename TKN>
00203 id_type
00204 mmTtrack<TKN>::
00205 findSid(id_type tokenPos) const
00206 {
00207 id_type const* p = std::upper_bound(index,index+this->numSent,tokenPos);
00208 assert(p>index);
00209 return p-index-1;
00210 }
00211
00212 template<typename TKN>
00213 void
00214 mmTtrack<TKN>::
00215 write_blank_file_header(std::ostream& out) const
00216 {
00217 tpt::numwrite(out,filepos_type(0));
00218 tpt::numwrite(out,id_type(0));
00219 tpt::numwrite(out,id_type(0));
00220 }
00221
00222 template<typename TKN>
00223 void
00224 mmTtrack<TKN>::
00225 write_index_and_finalize(std::ostream& out,
00226 std::vector<id_type>const& idx,
00227 id_type tokenCount) const
00228 {
00229 id_type idxSize = idx.size();
00230 tpt::filepos_type idxStart = out.tellp();
00231 for (size_t i = 0; i < idx.size(); ++i)
00232 tpt::numwrite(out,idx[i]);
00233 out.seekp(0);
00234 tpt::numwrite(out,idxStart);
00235 tpt::numwrite(out,idxSize-1);
00236 tpt::numwrite(out,tokenCount);
00237 }
00238
00239 template<typename TKN>
00240 id_type
00241 mmTtrack<TKN>::
00242 copySentences(std::ostream& trg, id_type start, id_type stop) const
00243 {
00244 assert(stop > start);
00245 TKN const* a = sntStart(start);
00246 TKN const* z = sntEnd(stop-1);
00247 size_t len = (z-a)*sizeof(TKN);
00248 if (!len) return 0;
00249 trg.write(reinterpret_cast<char const*>(a),len);
00250 return z-a;
00251 }
00252
00253 }
00254 #endif