00001
00002
00003
00004
00005 #ifndef __ug_im_ttrack
00006 #define __ug_im_ttrack
00007
00008 #include <string>
00009 #include <iostream>
00010
00011 #include <boost/shared_ptr.hpp>
00012 #include <boost/unordered_map.hpp>
00013 #include <boost/foreach.hpp>
00014
00015 #include "tpt_typedefs.h"
00016 #include "tpt_tokenindex.h"
00017 #include "ug_ttrack_base.h"
00018 #include "tpt_tokenindex.h"
00019 #include "util/exception.hh"
00020 #include "moses/Util.h"
00021
00022
00023
00024 #define IMTTRACK_INCREMENT_SIZE 100000
00025 #define IMTSA_INCREMENT_SIZE 1000000
00026
00027 namespace sapt
00028 {
00029 namespace bio=boost::iostreams;
00030
00031 template<typename Token> class imTSA;
00032 template<typename Token> class imTtrack;
00033
00034 template<typename TOKEN>
00035 typename boost::shared_ptr<imTtrack<TOKEN> >
00036 append(typename boost::shared_ptr<imTtrack<TOKEN> > const & crp,
00037 std::vector<TOKEN> const & snt);
00038
00039 template<typename Token>
00040 class imTtrack : public Ttrack<Token>
00041 {
00042
00043 private:
00044 size_t numToks;
00045 boost::shared_ptr<typename std::vector<std::vector<Token> > > myData;
00046
00047 friend class imTSA<Token>;
00048
00049 friend
00050 typename boost::shared_ptr<imTtrack<Token> >
00051 append<Token>(typename boost::shared_ptr<imTtrack<Token> > const & crp, std::vector<Token> const & snt);
00052
00053 void m_check_token_count();
00054
00055 public:
00056
00057 imTtrack(boost::shared_ptr<std::vector<std::vector<Token> > > const& d);
00058 imTtrack(std::istream& in, TokenIndex& V, std::ostream* log = NULL);
00059 imTtrack(size_t reserve = 0);
00060
00061
00063 Token const* sntStart(size_t sid) const;
00064
00066 Token const* sntEnd(size_t sid) const;
00067
00068 size_t size() const;
00069 size_t numTokens() const;
00070
00071 id_type findSid(Token const* t) const;
00072
00073 };
00074
00075 template<typename Token>
00076 void
00077 imTtrack<Token>::
00078 m_check_token_count()
00079 {
00080 size_t check = 0;
00081 BOOST_FOREACH(std::vector<Token> const& s, *myData)
00082 check += s.size();
00083 UTIL_THROW_IF2(check != this->numToks, "[" << HERE << "]"
00084 << " Wrong token count after appending sentence!"
00085 << " Counted " << check << " but expected "
00086 << this->numToks << " in a total of " << myData->size()
00087 << " sentences.");
00088
00089 }
00090
00091 template<typename Token>
00092 Token const*
00093 imTtrack<Token>::
00094 sntStart(size_t sid) const
00095 {
00096 assert(sid < size());
00097 if ((*myData)[sid].size() == 0) return NULL;
00098 return &((*myData)[sid].front());
00099 }
00100
00101 template<typename Token>
00102 Token const*
00103 imTtrack<Token>::
00104 sntEnd(size_t sid) const
00105 {
00106 assert(sid < size());
00107 if ((*myData)[sid].size() == 0) return NULL;
00108 return &(*myData)[sid].back()+1;
00109 }
00110
00111 template<typename Token>
00112 size_t
00113 imTtrack<Token>::
00114 size() const
00115 {
00116
00117
00118
00119 return myData->size();
00120 }
00121
00122 template<typename Token>
00123 size_t
00124 imTtrack<Token>::
00125 numTokens() const
00126 {
00127 return numToks;
00128 }
00129
00130 template<typename Token>
00131 imTtrack<Token>::
00132 imTtrack(std::istream& in, TokenIndex& V, std::ostream* log)
00133 : numToks(0)
00134 {
00135 myData.reset(new std::vector<std::vector<Token> >());
00136 std::string line,w;
00137 size_t linectr=0;
00138 boost::unordered_map<std::string,id_type> H;
00139
00140
00141 while (getline(in,line))
00142 {
00143
00144 myData->push_back(std::vector<Token>());
00145 if (log && ++linectr%1000000==0)
00146 *log << linectr/1000000 << "M lines of input processed" << std::endl;
00147 std::istringstream buf(line);
00148
00149 while (buf>>w)
00150 {
00151 myData->back().push_back(Token(V[w]));
00152
00153
00154 }
00155
00156 numToks += myData->back().size();
00157 }
00158 }
00159
00160 template<typename Token>
00161 imTtrack<Token>::
00162 imTtrack(size_t reserve)
00163 : numToks(0)
00164 {
00165 myData.reset(new std::vector<std::vector<Token> >());
00166 if (reserve) myData->reserve(reserve);
00167 }
00168
00169 template<typename Token>
00170 imTtrack<Token>::
00171 imTtrack(boost::shared_ptr<std::vector<std::vector<Token> > > const& d)
00172 : numToks(0)
00173 {
00174 myData = d;
00175 BOOST_FOREACH(std::vector<Token> const& v, *d)
00176 numToks += v.size();
00177 }
00178
00179 template<typename Token>
00180 id_type
00181 imTtrack<Token>::
00182 findSid(Token const* t) const
00183 {
00184 id_type i;
00185 for (i = 0; i < myData->size(); ++i)
00186 {
00187 std::vector<Token> const& v = (*myData)[i];
00188 if (v.size() == 0) continue;
00189 if (&v.front() <= t && &v.back() >= t)
00190 break;
00191 }
00192 return i;
00193 }
00194
00196 template<typename TOKEN>
00197 boost::shared_ptr<imTtrack<TOKEN> >
00198 append(boost::shared_ptr<imTtrack<TOKEN> > const& crp, std::vector<TOKEN> const & snt)
00199 {
00200 #if 1
00201 if (crp) crp->m_check_token_count();
00202 #endif
00203 boost::shared_ptr<imTtrack<TOKEN> > ret;
00204 if (crp == NULL)
00205 {
00206 ret.reset(new imTtrack<TOKEN>());
00207 ret->myData->reserve(IMTTRACK_INCREMENT_SIZE);
00208 }
00209 else if (crp->myData->capacity() == crp->size())
00210 {
00211 ret.reset(new imTtrack<TOKEN>());
00212 ret->myData->reserve(crp->size() + IMTTRACK_INCREMENT_SIZE);
00213 copy(crp->myData->begin(),crp->myData->end(),ret->myData->begin());
00214 }
00215 else ret = crp;
00216 ret->myData->push_back(snt);
00217 ret->numToks += snt.size();
00218
00219 #if 1
00220 ret->m_check_token_count();
00221 #endif
00222 return ret;
00223 }
00224
00225 }
00226 #endif