00001 #include "PrefixTreeMap.h"
00002 #include "TypeDef.h"
00003
00004 #ifdef WITH_THREADS
00005 #include <boost/thread.hpp>
00006 #endif
00007
00008 using namespace std;
00009
00010 namespace Moses
00011 {
00012 void GenericCandidate::readBin(FILE* f)
00013 {
00014 m_PhraseList.clear();
00015 m_ScoreList.clear();
00016 uint32_t num_phrases;
00017 fRead(f, num_phrases);
00018 for(unsigned int i = 0; i < num_phrases; ++i) {
00019 IPhrase phrase;
00020 fReadVector(f, phrase);
00021 m_PhraseList.push_back(phrase);
00022 };
00023 uint32_t num_scores;
00024 fRead(f, num_scores);
00025 for(unsigned int j = 0; j < num_scores; ++j) {
00026 std::vector<float> score;
00027 fReadVector(f, score);
00028 m_ScoreList.push_back(score);
00029 };
00030 };
00031
00032 void GenericCandidate::writeBin(FILE* f) const
00033 {
00034
00035 fWrite(f, static_cast<uint32_t>(m_PhraseList.size()));
00036 for(size_t i = 0; i < m_PhraseList.size(); ++i) {
00037 fWriteVector(f, m_PhraseList[i]);
00038 }
00039 fWrite(f, static_cast<uint32_t>(m_ScoreList.size()));
00040 for(size_t j = 0; j < m_ScoreList.size(); ++j) {
00041 fWriteVector(f, m_ScoreList[j]);
00042 }
00043 };
00044
00045
00046 void Candidates::writeBin(FILE* f) const
00047 {
00048 uint32_t s = this->size();
00049 fWrite(f,s);
00050 for(size_t i = 0; i < s; ++i) {
00051 MyBase::operator[](i).writeBin(f);
00052 }
00053 }
00054
00055 void Candidates::readBin(FILE* f)
00056 {
00057 uint32_t s;
00058 fRead(f,s);
00059 this->resize(s);
00060 for(size_t i = 0; i<s; ++i) {
00061 MyBase::operator[](i).readBin(f);
00062 }
00063 }
00064
00065 const LabelId PrefixTreeMap::MagicWord = std::numeric_limits<LabelId>::max() - 1;
00066
00068 PrefixTreeMap::~PrefixTreeMap()
00069 {
00070 if(m_FileSrc) {
00071 fClose(m_FileSrc);
00072 }
00073 if(m_FileTgt) {
00074 fClose(m_FileTgt);
00075 }
00076 FreeMemory();
00077 }
00078
00079
00080 void PrefixTreeMap::FreeMemory()
00081 {
00082 for(Data::iterator i = m_Data.begin(); i != m_Data.end(); ++i) {
00083 i->free();
00084 }
00085
00086
00087
00088
00089 m_PtrPool.reset();
00090 }
00091
00092 WordVoc &ReadVoc(std::map<std::string,WordVoc> &vocs, const std::string& filename)
00093 {
00094 #ifdef WITH_THREADS
00095 boost::mutex mutex;
00096 boost::mutex::scoped_lock lock(mutex);
00097 #endif
00098 std::map<std::string,WordVoc>::iterator vi = vocs.find(filename);
00099 if (vi == vocs.end()) {
00100 WordVoc &voc = vocs[filename];
00101 voc.Read(filename);
00102 return voc;
00103 } else {
00104 return vi->second;
00105 }
00106 }
00107
00108 int PrefixTreeMap::Read(const std::string& fileNameStem, int numVocs)
00109 {
00110 std::string ifs(fileNameStem + ".srctree"),
00111 ift(fileNameStem + ".tgtdata"),
00112 ifi(fileNameStem + ".idx"),
00113 ifv(fileNameStem + ".voc");
00114
00115 std::vector<OFF_T> srcOffsets;
00116 FILE *ii=fOpen(ifi.c_str(),"rb");
00117 fReadVector(ii,srcOffsets);
00118 fClose(ii);
00119
00120 if (m_FileSrc) {
00121 fClose(m_FileSrc);
00122 }
00123 m_FileSrc = fOpen(ifs.c_str(),"rb");
00124 if (m_FileTgt) {
00125 fClose(m_FileTgt);
00126 }
00127 m_FileTgt = fOpen(ift.c_str(),"rb");
00128
00129 m_Data.resize(srcOffsets.size());
00130
00131 for(size_t i = 0; i < m_Data.size(); ++i) {
00132 m_Data[i] = CPT(m_FileSrc, srcOffsets[i]);
00133 }
00134
00135 if(-1 == numVocs) {
00136 char num[5];
00137 numVocs = 0;
00138 sprintf(num, "%d", numVocs);
00139 while(FileExists(ifv + num)) {
00140 ++numVocs;
00141 sprintf(num, "%d", numVocs);
00142 }
00143 }
00144 char num[5];
00145 m_Voc.resize(numVocs);
00146 for(int i = 0; i < numVocs; ++i) {
00147 sprintf(num, "%d", i);
00148
00149
00150 m_Voc[i] = &ReadVoc(m_vocs, ifv + num);
00151 }
00152
00153 TRACE_ERR("binary file loaded, default OFF_T: "<< PTF::getDefault()<<"\n");
00154 return 1;
00155 };
00156
00157
00158 void PrefixTreeMap::GetCandidates(const IPhrase& key, Candidates* cands)
00159 {
00160
00161 if(key.empty() || key[0] >= m_Data.size() || !m_Data[key[0]]) {
00162 return;
00163 }
00164 UTIL_THROW_IF2(m_Data[key[0]]->findKey(key[0]) >= m_Data[key[0]]->size(),
00165 "Key not found: " << key[0]);
00166
00167 OFF_T candOffset = m_Data[key[0]]->find(key);
00168 if(candOffset == InvalidOffT) {
00169 return;
00170 }
00171 fSeek(m_FileTgt,candOffset);
00172 cands->readBin(m_FileTgt);
00173 }
00174
00175 void PrefixTreeMap::GetCandidates(const PPimp& p, Candidates* cands)
00176 {
00177 UTIL_THROW_IF2(!p.isValid(), "Not a valid PPimp...");
00178 if(p.isRoot()) {
00179 return;
00180 };
00181 OFF_T candOffset = p.ptr()->getData(p.idx);
00182 if(candOffset == InvalidOffT) {
00183 return;
00184 }
00185 fSeek(m_FileTgt,candOffset);
00186 cands->readBin(m_FileTgt);
00187 }
00188
00189 std::vector< std::string const * > PrefixTreeMap::ConvertPhrase(const IPhrase& p, unsigned int voc) const
00190 {
00191 UTIL_THROW_IF2(voc >= m_Voc.size() || m_Voc[voc] == 0,
00192 "Invalid vocab id: " << voc);
00193 std::vector< std::string const * > result;
00194 result.reserve(p.size());
00195 for(IPhrase::const_iterator i = p.begin(); i != p.end(); ++i) {
00196 result.push_back(&(m_Voc[voc]->symbol(*i)));
00197 }
00198 return result;
00199 }
00200
00201 IPhrase PrefixTreeMap::ConvertPhrase(const std::vector< std::string >& p, unsigned int voc) const
00202 {
00203 UTIL_THROW_IF2(voc >= m_Voc.size() || m_Voc[voc] == 0,
00204 "Invalid vocab id: " << voc);
00205 IPhrase result;
00206 result.reserve(p.size());
00207 for(size_t i = 0; i < p.size(); ++i) {
00208 result.push_back(m_Voc[voc]->index(p[i]));
00209 }
00210 return result;
00211 }
00212
00213 LabelId PrefixTreeMap::ConvertWord(const std::string& w, unsigned int voc) const
00214 {
00215 UTIL_THROW_IF2(voc >= m_Voc.size() || m_Voc[voc] == 0,
00216 "Invalid vocab id: " << voc);
00217 return m_Voc[voc]->index(w);
00218 }
00219
00220 std::string PrefixTreeMap::ConvertWord(LabelId w, unsigned int voc) const
00221 {
00222 UTIL_THROW_IF2(voc >= m_Voc.size() || m_Voc[voc] == 0,
00223 "Invalid vocab id: " << voc);
00224 if(w == PrefixTreeMap::MagicWord) {
00225 return "|||";
00226 } else if (w == InvalidLabelId) {
00227 return "<invalid>";
00228 } else {
00229 return m_Voc[voc]->symbol(w);
00230 }
00231 }
00232
00233 PPimp* PrefixTreeMap::GetRoot()
00234 {
00235 return m_PtrPool.get(PPimp(0,0,1));
00236 }
00237
00238 PPimp* PrefixTreeMap::Extend(PPimp* p, LabelId wi)
00239 {
00240 UTIL_THROW_IF2(!p->isValid(), "Not a valid PPimp...");
00241
00242 if(wi == InvalidLabelId) {
00243 return 0;
00244
00245 } else if(p->isRoot()) {
00246 if(wi < m_Data.size() && m_Data[wi]) {
00247 const void* ptr = m_Data[wi]->findKeyPtr(wi);
00248 UTIL_THROW_IF2(ptr == NULL, "Null pointer");
00249 return m_PtrPool.get(PPimp(m_Data[wi],m_Data[wi]->findKey(wi),0));
00250 }
00251 } else if(PTF const* nextP = p->ptr()->getPtr(p->idx)) {
00252 return m_PtrPool.get(PPimp(nextP, nextP->findKey(wi),0));
00253 }
00254 return 0;
00255
00256 }
00257
00258 }
00259