00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifndef moses_CanonicalHuffman_h
00023 #define moses_CanonicalHuffman_h
00024
00025 #include <string>
00026 #include <algorithm>
00027 #include <boost/dynamic_bitset.hpp>
00028 #include <boost/unordered_map.hpp>
00029
00030 #include "ThrowingFwrite.h"
00031
00032 namespace Moses
00033 {
00034
00035 template <typename Data>
00036 class CanonicalHuffman
00037 {
00038 private:
00039 std::vector<Data> m_symbols;
00040 std::vector<size_t> m_firstCodes;
00041 std::vector<size_t> m_lengthIndex;
00042
00043 typedef boost::unordered_map<Data, boost::dynamic_bitset<> > EncodeMap;
00044 EncodeMap m_encodeMap;
00045
00046 struct MinHeapSorter {
00047 std::vector<size_t>& m_vec;
00048
00049 MinHeapSorter(std::vector<size_t>& vec) : m_vec(vec) { }
00050
00051 bool operator()(size_t a, size_t b) {
00052 return m_vec[a] > m_vec[b];
00053 }
00054 };
00055
00056 template <class Iterator>
00057 void CalcLengths(Iterator begin, Iterator end, std::vector<size_t>& lengths) {
00058 size_t n = std::distance(begin, end);
00059 std::vector<size_t> A(2 * n, 0);
00060
00061 m_symbols.resize(n);
00062 size_t i = 0;
00063 for(Iterator it = begin; it != end; it++) {
00064 m_symbols[i] = it->first;
00065
00066 A[i] = n + i;
00067 A[n + i] = it->second;
00068 i++;
00069 }
00070
00071 if(n == 1) {
00072 lengths.push_back(1);
00073 return;
00074 }
00075
00076 MinHeapSorter hs(A);
00077 std::make_heap(A.begin(), A.begin() + n, hs);
00078
00079
00080 volatile size_t h = n;
00081 volatile size_t m1, m2;
00082 while(h > 1) {
00083 m1 = A[0];
00084 std::pop_heap(A.begin(), A.begin() + h, hs);
00085
00086 h--;
00087
00088 m2 = A[0];
00089 std::pop_heap(A.begin(), A.begin() + h, hs);
00090
00091 A[h] = A[m1] + A[m2];
00092 A[h-1] = h;
00093 A[m1] = A[m2] = h;
00094
00095 std::push_heap(A.begin(), A.begin() + h, hs);
00096 }
00097
00098 A[1] = 0;
00099 for(size_t i = 2; i < 2*n; i++)
00100 A[i] = A[A[i]] + 1;
00101
00102 lengths.resize(n);
00103 for(size_t i = 0; i < n; i++)
00104 lengths[i] = A[i + n];
00105 }
00106
00107 void CalcCodes(std::vector<size_t>& lengths) {
00108 std::vector<size_t> numLength;
00109 for(std::vector<size_t>::iterator it = lengths.begin();
00110 it != lengths.end(); it++) {
00111 size_t length = *it;
00112 if(numLength.size() <= length)
00113 numLength.resize(length + 1, 0);
00114 numLength[length]++;
00115 }
00116
00117 m_lengthIndex.resize(numLength.size());
00118 m_lengthIndex[0] = 0;
00119 for(size_t l = 1; l < numLength.size(); l++)
00120 m_lengthIndex[l] = m_lengthIndex[l - 1] + numLength[l - 1];
00121
00122 size_t maxLength = numLength.size() - 1;
00123
00124 m_firstCodes.resize(maxLength + 1, 0);
00125 for(size_t l = maxLength - 1; l > 0; l--)
00126 m_firstCodes[l] = (m_firstCodes[l + 1] + numLength[l + 1]) / 2;
00127
00128 std::vector<Data> t_symbols;
00129 t_symbols.resize(lengths.size());
00130
00131 std::vector<size_t> nextCode = m_firstCodes;
00132 for(size_t i = 0; i < lengths.size(); i++) {
00133 Data data = m_symbols[i];
00134 size_t length = lengths[i];
00135
00136 size_t pos = m_lengthIndex[length]
00137 + (nextCode[length] - m_firstCodes[length]);
00138 t_symbols[pos] = data;
00139
00140 nextCode[length] = nextCode[length] + 1;
00141 }
00142
00143 m_symbols.swap(t_symbols);
00144 }
00145
00146 void CreateCodeMap() {
00147 for(size_t l = 1; l < m_lengthIndex.size(); l++) {
00148 size_t intCode = m_firstCodes[l];
00149 size_t num = ((l+1 < m_lengthIndex.size()) ? m_lengthIndex[l+1]
00150 : m_symbols.size()) - m_lengthIndex[l];
00151
00152 for(size_t i = 0; i < num; i++) {
00153 Data data = m_symbols[m_lengthIndex[l] + i];
00154 boost::dynamic_bitset<> bitCode(l, intCode);
00155 m_encodeMap[data] = bitCode;
00156 intCode++;
00157 }
00158 }
00159 }
00160
00161 const boost::dynamic_bitset<>& Encode(Data data) const {
00162 typename EncodeMap::const_iterator it = m_encodeMap.find(data);
00163 UTIL_THROW_IF2(it == m_encodeMap.end(), "Cannot find symbol in encoding map");
00164 return it->second;
00165 }
00166
00167 template <class BitWrapper>
00168 void PutCode(BitWrapper& bitWrapper, const boost::dynamic_bitset<>& code) {
00169 for(int j = code.size()-1; j >= 0; j--)
00170 bitWrapper.Put(code[j]);
00171 }
00172
00173 public:
00174
00175 template <class Iterator>
00176 CanonicalHuffman(Iterator begin, Iterator end, bool forEncoding = true) {
00177 std::vector<size_t> lengths;
00178 CalcLengths(begin, end, lengths);
00179 CalcCodes(lengths);
00180
00181 if(forEncoding)
00182 CreateCodeMap();
00183 }
00184
00185 CanonicalHuffman(std::FILE* pFile, bool forEncoding = false) {
00186 Load(pFile);
00187
00188 if(forEncoding)
00189 CreateCodeMap();
00190 }
00191
00192 template <class BitWrapper>
00193 void Put(BitWrapper& bitWrapper, Data data) {
00194 PutCode(bitWrapper, Encode(data));
00195 }
00196
00197 template <class BitWrapper>
00198 Data Read(BitWrapper& bitWrapper) {
00199 if(bitWrapper.TellFromEnd()) {
00200 size_t intCode = bitWrapper.Read();
00201 size_t len = 1;
00202 while(intCode < m_firstCodes[len]) {
00203 intCode = 2 * intCode + bitWrapper.Read();
00204 len++;
00205 }
00206 return m_symbols[m_lengthIndex[len] + (intCode - m_firstCodes[len])];
00207 }
00208 return Data();
00209 }
00210
00211 size_t Load(std::FILE* pFile) {
00212 size_t start = std::ftell(pFile);
00213 size_t read = 0;
00214
00215 size_t size;
00216 read += std::fread(&size, sizeof(size_t), 1, pFile);
00217 m_symbols.resize(size);
00218 read += std::fread(&m_symbols[0], sizeof(Data), size, pFile);
00219
00220 read += std::fread(&size, sizeof(size_t), 1, pFile);
00221 m_firstCodes.resize(size);
00222 read += std::fread(&m_firstCodes[0], sizeof(size_t), size, pFile);
00223
00224 read += std::fread(&size, sizeof(size_t), 1, pFile);
00225 m_lengthIndex.resize(size);
00226 read += std::fread(&m_lengthIndex[0], sizeof(size_t), size, pFile);
00227
00228 return std::ftell(pFile) - start;
00229 }
00230
00231 size_t Save(std::FILE* pFile) {
00232 size_t start = std::ftell(pFile);
00233
00234 size_t size = m_symbols.size();
00235 ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
00236 ThrowingFwrite(&m_symbols[0], sizeof(Data), size, pFile);
00237
00238 size = m_firstCodes.size();
00239 ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
00240 ThrowingFwrite(&m_firstCodes[0], sizeof(size_t), size, pFile);
00241
00242 size = m_lengthIndex.size();
00243 ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
00244 ThrowingFwrite(&m_lengthIndex[0], sizeof(size_t), size, pFile);
00245
00246 return std::ftell(pFile) - start;
00247 }
00248 };
00249
00250 template <class Container = std::string>
00251 class BitWrapper
00252 {
00253 private:
00254 Container& m_data;
00255
00256 typename Container::iterator m_iterator;
00257 typename Container::value_type m_currentValue;
00258
00259 size_t m_valueBits;
00260 typename Container::value_type m_mask;
00261 size_t m_bitPos;
00262
00263 public:
00264
00265 BitWrapper(Container &data)
00266 : m_data(data), m_iterator(m_data.begin()), m_currentValue(0),
00267 m_valueBits(sizeof(typename Container::value_type) * 8),
00268 m_mask(1), m_bitPos(0) { }
00269
00270 bool Read() {
00271 if(m_bitPos % m_valueBits == 0) {
00272 if(m_iterator != m_data.end())
00273 m_currentValue = *m_iterator++;
00274 } else
00275 m_currentValue = m_currentValue >> 1;
00276
00277 m_bitPos++;
00278 return (m_currentValue & m_mask);
00279 }
00280
00281 void Put(bool bit) {
00282 if(m_bitPos % m_valueBits == 0)
00283 m_data.push_back(0);
00284
00285 if(bit)
00286 m_data[m_data.size()-1] |= m_mask << (m_bitPos % m_valueBits);
00287
00288 m_bitPos++;
00289 }
00290
00291 size_t Tell() {
00292 return m_bitPos;
00293 }
00294
00295 size_t TellFromEnd() {
00296 if(m_data.size() * m_valueBits < m_bitPos)
00297 return 0;
00298 return m_data.size() * m_valueBits - m_bitPos;
00299 }
00300
00301 void Seek(size_t bitPos) {
00302 m_bitPos = bitPos;
00303 m_iterator = m_data.begin() + int((m_bitPos-1)/m_valueBits);
00304 m_currentValue = (*m_iterator) >> ((m_bitPos-1) % m_valueBits);
00305 m_iterator++;
00306 }
00307
00308 void SeekFromEnd(size_t bitPosFromEnd) {
00309 size_t bitPos = m_data.size() * m_valueBits - bitPosFromEnd;
00310 Seek(bitPos);
00311 }
00312
00313 void Reset() {
00314 m_iterator = m_data.begin();
00315 m_currentValue = 0;
00316 m_bitPos = 0;
00317 }
00318
00319 Container& GetContainer() {
00320 return m_data;
00321 }
00322 };
00323
00324 }
00325
00326 #endif