00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifndef moses_ListCoders_h
00023 #define moses_ListCoders_h
00024
00025 #include <cmath>
00026 #include <cassert>
00027
00028 namespace Moses
00029 {
00030
00031 template <typename T = unsigned int>
00032 class VarIntType
00033 {
00034 private:
00035 template <typename IntType, typename OutIt>
00036 static void EncodeSymbol(IntType input, OutIt output) {
00037 if(input == 0) {
00038 *output = 0;
00039 output++;
00040 return;
00041 }
00042
00043 T msb = 1 << (sizeof(T)*8-1);
00044 IntType mask = ~msb;
00045 IntType shift = (sizeof(T)*8-1);
00046
00047 while(input) {
00048 T res = input & mask;
00049 input >>= shift;
00050 if(input)
00051 res |= msb;
00052 *output = res;
00053 output++;
00054 }
00055 };
00056
00057 template <typename InIt, typename IntType>
00058 static void DecodeSymbol(InIt &it, InIt end, IntType &output) {
00059 T msb = 1 << (sizeof(T)*8-1);
00060 IntType shift = (sizeof(T)*8-1);
00061
00062 output = 0;
00063 size_t i = 0;
00064 while(it != end && *it & msb) {
00065 IntType temp = *it & ~msb;
00066 temp <<= shift*i;
00067 output |= temp;
00068 it++;
00069 i++;
00070 }
00071 assert(it != end);
00072
00073 IntType temp = *it;
00074 temp <<= shift*i;
00075 output |= temp;
00076 it++;
00077 }
00078
00079 public:
00080
00081 template <typename InIt, typename OutIt>
00082 static void Encode(InIt it, InIt end, OutIt outIt) {
00083 while(it != end) {
00084 EncodeSymbol(*it, outIt);
00085 it++;
00086 }
00087 }
00088
00089 template <typename InIt, typename OutIt>
00090 static void Decode(InIt &it, InIt end, OutIt outIt) {
00091 while(it != end) {
00092 size_t output;
00093 DecodeSymbol(it, end, output);
00094 *outIt = output;
00095 outIt++;
00096 }
00097 }
00098
00099 template <typename InIt>
00100 static size_t DecodeAndSum(InIt &it, InIt end, size_t num) {
00101 size_t sum = 0;
00102 size_t curr = 0;
00103
00104 while(it != end && curr < num) {
00105 size_t output;
00106 DecodeSymbol(it, end, output);
00107 sum += output;
00108 curr++;
00109 }
00110
00111 return sum;
00112 }
00113
00114 };
00115
00116 typedef VarIntType<unsigned char> VarByte;
00117
00118 typedef VarByte VarInt8;
00119 typedef VarIntType<unsigned short> VarInt16;
00120 typedef VarIntType<unsigned int> VarInt32;
00121
00122 class Simple9
00123 {
00124 private:
00125 typedef unsigned int uint;
00126
00127 template <typename InIt>
00128 inline static void EncodeSymbol(uint &output, InIt it, InIt end) {
00129 uint length = end - it;
00130
00131 uint type = 0;
00132 uint bitlength = 0;
00133
00134 switch(length) {
00135 case 1:
00136 type = 1;
00137 bitlength = 28;
00138 break;
00139 case 2:
00140 type = 2;
00141 bitlength = 14;
00142 break;
00143 case 3:
00144 type = 3;
00145 bitlength = 9;
00146 break;
00147 case 4:
00148 type = 4;
00149 bitlength = 7;
00150 break;
00151 case 5:
00152 type = 5;
00153 bitlength = 5;
00154 break;
00155 case 7:
00156 type = 6;
00157 bitlength = 4;
00158 break;
00159 case 9:
00160 type = 7;
00161 bitlength = 3;
00162 break;
00163 case 14:
00164 type = 8;
00165 bitlength = 2;
00166 break;
00167 case 28:
00168 type = 9;
00169 bitlength = 1;
00170 break;
00171 }
00172
00173 output = 0;
00174 output |= (type << 28);
00175
00176 uint i = 0;
00177 while(it != end) {
00178 UTIL_THROW_IF2(*it > 268435455, "You are trying to encode " << *it
00179 << " with Simple9. Cannot encode numbers larger than 268435455 (2^28-1)");
00180
00181 uint l = bitlength * (length-i-1);
00182 output |= *it << l;
00183 it++;
00184 i++;
00185 }
00186 }
00187
00188 template <typename OutIt>
00189 static inline void DecodeSymbol(uint input, OutIt outIt) {
00190 uint type = (input >> 28);
00191
00192 uint bitlen = 0;
00193 uint shift = 0;
00194 uint mask = 0;
00195
00196 switch(type) {
00197 case 1:
00198 bitlen = 28;
00199 shift = 0;
00200 mask = 268435455;
00201 break;
00202 case 2:
00203 bitlen = 14;
00204 shift = 14;
00205 mask = 16383;
00206 break;
00207 case 3:
00208 bitlen = 9;
00209 shift = 18;
00210 mask = 511;
00211 break;
00212 case 4:
00213 bitlen = 7;
00214 shift = 21;
00215 mask = 127;
00216 break;
00217 case 5:
00218 bitlen = 5;
00219 shift = 20;
00220 mask = 31;
00221 break;
00222 case 6:
00223 bitlen = 4;
00224 shift = 24;
00225 mask = 15;
00226 break;
00227 case 7:
00228 bitlen = 3;
00229 shift = 24;
00230 mask = 7;
00231 break;
00232 case 8:
00233 bitlen = 2;
00234 shift = 26;
00235 mask = 3;
00236 break;
00237 case 9:
00238 bitlen = 1;
00239 shift = 27;
00240 mask = 1;
00241 break;
00242 }
00243
00244 while(shift > 0) {
00245 *outIt = (input >> shift) & mask;
00246 shift -= bitlen;
00247 outIt++;
00248 }
00249 *outIt = input & mask;
00250 outIt++;
00251 }
00252
00253 static inline size_t DecodeAndSumSymbol(uint input, size_t num, size_t &curr) {
00254 uint type = (input >> 28);
00255
00256 uint bitlen = 0;
00257 uint shift = 0;
00258 uint mask = 0;
00259
00260 switch(type) {
00261 case 1:
00262 bitlen = 28;
00263 shift = 0;
00264 mask = 268435455;
00265 break;
00266 case 2:
00267 bitlen = 14;
00268 shift = 14;
00269 mask = 16383;
00270 break;
00271 case 3:
00272 bitlen = 9;
00273 shift = 18;
00274 mask = 511;
00275 break;
00276 case 4:
00277 bitlen = 7;
00278 shift = 21;
00279 mask = 127;
00280 break;
00281 case 5:
00282 bitlen = 5;
00283 shift = 20;
00284 mask = 31;
00285 break;
00286 case 6:
00287 bitlen = 4;
00288 shift = 24;
00289 mask = 15;
00290 break;
00291 case 7:
00292 bitlen = 3;
00293 shift = 24;
00294 mask = 7;
00295 break;
00296 case 8:
00297 bitlen = 2;
00298 shift = 26;
00299 mask = 3;
00300 break;
00301 case 9:
00302 bitlen = 1;
00303 shift = 27;
00304 mask = 1;
00305 break;
00306 }
00307
00308 size_t sum = 0;
00309 while(shift > 0) {
00310 sum += (input >> shift) & mask;
00311 shift -= bitlen;
00312 if(++curr == num)
00313 return sum;
00314 }
00315 sum += input & mask;
00316 curr++;
00317 return sum;
00318 }
00319
00320 public:
00321 template <typename InIt, typename OutIt>
00322 static void Encode(InIt it, InIt end, OutIt outIt) {
00323 uint parts[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
00324
00325 uint buffer[28];
00326 for(InIt i = it; i < end; i++) {
00327 uint lastbit = 1;
00328 uint lastpos = 0;
00329 uint lastyes = 0;
00330 uint j = 0;
00331
00332 double log2 = log(2);
00333 while(j < 9 && lastpos < 28 && (i+lastpos) < end) {
00334 if(lastpos >= parts[j])
00335 j++;
00336
00337 buffer[lastpos] = *(i + lastpos);
00338
00339 uint reqbit = ceil(log(buffer[lastpos]+1)/log2);
00340 assert(reqbit <= 28);
00341
00342 uint bit = 28/floor(28/reqbit);
00343 if(lastbit < bit)
00344 lastbit = bit;
00345
00346 if(parts[j] > 28/lastbit)
00347 break;
00348 else if(lastpos == parts[j]-1)
00349 lastyes = lastpos;
00350
00351 lastpos++;
00352 }
00353 i += lastyes;
00354
00355 uint length = lastyes + 1;
00356 uint output;
00357 EncodeSymbol(output, buffer, buffer + length);
00358
00359 *outIt = output;
00360 outIt++;
00361 }
00362 }
00363
00364 template <typename InIt, typename OutIt>
00365 static void Decode(InIt &it, InIt end, OutIt outIt) {
00366 while(it != end) {
00367 DecodeSymbol(*it, outIt);
00368 it++;
00369 }
00370 }
00371
00372 template <typename InIt>
00373 static size_t DecodeAndSum(InIt &it, InIt end, size_t num) {
00374 size_t sum = 0;
00375 size_t curr = 0;
00376 while(it != end && curr < num) {
00377 sum += DecodeAndSumSymbol(*it, num, curr);
00378 it++;
00379 }
00380 assert(curr == num);
00381 return sum;
00382 }
00383 };
00384
00385 }
00386
00387 #endif