00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifndef moses_StringVector_h
00023 #define moses_StringVector_h
00024
00025 #include <vector>
00026 #include <algorithm>
00027 #include <string>
00028 #include <iterator>
00029 #include <cstdio>
00030 #include <cassert>
00031
00032 #include <boost/iterator/iterator_facade.hpp>
00033
00034 #include "ThrowingFwrite.h"
00035 #include "MonotonicVector.h"
00036 #include "MmapAllocator.h"
00037
00038 namespace Moses
00039 {
00040
00041
00042
00043 template <typename ValueIteratorT>
00044 class ValueIteratorRange
00045 {
00046 private:
00047 ValueIteratorT m_begin;
00048 ValueIteratorT m_end;
00049
00050 public:
00051 ValueIteratorRange(ValueIteratorT begin, ValueIteratorT end);
00052
00053 const ValueIteratorT& begin() const;
00054 const ValueIteratorT& end() const;
00055 const std::string str() const;
00056 operator const std::string() {
00057 return str();
00058 }
00059
00060 size_t size() {
00061 return std::distance(m_begin, m_end);
00062 }
00063
00064 template <typename StringT>
00065 bool operator==(const StringT& o) const;
00066 bool operator==(const char* c) const;
00067
00068 template <typename StringT>
00069 bool operator<(const StringT& o) const;
00070 bool operator<(const char* c) const;
00071 };
00072
00073
00074
00075 template <typename ValueT = unsigned char, typename PosT = unsigned int,
00076 template <typename> class Allocator = std::allocator>
00077 class StringVector
00078 {
00079 protected:
00080 bool m_sorted;
00081 bool m_memoryMapped;
00082
00083 std::vector<ValueT, Allocator<ValueT> >* m_charArray;
00084 MonotonicVector<PosT, unsigned int, 32> m_positions;
00085
00086 virtual const ValueT* value_ptr(PosT i) const;
00087
00088 public:
00089
00090 typedef ValueIteratorRange<const ValueT *> range;
00091
00092
00093
00094 class RangeIterator : public boost::iterator_facade<RangeIterator,
00095 range, std::random_access_iterator_tag, range, PosT>
00096 {
00097
00098 private:
00099 PosT m_index;
00100 StringVector<ValueT, PosT, Allocator>* m_container;
00101
00102 public:
00103 RangeIterator();
00104 RangeIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index=0);
00105
00106 PosT get_index();
00107
00108 private:
00109 friend class boost::iterator_core_access;
00110
00111 range dereference() const;
00112 bool equal(RangeIterator const& other) const;
00113 void increment();
00114 void decrement();
00115 void advance(PosT n);
00116
00117 PosT distance_to(RangeIterator const& other) const;
00118 };
00119
00120
00121
00122 class StringIterator : public boost::iterator_facade<StringIterator,
00123 std::string, std::random_access_iterator_tag, const std::string, PosT>
00124 {
00125
00126 private:
00127 PosT m_index;
00128 StringVector<ValueT, PosT, Allocator>* m_container;
00129
00130 public:
00131 StringIterator();
00132 StringIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index=0);
00133
00134 PosT get_index();
00135
00136 private:
00137 friend class boost::iterator_core_access;
00138
00139 const std::string dereference() const;
00140 bool equal(StringIterator const& other) const;
00141 void increment();
00142 void decrement();
00143 void advance(PosT n);
00144 PosT distance_to(StringIterator const& other) const;
00145 };
00146
00147 typedef RangeIterator iterator;
00148 typedef StringIterator string_iterator;
00149
00150 StringVector(bool allocate = false);
00151 StringVector(Allocator<ValueT>& alloc);
00152
00153 virtual ~StringVector() {
00154 delete m_charArray;
00155 }
00156
00157 void swap(StringVector<ValueT, PosT, Allocator> &c) {
00158 m_positions.commit();
00159 m_positions.swap(c.m_positions);
00160 m_charArray->swap(*c.m_charArray);
00161
00162 bool temp = m_sorted;
00163 m_sorted = c.m_sorted;
00164 c.m_sorted = temp;
00165 }
00166
00167 bool is_sorted() const;
00168 PosT size() const;
00169 virtual PosT size2() const;
00170
00171 template<class Iterator> Iterator begin() const;
00172 template<class Iterator> Iterator end() const;
00173
00174 iterator begin() const;
00175 iterator end() const;
00176
00177 PosT length(PosT i) const;
00178
00179
00180 const ValueT* begin(PosT i) const;
00181 const ValueT* end(PosT i) const;
00182
00183 void clear() {
00184 m_charArray->clear();
00185 m_sorted = true;
00186 m_positions = MonotonicVector<PosT, unsigned int, 32>();
00187 }
00188
00189 range at(PosT i) const;
00190 range operator[](PosT i) const;
00191 range back() const;
00192
00193 template <typename StringT>
00194 void push_back(StringT s);
00195 void push_back(const char* c);
00196
00197 template <typename StringT>
00198 PosT find(StringT &s) const;
00199 PosT find(const char* c) const;
00200
00201 virtual size_t load(std::FILE* in, bool memoryMapped = false) {
00202 size_t size = 0;
00203 m_memoryMapped = memoryMapped;
00204
00205 size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool);
00206 size += m_positions.load(in, false);
00207
00208 size += loadCharArray(m_charArray, in, m_memoryMapped);
00209 return size;
00210 }
00211
00212 size_t loadCharArray(std::vector<ValueT, std::allocator<ValueT> >*& c,
00213 std::FILE* in, bool map = false) {
00214
00215 assert(map == false);
00216
00217 size_t byteSize = 0;
00218
00219 size_t valSize;
00220 byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
00221
00222 c = new std::vector<ValueT, std::allocator<ValueT> >(valSize, 0);
00223 byteSize += std::fread(&(*c)[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
00224
00225 return byteSize;
00226 }
00227
00228 size_t loadCharArray(std::vector<ValueT, MmapAllocator<ValueT> >*& c,
00229 std::FILE* in, bool map = false) {
00230 size_t byteSize = 0;
00231
00232 size_t valSize;
00233 byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
00234
00235 if(map == false) {
00236
00237
00238 c = new std::vector<ValueT, MmapAllocator<ValueT> >(valSize, 0);
00239 byteSize += std::fread(&(*c)[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
00240 } else {
00241
00242
00243
00244 size_t valPos = std::ftell(in);
00245 Allocator<ValueT> alloc(in, valPos);
00246 c = new std::vector<ValueT, Allocator<ValueT> >(alloc);
00247 c->resize(valSize, 0);
00248
00249 byteSize += valSize * sizeof(ValueT);
00250 }
00251
00252 return byteSize;
00253 }
00254
00255 size_t load(std::string filename, bool memoryMapped = false) {
00256 std::FILE* pFile = fopen(filename.c_str(), "r");
00257 size_t byteSize = load(pFile, memoryMapped);
00258 fclose(pFile);
00259 return byteSize;
00260 }
00261
00262 size_t save(std::FILE* out) {
00263 size_t byteSize = 0;
00264 byteSize += ThrowingFwrite(&m_sorted, sizeof(bool), 1, out) * sizeof(bool);
00265
00266 byteSize += m_positions.save(out);
00267
00268 size_t valSize = size2();
00269 byteSize += ThrowingFwrite(&valSize, sizeof(size_t), 1, out) * sizeof(size_t);
00270 byteSize += ThrowingFwrite(&(*m_charArray)[0], sizeof(ValueT), valSize, out) * sizeof(ValueT);
00271
00272 return byteSize;
00273 }
00274
00275 size_t save(std::string filename) {
00276 std::FILE* pFile = fopen(filename.c_str(), "w");
00277 size_t byteSize = save(pFile);
00278 fclose(pFile);
00279 return byteSize;
00280 }
00281
00282 };
00283
00284
00285
00286
00287
00288 template <typename ValueIteratorT>
00289 ValueIteratorRange<ValueIteratorT>::ValueIteratorRange(ValueIteratorT begin,
00290 ValueIteratorT end) : m_begin(begin), m_end(end) { }
00291
00292 template <typename ValueIteratorT>
00293 const ValueIteratorT& ValueIteratorRange<ValueIteratorT>::begin() const
00294 {
00295 return m_begin;
00296 }
00297
00298 template <typename ValueIteratorT>
00299 const ValueIteratorT& ValueIteratorRange<ValueIteratorT>::end() const
00300 {
00301 return m_end;
00302 }
00303
00304 template <typename ValueIteratorT>
00305 const std::string ValueIteratorRange<ValueIteratorT>::str() const
00306 {
00307 std::string dummy;
00308 for(ValueIteratorT it = m_begin; it != m_end; it++)
00309 dummy.push_back(*it);
00310 return dummy;
00311 }
00312
00313 template <typename ValueIteratorT>
00314 template <typename StringT>
00315 bool ValueIteratorRange<ValueIteratorT>::operator==(const StringT& o) const
00316 {
00317 if(std::distance(m_begin, m_end) == std::distance(o.begin(), o.end()))
00318 return std::equal(m_begin, m_end, o.begin());
00319 else
00320 return false;
00321 }
00322
00323 template <typename ValueIteratorT>
00324 bool ValueIteratorRange<ValueIteratorT>::operator==(const char* c) const
00325 {
00326 return *this == std::string(c);
00327 }
00328
00329 template <typename ValueIteratorT>
00330 template <typename StringT>
00331 bool ValueIteratorRange<ValueIteratorT>::operator<(const StringT &s2) const
00332 {
00333 return std::lexicographical_compare(m_begin, m_end, s2.begin(), s2.end(),
00334 std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
00335 }
00336
00337 template <typename ValueIteratorT>
00338 bool ValueIteratorRange<ValueIteratorT>::operator<(const char* c) const
00339 {
00340 return *this < std::string(c);
00341 }
00342
00343 template <typename StringT, typename ValueIteratorT>
00344 bool operator<(const StringT &s1, const ValueIteratorRange<ValueIteratorT> &s2)
00345 {
00346 return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(), s2.end(),
00347 std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
00348 }
00349
00350 template <typename ValueIteratorT>
00351 bool operator<(const char* c, const ValueIteratorRange<ValueIteratorT> &s2)
00352 {
00353 size_t len = std::char_traits<char>::length(c);
00354 return std::lexicographical_compare(c, c + len, s2.begin(), s2.end(),
00355 std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
00356 }
00357
00358 template <typename OStream, typename ValueIteratorT>
00359 OStream& operator<<(OStream &os, ValueIteratorRange<ValueIteratorT> cr)
00360 {
00361 ValueIteratorT it = cr.begin();
00362 while(it != cr.end())
00363 os << *(it++);
00364 return os;
00365 }
00366
00367
00368
00369 template<typename ValueT, typename PosT, template <typename> class Allocator>
00370 StringVector<ValueT, PosT, Allocator>::StringVector(bool allocate)
00371 : m_sorted(true), m_memoryMapped(false),
00372 m_charArray(allocate ? new std::vector<ValueT, Allocator<ValueT> >() : 0) { }
00373
00374 template<typename ValueT, typename PosT, template <typename> class Allocator>
00375 StringVector<ValueT, PosT, Allocator>::StringVector(Allocator<ValueT> &alloc)
00376 : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >(alloc)) { }
00377
00378 template<typename ValueT, typename PosT, template <typename> class Allocator>
00379 template <typename StringT>
00380 void StringVector<ValueT, PosT, Allocator>::push_back(StringT s)
00381 {
00382 if(is_sorted() && size() && !(back() < s))
00383 m_sorted = false;
00384
00385 m_positions.push_back(size2());
00386 std::copy(s.begin(), s.end(), std::back_inserter(*m_charArray));
00387 }
00388
00389 template<typename ValueT, typename PosT, template <typename> class Allocator>
00390 void StringVector<ValueT, PosT, Allocator>::push_back(const char* c)
00391 {
00392 std::string dummy(c);
00393 push_back(dummy);
00394 }
00395
00396 template<typename ValueT, typename PosT, template <typename> class Allocator>
00397 template <typename Iterator>
00398 Iterator StringVector<ValueT, PosT, Allocator>::begin() const
00399 {
00400 return Iterator(const_cast<StringVector<ValueT, PosT, Allocator>&>(*this), 0);
00401 }
00402
00403 template<typename ValueT, typename PosT, template <typename> class Allocator>
00404 template <typename Iterator>
00405 Iterator StringVector<ValueT, PosT, Allocator>::end() const
00406 {
00407 return Iterator(const_cast<StringVector<ValueT, PosT, Allocator>&>(*this), size());
00408 }
00409
00410 template<typename ValueT, typename PosT, template <typename> class Allocator>
00411 typename StringVector<ValueT, PosT, Allocator>::iterator StringVector<ValueT, PosT, Allocator>::begin() const
00412 {
00413 return begin<iterator>();
00414 };
00415
00416 template<typename ValueT, typename PosT, template <typename> class Allocator>
00417 typename StringVector<ValueT, PosT, Allocator>::iterator StringVector<ValueT, PosT, Allocator>::end() const
00418 {
00419 return end<iterator>();
00420 };
00421
00422 template<typename ValueT, typename PosT, template <typename> class Allocator>
00423 bool StringVector<ValueT, PosT, Allocator>::is_sorted() const
00424 {
00425 return m_sorted;
00426 }
00427
00428 template<typename ValueT, typename PosT, template <typename> class Allocator>
00429 PosT StringVector<ValueT, PosT, Allocator>::size() const
00430 {
00431 return m_positions.size();
00432 }
00433
00434 template<typename ValueT, typename PosT, template <typename> class Allocator>
00435 PosT StringVector<ValueT, PosT, Allocator>::size2() const
00436 {
00437 return m_charArray->size();
00438 }
00439
00440 template<typename ValueT, typename PosT, template <typename> class Allocator>
00441 typename StringVector<ValueT, PosT, Allocator>::range StringVector<ValueT, PosT, Allocator>::at(PosT i) const
00442 {
00443 return range(begin(i), end(i));
00444 }
00445
00446 template<typename ValueT, typename PosT, template <typename> class Allocator>
00447 typename StringVector<ValueT, PosT, Allocator>::range StringVector<ValueT, PosT, Allocator>::operator[](PosT i) const
00448 {
00449 return at(i);
00450 }
00451
00452 template<typename ValueT, typename PosT, template <typename> class Allocator>
00453 typename StringVector<ValueT, PosT, Allocator>::range StringVector<ValueT, PosT, Allocator>::back() const
00454 {
00455 return at(size()-1);
00456 }
00457
00458 template<typename ValueT, typename PosT, template <typename> class Allocator>
00459 PosT StringVector<ValueT, PosT, Allocator>::length(PosT i) const
00460 {
00461 if(i+1 < size())
00462 return m_positions[i+1] - m_positions[i];
00463 else
00464 return size2() - m_positions[i];
00465 }
00466
00467 template<typename ValueT, typename PosT, template <typename> class Allocator>
00468 const ValueT* StringVector<ValueT, PosT, Allocator>::value_ptr(PosT i) const
00469 {
00470 return &(*m_charArray)[m_positions[i]];
00471 }
00472
00473 template<typename ValueT, typename PosT, template <typename> class Allocator>
00474
00475 const ValueT* StringVector<ValueT, PosT, Allocator>::begin(PosT i) const
00476 {
00477
00478 return value_ptr(i);
00479 }
00480
00481 template<typename ValueT, typename PosT, template <typename> class Allocator>
00482
00483 const ValueT* StringVector<ValueT, PosT, Allocator>::end(PosT i) const
00484 {
00485
00486 return value_ptr(i) + length(i);
00487 }
00488
00489 template<typename ValueT, typename PosT, template <typename> class Allocator>
00490 template <typename StringT>
00491 PosT StringVector<ValueT, PosT, Allocator>::find(StringT &s) const
00492 {
00493 if(m_sorted)
00494 return std::distance(begin(), std::lower_bound(begin(), end(), s));
00495 return std::distance(begin(), std::find(begin(), end(), s));
00496 }
00497
00498 template<typename ValueT, typename PosT, template <typename> class Allocator>
00499 PosT StringVector<ValueT, PosT, Allocator>::find(const char* c) const
00500 {
00501 std::string s(c);
00502 return find(s);
00503 }
00504
00505
00506
00507 template<typename ValueT, typename PosT, template <typename> class Allocator>
00508 StringVector<ValueT, PosT, Allocator>::RangeIterator::RangeIterator() : m_index(0), m_container(0) { }
00509
00510 template<typename ValueT, typename PosT, template <typename> class Allocator>
00511 StringVector<ValueT, PosT, Allocator>::RangeIterator::RangeIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index)
00512 : m_index(index), m_container(&sv) { }
00513
00514 template<typename ValueT, typename PosT, template <typename> class Allocator>
00515 PosT StringVector<ValueT, PosT, Allocator>::RangeIterator::get_index()
00516 {
00517 return m_index;
00518 }
00519
00520 template<typename ValueT, typename PosT, template <typename> class Allocator>
00521 typename StringVector<ValueT, PosT, Allocator>::range
00522 StringVector<ValueT, PosT, Allocator>::RangeIterator::dereference() const
00523 {
00524 return typename StringVector<ValueT, PosT, Allocator>::range(
00525 m_container->begin(m_index),
00526 m_container->end(m_index)
00527 );
00528 }
00529
00530 template<typename ValueT, typename PosT, template <typename> class Allocator>
00531 bool StringVector<ValueT, PosT, Allocator>::RangeIterator::equal(
00532 StringVector<ValueT, PosT, Allocator>::RangeIterator const& other) const
00533 {
00534 return m_index == other.m_index && m_container == other.m_container;
00535 }
00536
00537 template<typename ValueT, typename PosT, template <typename> class Allocator>
00538 void StringVector<ValueT, PosT, Allocator>::RangeIterator::increment()
00539 {
00540 m_index++;
00541 }
00542
00543 template<typename ValueT, typename PosT, template <typename> class Allocator>
00544 void StringVector<ValueT, PosT, Allocator>::RangeIterator::decrement()
00545 {
00546 m_index--;
00547 }
00548
00549 template<typename ValueT, typename PosT, template <typename> class Allocator>
00550 void StringVector<ValueT, PosT, Allocator>::RangeIterator::advance(PosT n)
00551 {
00552 m_index += n;
00553 }
00554
00555 template<typename ValueT, typename PosT, template <typename> class Allocator>
00556 PosT StringVector<ValueT, PosT, Allocator>::RangeIterator::distance_to(
00557 StringVector<ValueT, PosT, Allocator>::RangeIterator const& other) const
00558 {
00559 return other.m_index - m_index;
00560 }
00561
00562
00563
00564 template<typename ValueT, typename PosT, template <typename> class Allocator>
00565 StringVector<ValueT, PosT, Allocator>::StringIterator::StringIterator()
00566 : m_index(0), m_container(0) { }
00567
00568 template<typename ValueT, typename PosT, template <typename> class Allocator>
00569 StringVector<ValueT, PosT, Allocator>::StringIterator::StringIterator(
00570 StringVector<ValueT, PosT, Allocator> &sv, PosT index) : m_index(index),
00571 m_container(&sv) { }
00572
00573 template<typename ValueT, typename PosT, template <typename> class Allocator>
00574 PosT StringVector<ValueT, PosT, Allocator>::StringIterator::get_index()
00575 {
00576 return m_index;
00577 }
00578
00579 template<typename ValueT, typename PosT, template <typename> class Allocator>
00580 const std::string StringVector<ValueT, PosT, Allocator>::StringIterator::dereference() const
00581 {
00582 return StringVector<ValueT, PosT, Allocator>::range(m_container->begin(m_index),
00583 m_container->end(m_index)).str();
00584 }
00585
00586 template<typename ValueT, typename PosT, template <typename> class Allocator>
00587 bool StringVector<ValueT, PosT, Allocator>::StringIterator::equal(
00588 StringVector<ValueT, PosT, Allocator>::StringIterator const& other) const
00589 {
00590 return m_index == other.m_index && m_container == other.m_container;
00591 }
00592
00593 template<typename ValueT, typename PosT, template <typename> class Allocator>
00594 void StringVector<ValueT, PosT, Allocator>::StringIterator::increment()
00595 {
00596 m_index++;
00597 }
00598
00599 template<typename ValueT, typename PosT, template <typename> class Allocator>
00600 void StringVector<ValueT, PosT, Allocator>::StringIterator::decrement()
00601 {
00602 m_index--;
00603 }
00604
00605 template<typename ValueT, typename PosT, template <typename> class Allocator>
00606 void StringVector<ValueT, PosT, Allocator>::StringIterator::advance(PosT n)
00607 {
00608 m_index += n;
00609 }
00610
00611 template<typename ValueT, typename PosT, template <typename> class Allocator>
00612 PosT StringVector<ValueT, PosT, Allocator>::StringIterator::distance_to(
00613 StringVector<ValueT, PosT, Allocator>::StringIterator const& other) const
00614 {
00615 return other.m_index - m_index;
00616 }
00617
00618
00619
00620 typedef StringVector<unsigned char, unsigned int> MediumStringVector;
00621 typedef StringVector<unsigned char, unsigned long> LongStringVector;
00622
00623 }
00624
00625 #endif