00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifndef moses_BlockHashIndex_h
00023 #define moses_BlockHashIndex_h
00024
00025 #include <iostream>
00026 #include <sstream>
00027 #include <string>
00028 #include <vector>
00029 #include <queue>
00030 #include <cstring>
00031 #include <cstdio>
00032
00033 #include "MurmurHash3.h"
00034 #include "StringVector.h"
00035 #include "PackedArray.h"
00036 #include "util/exception.hh"
00037 #include "util/string_stream.hh"
00038
00039 #ifdef WITH_THREADS
00040 #include "moses/ThreadPool.h"
00041 #else
00042 #include <ctime>
00043 #endif
00044
00045 #include <boost/shared_ptr.hpp>
00046
00047 namespace Moses
00048 {
00049
00050 class BlockHashIndex
00051 {
00052 private:
00053 std::priority_queue<int> m_queue;
00054
00055 size_t m_orderBits;
00056 size_t m_fingerPrintBits;
00057
00058 std::FILE* m_fileHandle;
00059 size_t m_fileHandleStart;
00060
00061 StringVector<unsigned char, unsigned long> m_landmarks;
00062
00063 std::vector<void*> m_hashes;
00064 std::vector<clock_t> m_clocks;
00065 std::vector<PairedPackedArray<>*> m_arrays;
00066
00067 std::vector<size_t> m_seekIndex;
00068
00069 size_t m_size;
00070 int m_lastSaved;
00071 int m_lastDropped;
00072 size_t m_numLoadedRanges;
00073
00074 #ifdef WITH_THREADS
00075 ThreadPool m_threadPool;
00076 boost::mutex m_mutex;
00077
00078 template <typename Keys>
00079 class HashTask : public Task
00080 {
00081 public:
00082 HashTask(int id, BlockHashIndex& hash, Keys& keys)
00083 : m_id(id), m_hash(hash), m_keys(new Keys(keys)) {}
00084
00085 virtual void Run() {
00086 m_hash.CalcHash(m_id, *m_keys);
00087 }
00088
00089 virtual ~HashTask() {
00090 delete m_keys;
00091 }
00092
00093 private:
00094 int m_id;
00095 BlockHashIndex& m_hash;
00096 Keys* m_keys;
00097 };
00098 #endif
00099
00100 size_t GetFprint(const char* key) const;
00101 size_t GetHash(size_t i, const char* key);
00102
00103 public:
00104 #ifdef WITH_THREADS
00105 BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
00106 size_t threadsNum = 2);
00107 #else
00108 BlockHashIndex(size_t orderBits, size_t fingerPrintBits);
00109 #endif
00110
00111 ~BlockHashIndex();
00112
00113 size_t GetHash(const char* key);
00114 size_t GetHash(std::string key);
00115
00116 size_t operator[](std::string key);
00117 size_t operator[](char* key);
00118
00119 void BeginSave(std::FILE* mphf);
00120 void SaveRange(size_t i);
00121 void SaveLastRange();
00122 size_t FinalizeSave();
00123
00124 #ifdef WITH_THREADS
00125 void WaitAll();
00126 #endif
00127
00128 void DropRange(size_t i);
00129 void DropLastRange();
00130
00131 size_t LoadIndex(std::FILE* mphf);
00132 void LoadRange(size_t i);
00133
00134 size_t Save(std::string filename);
00135 size_t Save(std::FILE * mphf);
00136
00137 size_t Load(std::string filename);
00138 size_t Load(std::FILE * mphf);
00139
00140 size_t GetSize() const;
00141
00142 void KeepNLastRanges(float ratio = 0.1, float tolerance = 0.1);
00143
00144 template <typename Keys>
00145 void AddRange(Keys &keys) {
00146 size_t current = m_landmarks.size();
00147
00148 if(m_landmarks.size() && m_landmarks.back().str() >= keys[0]) {
00149 util::StringStream strme;
00150 strme << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort\n";
00151 strme << "1: " << m_landmarks.back().str() << "\n";
00152 strme << "2: " << keys[0] << "\n";
00153 UTIL_THROW2(strme.str());
00154 }
00155
00156 m_landmarks.push_back(keys[0]);
00157 m_size += keys.size();
00158
00159 if(keys.size() == 1) {
00160
00161 keys.push_back("###DUMMY_KEY###");
00162 }
00163
00164 #ifdef WITH_THREADS
00165
00166 boost::shared_ptr<HashTask<Keys> >
00167 ht(new HashTask<Keys>(current, *this, keys));
00168 m_threadPool.Submit(ht);
00169 #else
00170 CalcHash(current, keys);
00171 #endif
00172 }
00173
00174 template <typename Keys>
00175 void CalcHash(size_t current, Keys &keys) {
00176 #ifdef HAVE_CMPH
00177 void* source = vectorAdapter(keys);
00178 CalcHash(current, source);
00179 #endif
00180 }
00181
00182 void CalcHash(size_t current, void* source);
00183
00184 #ifdef HAVE_CMPH
00185 void* vectorAdapter(std::vector<std::string>& v);
00186 void* vectorAdapter(StringVector<unsigned, size_t, std::allocator>& sv);
00187 void* vectorAdapter(StringVector<unsigned, size_t, MmapAllocator>& sv);
00188 #endif
00189 };
00190
00191 }
00192 #endif