00001
00002
00003 #pragma once
00004
00005 #include <iostream>
00006 #include <fstream>
00007 #include <cassert>
00008 #include <cstdlib>
00009 #include <string>
00010 #include <queue>
00011 #include <map>
00012 #include <cmath>
00013
00014 #ifdef WITH_THREADS
00015 #include <boost/thread/shared_mutex.hpp>
00016 #endif
00017
00018 namespace tmmt
00019 {
00020 typedef std::string WORD;
00021 typedef unsigned int WORD_ID;
00022
00023 class Vocabulary
00024 {
00025 public:
00026 std::map<WORD, WORD_ID> lookup;
00027 std::vector< WORD > vocab;
00028 WORD_ID StoreIfNew( const WORD& );
00029 WORD_ID GetWordID( const WORD& );
00030 std::vector<WORD_ID> Tokenize( const char[] );
00031 inline WORD &GetWord( WORD_ID id ) const {
00032 WORD &i = (WORD&) vocab[ id ];
00033 return i;
00034 }
00035
00036 protected:
00037 #ifdef WITH_THREADS
00038
00039 mutable boost::shared_mutex m_accessLock;
00040 #endif
00041
00042
00043 };
00044
00045 }
00046