00001
00002 #include "Vocabulary.h"
00003 #ifdef WITH_THREADS
00004 #include <boost/thread/locks.hpp>
00005 #endif
00006
00007 using namespace std;
00008
00009 namespace tmmt
00010 {
00011
00012
00013 vector<WORD_ID> Vocabulary::Tokenize( const char input[] )
00014 {
00015 vector< WORD_ID > token;
00016 bool betweenWords = true;
00017 int start=0;
00018 int i=0;
00019 for(; input[i] != '\0'; i++) {
00020 bool isSpace = (input[i] == ' ' || input[i] == '\t');
00021
00022 if (!isSpace && betweenWords) {
00023 start = i;
00024 betweenWords = false;
00025 } else if (isSpace && !betweenWords) {
00026 token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
00027 betweenWords = true;
00028 }
00029 }
00030 if (!betweenWords)
00031 token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
00032 return token;
00033 }
00034
00035 WORD_ID Vocabulary::StoreIfNew( const WORD& word )
00036 {
00037
00038 {
00039
00040 #ifdef WITH_THREADS
00041 boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
00042 #endif
00043 map<WORD, WORD_ID>::iterator i = lookup.find( word );
00044
00045 if( i != lookup.end() )
00046 return i->second;
00047 }
00048
00049 #ifdef WITH_THREADS
00050 boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
00051 #endif
00052 WORD_ID id = vocab.size();
00053 vocab.push_back( word );
00054 lookup[ word ] = id;
00055 return id;
00056 }
00057
00058 WORD_ID Vocabulary::GetWordID( const WORD &word )
00059 {
00060 #ifdef WITH_THREADS
00061 boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
00062 #endif
00063 map<WORD, WORD_ID>::iterator i = lookup.find( word );
00064 if( i == lookup.end() )
00065 return 0;
00066 WORD_ID w= (WORD_ID) i->second;
00067 return w;
00068 }
00069
00070 }
00071