00001 #include "Vocabulary.h" 00002 00003 #pragma once 00004 00005 #define LINE_MAX_LENGTH 10000 00006 00007 namespace tmmt 00008 { 00009 00010 class SuffixArray 00011 { 00012 public: 00013 typedef unsigned int INDEX; 00014 00015 private: 00016 std::vector< std::vector< WORD_ID > > corpus; 00017 00018 WORD_ID *m_array; 00019 INDEX *m_index; 00020 INDEX *m_buffer; 00021 char *m_wordInSentence; 00022 size_t *m_sentence; 00023 char *m_sentenceLength; 00024 WORD_ID m_endOfSentence; 00025 Vocabulary m_vcb; 00026 INDEX m_size; 00027 00028 public: 00029 SuffixArray( std::string fileName ); 00030 ~SuffixArray(); 00031 00032 void Sort(INDEX start, INDEX end); 00033 int CompareIndex( INDEX a, INDEX b ) const; 00034 inline int CompareWord( WORD_ID a, WORD_ID b ) const; 00035 int Count( const std::vector< WORD > &phrase ); 00036 bool MinCount( const std::vector< WORD > &phrase, INDEX min ); 00037 bool Exists( const std::vector< WORD > &phrase ); 00038 int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 ); 00039 int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 ); 00040 INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end ); 00041 INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction ); 00042 int Match( const std::vector< WORD > &phrase, INDEX index ); 00043 void List( INDEX start, INDEX end ); 00044 inline INDEX GetPosition( INDEX index ) { 00045 return m_index[ index ]; 00046 } 00047 inline size_t GetSentence( INDEX position ) { 00048 return m_sentence[position]; 00049 } 00050 inline char GetWordInSentence( INDEX position ) { 00051 return m_wordInSentence[position]; 00052 } 00053 inline char GetSentenceLength( size_t sentenceId ) { 00054 return m_sentenceLength[sentenceId]; 00055 } 00056 inline INDEX GetSize() { 00057 return m_size; 00058 } 00059 00060 Vocabulary &GetVocabulary() { 00061 return m_vcb; 00062 } 00063 const std::vector< std::vector< WORD_ID > > &GetCorpus() const { 00064 return corpus; 00065 } 00066 }; 00067 00068 } 00069