00001 #include "Vocabulary.h"
00002
00003 #pragma once
00004
00005 #define LINE_MAX_LENGTH 10000
00006
00007 namespace tmmt
00008 {
00009
00010 class SuffixArray
00011 {
00012 public:
00013 typedef unsigned int INDEX;
00014
00015 private:
00016 std::vector< std::vector< WORD_ID > > corpus;
00017
00018 WORD_ID *m_array;
00019 INDEX *m_index;
00020 INDEX *m_buffer;
00021 char *m_wordInSentence;
00022 size_t *m_sentence;
00023 char *m_sentenceLength;
00024 WORD_ID m_endOfSentence;
00025 Vocabulary m_vcb;
00026 INDEX m_size;
00027
00028 public:
00029 SuffixArray( std::string fileName );
00030 ~SuffixArray();
00031
00032 void Sort(INDEX start, INDEX end);
00033 int CompareIndex( INDEX a, INDEX b ) const;
00034 inline int CompareWord( WORD_ID a, WORD_ID b ) const;
00035 int Count( const std::vector< WORD > &phrase );
00036 bool MinCount( const std::vector< WORD > &phrase, INDEX min );
00037 bool Exists( const std::vector< WORD > &phrase );
00038 int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
00039 int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
00040 INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end );
00041 INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
00042 int Match( const std::vector< WORD > &phrase, INDEX index );
00043 void List( INDEX start, INDEX end );
00044 inline INDEX GetPosition( INDEX index ) {
00045 return m_index[ index ];
00046 }
00047 inline size_t GetSentence( INDEX position ) {
00048 return m_sentence[position];
00049 }
00050 inline char GetWordInSentence( INDEX position ) {
00051 return m_wordInSentence[position];
00052 }
00053 inline char GetSentenceLength( size_t sentenceId ) {
00054 return m_sentenceLength[sentenceId];
00055 }
00056 inline INDEX GetSize() {
00057 return m_size;
00058 }
00059
00060 Vocabulary &GetVocabulary() {
00061 return m_vcb;
00062 }
00063 const std::vector< std::vector< WORD_ID > > &GetCorpus() const {
00064 return corpus;
00065 }
00066 };
00067
00068 }
00069