00001 #include "SuffixArray.h"
00002 #include <string>
00003 #include <stdlib.h>
00004 #include <cstring>
00005
00006 using namespace std;
00007
00008 namespace tmmt
00009 {
00010
00011 SuffixArray::SuffixArray( string fileName )
00012 {
00013 m_vcb.StoreIfNew( "<uNk>" );
00014 m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
00015
00016 ifstream extractFile;
00017
00018
00019 extractFile.open(fileName.c_str());
00020 istream *fileP = &extractFile;
00021 m_size = 0;
00022 size_t sentenceCount = 0;
00023 string line;
00024 while(getline(*fileP, line)) {
00025
00026 vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() );
00027 m_size += words.size() + 1;
00028 sentenceCount++;
00029 }
00030 extractFile.close();
00031 cerr << m_size << " words (incl. sentence boundaries)" << endl;
00032
00033
00034 m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
00035 m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
00036 m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
00037 m_sentence = (size_t*) calloc( sizeof( size_t ), m_size );
00038 m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount );
00039
00040
00041 int wordIndex = 0;
00042 int sentenceId = 0;
00043 extractFile.open(fileName.c_str());
00044 fileP = &extractFile;
00045 while(getline(*fileP, line)) {
00046 vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() );
00047
00048
00049 corpus.push_back(words);
00050
00051
00052
00053 vector< WORD_ID >::const_iterator i;
00054 for( i=words.begin(); i!=words.end(); i++) {
00055 m_index[ wordIndex ] = wordIndex;
00056 m_sentence[ wordIndex ] = sentenceId;
00057 m_wordInSentence[ wordIndex ] = i-words.begin();
00058 m_array[ wordIndex++ ] = *i;
00059 }
00060 m_index[ wordIndex ] = wordIndex;
00061 m_array[ wordIndex++ ] = m_endOfSentence;
00062 m_sentenceLength[ sentenceId++ ] = words.size();
00063 }
00064 extractFile.close();
00065 cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
00066
00067
00068
00069 m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
00070 Sort( 0, m_size-1 );
00071 free( m_buffer );
00072 cerr << "done sorting" << endl;
00073 }
00074
00075
00076 void SuffixArray::Sort(INDEX start, INDEX end)
00077 {
00078 if (start == end) return;
00079 INDEX mid = (start+end+1)/2;
00080 Sort( start, mid-1 );
00081 Sort( mid, end );
00082
00083
00084 size_t i = start;
00085 size_t j = mid;
00086 size_t k = 0;
00087 size_t length = end-start+1;
00088 while( k<length ) {
00089 if (i == mid ) {
00090 m_buffer[ k++ ] = m_index[ j++ ];
00091 } else if (j > end ) {
00092 m_buffer[ k++ ] = m_index[ i++ ];
00093 } else {
00094 if (CompareIndex( m_index[i], m_index[j] ) < 0) {
00095 m_buffer[ k++ ] = m_index[ i++ ];
00096 } else {
00097 m_buffer[ k++ ] = m_index[ j++ ];
00098 }
00099 }
00100 }
00101
00102 memcpy( ((char*)m_index) + sizeof( INDEX ) * start,
00103 ((char*)m_buffer), sizeof( INDEX ) * (end-start+1) );
00104 }
00105
00106 SuffixArray::~SuffixArray()
00107 {
00108 free(m_index);
00109 free(m_array);
00110 }
00111
00112 int SuffixArray::CompareIndex( INDEX a, INDEX b ) const
00113 {
00114
00115 INDEX offset = 0;
00116 while( a+offset < m_size &&
00117 b+offset < m_size &&
00118 m_array[ a+offset ] == m_array[ b+offset ] ) {
00119 offset++;
00120 }
00121
00122 if( a+offset == m_size ) return -1;
00123 if( b+offset == m_size ) return 1;
00124 return CompareWord( m_array[ a+offset ], m_array[ b+offset ] );
00125 }
00126
00127 inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
00128 {
00129
00130 return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
00131 }
00132
00133 int SuffixArray::Count( const vector< WORD > &phrase )
00134 {
00135 INDEX dummy;
00136 return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 );
00137 }
00138
00139 bool SuffixArray::MinCount( const vector< WORD > &phrase, INDEX min )
00140 {
00141 INDEX dummy;
00142 return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min;
00143 }
00144
00145 bool SuffixArray::Exists( const vector< WORD > &phrase )
00146 {
00147 INDEX dummy;
00148 return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1;
00149 }
00150
00151 int SuffixArray::FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
00152 {
00153 return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end );
00154 }
00155
00156 int SuffixArray::LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
00157 {
00158
00159 INDEX start = search_start;
00160 INDEX end = (search_end == -1) ? (m_size-1) : search_end;
00161 INDEX mid = FindFirst( phrase, start, end );
00162
00163 if (mid == m_size) return 0;
00164 if (min == 1) return 1;
00165
00166 int matchCount = 1;
00167
00168
00169 firstMatch = FindLast( phrase, mid, start, -1 );
00170 matchCount += mid - firstMatch;
00171
00172
00173 lastMatch = FindLast( phrase, mid, end, 1 );
00174 matchCount += lastMatch - mid;
00175
00176 return matchCount;
00177 }
00178
00179 SuffixArray::INDEX SuffixArray::FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction )
00180 {
00181 end += direction;
00182 while(true) {
00183 INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2;
00184
00185 int match = Match( phrase, mid );
00186 int matchNext = Match( phrase, mid+direction );
00187
00188
00189 if (match == 0 && matchNext != 0) return mid;
00190
00191 if (match == 0)
00192 start = mid;
00193 else
00194 end = mid;
00195 }
00196 }
00197
00198 SuffixArray::INDEX SuffixArray::FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end )
00199 {
00200 while(true) {
00201 INDEX mid = ( start + end + 1 )/2;
00202
00203 int match = Match( phrase, mid );
00204
00205 if (match == 0) return mid;
00206 if (start >= end && match != 0 ) return m_size;
00207
00208 if (match > 0)
00209 start = mid+1;
00210 else
00211 end = mid-1;
00212 }
00213 }
00214
00215 int SuffixArray::Match( const vector< WORD > &phrase, INDEX index )
00216 {
00217 INDEX pos = m_index[ index ];
00218 for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++) {
00219 int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] );
00220
00221 if (match != 0)
00222 return match;
00223 }
00224 return 0;
00225 }
00226
00227 void SuffixArray::List(INDEX start, INDEX end)
00228 {
00229 for(INDEX i=start; i<=end; i++) {
00230 INDEX pos = m_index[ i ];
00231
00232 for(int j=0; j<5 && j+pos<m_size; j++) {
00233
00234 }
00235
00236 }
00237 }
00238
00239 }
00240