00001 // $Id: n_gram.h 3461 2010-08-27 10:17:34Z bertoldi $ 00002 00003 /****************************************************************************** 00004 IrstLM: IRST Language Model Toolkit 00005 Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy 00006 00007 This library is free software; you can redistribute it and/or 00008 modify it under the terms of the GNU Lesser General Public 00009 License as published by the Free Software Foundation; either 00010 version 2.1 of the License, or (at your option) any later version. 00011 00012 This library is distributed in the hope that it will be useful, 00013 but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 Lesser General Public License for more details. 00016 00017 You should have received a copy of the GNU Lesser General Public 00018 License along with this library; if not, write to the Free Software 00019 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00020 00021 ******************************************************************************/ 00022 00023 // n-gram tables 00024 // by M. Federico 00025 // Copyright Marcello Federico, ITC-irst, 1998 00026 00027 #ifndef MF_NGRAM_H 00028 #define MF_NGRAM_H 00029 00030 #include <fstream> 00031 #include <cassert> 00032 #include "dictionary.h" 00033 00034 #ifdef MYMAXNGRAM 00035 #define MAX_NGRAM MYMAXNGRAM 00036 #else 00037 #define MAX_NGRAM 20 00038 #endif 00039 00040 class dictionary; 00041 00042 //typedef int code; 00043 00044 class ngram 00045 { 00046 int word[MAX_NGRAM]; //encoded ngram 00047 public: 00048 dictionary *dict; // dictionary 00049 char* link; // ngram-tree pointer 00050 char* succlink; // pointer to the first successor 00051 int midx[MAX_NGRAM]; // ngram-tree scan pointer 00052 char* path[MAX_NGRAM]; // path in the ngram-trie 00053 float bowv[MAX_NGRAM]; // vector of bow found in the trie 00054 00055 int lev; // ngram-tree level 00056 int size; // ngram size 00057 long long freq; // ngram frequency or integer prob 00058 int succ; // number of successors 00059 float bow; // back-off weight 00060 float prob; // probability 00061 00062 unsigned char info; // ngram-tree info flags 00063 unsigned char pinfo; // ngram-tree parent info flags 00064 int isym; // last interruption symbol 00065 00066 ngram(dictionary* d,int sz=0); 00067 ngram(ngram& ng); 00068 00069 int *wordp() { // n-gram pointer 00070 return wordp(size); 00071 } 00072 int *wordp(int k) { // n-gram pointer 00073 return size>=k?&word[MAX_NGRAM-k]:0; 00074 } 00075 const int *wordp() const { // n-gram pointer 00076 return wordp(size); 00077 } 00078 const int *wordp(int k) const { // n-gram pointer 00079 return size>=k?&word[MAX_NGRAM-k]:0; 00080 } 00081 00082 00083 int containsWord(const char* s,int lev) { 00084 00085 int c=dict->encode(s); 00086 if (c == -1) return 0; 00087 00088 assert(lev <= size); 00089 for (int i=0; i<lev; i++) { 00090 if (*wordp(size-i)== c) return 1; 00091 } 00092 return 0; 00093 } 00094 00095 00096 void trans(const ngram& ng); 00097 void invert (const ngram& ng); 00098 void shift (); 00099 void shift (int sz); 00100 00101 friend std::ifstream& operator>> (std::ifstream& fi,ngram& ng); 00102 friend std::ofstream& operator<< (std::ofstream& fi,ngram& ng); 00103 friend std::istream& operator>> (std::istream& fi,ngram& ng); 00104 friend std::ostream& operator<< (std::ostream& fi,ngram& ng); 00105 00106 inline bool operator==(const ngram &compare) const { 00107 if ( size != compare.size || dict != compare.dict) 00108 return false; 00109 else 00110 for (int i=size; i>0; i--) 00111 if (word[MAX_NGRAM-i] != compare.word[MAX_NGRAM-i]) 00112 return false; 00113 return true; 00114 } 00115 00116 inline bool operator!=(const ngram &compare) const { 00117 if ( size != compare.size || dict != compare.dict) 00118 return true; 00119 else 00120 for (int i=size; i>0; i--) 00121 if (word[MAX_NGRAM-i] != compare.word[MAX_NGRAM-i]) 00122 return true; 00123 return false; 00124 } 00125 00126 00127 00128 inline int ckhisto(int sz) { 00129 00130 for (int i=sz; i>1; i--) 00131 if (*wordp(i)==dict->oovcode()) 00132 return 0; 00133 return 1; 00134 } 00135 00136 int pushc(int c); 00137 int pushc(int* codes, int sz); 00138 int pushw(const char* w); 00139 00140 //~ngram(); 00141 00142 00143 00144 }; 00145 00146 #endif 00147 00148 00149