00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdio.h>
00023 #include <cstdlib>
00024 #include <stdlib.h>
00025 #include <iostream>
00026 #include <stdexcept>
00027 #include <string>
00028 #include <cassert>
00029 #include "lmContainer.h"
00030 #include "lmtable.h"
00031 #include "lmmacro.h"
00032 #include "lmclass.h"
00033 #include "lmInterpolation.h"
00034
00035 using namespace std;
00036
00037 inline void error(const char* message)
00038 {
00039 std::cerr << message << "\n";
00040 throw std::runtime_error(message);
00041 }
00042
00043 lmContainer::lmContainer()
00044 {
00045 requiredMaxlev=1000;
00046 }
00047
00048 int lmContainer::getLanguageModelType(std::string filename)
00049 {
00050 fstream inp(filename.c_str(),ios::in|ios::binary);
00051
00052 if (!inp.good()) {
00053 std::cerr << "Failed to open " << filename << "!" << std::endl;
00054 exit(1);
00055 }
00056
00057 std::string header;
00058 inp >> header;
00059 inp.close();
00060
00061 VERBOSE(1,"LM header:|" << header << "|" << std::endl);
00062
00063 int type=_IRSTLM_LMUNKNOWN;
00064 VERBOSE(1,"type: " << type << std::endl);
00065 if (header == "lmminterpolation" || header == "LMINTERPOLATION") {
00066 type = _IRSTLM_LMINTERPOLATION;
00067 } else if (header == "lmmacro" || header == "LMMACRO") {
00068 type = _IRSTLM_LMMACRO;
00069 } else if (header == "lmclass" || header == "LMCLASS") {
00070 type = _IRSTLM_LMCLASS;
00071 } else {
00072 type = _IRSTLM_LMTABLE;
00073 }
00074 VERBOSE(1,"type: " << type << std::endl);
00075
00076 return type;
00077 };
00078
00079 lmContainer* lmContainer::CreateLanguageModel(const std::string infile, float nlf, float dlf)
00080 {
00081 int type = getLanguageModelType(infile);
00082 std::cerr << "Language Model Type of " << infile << " is " << type << std::endl;
00083
00084 return CreateLanguageModel(type, nlf, dlf);
00085 }
00086
00087 lmContainer* lmContainer::CreateLanguageModel(int type, float nlf, float dlf)
00088 {
00089
00090 std::cerr << "Language Model Type is " << type << std::endl;
00091
00092 lmContainer* lm=NULL;
00093
00094 switch (type) {
00095
00096 case _IRSTLM_LMTABLE:
00097 lm = new lmtable(nlf, dlf);
00098 break;
00099
00100 case _IRSTLM_LMMACRO:
00101 lm = new lmmacro(nlf, dlf);
00102 break;
00103
00104 case _IRSTLM_LMCLASS:
00105 lm = new lmclass(nlf, dlf);
00106 break;
00107
00108 case _IRSTLM_LMINTERPOLATION:
00109 lm = new lmInterpolation(nlf, dlf);
00110 break;
00111
00112 }
00113
00114 if (lm == NULL) {
00115 std::cerr << "This language model type is unknown!" << std::endl;
00116 exit(1);
00117 }
00118
00119 lm->setLanguageModelType(type);
00120 return lm;
00121 }
00122
00123 bool lmContainer::filter(const string sfilter, lmContainer*& sublmC, const string skeepunigrams)
00124 {
00125 if (lmtype == _IRSTLM_LMTABLE) {
00126 sublmC = sublmC->CreateLanguageModel(lmtype,((lmtable*) this)->GetNgramcacheLoadFactor(),((lmtable*) this)->GetDictioanryLoadFactor());
00127
00128
00129 sublmC->is_inverted(is_inverted());
00130 sublmC->setMaxLoadedLevel(getMaxLoadedLevel());
00131 sublmC->maxlevel(maxlevel());
00132
00133 bool res=((lmtable*) this)->filter(sfilter, (lmtable*) sublmC, skeepunigrams);
00134
00135 return res;
00136 }
00137 return false;
00138 };
00139