00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <cstdio>
00023 #include <cstdlib>
00024 #include <cstring>
00025 #include <climits>
00026 #include <sys/types.h>
00027 #include <unistd.h>
00028 #include <dirent.h>
00029
00030 #include <fstream>
00031 #include <string>
00032 #include <iterator>
00033 #include <algorithm>
00034 #include "Loader.h"
00035 #include "LoaderFactory.h"
00036 #include "PhraseDictionaryFuzzyMatch.h"
00037 #include "moses/FactorCollection.h"
00038 #include "moses/Word.h"
00039 #include "moses/Util.h"
00040 #include "moses/InputFileStream.h"
00041 #include "moses/StaticData.h"
00042 #include "moses/Range.h"
00043 #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h"
00044 #include "moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h"
00045 #include "moses/TranslationModel/fuzzy-match/SentenceAlignment.h"
00046 #include "moses/TranslationTask.h"
00047 #include "util/file.hh"
00048 #include "util/exception.hh"
00049 #include "util/random.hh"
00050
00051 using namespace std;
00052
00053 #if defined __MINGW32__ && !defined mkdtemp
00054 #include <windows.h>
00055 #include <cerrno>
00056 char *mkdtemp(char *tempbuf)
00057 {
00058 int rand_value = 0;
00059 char* tempbase = NULL;
00060 char tempbasebuf[MAX_PATH] = "";
00061
00062 if (strcmp(&tempbuf[strlen(tempbuf)-6], "XXXXXX")) {
00063 errno = EINVAL;
00064 return NULL;
00065 }
00066
00067 util::rand_init();
00068 rand_value = util::rand_excl(1e6);
00069 tempbase = strrchr(tempbuf, '/');
00070 tempbase = tempbase ? tempbase+1 : tempbuf;
00071 strcpy(tempbasebuf, tempbase);
00072 sprintf(&tempbasebuf[strlen(tempbasebuf)-6], "%d", rand_value);
00073 ::GetTempPath(MAX_PATH, tempbuf);
00074 strcat(tempbuf, tempbasebuf);
00075 ::CreateDirectory(tempbuf, NULL);
00076 return tempbuf;
00077 }
00078 #endif
00079
00080 namespace Moses
00081 {
00082
00083 PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(const std::string &line)
00084 :PhraseDictionary(line, true)
00085 ,m_config(3)
00086 ,m_FuzzyMatchWrapper(NULL)
00087 {
00088 ReadParameters();
00089 }
00090
00091 PhraseDictionaryFuzzyMatch::~PhraseDictionaryFuzzyMatch()
00092 {
00093 delete m_FuzzyMatchWrapper;
00094 }
00095
00096 void PhraseDictionaryFuzzyMatch::Load(AllOptions::ptr const& opts)
00097 {
00098 m_options = opts;
00099 SetFeaturesToApply();
00100
00101 m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]);
00102 }
00103
00104 ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager(
00105 const ChartParser &parser,
00106 const ChartCellCollectionBase &cellCollection,
00107 std::size_t )
00108 {
00109 return new ChartRuleLookupManagerMemoryPerSentence(parser, cellCollection, *this);
00110 }
00111
00112 void
00113 PhraseDictionaryFuzzyMatch::
00114 SetParameter(const std::string& key, const std::string& value)
00115 {
00116 if (key == "source") {
00117 m_config[0] = value;
00118 } else if (key == "target") {
00119 m_config[1] = value;
00120 } else if (key == "alignment") {
00121 m_config[2] = value;
00122 } else {
00123 PhraseDictionary::SetParameter(key, value);
00124 }
00125 }
00126
00127 int removedirectoryrecursively(const char *dirname)
00128 {
00129 #if defined __MINGW32__
00130
00131 #else
00132 DIR *dir;
00133 struct dirent *entry;
00134 char path[PATH_MAX];
00135
00136 dir = opendir(dirname);
00137 if (dir == NULL) {
00138 perror("Error opendir()");
00139 return 0;
00140 }
00141
00142 while ((entry = readdir(dir)) != NULL) {
00143 if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) {
00144 snprintf(path, (size_t) PATH_MAX, "%s/%s", dirname, entry->d_name);
00145 if (entry->d_type == DT_DIR) {
00146 removedirectoryrecursively(path);
00147 }
00148
00149 remove(path);
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162 }
00163
00164 }
00165 closedir(dir);
00166
00167 rmdir(dirname);
00168
00169
00170
00171
00172
00173 #endif
00174 return 1;
00175 }
00176
00177 void PhraseDictionaryFuzzyMatch::InitializeForInput(ttasksptr const& ttask)
00178 {
00179 InputType const& inputSentence = *ttask->GetSource();
00180 #if defined __MINGW32__
00181 char dirName[] = "moses.XXXXXX";
00182 #else
00183 char dirName[] = "/tmp/moses.XXXXXX";
00184 #endif // defined
00185 char *temp = mkdtemp(dirName);
00186 UTIL_THROW_IF2(temp == NULL,
00187 "Couldn't create temporary directory " << dirName);
00188
00189 string dirNameStr(dirName);
00190
00191 string inFileName(dirNameStr + "/in");
00192
00193 ofstream inFile(inFileName.c_str());
00194
00195 for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) {
00196 inFile << inputSentence.GetWord(i);
00197 }
00198 inFile << endl;
00199 inFile.close();
00200
00201 long translationId = inputSentence.GetTranslationId();
00202 string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr);
00203
00204
00205 PhraseDictionaryNodeMemory &rootNode = m_collection[translationId];
00206 FormatType format = MosesFormat;
00207
00208
00209 InputFileStream inStream(ptFileName);
00210
00211
00212 PrintUserTime("Start loading fuzzy-match phrase model");
00213
00214 const StaticData &staticData = StaticData::Instance();
00215
00216
00217 string lineOrig;
00218 size_t count = 0;
00219
00220 while(getline(inStream, lineOrig)) {
00221 const string *line;
00222 if (format == HieroFormat) {
00223 UTIL_THROW(util::Exception, "Cannot be Hiero format");
00224
00225 } else {
00226
00227 line = &lineOrig;
00228 }
00229
00230 vector<string> tokens;
00231 vector<float> scoreVector;
00232
00233 TokenizeMultiCharSeparator(tokens, *line , "|||" );
00234
00235 if (tokens.size() != 4 && tokens.size() != 5) {
00236 UTIL_THROW2("Syntax error at " << ptFileName << ":" << count);
00237 }
00238
00239 const string &sourcePhraseString = tokens[0]
00240 , &targetPhraseString = tokens[1]
00241 , &scoreString = tokens[2]
00242 , &alignString = tokens[3];
00243
00244 bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
00245 if (isLHSEmpty && !ttask->options()->unk.word_deletion_enabled) {
00246 TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n");
00247 continue;
00248 }
00249
00250 Tokenize<float>(scoreVector, scoreString);
00251 const size_t numScoreComponents = GetNumScoreComponents();
00252 if (scoreVector.size() != numScoreComponents) {
00253 UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
00254 << numScoreComponents << ") of score components on line " << count);
00255 }
00256
00257 UTIL_THROW_IF2(scoreVector.size() != numScoreComponents,
00258 "Number of scores incorrectly specified");
00259
00260
00261
00262
00263 Word *sourceLHS;
00264 Word *targetLHS;
00265
00266
00267 Phrase sourcePhrase( 0);
00268 sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, &sourceLHS);
00269
00270
00271 TargetPhrase *targetPhrase = new TargetPhrase(this);
00272 targetPhrase->CreateFromString(Output, m_output, targetPhraseString, &targetLHS);
00273
00274
00275 targetPhrase->SetAlignmentInfo(alignString);
00276 targetPhrase->SetTargetLHS(targetLHS);
00277
00278
00279
00280 std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
00281 std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
00282
00283 targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
00284 targetPhrase->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
00285
00286 TargetPhraseCollection::shared_ptr phraseColl
00287 = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase,
00288 *targetPhrase, sourceLHS);
00289 phraseColl->Add(targetPhrase);
00290
00291 count++;
00292
00293 if (format == HieroFormat) {
00294 delete line;
00295 } else {
00296
00297 }
00298
00299 }
00300
00301
00302 SortAndPrune(rootNode);
00303
00304
00305 }
00306
00307 TargetPhraseCollection::shared_ptr
00308 PhraseDictionaryFuzzyMatch::
00309 GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode
00310 , const Phrase &source
00311 , const TargetPhrase &target
00312 , const Word *sourceLHS)
00313 {
00314 PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS);
00315 return currNode.GetTargetPhraseCollection();
00316 }
00317
00318 PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode
00319 , const Phrase &source
00320 , const TargetPhrase &target
00321 , const Word *sourceLHS)
00322 {
00323 cerr << source << endl << target << endl;
00324 const size_t size = source.GetSize();
00325
00326 const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
00327 AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
00328
00329 PhraseDictionaryNodeMemory *currNode = &rootNode;
00330 for (size_t pos = 0 ; pos < size ; ++pos) {
00331 const Word& word = source.GetWord(pos);
00332
00333 if (word.IsNonTerminal()) {
00334
00335 const Word &sourceNonTerm = word;
00336
00337 UTIL_THROW_IF2(iterAlign == alignmentInfo.end(),
00338 "No alignment for non-term at position " << pos);
00339 UTIL_THROW_IF2(iterAlign->first != pos,
00340 "Alignment info incorrect at position " << pos);
00341
00342 size_t targetNonTermInd = iterAlign->second;
00343 ++iterAlign;
00344 const Word &targetNonTerm = target.GetWord(targetNonTermInd);
00345
00346 #if defined(UNLABELLED_SOURCE)
00347 currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm);
00348 #else
00349 currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm);
00350 #endif
00351 } else {
00352 currNode = currNode->GetOrCreateChild(word);
00353 }
00354
00355 UTIL_THROW_IF2(currNode == NULL,
00356 "Node not found at position " << pos);
00357
00358 }
00359
00360
00361
00362
00363 return *currNode;
00364 }
00365
00366 void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeMemory &rootNode)
00367 {
00368 if (GetTableLimit()) {
00369 rootNode.Sort(GetTableLimit());
00370 }
00371 }
00372
00373 void PhraseDictionaryFuzzyMatch::CleanUpAfterSentenceProcessing(const InputType &source)
00374 {
00375 m_collection.erase(source.GetTranslationId());
00376 }
00377
00378 const PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(long translationId) const
00379 {
00380 std::map<long, PhraseDictionaryNodeMemory>::const_iterator iter = m_collection.find(translationId);
00381 UTIL_THROW_IF2(iter == m_collection.end(),
00382 "Couldn't find root node for input: " << translationId);
00383 return iter->second;
00384 }
00385 PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source)
00386 {
00387 long transId = source.GetTranslationId();
00388 std::map<long, PhraseDictionaryNodeMemory>::iterator iter = m_collection.find(transId);
00389 UTIL_THROW_IF2(iter == m_collection.end(),
00390 "Couldn't find root node for input: " << transId);
00391 return iter->second;
00392 }
00393
00394 TO_STRING_BODY(PhraseDictionaryFuzzyMatch);
00395
00396
00397 ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict)
00398 {
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412
00413
00414 return out;
00415 }
00416
00417 }