Moses: /disk4/html/www/moses/doxygen/mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h Source File

00001 //
00002 //  FuzzyMatchWrapper.h
00003 //  moses
00004 //
00005 //  Created by Hieu Hoang on 26/07/2012.
00006 //  Copyright 2012 __MyCompanyName__. All rights reserved.
00007 //
00008 
00009 #ifndef moses_FuzzyMatchWrapper_h
00010 #define moses_FuzzyMatchWrapper_h
00011 
00012 #ifdef WITH_THREADS
00013 #include <boost/thread/shared_mutex.hpp>
00014 #endif
00015 
00016 #include <fstream>
00017 #include <string>
00018 #include "SuffixArray.h"
00019 #include "Vocabulary.h"
00020 #include "Match.h"
00021 #include "moses/InputType.h"
00022 
00023 namespace tmmt
00024 {
00025 class Match;
00026 struct SentenceAlignment;
00027 
00028 class FuzzyMatchWrapper
00029 {
00030 public:
00031   FuzzyMatchWrapper(const std::string &source, const std::string &target, const std::string &alignment);
00032 
00033   std::string Extract(long translationId, const std::string &dirNameStr);
00034 
00035 protected:
00036   // tm-mt
00037   std::vector< std::vector< tmmt::SentenceAlignment > > targetAndAlignment;
00038   tmmt::SuffixArray *suffixArray;
00039   int basic_flag;
00040   int lsed_flag;
00041   int refined_flag;
00042   int length_filter_flag;
00043   int parse_flag;
00044   int min_match;
00045   int multiple_flag;
00046   int multiple_slack;
00047   int multiple_max;
00048 
00049   typedef std::map< WORD_ID,std::vector< int > > WordIndex;
00050 
00051   // global cache for word pairs
00052   std::map< std::pair< WORD_ID, WORD_ID >, unsigned int > m_lsed;
00053 #ifdef WITH_THREADS
00054   //reader-writer lock
00055   mutable boost::shared_mutex m_accessLock;
00056 #endif
00057 
00058   void load_corpus( const std::string &fileName, std::vector< std::vector< tmmt::WORD_ID > > &corpus );
00059   void load_target( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus);
00060   void load_alignment( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus );
00061 
00063   void basic_fuzzy_match( std::vector< std::vector< tmmt::WORD_ID > > source,
00064                           std::vector< std::vector< tmmt::WORD_ID > > input ) ;
00065 
00068   unsigned int compute_length( const std::vector< tmmt::WORD_ID > &sentence );
00069   unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx );
00070   unsigned int sed( const std::vector< WORD_ID > &a, const std::vector< WORD_ID > &b, std::string &best_path, bool use_letter_sed );
00071   void init_short_matches(WordIndex &wordIndex, long translationId, const std::vector< WORD_ID > &input );
00072   int short_match_max_length( int input_length );
00073   void add_short_matches(WordIndex &wordIndex, long translationId, std::vector< Match > &match, const std::vector< WORD_ID > &tm, int input_length, int best_cost );
00074   std::vector< Match > prune_matches( const std::vector< Match > &match, int best_cost );
00075   int parse_matches( std::vector< Match > &match, int input_length, int tm_length, int &best_cost );
00076 
00077   void create_extract(int sentenceInd, int cost, const std::vector< WORD_ID > &sourceSentence, const std::vector<SentenceAlignment> &targets, const std::string &inputStr, const std::string  &path, std::ofstream &outputFile);
00078 
00079   std::string ExtractTM(WordIndex &wordIndex, long translationId, const std::string &inputPath);
00080   Vocabulary &GetVocabulary() {
00081     return suffixArray->GetVocabulary();
00082   }
00083 
00084   bool GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const;
00085   void SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value);
00086 
00087 };
00088 
00089 }
00090 
00091 #endif