00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 #ifndef moses_LexicalReorderingTableCreator_h
00023 #define moses_LexicalReorderingTableCreator_h
00024 
00025 #include "PhraseTableCreator.h"
00026 
00027 namespace Moses
00028 {
00029 
00030 class LexicalReorderingTableCreator
00031 {
00032 private:
00033   std::string m_inPath;
00034   std::string m_outPath;
00035   std::string m_tempfilePath;
00036 
00037   std::FILE* m_outFile;
00038 
00039   size_t m_orderBits;
00040   size_t m_fingerPrintBits;
00041 
00042   size_t m_numScoreComponent;
00043 
00044   bool m_multipleScoreTrees;
00045   bool m_quantize;
00046 
00047   std::string m_separator;
00048 
00049   BlockHashIndex m_hash;
00050 
00051   typedef Counter<float> ScoreCounter;
00052   typedef CanonicalHuffman<float> ScoreTree;
00053 
00054   std::vector<ScoreCounter*> m_scoreCounters;
00055   std::vector<ScoreTree*> m_scoreTrees;
00056 
00057   StringVector<unsigned char, unsigned long, MmapAllocator>* m_encodedScores;
00058   StringVector<unsigned char, unsigned long, MmapAllocator>* m_compressedScores;
00059 
00060   std::priority_queue<PackedItem> m_queue;
00061   long m_lastFlushedLine;
00062   long m_lastFlushedSourceNum;
00063   std::string m_lastFlushedSourcePhrase;
00064   std::vector<std::string> m_lastRange;
00065 
00066 #ifdef WITH_THREADS
00067   size_t m_threads;
00068 #endif
00069 
00070   void PrintInfo();
00071 
00072   void EncodeScores();
00073   void CalcHuffmanCodes();
00074   void CompressScores();
00075   void Save();
00076 
00077   std::string MakeSourceTargetKey(std::string&, std::string&);
00078 
00079   std::string EncodeLine(std::vector<std::string>& tokens);
00080   void AddEncodedLine(PackedItem& pi);
00081   void FlushEncodedQueue(bool force = false);
00082 
00083   std::string CompressEncodedScores(std::string &encodedScores);
00084   void AddCompressedScores(PackedItem& pi);
00085   void FlushCompressedQueue(bool force = false);
00086 
00087 public:
00088   LexicalReorderingTableCreator(std::string inPath,
00089                                 std::string outPath,
00090                                 std::string tempfilePath,
00091                                 size_t orderBits = 10,
00092                                 size_t fingerPrintBits = 16,
00093                                 bool multipleScoreTrees = true,
00094                                 size_t quantize = 0
00095 #ifdef WITH_THREADS
00096                                     , size_t threads = 2
00097 #endif
00098                                );
00099 
00100   ~LexicalReorderingTableCreator();
00101 
00102   friend class EncodingTaskReordering;
00103   friend class CompressionTaskReordering;
00104 };
00105 
00106 class EncodingTaskReordering
00107 {
00108 private:
00109 #ifdef WITH_THREADS
00110   static boost::mutex m_mutex;
00111   static boost::mutex m_fileMutex;
00112 #endif
00113   static size_t m_lineNum;
00114   static size_t m_sourcePhraseNum;
00115   static std::string m_lastSourcePhrase;
00116 
00117   InputFileStream& m_inFile;
00118   LexicalReorderingTableCreator& m_creator;
00119 
00120 public:
00121   EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator);
00122   void operator()();
00123 };
00124 
00125 class CompressionTaskReordering
00126 {
00127 private:
00128 #ifdef WITH_THREADS
00129   static boost::mutex m_mutex;
00130 #endif
00131   static size_t m_scoresNum;
00132   StringVector<unsigned char, unsigned long, MmapAllocator> &m_encodedScores;
00133   LexicalReorderingTableCreator &m_creator;
00134 
00135 public:
00136   CompressionTaskReordering(StringVector<unsigned char, unsigned long, MmapAllocator>&
00137                             m_encodedScores, LexicalReorderingTableCreator& creator);
00138   void operator()();
00139 };
00140 
00141 }
00142 
00143 #endif