00001 #pragma once
00002
00003 #include <map>
00004 #include <set>
00005 #include <sstream>
00006 #include <fstream>
00007 #include <iostream>
00008
00009 namespace MosesTraining
00010 {
00011
00012 class WordCount
00013 {
00014 friend std::ostream& operator<<(std::ostream&, const WordCount&);
00015 public:
00016 float m_count;
00017
00018 std::map<const std::string*, WordCount> m_coll;
00019
00020 WordCount()
00021 :m_count(0) {
00022 }
00023
00024
00025
00026 WordCount(float count)
00027 :m_count(count) {
00028 }
00029
00030 void AddCount(float incr);
00031
00032 std::map<const std::string*, WordCount> &GetColl() {
00033 return m_coll;
00034 }
00035 const std::map<const std::string*, WordCount> &GetColl() const {
00036 return m_coll;
00037 }
00038
00039 const float GetCount() const {
00040 return m_count;
00041 }
00042
00043 };
00044
00045 class Vocab
00046 {
00047 std::set<std::string> m_coll;
00048 public:
00049 const std::string *GetOrAdd(const std::string &word);
00050 };
00051
00052 class ExtractLex
00053 {
00054 Vocab m_vocab;
00055 std::map<const std::string*, WordCount> m_collS2T, m_collT2S;
00056
00057 void Process(const std::string *target, const std::string *source);
00058 void Process(WordCount &wcIn, const std::string *out);
00059 void ProcessUnaligned(std::vector<std::string> &toksTarget, std::vector<std::string> &toksSource
00060 , const std::vector<bool> &m_sourceAligned, const std::vector<bool> &m_targetAligned);
00061
00062 void Output(const std::map<const std::string*, WordCount> &coll, std::ofstream &outStream);
00063
00064 public:
00065 void Process(std::vector<std::string> &toksTarget, std::vector<std::string> &toksSource, std::vector<std::string> &toksAlign, size_t lineCount);
00066 void Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S);
00067
00068 };
00069
00070 }