00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #pragma once
00021
00022 #include <map>
00023 #include <ostream>
00024 #include <set>
00025 #include <string>
00026 #include <vector>
00027
00028 #include "OutputFileStream.h"
00029 #include "SyntaxTree.h"
00030
00031 #include "syntax-common/tool.h"
00032
00033 namespace MosesTraining
00034 {
00035 namespace Syntax
00036 {
00037 namespace GHKM
00038 {
00039
00040 struct Options;
00041
00042 class ExtractGHKM : public Tool
00043 {
00044 public:
00045 ExtractGHKM() : Tool("extract-ghkm") {}
00046
00047 virtual int Main(int argc, char *argv[]);
00048
00049 private:
00050 void RecordTreeLabels(const SyntaxTree &, std::set<std::string> &);
00051 void CollectWordLabelCounts(SyntaxTree &,
00052 const Options &,
00053 std::map<std::string, int> &,
00054 std::map<std::string, std::string> &);
00055 void WriteUnknownWordLabel(const std::map<std::string, int> &,
00056 const std::map<std::string, std::string> &,
00057 const Options &,
00058 std::ostream &,
00059 bool writeCounts=false) const;
00060 void WriteUnknownWordSoftMatches(const std::set<std::string> &,
00061 std::ostream &) const;
00062 void WriteGlueGrammar(const std::set<std::string> &,
00063 const std::map<std::string, int> &,
00064 const std::map<std::string,size_t> &,
00065 const Options &,
00066 std::ostream &) const;
00067 void WriteSourceLabelSet(const std::map<std::string,size_t> &,
00068 std::ostream &) const;
00069 void StripBitParLabels(const std::set<std::string> &labelSet,
00070 const std::map<std::string, int> &topLabelSet,
00071 std::set<std::string> &outLabelSet,
00072 std::map<std::string, int> &outTopLabelSet) const;
00073
00074 std::vector<std::string> ReadTokens(const std::string &) const;
00075 std::vector<std::string> ReadTokens(const SyntaxTree &root) const;
00076
00077 void ProcessOptions(int, char *[], Options &) const;
00078 };
00079
00080 }
00081 }
00082 }