00001 #include <iostream>
00002 #include <cstdlib>
00003 #include <boost/program_options.hpp>
00004
00005 #include "Main.h"
00006 #include "InputFileStream.h"
00007 #include "OutputFileStream.h"
00008 #include "AlignedSentence.h"
00009 #include "AlignedSentenceSyntax.h"
00010 #include "Parameter.h"
00011 #include "Rules.h"
00012
00013 using namespace std;
00014
00015 bool g_debug = false;
00016
00017 int main(int argc, char** argv)
00018 {
00019 cerr << "Starting" << endl;
00020
00021 Parameter params;
00022
00023 namespace po = boost::program_options;
00024 po::options_description desc("Options");
00025 desc.add_options()
00026 ("help", "Print help messages")
00027 ("MaxSpan", po::value<int>()->default_value(params.maxSpan), "Max (source) span of a rule. ie. number of words in the source")
00028 ("MinSpan", po::value<int>()->default_value(params.minSpan), "Min (source) span of a rule.")
00029 ("GlueGrammar", po::value<string>()->default_value(params.gluePath), "Output glue grammar to here")
00030 ("SentenceOffset", po::value<long>()->default_value(params.sentenceOffset), "Starting sentence id. Not used")
00031 ("GZOutput", "Compress extract files")
00032 ("MaxNonTerm", po::value<int>()->default_value(params.maxNonTerm), "Maximum number of non-terms allowed per rule")
00033 ("MaxHieroNonTerm", po::value<int>()->default_value(params.maxHieroNonTerm), "Maximum number of Hiero non-term. Usually, --MaxNonTerm is the normal constraint")
00034 ("MinHoleSource", po::value<int>()->default_value(params.minHoleSource), "Minimum source span for a non-term.")
00035 ("MinHoleSourceSyntax", po::value<int>()->default_value(params.minHoleSourceSyntax), "Minimum source span for a syntactic non-term (source or target).")
00036
00037 ("SourceSyntax", "Source sentence is a parse tree")
00038 ("TargetSyntax", "Target sentence is a parse tree")
00039 ("MixedSyntaxType", po::value<int>()->default_value(params.mixedSyntaxType), "Hieu's Mixed syntax type. 0(default)=no mixed syntax, 1=add [X] only if no syntactic label. 2=add [X] everywhere")
00040 ("MultiLabel", po::value<int>()->default_value(params.multiLabel), "What to do with multiple labels on the same span. 0(default)=keep them all, 1=keep only top-most, 2=keep only bottom-most")
00041 ("HieroSourceLHS", "Always use Hiero source LHS? Default = 0")
00042 ("MaxSpanFreeNonTermSource", po::value<int>()->default_value(params.maxSpanFreeNonTermSource), "Max number of words covered by beginning/end NT. Default = 0 (no limit)")
00043 ("NoNieceTerminal", "Don't extract rule if 1 of the non-term covers the same word as 1 of the terminals")
00044 ("MaxScope", po::value<int>()->default_value(params.maxScope), "maximum scope (see Hopkins and Langmead (2010)). Default is HIGH")
00045 ("MinScope", po::value<int>()->default_value(params.minScope), "min scope.")
00046
00047 ("SpanLength", "Property - span length of each LHS non-term")
00048 ("RuleLength", "Property - length of entire rule. Only for rules with NTs")
00049
00050 ("NonTermContext", "Property - (source) left and right, inside and outside words of each non-term ")
00051 ("NonTermContextTarget", "Property - (target) left and right, inside and outside words of each non-term")
00052 ("NonTermContextFactor", po::value<int>()->default_value(params.nonTermContextFactor), "Factor to use for non-term context property.")
00053
00054 ("NumSourceFactors", po::value<int>()->default_value(params.numSourceFactors), "Number of source factors.")
00055 ("NumTargetFactors", po::value<int>()->default_value(params.numTargetFactors), "Number of target factors.")
00056
00057 ("HieroNonTerm", po::value<string>()->default_value(params.hieroNonTerm), "Hiero non-terminal label, including bracket")
00058 ("ScopeSpan", po::value<string>()->default_value(params.scopeSpanStr), "Min and max span for rules of each scope. Format is min,max:min,max...")
00059
00060 ("NonTermConsecSource", "Allow consecutive non-terms on the source side")
00061 ("NonTermConsecSourceMixedSyntax", po::value<int>()->default_value(params.nonTermConsecSourceMixedSyntax), "In mixed syntax mode, what nt can be consecutive. 0=don't allow consec nt. 1(default)=hiero+syntax. 2=syntax+syntax. 3=always allow");
00062
00063
00064 po::variables_map vm;
00065 try {
00066 po::store(po::parse_command_line(argc, argv, desc),
00067 vm);
00068
00071 if ( vm.count("help") || argc < 5 ) {
00072 std::cout << argv[0] << " target source alignment [options...]" << std::endl
00073 << desc << std::endl;
00074 return EXIT_SUCCESS;
00075 }
00076
00077 po::notify(vm);
00078
00079 } catch(po::error& e) {
00080 std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
00081 std::cerr << desc << std::endl;
00082 return EXIT_FAILURE;
00083 }
00084
00085 if (vm.count("MaxSpan")) params.maxSpan = vm["MaxSpan"].as<int>();
00086 if (vm.count("MinSpan")) params.minSpan = vm["MinSpan"].as<int>();
00087 if (vm.count("GZOutput")) params.gzOutput = true;
00088 if (vm.count("GlueGrammar")) params.gluePath = vm["GlueGrammar"].as<string>();
00089 if (vm.count("SentenceOffset")) params.sentenceOffset = vm["SentenceOffset"].as<long>();
00090 if (vm.count("MaxNonTerm")) params.maxNonTerm = vm["MaxNonTerm"].as<int>();
00091 if (vm.count("MaxHieroNonTerm")) params.maxHieroNonTerm = vm["MaxHieroNonTerm"].as<int>();
00092 if (vm.count("MinHoleSource")) params.minHoleSource = vm["MinHoleSource"].as<int>();
00093 if (vm.count("MinHoleSourceSyntax")) params.minHoleSourceSyntax = vm["MinHoleSourceSyntax"].as<int>();
00094
00095 if (vm.count("SourceSyntax")) params.sourceSyntax = true;
00096 if (vm.count("TargetSyntax")) params.targetSyntax = true;
00097 if (vm.count("MixedSyntaxType")) params.mixedSyntaxType = vm["MixedSyntaxType"].as<int>();
00098 if (vm.count("MultiLabel")) params.multiLabel = vm["MultiLabel"].as<int>();
00099 if (vm.count("HieroSourceLHS")) params.hieroSourceLHS = true;
00100 if (vm.count("MaxSpanFreeNonTermSource")) params.maxSpanFreeNonTermSource = vm["MaxSpanFreeNonTermSource"].as<int>();
00101 if (vm.count("NoNieceTerminal")) params.nieceTerminal = false;
00102 if (vm.count("MaxScope")) params.maxScope = vm["MaxScope"].as<int>();
00103 if (vm.count("MinScope")) params.minScope = vm["MinScope"].as<int>();
00104
00105
00106 if (vm.count("SpanLength")) params.spanLength = true;
00107 if (vm.count("RuleLength")) params.ruleLength = true;
00108 if (vm.count("NonTermContext")) params.nonTermContext = true;
00109 if (vm.count("NonTermContextTarget")) params.nonTermContextTarget = true;
00110 if (vm.count("NonTermContextFactor")) params.nonTermContextFactor = vm["NonTermContextFactor"].as<int>();
00111
00112 if (vm.count("NumSourceFactors")) params.numSourceFactors = vm["NumSourceFactors"].as<int>();
00113 if (vm.count("NumTargetFactors")) params.numTargetFactors = vm["NumTargetFactors"].as<int>();
00114
00115 if (vm.count("HieroNonTerm")) params.hieroNonTerm = vm["HieroNonTerm"].as<string>();
00116 if (vm.count("ScopeSpan")) {
00117 params.SetScopeSpan(vm["ScopeSpan"].as<string>());
00118 }
00119
00120 if (vm.count("NonTermConsecSource")) params.nonTermConsecSource = true;
00121 if (vm.count("NonTermConsecSourceMixedSyntax")) params.nonTermConsecSourceMixedSyntax = vm["NonTermConsecSourceMixedSyntax"].as<int>();
00122
00123
00124
00125 string pathTarget = argv[1];
00126 string pathSource = argv[2];
00127 string pathAlignment = argv[3];
00128
00129 string pathExtract = argv[4];
00130 string pathExtractInv = pathExtract + ".inv";
00131 if (params.gzOutput) {
00132 pathExtract += ".gz";
00133 pathExtractInv += ".gz";
00134 }
00135
00136 Moses::InputFileStream strmTarget(pathTarget);
00137 Moses::InputFileStream strmSource(pathSource);
00138 Moses::InputFileStream strmAlignment(pathAlignment);
00139 Moses::OutputFileStream extractFile(pathExtract);
00140 Moses::OutputFileStream extractInvFile(pathExtractInv);
00141
00142
00143
00144 int lineNum = 1;
00145 string lineTarget, lineSource, lineAlignment;
00146 while (getline(strmTarget, lineTarget)) {
00147 if (lineNum % 10000 == 0) {
00148 cerr << lineNum << " ";
00149 }
00150
00151 if (!getline(strmSource, lineSource)) {
00152 throw "Couldn't read source";
00153 }
00154 if (!getline(strmAlignment, lineAlignment)) {
00155 throw "Couldn't read alignment";
00156 }
00157
00158
00159
00160
00161
00162
00163
00164 AlignedSentence *alignedSentence;
00165
00166 if (params.sourceSyntax || params.targetSyntax) {
00167 alignedSentence = new AlignedSentenceSyntax(lineNum, lineSource, lineTarget, lineAlignment);
00168 } else {
00169 alignedSentence = new AlignedSentence(lineNum, lineSource, lineTarget, lineAlignment);
00170 }
00171
00172 alignedSentence->Create(params);
00173
00174
00175 Rules rules(*alignedSentence);
00176 rules.Extend(params);
00177 rules.Consolidate(params);
00178
00179
00180 rules.Output(extractFile, true, params);
00181 rules.Output(extractInvFile, false, params);
00182
00183 delete alignedSentence;
00184
00185 ++lineNum;
00186 }
00187
00188 if (!params.gluePath.empty()) {
00189 Moses::OutputFileStream glueFile(params.gluePath);
00190 CreateGlueGrammar(glueFile);
00191 }
00192
00193 cerr << "Finished" << endl;
00194 }
00195
00196 void CreateGlueGrammar(Moses::OutputFileStream &glueFile)
00197 {
00198 glueFile << "<s> [X] ||| <s> [S] ||| 1 ||| ||| 0" << endl
00199 << "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0" << endl
00200 << "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << endl;
00201
00202 }