00001
00002
00003
00004
00005
00006
00007 #include <string>
00008
00009 #include <unicode/stringpiece.h>
00010 #include <unicode/utypes.h>
00011 #include <unicode/unistr.h>
00012 #include <unicode/uchar.h>
00013 #include <unicode/utf8.h>
00014
00015 #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
00016 #include "moses/TranslationModel/UG/mm/tpt_tokenindex.h"
00017 #include <boost/unordered_map.hpp>
00018 #include "moses/TranslationModel/UG/mm/tpt_pickler.h"
00019 #include "moses/TranslationModel/UG/mm/ug_mm_2d_table.h"
00020
00021 using namespace std;
00022 using namespace ugdiss;
00023
00024 typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> table_t;
00025
00026 class IBM1
00027 {
00028 public:
00029 table_t COOC;
00030 TokenIndex V1,V2;
00031
00032 void
00033 align(string const& s1, string const& s2, vector<int>& aln) const;
00034
00035 void
00036 align(vector<id_type> const& x1,
00037 vector<id_type> const& x2,
00038 vector<int>& aln) const;
00039
00040 void
00041 fill_amatrix(vector<id_type> const& x1,
00042 vector<id_type> const& x2,
00043 vector<vector<int> >& aln) const;
00044
00045 void
00046 open(string const base, string const L1, string const L2);
00047 };
00048
00049 void
00050 IBM1::
00051 open(string const base, string const L1, string const L2)
00052 {
00053 V1.open(base+L1+".tdx");
00054 V2.open(base+L2+".tdx");
00055 COOC.open(base+L1+"-"+L2+".lex");
00056 }
00057
00058 void
00059 IBM1::
00060 align(string const& s1, string const& s2, vector<int>& aln) const
00061 {
00062 vector<id_type> x1,x2;
00063 V1.fillIdSeq(s1,x1);
00064 V2.fillIdSeq(s2,x2);
00065 align(x1,x2,aln);
00066 }
00067
00068 static UnicodeString apos = UnicodeString::fromUTF8(StringPiece("'"));
00069
00070 string
00071 u(StringPiece str, size_t start, size_t stop)
00072 {
00073 string ret;
00074 UnicodeString::fromUTF8(str).tempSubString(start,stop).toUTF8String(ret);
00075 return ret;
00076 }
00077
00078 void
00079 IBM1::
00080 fill_amatrix(vector<id_type> const& x1,
00081 vector<id_type> const& x2,
00082 vector<vector<int> >& aln) const
00083 {
00084 aln.assign(x1.size(),vector<int>(x2.size()));
00085 for (size_t i = 0; i < x1.size(); ++i)
00086 for (size_t k = 0; k < x2.size(); ++k)
00087 aln[i][k] = COOC[x1[i]][x2[k]];
00088 #if 0
00089 cout << setw(10) << " ";
00090 for (size_t k = 0; k < x2.size(); ++k)
00091 cout << setw(7) << right << u(V2[x2[k]],0,6);
00092 cout << endl;
00093 for (size_t i = 0; i < x1.size(); ++i)
00094 {
00095 cout << setw(10) << u(V1[x1[i]],0,10);
00096 for (size_t k = 0; k < x2.size(); ++k)
00097 {
00098 if (aln[i][k] > 999999)
00099 cout << setw(7) << aln[i][k]/1000 << " K";
00100 else
00101 cout << setw(7) << aln[i][k];
00102 }
00103 cout << endl;
00104 }
00105 #endif
00106 }
00107
00108
00109 void
00110 IBM1::
00111 align(vector<id_type> const& x1,
00112 vector<id_type> const& x2,
00113 vector<int>& aln) const
00114 {
00115 vector<vector<int> > M;
00116
00117 vector<int> i1(x1.size(),0), max1(x1.size(),0);
00118 vector<int> i2(x2.size(),0), max2(x2.size(),0);
00119 aln.clear();
00120 for (size_t i = 0; i < i1.size(); ++i)
00121 {
00122 for (size_t k = 0; k < i2.size(); ++k)
00123 {
00124 int c = COOC[x1[i]][x2[k]];
00125 if (c > max1[i]) { i1[i] = k; max1[i] = c; }
00126 if (c >= max2[k]) { i2[k] = i; max2[k] = c; }
00127 }
00128 }
00129 for (size_t i = 0; i < i1.size(); ++i)
00130 {
00131 if (max1[i] && i2[i1[i]] == i)
00132 {
00133 aln.push_back(i);
00134 aln.push_back(i1[i]);
00135 }
00136 }
00137 }
00138
00139 int main(int argc, char* argv[])
00140 {
00141 IBM1 ibm1;
00142 ibm1.open(argv[1],argv[2],argv[3]);
00143 string line1,line2,sid;
00144 while (getline(cin,sid))
00145 {
00146 if (!getline(cin,line1)) assert(false);
00147 if (!getline(cin,line2)) assert(false);
00148 vector<int> a;
00149 vector<id_type> s1,s2;
00150 ibm1.V1.fillIdSeq(line1,s1);
00151 ibm1.V2.fillIdSeq(line2,s2);
00152 ibm1.align(s1,s2,a);
00153 cout << sid;
00154 for (size_t i = 0; i < a.size(); i += 2)
00155 cout << " " << a[i] << ":" << a[i+1] << ":unspec";
00156 cout << endl;
00157
00158
00159
00160
00161
00162 }
00163
00164 }