Moses: /disk4/html/www/moses/doxygen/mosesdecoder/moses/TranslationModel/UG/filter-pt.cc Source File

00001 // -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
00002 // significance filtering for phrase tables as described in
00003 // H. Johnson, et al. (2007) Improving Translation Quality
00004 // by Discarding Most of the Phrasetable. EMNLP 2007.
00005 // Implemented by Marcin Junczys-Dowmunt
00006 // recommended use: -l a+e -n <ttable-limit>
00007 #include <cstring> 
00008 #include <cassert>
00009 #include <cstdio>
00010 #include <cstdlib>
00011 #include <algorithm>
00012 #include <fstream>
00013 #include <sstream>
00014 
00015 #include <vector>
00016 #include <iostream>
00017 #include <set>
00018 
00019 #include <boost/thread/tss.hpp>
00020 #include <boost/thread.hpp> 
00021 #include <boost/unordered_map.hpp>
00022 #include <boost/program_options.hpp>
00023 #include <boost/shared_ptr.hpp>
00024 #include <boost/foreach.hpp>
00025 
00026 #ifdef WIN32
00027 #include "WIN32_functions.h"
00028 #else
00029 #include <unistd.h>
00030 #endif
00031 
00032 #include "mm/ug_bitext.h"
00033 
00034 // constants
00035 const size_t MINIMUM_SIZE_TO_KEEP = 10000;     // increase this to improve memory usage,
00036 // reduce for speed
00037 const std::string SEPARATOR       = " ||| ";
00038 
00039 const double ALPHA_PLUS_EPS  = -1000.0;        // dummy value
00040 const double ALPHA_MINUS_EPS = -2000.0;        // dummy value
00041 
00042 // configuration params
00043 int pfe_filter_limit = 0;               // 0 = don't filter anything based on P(f|e)
00044 bool print_cooc_counts = false;         // add cooc counts to phrase table?
00045 bool print_neglog_significance = false; // add -log(p) to phrase table?
00046 double sig_filter_limit = 0;            // keep phrase pairs with -log(sig) > sig_filter_limit
00047 //    higher = filter-more
00048 bool pef_filter_only = false;           // only filter based on pef
00049 bool hierarchical = false;
00050 
00051 double p_111 = 0.0;                     // alpha
00052 size_t pt_lines = 0;
00053 size_t nremoved_sigfilter = 0;
00054 size_t nremoved_pfefilter = 0;
00055 
00056 typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
00057 typedef sapt::mmTtrack<Token> ttrack_t;
00058 typedef sapt::mmTSA<Token> tsa_t;
00059 typedef sapt::TokenIndex tind_t;
00060 
00061 int num_lines;
00062 
00063 boost::mutex in_mutex;
00064 boost::mutex out_mutex;
00065 boost::mutex err_mutex;
00066 
00067 typedef size_t TextLenType;
00068 
00069 typedef boost::shared_ptr<std::vector<TextLenType> > SentIdSet;
00070 
00071 class Cache {
00072   typedef std::pair<SentIdSet, clock_t> ClockedSet;
00073   typedef boost::unordered_map<std::string, ClockedSet> ClockedMap;
00074   
00075   public:
00076     
00077     SentIdSet get(const std::string& phrase) {
00078       boost::shared_lock<boost::shared_mutex> lock(m_mutex);
00079       if(m_cont.count(phrase)) {
00080         ClockedSet& set = m_cont[phrase];
00081         set.second = clock();
00082         return set.first;
00083       }
00084       return SentIdSet( new SentIdSet::element_type() );
00085     }
00086     
00087     void put(const std::string& phrase, const SentIdSet set) {
00088       boost::unique_lock<boost::shared_mutex> lock(m_mutex);
00089       m_cont[phrase] = std::make_pair(set, clock());
00090     }
00091     
00092     static void set_max_cache(size_t max_cache) {
00093       s_max_cache = max_cache;
00094     }
00095     
00096     void prune() {
00097       if(s_max_cache > 0) {
00098         boost::upgrade_lock<boost::shared_mutex> lock(m_mutex);
00099         if(m_cont.size() > s_max_cache) {
00100           std::vector<clock_t> clocks;
00101           for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++) 
00102             clocks.push_back(it->second.second);
00103           
00104           std::sort(clocks.begin(), clocks.end());
00105           clock_t out = clocks[m_cont.size() - s_max_cache];
00106           
00107           boost::upgrade_to_unique_lock<boost::shared_mutex> uniq_lock(lock);
00108           for(ClockedMap::iterator it = m_cont.begin(); it != m_cont.end(); it++)
00109             if(it->second.second < out)
00110               m_cont.erase(it);
00111         }
00112       }
00113     }
00114   
00115   private:
00116     ClockedMap m_cont;
00117     boost::shared_mutex m_mutex;
00118     static size_t s_max_cache;
00119 };
00120 
00121 size_t Cache::s_max_cache = 0;
00122 
00123 struct SA {
00124   tind_t V;
00125   boost::shared_ptr<ttrack_t> T;
00126   tsa_t I;
00127   Cache cache;
00128 };
00129 
00130 std::vector<boost::shared_ptr<SA> > e_sas;
00131 std::vector<boost::shared_ptr<SA> > f_sas;
00132 
00133 #undef min
00134 
00135 void usage()
00136 {
00137   std::cerr << "\nFilter phrase table using significance testing as described\n"
00138             << "in H. Johnson, et al. (2007) Improving Translation Quality\n"
00139             << "by Discarding Most of the Phrasetable. EMNLP 2007.\n";
00140 }
00141 
00142 struct PTEntry {
00143   PTEntry(const std::string& str, int index);
00144   std::string f_phrase;
00145   std::string e_phrase;
00146   std::string extra;
00147   std::string scores;
00148   float pfe;
00149   int cf;
00150   int ce;
00151   int cfe;
00152   float nlog_pte;
00153   void set_cooc_stats(int _cef, int _cf, int _ce, float nlp) {
00154     cfe = _cef;
00155     cf = _cf;
00156     ce = _ce;
00157     nlog_pte = nlp;
00158   }
00159 
00160 };
00161 
00162 PTEntry::PTEntry(const std::string& str, int index) :
00163   cf(0), ce(0), cfe(0), nlog_pte(0.0)
00164 {
00165   size_t pos = 0;
00166   std::string::size_type nextPos = str.find(SEPARATOR, pos);
00167   this->f_phrase = str.substr(pos,nextPos);
00168 
00169   pos = nextPos + SEPARATOR.size();
00170   nextPos = str.find(SEPARATOR, pos);
00171   this->e_phrase = str.substr(pos,nextPos-pos);
00172 
00173   pos = nextPos + SEPARATOR.size();
00174   nextPos = str.find(SEPARATOR, pos);
00175   if (nextPos < str.size()) {
00176     this->scores = str.substr(pos,nextPos-pos);
00177 
00178     pos = nextPos + SEPARATOR.size();
00179     this->extra = str.substr(pos);
00180   }
00181   else {
00182     this->scores = str.substr(pos,str.size()-pos);
00183   }
00184 
00185   int c = 0;
00186   std::string::iterator i=scores.begin();
00187   if (index > 0) {
00188     for (; i != scores.end(); ++i) {
00189       if ((*i) == ' ') {
00190         c++;
00191         if (c == index) break;
00192       }
00193     }
00194   }
00195   if (i != scores.end()) {
00196     ++i;
00197   }
00198   char f[24];
00199   char *fp=f;
00200   while (i != scores.end() && *i != ' ') {
00201     *fp++=*i++;
00202   }
00203   *fp++=0;
00204 
00205   this->pfe = atof(f);
00206 }
00207 
00208 struct PfeComparer {
00209   bool operator()(const PTEntry* a, const PTEntry* b) const {
00210     return a->pfe > b->pfe;
00211   }
00212 };
00213 
00214 struct NlogSigThresholder {
00215   NlogSigThresholder(float threshold) : t(threshold) {}
00216   float t;
00217   bool operator()(const PTEntry* a) const {
00218     if (a->nlog_pte < t) {
00219       delete a;
00220       return true;
00221     } else return false;
00222   }
00223 };
00224 
00225 std::ostream& operator << (std::ostream& os, const PTEntry& pp)
00226 {
00227   os << pp.f_phrase << " ||| " << pp.e_phrase;
00228   os << " ||| " << pp.scores;
00229   if (pp.extra.size()>0) os << " ||| " << pp.extra;
00230   if (print_cooc_counts) os << " ||| " << pp.cfe << " " << pp.cf << " " << pp.ce;
00231   if (print_neglog_significance) os << " ||| " << pp.nlog_pte;
00232   return os;
00233 }
00234 
00235 void print(int a, int b, int c, int d, float p)
00236 {
00237   std::cerr << a << "\t" << b << "\t P=" << p << "\n"
00238             << c << "\t" << d << "\t xf="
00239             << (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1) << "\n\n";
00240 }
00241 
00242 // 2x2 (one-sided) Fisher's exact test
00243 // see B. Moore. (2004) On Log Likelihood and the Significance of Rare Events
00244 double fisher_exact(int cfe, int ce, int cf)
00245 {
00246   assert(cfe <= ce);
00247   assert(cfe <= cf);
00248 
00249   int a = cfe;
00250   int b = (cf - cfe);
00251   int c = (ce - cfe);
00252   int d = (num_lines - ce - cf + cfe);
00253   int n = a + b + c + d;
00254 
00255   double cp = exp(lgamma(1+a+c) + lgamma(1+b+d) + lgamma(1+a+b) + lgamma(1+c+d)
00256                   - lgamma(1+n) - lgamma(1+a) - lgamma(1+b) - lgamma(1+c)
00257                   - lgamma(1+d));
00258   double total_p = 0.0;
00259   int tc = std::min(b,c);
00260   for (int i=0; i<=tc; i++) {
00261     total_p += cp;
00262     double coef = (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1);
00263     cp *= coef;
00264     ++a;
00265     --c;
00266     ++d;
00267     --b;
00268   }
00269   return total_p;
00270 }
00271 
00272 template <class setType>
00273 void ordered_set_intersect(setType& out, const setType set_1, const setType set_2)
00274 {
00275     std::set_intersection(set_1->begin(), set_1->end(), set_2->begin(),
00276                           set_2->end(), inserter(*out, out->begin()) );
00277 }
00278 
00279 
00280 void lookup_phrase(SentIdSet& ids, const std::string& phrase,
00281                    tsa_t &my_sa, tind_t &my_v, Cache& cache)
00282 {
00283     ids = cache.get(phrase);
00284     if(ids->empty()) {
00285       
00286       std::vector<sapt::id_type> snt;
00287       my_v.fillIdSeq(phrase, snt);
00288 
00289       tsa_t::tree_iterator m(&my_sa);
00290       size_t k = 0;
00291       while (k < snt.size() && m.extend(snt[k])) ++k;
00292       if(k == snt.size()) {
00293         ids->reserve(m.approxOccurrenceCount()+10);
00294         sapt::tsa::ArrayEntry I(m.lower_bound(-1));
00295         char const* stop = m.upper_bound(-1);
00296         do {
00297           m.root->readEntry(I.next,I);
00298           ids->push_back(I.sid);
00299         } while (I.next != stop);
00300         
00301         std::sort(ids->begin(), ids->end());
00302         SentIdSet::element_type::iterator it =
00303           std::unique(ids->begin(), ids->end());
00304         ids->resize(it - ids->begin());
00305         
00306         if(ids->size() >= MINIMUM_SIZE_TO_KEEP)
00307           cache.put(phrase, ids);
00308       }
00309     }
00310 }
00311 
00312 void lookup_multiple_phrases(SentIdSet& ids, std::vector<std::string> & phrases,
00313                              tsa_t & my_sa, tind_t &my_v,
00314                              const std::string & rule, Cache& cache) 
00315 { 
00316 
00317     if (phrases.size() == 1) {
00318         lookup_phrase(ids, phrases.front(), my_sa, my_v, cache);
00319     }
00320     else {
00321         SentIdSet main_set( new SentIdSet::element_type() );
00322         bool first = true;
00323         SentIdSet first_set( new SentIdSet::element_type() );
00324         lookup_phrase(first_set, phrases.front(), my_sa, my_v, cache);
00325         for (std::vector<std::string>::iterator phrase=phrases.begin()+1;
00326              phrase != phrases.end(); ++phrase) {
00327             SentIdSet temp_set( new SentIdSet::element_type() );
00328             lookup_phrase(temp_set, *phrase, my_sa, my_v, cache);
00329             if (first) {
00330                 ordered_set_intersect(main_set, first_set, temp_set);
00331                 first = false;
00332             }
00333             else {
00334                 SentIdSet new_set( new SentIdSet::element_type() );
00335                 ordered_set_intersect(new_set, main_set, temp_set);
00336                 main_set->swap(*new_set);
00337             }
00338         }
00339         ids->swap(*main_set);
00340     }
00341 }
00342 
00343 
00344 void find_occurrences(SentIdSet& ids, const std::string& rule,
00345                       tsa_t& my_sa, tind_t &my_v, Cache& cache)
00346 {
00347     // we search for hierarchical rules by stripping away NT and looking for terminals sequences
00348     // if a rule contains multiple sequences of terminals, we intersect their occurrences.
00349     if (hierarchical) {
00350         //   std::cerr << "splitting up phrase: " << phrase << "\n";
00351         int pos = 0;
00352         int NTStartPos, NTEndPos;
00353         std::vector<std::string> phrases;
00354         while (rule.find("] ", pos) < rule.size()) {
00355             NTStartPos = rule.find("[",pos) - 1; // -1 to cut space before NT
00356             NTEndPos = rule.find("] ",pos);
00357             if (NTStartPos < pos) { // no space: NT at start of rule (or two consecutive NTs)
00358                 pos = NTEndPos + 2;
00359                 continue;
00360             }
00361             phrases.push_back(rule.substr(pos,NTStartPos-pos));
00362             pos = NTEndPos + 2;
00363         }
00364 
00365         NTStartPos = rule.find("[",pos) - 1; // LHS of rule
00366         if (NTStartPos > pos) {
00367             phrases.push_back(rule.substr(pos,NTStartPos-pos));
00368         }
00369 
00370         lookup_multiple_phrases(ids, phrases, my_sa, my_v, rule, cache);
00371     }
00372     else {
00373         lookup_phrase(ids, rule, my_sa, my_v, cache);
00374     }
00375 }
00376 
00377 
00378 // input: unordered list of translation options for a single source phrase
00379 void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options)
00380 {
00381   if (pfe_filter_limit > 0 && options.size() > pfe_filter_limit) {
00382     nremoved_pfefilter += (options.size() - pfe_filter_limit);
00383     std::nth_element(options.begin(), options.begin() + pfe_filter_limit,
00384                      options.end(), PfeComparer());
00385     for (std::vector<PTEntry*>::iterator i = options.begin() + pfe_filter_limit;
00386          i != options.end(); ++i)
00387       delete *i;
00388     options.erase(options.begin() + pfe_filter_limit,options.end());
00389   }
00390   
00391   if (pef_filter_only)
00392     return;
00393   
00394   if (options.empty())
00395     return;
00396   
00397   size_t cf = 0;
00398   std::vector<SentIdSet> fsets;
00399   BOOST_FOREACH(boost::shared_ptr<SA>& f_sa, f_sas) {
00400     fsets.push_back( boost::shared_ptr<SentIdSet::element_type>(new SentIdSet::element_type()) );
00401     find_occurrences(fsets.back(), options.front()->f_phrase, f_sa->I, f_sa->V, f_sa->cache);
00402     cf += fsets.back()->size();
00403   }
00404   
00405   for (std::vector<PTEntry*>::iterator i = options.begin();
00406        i != options.end(); ++i) {
00407     const std::string& e_phrase = (*i)->e_phrase;
00408     
00409     size_t ce = 0;
00410     std::vector<SentIdSet> esets;
00411     BOOST_FOREACH(boost::shared_ptr<SA>& e_sa,  e_sas) {
00412       esets.push_back( boost::shared_ptr<SentIdSet::element_type>(new SentIdSet::element_type()) );
00413       find_occurrences(esets.back(), e_phrase, e_sa->I, e_sa->V, e_sa->cache);
00414       ce += esets.back()->size();
00415     }
00416       
00417     size_t cef = 0;
00418     for(size_t j = 0; j < fsets.size(); ++j) {
00419       SentIdSet efset( new SentIdSet::element_type() );
00420       ordered_set_intersect(efset, fsets[j], esets[j]);
00421       cef += efset->size();
00422     }
00423     
00424     double nlp = -log(fisher_exact(cef, cf, ce));
00425     (*i)->set_cooc_stats(cef, cf, ce, nlp);
00426   }
00427   
00428   std::vector<PTEntry*>::iterator new_end =
00429     std::remove_if(options.begin(), options.end(),
00430                    NlogSigThresholder(sig_filter_limit));
00431   nremoved_sigfilter += (options.end() - new_end);
00432   options.erase(new_end,options.end());
00433 }
00434 
00435 void filter_thread(std::istream* in, std::ostream* out, int pfe_index) {
00436       
00437   std::vector<std::string> lines;
00438   std::string prev = "";
00439   std::vector<PTEntry*> options;
00440   while(true) {
00441     {
00442       boost::mutex::scoped_lock lock(in_mutex);
00443       if(in->eof())
00444         break;
00445       
00446       lines.clear();
00447       std::string line;
00448       while(getline(*in, line) && lines.size() < 500000)
00449         lines.push_back(line);
00450     }
00451     
00452     std::stringstream out_temp;
00453     for(std::vector<std::string>::iterator it = lines.begin(); it != lines.end(); it++) {
00454       size_t tmp_lines = ++pt_lines;
00455       if(tmp_lines % 10000 == 0) {
00456         boost::mutex::scoped_lock lock(err_mutex);
00457         std::cerr << ".";
00458       
00459         if(tmp_lines % 500000 == 0)
00460           std::cerr << "[n:" << tmp_lines << "]\n";
00461   
00462         if(tmp_lines % 10000000 == 0) {
00463           float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
00464           float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
00465           std::cerr << "------------------------------------------------------\n"
00466                     << "  unfiltered phrases pairs: " << pt_lines << "\n"
00467                     << "\n"
00468                     << "     P(f|e) filter [first]: " << nremoved_pfefilter << "   (" << pfefper << "%)\n"
00469                     << "       significance filter: " << nremoved_sigfilter << "   (" << sigfper << "%)\n"
00470                     << "            TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << "   (" << (sigfper + pfefper) << "%)\n"
00471                     << "\n"
00472                     << "     FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << "   (" << (100.0-sigfper - pfefper) << "%)\n"
00473                     << "------------------------------------------------------\n";
00474         }
00475       }
00476       
00477       if(pt_lines % 10000 == 0) {
00478         BOOST_FOREACH(boost::shared_ptr<SA> f_sa, f_sas)
00479           f_sa->cache.prune();
00480         BOOST_FOREACH(boost::shared_ptr<SA> e_sa, e_sas)
00481           e_sa->cache.prune();
00482       }
00483       
00484       if(it->length() > 0) {
00485         PTEntry* pp = new PTEntry(it->c_str(), pfe_index);
00486         if (prev != pp->f_phrase) {
00487           prev = pp->f_phrase;
00488   
00489           if (!options.empty()) {  // always true after first line
00490             compute_cooc_stats_and_filter(options);
00491           }
00492           
00493           for (std::vector<PTEntry*>::iterator i = options.begin();
00494                i != options.end(); ++i) {
00495             out_temp << **i << '\n';
00496             delete *i;
00497           }
00498         
00499           options.clear();
00500           options.push_back(pp);
00501   
00502         } else {
00503           options.push_back(pp);
00504         }
00505       }
00506     }
00507     boost::mutex::scoped_lock lock(out_mutex);
00508     *out << out_temp.str() << std::flush;
00509   }
00510   compute_cooc_stats_and_filter(options);
00511   
00512   boost::mutex::scoped_lock lock(out_mutex);
00513   for (std::vector<PTEntry*>::iterator i = options.begin();
00514        i != options.end(); ++i) {
00515     *out << **i << '\n';
00516     delete *i;
00517   }
00518   *out << std::flush;
00519 }
00520 
00521 namespace po = boost::program_options;
00522 
00523 int main(int argc, char * argv[])
00524 {
00525   bool help;
00526   std::vector<std::string> efiles;
00527   std::vector<std::string> ffiles;
00528   int pfe_index = 2;
00529   int threads = 1;
00530   size_t max_cache = 0;
00531   std::string str_sig_filter_limit;
00532    
00533   po::options_description general("General options");
00534   general.add_options()
00535     ("english,e", po::value<std::vector<std::string> >(&efiles)->multitoken(),
00536      "english.suf-arr")
00537     ("french,f", po::value<std::vector<std::string> >(&ffiles)->multitoken(),
00538      "french.suf-arr")
00539     ("pfe-index,i", po::value(&pfe_index)->default_value(2),
00540      "Index of P(f|e) in phrase table")
00541     ("pfe-filter-limit,n", po::value(&pfe_filter_limit)->default_value(0),
00542      "0, 1...: 0=no filtering, >0 sort by P(e|f) and keep the top num elements")
00543     ("threads,t", po::value(&threads)->default_value(1),
00544      "number of threads to use")
00545     ("max-cache,m", po::value(&max_cache)->default_value(0),
00546      "limit cache to  arg  most recent phrases")
00547     ("print-cooc,c", po::value(&print_cooc_counts)->zero_tokens()->default_value(false),
00548      "add the coocurrence counts to the phrase table")
00549     ("print-significance,p", po::value(&print_neglog_significance)->zero_tokens()->default_value(false),
00550      "add -log(significance) to the phrase table")
00551     ("hierarchical,x", po::value(&hierarchical)->zero_tokens()->default_value(false),
00552      "filter hierarchical rule table")
00553     ("sig-filter-limit,l", po::value(&str_sig_filter_limit),
00554      ">0.0, a+e, or a-e: keep values that have a -log significance > this")
00555     ("help,h", po::value(&help)->zero_tokens()->default_value(false),
00556      "display this message")
00557   ;
00558 
00559   po::options_description cmdline_options("Allowed options");
00560   cmdline_options.add(general);
00561   po::variables_map vm;
00562   
00563   try { 
00564     po::store(po::command_line_parser(argc,argv).
00565               options(cmdline_options).run(), vm);
00566     po::notify(vm);
00567   }
00568   catch (std::exception& e) {
00569     std::cout << "Error: " << e.what() << std::endl << std::endl;
00570     
00571     usage();
00572     std::cout << cmdline_options << std::endl;
00573     exit(0);
00574   }
00575   
00576   if(vm["help"].as<bool>()) {
00577     usage();
00578     std::cout << cmdline_options << std::endl;
00579     exit(0);
00580   }
00581    
00582   if(vm.count("pfe-filter-limit"))
00583     std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl;
00584   if(vm.count("threads"))
00585     std::cerr << "Using threads: " << threads << std::endl;  
00586   if(vm.count("max-cache"))
00587     std::cerr << "Using max phrases in caches: " << max_cache << std::endl;
00588     
00589   if (strcmp(str_sig_filter_limit.c_str(),"a+e") == 0) {
00590     sig_filter_limit = ALPHA_PLUS_EPS;
00591   } else if (strcmp(str_sig_filter_limit.c_str(),"a-e") == 0) {
00592     sig_filter_limit = ALPHA_MINUS_EPS;
00593   } else {
00594     char *x;
00595     sig_filter_limit = strtod(str_sig_filter_limit.c_str(), &x);
00596     if (sig_filter_limit < 0.0) {
00597       std::cerr << "Filter limit (-l) must be either 'a+e', 'a-e' or a real number >= 0.0\n";
00598       usage();
00599     }
00600   }
00601     
00602   if (sig_filter_limit == 0.0) pef_filter_only = true;
00603   //-----------------------------------------------------------------------------
00604   if (optind != argc || ((efiles.empty() || ffiles.empty()) && !pef_filter_only)) {
00605     usage();
00606   }
00607   
00608   if (!pef_filter_only) {
00609     size_t elines = 0;
00610     BOOST_FOREACH(std::string& efile, efiles) {
00611       e_sas.push_back(boost::shared_ptr<SA>(new SA()));
00612       e_sas.back()->V.open(efile + ".tdx");
00613       e_sas.back()->T.reset(new ttrack_t());  
00614       e_sas.back()->T->open(efile + ".mct");
00615       e_sas.back()->I.open(efile + ".sfa", e_sas.back()->T);
00616       elines += e_sas.back()->T->size(); 
00617     }
00618     
00619     size_t flines = 0;
00620     BOOST_FOREACH(std::string& ffile, ffiles) {
00621       f_sas.push_back(boost::shared_ptr<SA>(new SA()));
00622       f_sas.back()->V.open(ffile + ".tdx");
00623       f_sas.back()->T.reset(new ttrack_t());  
00624       f_sas.back()->T->open(ffile + ".mct");
00625       f_sas.back()->I.open(ffile + ".sfa", f_sas.back()->T);
00626       flines += f_sas.back()->T->size(); 
00627     }
00628     
00629     if (elines != flines) {
00630       std::cerr << "Number of lines in e-corpus != number of lines in f-corpus!\n";
00631       usage();
00632       exit(1);
00633     } else {
00634       std::cerr << "Training corpus: " << elines << " lines\n";
00635       num_lines = elines;
00636     }
00637     p_111 = -log(fisher_exact(1,1,1));
00638     std::cerr << "\\alpha = " << p_111 << "\n";
00639     if (sig_filter_limit == ALPHA_MINUS_EPS) {
00640       sig_filter_limit = p_111 - 0.001;
00641     } else if (sig_filter_limit == ALPHA_PLUS_EPS) {
00642       sig_filter_limit = p_111 + 0.001;
00643     }
00644     std::cerr << "Sig filter threshold is = " << sig_filter_limit << "\n";
00645   } else {
00646     std::cerr << "Filtering using P(e|f) only. n=" << pfe_filter_limit << std::endl;
00647   }
00648 
00649   Cache::set_max_cache(max_cache);
00650   std::ios_base::sync_with_stdio(false);
00651   
00652   boost::thread_group threadGroup;
00653   for(int i = 0; i < threads; i++) 
00654     threadGroup.add_thread(new boost::thread(filter_thread, &std::cin, &std::cout, pfe_index));
00655   threadGroup.join_all();
00656 
00657   float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
00658   float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
00659   
00660   std::cerr << "\n\n------------------------------------------------------\n"
00661             << "  unfiltered phrases pairs: " << pt_lines << "\n"
00662             << "\n"
00663             << "     P(f|e) filter [first]: " << nremoved_pfefilter << "   (" << pfefper << "%)\n"
00664             << "       significance filter: " << nremoved_sigfilter << "   (" << sigfper << "%)\n"
00665             << "            TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << "   (" << (sigfper + pfefper) << "%)\n"
00666             << "\n"
00667             << "     FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << "   (" << (100.0-sigfper - pfefper) << "%)\n"
00668             << "------------------------------------------------------\n";  
00669 }