#include <cstring>#include <cassert>#include <cstdio>#include <cstdlib>#include <algorithm>#include <fstream>#include <sstream>#include <vector>#include <iostream>#include <set>#include <boost/thread/tss.hpp>#include <boost/thread.hpp>#include <boost/unordered_map.hpp>#include <boost/program_options.hpp>#include <boost/shared_ptr.hpp>#include <boost/foreach.hpp>#include <unistd.h>#include "mm/ug_bitext.h"Go to the source code of this file.
Classes | |
| class | Cache |
| struct | SA |
| struct | PTEntry |
| struct | PfeComparer |
| struct | NlogSigThresholder |
Typedefs | |
| typedef sapt::L2R_Token < sapt::SimpleWordId > | Token |
| typedef sapt::mmTtrack< Token > | ttrack_t |
| typedef sapt::mmTSA< Token > | tsa_t |
| typedef sapt::TokenIndex | tind_t |
| typedef size_t | TextLenType |
| typedef boost::shared_ptr < std::vector< TextLenType > > | SentIdSet |
Functions | |
| void | usage () |
| std::ostream & | operator<< (std::ostream &os, const PTEntry &pp) |
| void | print (int a, int b, int c, int d, float p) |
| double | fisher_exact (int cfe, int ce, int cf) |
| template<class setType > | |
| void | ordered_set_intersect (setType &out, const setType set_1, const setType set_2) |
| void | lookup_phrase (SentIdSet &ids, const std::string &phrase, tsa_t &my_sa, tind_t &my_v, Cache &cache) |
| void | lookup_multiple_phrases (SentIdSet &ids, std::vector< std::string > &phrases, tsa_t &my_sa, tind_t &my_v, const std::string &rule, Cache &cache) |
| void | find_occurrences (SentIdSet &ids, const std::string &rule, tsa_t &my_sa, tind_t &my_v, Cache &cache) |
| void | compute_cooc_stats_and_filter (std::vector< PTEntry * > &options) |
| void | filter_thread (std::istream *in, std::ostream *out, int pfe_index) |
| int | main (int argc, char *argv[]) |
Variables | |
| const size_t | MINIMUM_SIZE_TO_KEEP = 10000 |
| const std::string | SEPARATOR = " ||| " |
| const double | ALPHA_PLUS_EPS = -1000.0 |
| const double | ALPHA_MINUS_EPS = -2000.0 |
| int | pfe_filter_limit = 0 |
| bool | print_cooc_counts = false |
| bool | print_neglog_significance = false |
| double | sig_filter_limit = 0 |
| bool | pef_filter_only = false |
| bool | hierarchical = false |
| double | p_111 = 0.0 |
| size_t | pt_lines = 0 |
| size_t | nremoved_sigfilter = 0 |
| size_t | nremoved_pfefilter = 0 |
| int | num_lines |
| boost::mutex | in_mutex |
| boost::mutex | out_mutex |
| boost::mutex | err_mutex |
| std::vector< boost::shared_ptr < SA > > | e_sas |
| std::vector< boost::shared_ptr < SA > > | f_sas |
| typedef boost::shared_ptr<std::vector<TextLenType> > SentIdSet |
Definition at line 69 of file filter-pt.cc.
| typedef size_t TextLenType |
Definition at line 67 of file filter-pt.cc.
| typedef sapt::TokenIndex tind_t |
Definition at line 59 of file filter-pt.cc.
| typedef sapt::L2R_Token<sapt::SimpleWordId> Token |
Definition at line 56 of file filter-pt.cc.
| typedef sapt::mmTSA<Token> tsa_t |
Definition at line 58 of file filter-pt.cc.
| typedef sapt::mmTtrack<Token> ttrack_t |
Definition at line 57 of file filter-pt.cc.
| void compute_cooc_stats_and_filter | ( | std::vector< PTEntry * > & | options | ) |
Definition at line 379 of file filter-pt.cc.
References e_sas, f_sas, find_occurrences(), fisher_exact(), nremoved_pfefilter, nremoved_sigfilter, ordered_set_intersect(), pef_filter_only, pfe_filter_limit, and sig_filter_limit.
Referenced by filter_thread().


| void filter_thread | ( | std::istream * | in, | |
| std::ostream * | out, | |||
| int | pfe_index | |||
| ) |
Definition at line 435 of file filter-pt.cc.
References compute_cooc_stats_and_filter(), e_sas, err_mutex, PTEntry::f_phrase, f_sas, in_mutex, nremoved_pfefilter, nremoved_sigfilter, out_mutex, and pt_lines.
Referenced by main().


| void find_occurrences | ( | SentIdSet & | ids, | |
| const std::string & | rule, | |||
| tsa_t & | my_sa, | |||
| tind_t & | my_v, | |||
| Cache & | cache | |||
| ) |
Definition at line 344 of file filter-pt.cc.
References hierarchical, lookup_multiple_phrases(), and lookup_phrase().
Referenced by compute_cooc_stats_and_filter().


| double fisher_exact | ( | int | cfe, | |
| int | ce, | |||
| int | cf | |||
| ) |
Definition at line 244 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), and main().

| void lookup_multiple_phrases | ( | SentIdSet & | ids, | |
| std::vector< std::string > & | phrases, | |||
| tsa_t & | my_sa, | |||
| tind_t & | my_v, | |||
| const std::string & | rule, | |||
| Cache & | cache | |||
| ) |
Definition at line 312 of file filter-pt.cc.
References lookup_phrase(), and ordered_set_intersect().
Referenced by find_occurrences().


| void lookup_phrase | ( | SentIdSet & | ids, | |
| const std::string & | phrase, | |||
| tsa_t & | my_sa, | |||
| tind_t & | my_v, | |||
| Cache & | cache | |||
| ) |
Definition at line 280 of file filter-pt.cc.
References sapt::TokenIndex::fillIdSeq(), Cache::get(), I, k, m, MINIMUM_SIZE_TO_KEEP, Cache::put(), sort(), stop, and unique().
Referenced by find_occurrences(), and lookup_multiple_phrases().


| int main | ( | int | argc, | |
| char * | argv[] | |||
| ) |
Definition at line 523 of file filter-pt.cc.
References ALPHA_MINUS_EPS, ALPHA_PLUS_EPS, e_sas, f_sas, filter_thread(), fisher_exact(), hierarchical, nremoved_pfefilter, nremoved_sigfilter, num_lines, optind, p_111, pef_filter_only, pfe_filter_limit, print_cooc_counts, print_neglog_significance, pt_lines, Cache::set_max_cache(), sig_filter_limit, store(), and usage().

| std::ostream& operator<< | ( | std::ostream & | os, | |
| const PTEntry & | pp | |||
| ) |
Definition at line 225 of file filter-pt.cc.
References PTEntry::ce, PTEntry::cf, PTEntry::cfe, PTEntry::e_phrase, PTEntry::extra, PTEntry::f_phrase, PTEntry::nlog_pte, print_cooc_counts, print_neglog_significance, and PTEntry::scores.
| void ordered_set_intersect | ( | setType & | out, | |
| const setType | set_1, | |||
| const setType | set_2 | |||
| ) | [inline] |
Definition at line 273 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), and lookup_multiple_phrases().

| void print | ( | int | a, | |
| int | b, | |||
| int | c, | |||
| int | d, | |||
| float | p | |||
| ) |
Definition at line 235 of file filter-pt.cc.
| void usage | ( | ) |
Definition at line 135 of file filter-pt.cc.
Referenced by main().

| const double ALPHA_MINUS_EPS = -2000.0 |
| const double ALPHA_PLUS_EPS = -1000.0 |
Definition at line 130 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), filter_thread(), and main().
| boost::mutex err_mutex |
Definition at line 131 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), filter_thread(), and main().
| bool hierarchical = false |
| boost::mutex in_mutex |
| const size_t MINIMUM_SIZE_TO_KEEP = 10000 |
| size_t nremoved_pfefilter = 0 |
Definition at line 54 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), filter_thread(), and main().
| size_t nremoved_sigfilter = 0 |
Definition at line 53 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), filter_thread(), and main().
| int num_lines |
| boost::mutex out_mutex |
| double p_111 = 0.0 |
| bool pef_filter_only = false |
Definition at line 48 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), and main().
| int pfe_filter_limit = 0 |
Definition at line 43 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), and main().
| bool print_cooc_counts = false |
| bool print_neglog_significance = false |
| size_t pt_lines = 0 |
| const std::string SEPARATOR = " ||| " |
| double sig_filter_limit = 0 |
Definition at line 46 of file filter-pt.cc.
Referenced by compute_cooc_stats_and_filter(), and main().
1.5.9