00001
00002
00003
00004 template<typename Token>
00005 void
00006 Bitext<Token>::agenda
00007 ::worker
00008 ::operator()()
00009 {
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 uint64_t sid=0, offset=0;
00022 size_t s1=0, s2=0, e1=0, e2=0;
00023 std::vector<unsigned char> aln;
00024 while(SPTR<job> j = ag.get_job())
00025 {
00026 j->stats->register_worker();
00027 bitvector full_alignment(100*100);
00028 while (j->nextSample(sid,offset))
00029 {
00030 aln.clear();
00031 int po_fwd = LRModel::NONE;
00032 int po_bwd = LRModel::NONE;
00033 int docid = j->m_bias ? j->m_bias->GetClass(sid) : -1;
00034 bitvector* full_aln = j->fwd ? &full_alignment : NULL;
00035
00036
00037 bool good = (ag.bt.find_trg_phr_bounds
00038 (sid, offset, offset + j->len,
00039 s1, s2, e1, e2, po_fwd, po_bwd,
00040 &aln, full_aln, !j->fwd));
00041
00042 if (!good)
00043 {
00044 j->stats->count_sample(docid, 0, po_fwd, po_bwd);
00045 continue;
00046 }
00047
00048
00049 size_t num_pairs = (s2-s1+1) * (e2-e1+1);
00050 j->stats->count_sample(docid, num_pairs, po_fwd, po_bwd);
00051
00052 #if 0
00053 Token const* t = ag.bt.T2->sntStart(sid);
00054 Token const* eos = ag.bt.T2->sntEnd(sid);
00055 cerr << "[" << j->stats->good + 1 << "] ";
00056 while (t != eos) cerr << (*ag.bt.V2)[(t++)->id()] << " ";
00057 cerr << "[" << docid << "]" << std::endl;
00058 #endif
00059
00060 float sample_weight = 1./num_pairs;
00061 Token const* o = (j->fwd ? ag.bt.T2 : ag.bt.T1)->sntStart(sid);
00062
00063
00064 for (size_t k = 1; k < aln.size(); k += 2) aln[k] += s2 - s1;
00065
00066 std::vector<uint64_t> seen; seen.reserve(10);
00067
00068
00069
00070
00071
00072
00073
00074 for (size_t s = s1; s <= s2; ++s)
00075 {
00076 TSA<Token> const& I = j->fwd ? *ag.bt.I2 : *ag.bt.I1;
00077 SPTR<iter> b = I.find(o + s, e1 - s);
00078 UTIL_THROW_IF2(!b || b->size() < e1-s, "target phrase not found");
00079
00080 for (size_t i = e1; i <= e2; ++i)
00081 {
00082 uint64_t tpid = b->getPid();
00083
00084
00085 size_t s = 0;
00086 while (s < seen.size() && seen[s] != tpid) ++s;
00087 if (s < seen.size()) continue;
00088 seen.push_back(tpid);
00089
00090 size_t raw2 = b->approxOccurrenceCount();
00091 float bwgt = j->m_bias ? (*j->m_bias)[sid] : 1;
00092 j->stats->add(tpid, sample_weight, bwgt, aln, raw2,
00093 po_fwd, po_bwd, docid, sid);
00094 bool ok = (i == e2) || b->extend(o[i].id());
00095 UTIL_THROW_IF2(!ok, "Could not extend target phrase.");
00096 }
00097 if (s < s2)
00098 for (size_t k = 1; k < aln.size(); k += 2)
00099 --aln[k];
00100 }
00101 }
00102 j->stats->release();
00103 }
00104 }