00001 #include "HypPackEnumerator.h"
00002
00003 #include <cassert>
00004 #include <algorithm>
00005 #include <boost/unordered_set.hpp>
00006
00007 using namespace std;
00008
00009 namespace MosesTuning
00010 {
00011
00012
00013 StreamingHypPackEnumerator::StreamingHypPackEnumerator
00014 (
00015 vector<std::string> const& featureFiles,
00016 vector<std::string> const& scoreFiles
00017 )
00018 : m_featureFiles(featureFiles),
00019 m_scoreFiles(scoreFiles)
00020 {
00021 if (scoreFiles.size() == 0 || featureFiles.size() == 0) {
00022 cerr << "No data to process" << endl;
00023 exit(0);
00024 }
00025
00026 if (featureFiles.size() != scoreFiles.size()) {
00027 cerr << "Error: Number of feature files (" << featureFiles.size() <<
00028 ") does not match number of score files (" << scoreFiles.size() << ")" << endl;
00029 exit(1);
00030 }
00031
00032 m_num_lists = scoreFiles.size();
00033 m_primed = false;
00034 m_iNumDense = -1;
00035 }
00036
00037 size_t StreamingHypPackEnumerator::num_dense() const
00038 {
00039 if(m_iNumDense<0) {
00040 cerr << "Error: Requested num_dense() for an unprimed StreamingHypPackEnumerator" << endl;
00041 exit(1);
00042 }
00043 return (size_t) m_iNumDense;
00044 }
00045
00046 void StreamingHypPackEnumerator::prime()
00047 {
00048 m_current_indexes.clear();
00049 m_current_featureVectors.clear();
00050 boost::unordered_set<FeatureDataItem> seen;
00051 m_primed = true;
00052
00053 for (size_t i = 0; i < m_num_lists; ++i) {
00054 if (m_featureDataIters[i] == FeatureDataIterator::end()) {
00055 cerr << "Error: Feature file " << i << " ended prematurely" << endl;
00056 exit(1);
00057 }
00058 if (m_scoreDataIters[i] == ScoreDataIterator::end()) {
00059 cerr << "Error: Score file " << i << " ended prematurely" << endl;
00060 exit(1);
00061 }
00062 if (m_featureDataIters[i]->size() != m_scoreDataIters[i]->size()) {
00063 cerr << "Error: For sentence " << m_sentenceId << " features and scores have different size" << endl;
00064 exit(1);
00065 }
00066 for (size_t j = 0; j < m_featureDataIters[i]->size(); ++j) {
00067 const FeatureDataItem& item = m_featureDataIters[i]->operator[](j);
00068
00069 if(seen.find(item)==seen.end()) {
00070 seen.insert(item);
00071
00072 int iDense = item.dense.size();
00073 if(m_iNumDense != iDense) {
00074 if(m_iNumDense==-1) m_iNumDense = iDense;
00075 else {
00076 cerr << "Error: expecting constant number of dense features: "
00077 << m_iNumDense << " != " << iDense << endl;
00078 exit(1);
00079 }
00080 }
00081
00082 m_current_indexes.push_back(pair<size_t,size_t>(i,j));
00083 m_current_featureVectors.push_back(MiraFeatureVector(item));
00084 }
00085 }
00086 }
00087 }
00088
00089 void StreamingHypPackEnumerator::reset()
00090 {
00091 m_featureDataIters.clear();
00092 m_scoreDataIters.clear();
00093 for (size_t i = 0; i < m_num_lists; ++i) {
00094 m_featureDataIters.push_back(FeatureDataIterator(m_featureFiles[i]));
00095 m_scoreDataIters.push_back(ScoreDataIterator(m_scoreFiles[i]));
00096 }
00097 m_sentenceId=0;
00098 prime();
00099 }
00100
00101 bool StreamingHypPackEnumerator::finished()
00102 {
00103 return m_featureDataIters[0]==FeatureDataIterator::end();
00104 }
00105
00106 void StreamingHypPackEnumerator::next()
00107 {
00108 if(!m_primed) {
00109 cerr << "Enumerating an unprimed HypPackEnumerator" << endl;
00110 exit(1);
00111 }
00112 for (size_t i = 0; i < m_num_lists; ++i) {
00113 ++m_featureDataIters[i];
00114 ++m_scoreDataIters[i];
00115 }
00116 m_sentenceId++;
00117 if(m_sentenceId % 100 == 0) cerr << ".";
00118 if(!finished()) prime();
00119 }
00120
00121 size_t StreamingHypPackEnumerator::cur_size()
00122 {
00123 if(!m_primed) {
00124 cerr << "Querying size from an unprimed HypPackEnumerator" << endl;
00125 exit(1);
00126 }
00127 return m_current_indexes.size();
00128 }
00129
00130 const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index)
00131 {
00132 if(!m_primed) {
00133 cerr << "Querying features from an unprimed HypPackEnumerator" << endl;
00134 exit(1);
00135 }
00136 return m_current_featureVectors[index];
00137 }
00138
00139 const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index)
00140 {
00141 if(!m_primed) {
00142 cerr << "Querying scores from an unprimed HypPackEnumerator" << endl;
00143 exit(1);
00144 }
00145 const pair<size_t,size_t>& pij = m_current_indexes[index];
00146 return m_scoreDataIters[pij.first]->operator[](pij.second);
00147 }
00148
00149 size_t StreamingHypPackEnumerator::cur_id()
00150 {
00151 return m_sentenceId;
00152 }
00153
00154
00155
00156 RandomAccessHypPackEnumerator::RandomAccessHypPackEnumerator(vector<string> const& featureFiles,
00157 vector<string> const& scoreFiles,
00158 bool no_shuffle)
00159 {
00160 StreamingHypPackEnumerator train(featureFiles,scoreFiles);
00161 size_t index=0;
00162 for(train.reset(); !train.finished(); train.next()) {
00163 m_features.push_back(vector<MiraFeatureVector>());
00164 m_scores.push_back(vector<ScoreDataItem>());
00165 for(size_t j=0; j<train.cur_size(); j++) {
00166 m_features.back().push_back(train.featuresAt(j));
00167 m_scores.back().push_back(train.scoresAt(j));
00168 }
00169 m_indexes.push_back(index++);
00170 }
00171
00172 m_cur_index = 0;
00173 m_no_shuffle = no_shuffle;
00174 m_num_dense = train.num_dense();
00175 }
00176
00177 size_t RandomAccessHypPackEnumerator::num_dense() const
00178 {
00179 return m_num_dense;
00180 }
00181
00182 void RandomAccessHypPackEnumerator::reset()
00183 {
00184 m_cur_index = 0;
00185 if(!m_no_shuffle) random_shuffle(m_indexes.begin(),m_indexes.end());
00186 }
00187 bool RandomAccessHypPackEnumerator::finished()
00188 {
00189 return m_cur_index >= m_indexes.size();
00190 }
00191 void RandomAccessHypPackEnumerator::next()
00192 {
00193 m_cur_index++;
00194 }
00195
00196 size_t RandomAccessHypPackEnumerator::cur_size()
00197 {
00198 assert(m_features[m_indexes[m_cur_index]].size()==m_scores[m_indexes[m_cur_index]].size());
00199 return m_features[m_indexes[m_cur_index]].size();
00200 }
00201 const MiraFeatureVector& RandomAccessHypPackEnumerator::featuresAt(size_t i)
00202 {
00203 return m_features[m_indexes[m_cur_index]][i];
00204 }
00205 const ScoreDataItem& RandomAccessHypPackEnumerator::scoresAt(size_t i)
00206 {
00207 return m_scores[m_indexes[m_cur_index]][i];
00208 }
00209
00210 size_t RandomAccessHypPackEnumerator::cur_id()
00211 {
00212 return m_indexes[m_cur_index];
00213 }
00214
00215
00216
00217
00218
00219
00220 }