00001 #include "util/file_piece.hh"
00002
00003 #include "util/double-conversion/double-conversion.h"
00004 #include "util/exception.hh"
00005 #include "util/file.hh"
00006 #include "util/mmap.hh"
00007
00008 #if defined(_WIN32) || defined(_WIN64)
00009 #include <io.h>
00010 #else
00011 #include <unistd.h>
00012 #endif
00013
00014 #include <cassert>
00015 #include <cerrno>
00016 #include <cmath>
00017 #include <cstdlib>
00018 #include <iostream>
00019 #include <limits>
00020 #include <string>
00021
00022 #include <fcntl.h>
00023 #include <sys/types.h>
00024 #include <sys/stat.h>
00025
00026 namespace util {
00027
00028 ParseNumberException::ParseNumberException(StringPiece value) throw() {
00029 *this << "Could not parse \"" << value << "\" into a ";
00030 }
00031
00032
00033 const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
00034
00035 FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) :
00036 file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(SizePage()),
00037 progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name) {
00038 Initialize(name, show_progress, min_buffer);
00039 }
00040
00041 namespace {
00042 std::string NamePossiblyFind(int fd, const char *name) {
00043 if (name) return name;
00044 return NameFromFD(fd);
00045 }
00046 }
00047
00048 FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) :
00049 file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()),
00050 progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + NamePossiblyFind(fd, name)) {
00051 Initialize(NamePossiblyFind(fd, name).c_str(), show_progress, min_buffer);
00052 }
00053
00054 FilePiece::FilePiece(std::istream &stream, const char *name, std::size_t min_buffer) :
00055 total_size_(kBadSize), page_(SizePage()) {
00056 InitializeNoRead("istream", min_buffer);
00057
00058 fallback_to_read_ = true;
00059 HugeMalloc(default_map_size_, false, data_);
00060 position_ = data_.begin();
00061 position_end_ = position_;
00062
00063 fell_back_.Reset(stream);
00064 }
00065
00066 FilePiece::~FilePiece() {}
00067
00068 StringPiece FilePiece::ReadLine(char delim, bool strip_cr) {
00069 std::size_t skip = 0;
00070 while (true) {
00071 for (const char *i = position_ + skip; i < position_end_; ++i) {
00072 if (*i == delim) {
00073
00074
00075 const std::size_t subtract_cr = (
00076 (strip_cr && i > position_ && *(i - 1) == '\r') ?
00077 1 : 0);
00078 StringPiece ret(position_, i - position_ - subtract_cr);
00079 position_ = i + 1;
00080 return ret;
00081 }
00082 }
00083 if (at_end_) {
00084 if (position_ == position_end_) {
00085 Shift();
00086 }
00087 return Consume(position_end_);
00088 }
00089 skip = position_end_ - position_;
00090 Shift();
00091 }
00092 }
00093
00094 bool FilePiece::ReadLineOrEOF(StringPiece &to, char delim, bool strip_cr) {
00095 try {
00096 to = ReadLine(delim, strip_cr);
00097 } catch (const util::EndOfFileException &e) { return false; }
00098 return true;
00099 }
00100
00101 float FilePiece::ReadFloat() {
00102 return ReadNumber<float>();
00103 }
00104 double FilePiece::ReadDouble() {
00105 return ReadNumber<double>();
00106 }
00107 long int FilePiece::ReadLong() {
00108 return ReadNumber<long int>();
00109 }
00110 unsigned long int FilePiece::ReadULong() {
00111 return ReadNumber<unsigned long int>();
00112 }
00113
00114
00115 void FilePiece::InitializeNoRead(const char *name, std::size_t min_buffer) {
00116 file_name_ = name;
00117
00118 default_map_size_ = page_ * std::max<std::size_t>((min_buffer / page_ + 1), 2);
00119 position_ = NULL;
00120 position_end_ = NULL;
00121 mapped_offset_ = 0;
00122 at_end_ = false;
00123 }
00124
00125 void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) {
00126 InitializeNoRead(name, min_buffer);
00127
00128 if (total_size_ == kBadSize) {
00129
00130 fallback_to_read_ = false;
00131 if (show_progress)
00132 *show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl;
00133 TransitionToRead();
00134 } else {
00135 fallback_to_read_ = false;
00136 }
00137 Shift();
00138
00139 if ((position_end_ >= position_ + ReadCompressed::kMagicSize) && ReadCompressed::DetectCompressedMagic(position_)) {
00140 if (!fallback_to_read_) {
00141 at_end_ = false;
00142 TransitionToRead();
00143 }
00144 }
00145 }
00146
00147 namespace {
00148
00149 static const double_conversion::StringToDoubleConverter kConverter(
00150 double_conversion::StringToDoubleConverter::ALLOW_TRAILING_JUNK | double_conversion::StringToDoubleConverter::ALLOW_LEADING_SPACES,
00151 std::numeric_limits<double>::quiet_NaN(),
00152 std::numeric_limits<double>::quiet_NaN(),
00153 "inf",
00154 "NaN");
00155
00156 StringPiece FirstToken(StringPiece str) {
00157 const char *i;
00158 for (i = str.data(); i != str.data() + str.size(); ++i) {
00159 if (kSpaces[(unsigned char)*i]) break;
00160 }
00161 return StringPiece(str.data(), i - str.data());
00162 }
00163
00164 const char *ParseNumber(StringPiece str, float &out) {
00165 int count;
00166 out = kConverter.StringToFloat(str.data(), str.size(), &count);
00167 UTIL_THROW_IF_ARG(std::isnan(out) && str != "NaN" && str != "nan", ParseNumberException, (FirstToken(str)), "float");
00168 return str.data() + count;
00169 }
00170 const char *ParseNumber(StringPiece str, double &out) {
00171 int count;
00172 out = kConverter.StringToDouble(str.data(), str.size(), &count);
00173 UTIL_THROW_IF_ARG(std::isnan(out) && str != "NaN" && str != "nan", ParseNumberException, (FirstToken(str)), "double");
00174 return str.data() + count;
00175 }
00176 const char *ParseNumber(StringPiece str, long int &out) {
00177 char *end;
00178 errno = 0;
00179 out = strtol(str.data(), &end, 10);
00180 UTIL_THROW_IF_ARG(errno || (end == str.data()), ParseNumberException, (FirstToken(str)), "long int");
00181 return end;
00182 }
00183 const char *ParseNumber(StringPiece str, unsigned long int &out) {
00184 char *end;
00185 errno = 0;
00186 out = strtoul(str.data(), &end, 10);
00187 UTIL_THROW_IF_ARG(errno || (end == str.data()), ParseNumberException, (FirstToken(str)), "unsigned long int");
00188 return end;
00189 }
00190 }
00191
00192 template <class T> T FilePiece::ReadNumber() {
00193 SkipSpaces();
00194 while (last_space_ < position_) {
00195 if (UTIL_UNLIKELY(at_end_)) {
00196
00197 std::string buffer(position_, position_end_);
00198 T ret;
00199
00200 const char *begin = buffer.c_str();
00201 const char *end = ParseNumber(StringPiece(begin, buffer.size()), ret);
00202 position_ += end - begin;
00203 return ret;
00204 }
00205 Shift();
00206 }
00207 T ret;
00208 position_ = ParseNumber(StringPiece(position_, last_space_ - position_), ret);
00209 return ret;
00210 }
00211
00212 const char *FilePiece::FindDelimiterOrEOF(const bool *delim) {
00213 std::size_t skip = 0;
00214 while (true) {
00215 for (const char *i = position_ + skip; i < position_end_; ++i) {
00216 if (delim[static_cast<unsigned char>(*i)]) return i;
00217 }
00218 if (at_end_) {
00219 if (position_ == position_end_) Shift();
00220 return position_end_;
00221 }
00222 skip = position_end_ - position_;
00223 Shift();
00224 }
00225 }
00226
00227 void FilePiece::Shift() {
00228 if (at_end_) {
00229 progress_.Finished();
00230 throw EndOfFileException();
00231 }
00232 uint64_t desired_begin = position_ - data_.begin() + mapped_offset_;
00233
00234 if (!fallback_to_read_) MMapShift(desired_begin);
00235
00236 if (fallback_to_read_) ReadShift();
00237
00238 for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) {
00239 if (kSpaces[static_cast<unsigned char>(*last_space_)]) break;
00240 }
00241 }
00242
00243 void FilePiece::MMapShift(uint64_t desired_begin) {
00244
00245 uint64_t ignore = desired_begin % page_;
00246
00247 if (position_ == data_.begin() + ignore && position_) {
00248 default_map_size_ *= 2;
00249 }
00250
00251 uint64_t mapped_offset = desired_begin - ignore;
00252
00253 uint64_t mapped_size;
00254 if (default_map_size_ >= static_cast<std::size_t>(total_size_ - mapped_offset)) {
00255 at_end_ = true;
00256 mapped_size = total_size_ - mapped_offset;
00257 } else {
00258 mapped_size = default_map_size_;
00259 }
00260
00261
00262 data_.reset();
00263 try {
00264 MapRead(POPULATE_OR_LAZY, *file_, mapped_offset, mapped_size, data_);
00265 } catch (const util::ErrnoException &e) {
00266 if (desired_begin) {
00267 SeekOrThrow(*file_, desired_begin);
00268 }
00269
00270 at_end_ = false;
00271 TransitionToRead();
00272 return;
00273 }
00274 mapped_offset_ = mapped_offset;
00275 position_ = data_.begin() + ignore;
00276 position_end_ = data_.begin() + mapped_size;
00277
00278 progress_.Set(desired_begin);
00279 }
00280
00281 void FilePiece::TransitionToRead() {
00282 assert(!fallback_to_read_);
00283 fallback_to_read_ = true;
00284 data_.reset();
00285 HugeMalloc(default_map_size_, false, data_);
00286 position_ = data_.begin();
00287 position_end_ = position_;
00288
00289 try {
00290 fell_back_.Reset(file_.release());
00291 } catch (util::Exception &e) {
00292 e << " in file " << file_name_;
00293 throw;
00294 }
00295 }
00296
00297 void FilePiece::ReadShift() {
00298 assert(fallback_to_read_);
00299
00300
00301
00302
00303 if (position_ == position_end_) {
00304 mapped_offset_ += (position_end_ - data_.begin());
00305 position_ = data_.begin();
00306 position_end_ = position_;
00307 }
00308
00309 std::size_t already_read = position_end_ - data_.begin();
00310
00311 if (already_read == default_map_size_) {
00312 if (position_ == data_.begin()) {
00313
00314 std::size_t valid_length = position_end_ - position_;
00315 default_map_size_ *= 2;
00316 HugeRealloc(default_map_size_, false, data_);
00317 position_ = data_.begin();
00318 position_end_ = position_ + valid_length;
00319 } else {
00320 std::size_t moving = position_end_ - position_;
00321 memmove(data_.get(), position_, moving);
00322 position_ = data_.begin();
00323 position_end_ = position_ + moving;
00324 already_read = moving;
00325 }
00326 }
00327
00328 std::size_t read_return = fell_back_.Read(static_cast<uint8_t*>(data_.get()) + already_read, default_map_size_ - already_read);
00329 progress_.Set(fell_back_.RawAmount());
00330
00331 if (read_return == 0) {
00332 at_end_ = true;
00333 }
00334 position_end_ += read_return;
00335 }
00336
00337 }