text_reader.h 11.1 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#ifndef LIGHTGBM_UTILS_TEXT_READER_H_
#define LIGHTGBM_UTILS_TEXT_READER_H_

8
9
10
11
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/pipeline_reader.h>
#include <LightGBM/utils/random.h>

12
#include <string>
Guolin Ke's avatar
Guolin Ke committed
13
#include <cstdio>
14
#include <functional>
Guolin Ke's avatar
Guolin Ke committed
15
#include <sstream>
Guolin Ke's avatar
Guolin Ke committed
16
17
18
19
#include <vector>

namespace LightGBM {

20
21
const size_t kGbs = size_t(1024) * 1024 * 1024;

Guolin Ke's avatar
Guolin Ke committed
22
23
24
25
26
/*!
* \brief Read text data from file
*/
template<typename INDEX_T>
class TextReader {
Nikita Titov's avatar
Nikita Titov committed
27
 public:
Guolin Ke's avatar
Guolin Ke committed
28
29
30
  /*!
  * \brief Constructor
  * \param filename Filename of data
Guolin Ke's avatar
Guolin Ke committed
31
  * \param is_skip_first_line True if need to skip header
Guolin Ke's avatar
Guolin Ke committed
32
  */
33
34
  TextReader(const char* filename, bool is_skip_first_line, size_t progress_interval_bytes = SIZE_MAX):
    filename_(filename), is_skip_first_line_(is_skip_first_line), read_progress_interval_bytes_(progress_interval_bytes) {
Guolin Ke's avatar
Guolin Ke committed
35
    if (is_skip_first_line_) {
36
37
      auto reader = VirtualFileReader::Make(filename);
      if (!reader->Init()) {
38
        Log::Fatal("Could not open %s", filename);
Guolin Ke's avatar
Guolin Ke committed
39
      }
Guolin Ke's avatar
Guolin Ke committed
40
      std::stringstream str_buf;
41
42
43
44
      char read_c;
      size_t nread = reader->Read(&read_c, 1);
      while (nread == 1) {
        if (read_c == '\n' || read_c == '\r') {
Guolin Ke's avatar
Guolin Ke committed
45
46
          break;
        }
47
        str_buf << read_c;
Guolin Ke's avatar
Guolin Ke committed
48
        ++skip_bytes_;
49
        nread = reader->Read(&read_c, 1);
Guolin Ke's avatar
Guolin Ke committed
50
      }
51
52
      if (read_c == '\r') {
        reader->Read(&read_c, 1);
Guolin Ke's avatar
Guolin Ke committed
53
54
        ++skip_bytes_;
      }
55
56
      if (read_c == '\n') {
        reader->Read(&read_c, 1);
Guolin Ke's avatar
Guolin Ke committed
57
58
        ++skip_bytes_;
      }
Guolin Ke's avatar
Guolin Ke committed
59
      first_line_ = str_buf.str();
60
      Log::Debug("Skipped header \"%s\" in file %s", first_line_.c_str(), filename_);
Guolin Ke's avatar
Guolin Ke committed
61
    }
Guolin Ke's avatar
Guolin Ke committed
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
  }
  /*!
  * \brief Destructor
  */
  ~TextReader() {
    Clear();
  }
  /*!
  * \brief Clear cached data
  */
  inline void Clear() {
    lines_.clear();
    lines_.shrink_to_fit();
  }
  /*!
Guolin Ke's avatar
Guolin Ke committed
77
78
79
80
81
82
  * \brief return first line of data
  */
  inline std::string first_line() {
    return first_line_;
  }
  /*!
Guolin Ke's avatar
Guolin Ke committed
83
84
85
86
87
88
89
90
  * \brief Get text data that read from file
  * \return Text data, store in std::vector by line
  */
  inline std::vector<std::string>& Lines() { return lines_; }

  INDEX_T ReadAllAndProcess(const std::function<void(INDEX_T, const char*, size_t)>& process_fun) {
    last_line_ = "";
    INDEX_T total_cnt = 0;
91
    size_t bytes_read = 0;
Guolin Ke's avatar
Guolin Ke committed
92
    PipelineReader::Read(filename_, skip_bytes_,
Guolin Ke's avatar
Guolin Ke committed
93
        [&process_fun, &bytes_read, &total_cnt, this]
Guolin Ke's avatar
Guolin Ke committed
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
    (const char* buffer_process, size_t read_cnt) {
      size_t cnt = 0;
      size_t i = 0;
      size_t last_i = 0;
      // skip the break between \r and \n
      if (last_line_.size() == 0 && buffer_process[0] == '\n') {
        i = 1;
        last_i = i;
      }
      while (i < read_cnt) {
        if (buffer_process[i] == '\n' || buffer_process[i] == '\r') {
          if (last_line_.size() > 0) {
            last_line_.append(buffer_process + last_i, i - last_i);
            process_fun(total_cnt, last_line_.c_str(), last_line_.size());
            last_line_ = "";
109
          } else {
Guolin Ke's avatar
Guolin Ke committed
110
111
112
113
114
115
            process_fun(total_cnt, buffer_process + last_i, i - last_i);
          }
          ++cnt;
          ++i;
          ++total_cnt;
          // skip end of line
116
          while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) { ++i; }
Guolin Ke's avatar
Guolin Ke committed
117
          last_i = i;
118
        } else {
Guolin Ke's avatar
Guolin Ke committed
119
120
121
122
          ++i;
        }
      }
      if (last_i != read_cnt) {
123
        last_line_.append(buffer_process + last_i, read_cnt - last_i);
Guolin Ke's avatar
Guolin Ke committed
124
      }
125
126
127

      size_t prev_bytes_read = bytes_read;
      bytes_read += read_cnt;
128
129
      if (prev_bytes_read / read_progress_interval_bytes_ < bytes_read / read_progress_interval_bytes_) {
        Log::Debug("Read %.1f GBs from %s.", 1.0 * bytes_read / kGbs, filename_);
130
131
      }

Guolin Ke's avatar
Guolin Ke committed
132
133
134
135
      return cnt;
    });
    // if last line of file doesn't contain end of line
    if (last_line_.size() > 0) {
136
      Log::Info("Warning: last line of %s has no end of line, still using this line", filename_);
Guolin Ke's avatar
Guolin Ke committed
137
138
139
140
141
142
143
144
145
146
147
148
149
      process_fun(total_cnt, last_line_.c_str(), last_line_.size());
      ++total_cnt;
      last_line_ = "";
    }
    return total_cnt;
  }

  /*!
  * \brief Read all text data from file in memory
  * \return number of lines of text data
  */
  INDEX_T ReadAllLines() {
    return ReadAllAndProcess(
150
      [=](INDEX_T, const char* buffer, size_t size) {
Guolin Ke's avatar
Guolin Ke committed
151
152
153
154
      lines_.emplace_back(buffer, size);
    });
  }

155
156
157
  std::vector<char> ReadContent(size_t* out_len) {
    std::vector<char> ret;
    *out_len = 0;
158
159
    auto reader = VirtualFileReader::Make(filename_);
    if (!reader->Init()) {
160
161
162
163
164
165
      return ret;
    }
    const size_t buffer_size = 16 * 1024 * 1024;
    auto buffer_read = std::vector<char>(buffer_size);
    size_t read_cnt = 0;
    do {
166
      read_cnt = reader->Read(buffer_read.data(), buffer_size);
167
168
169
170
171
172
      ret.insert(ret.end(), buffer_read.begin(), buffer_read.begin() + read_cnt);
      *out_len += read_cnt;
    } while (read_cnt > 0);
    return ret;
  }

Guolin Ke's avatar
Guolin Ke committed
173
  INDEX_T SampleFromFile(Random* random, INDEX_T sample_cnt, std::vector<std::string>* out_sampled_data) {
Guolin Ke's avatar
Guolin Ke committed
174
    INDEX_T cur_sample_cnt = 0;
Guolin Ke's avatar
Guolin Ke committed
175
176
    return ReadAllAndProcess([=, &random, &cur_sample_cnt,
                              &out_sampled_data]
Guolin Ke's avatar
Guolin Ke committed
177
178
179
180
    (INDEX_T line_idx, const char* buffer, size_t size) {
      if (cur_sample_cnt < sample_cnt) {
        out_sampled_data->emplace_back(buffer, size);
        ++cur_sample_cnt;
181
      } else {
Guolin Ke's avatar
Guolin Ke committed
182
        const size_t idx = static_cast<size_t>(random->NextInt(0, static_cast<int>(line_idx + 1)));
183
        if (idx < static_cast<size_t>(sample_cnt)) {
Guolin Ke's avatar
Guolin Ke committed
184
185
186
187
188
189
190
191
192
193
194
195
196
197
          out_sampled_data->operator[](idx) = std::string(buffer, size);
        }
      }
    });
  }
  /*!
  * \brief Read part of text data from file in memory, use filter_fun to filter data
  * \param filter_fun Function that perform data filter
  * \param out_used_data_indices Store line indices that read text data
  * \return The number of total data
  */
  INDEX_T ReadAndFilterLines(const std::function<bool(INDEX_T)>& filter_fun, std::vector<INDEX_T>* out_used_data_indices) {
    out_used_data_indices->clear();
    INDEX_T total_cnt = ReadAllAndProcess(
Guolin Ke's avatar
Guolin Ke committed
198
        [&filter_fun, &out_used_data_indices, this]
Guolin Ke's avatar
Guolin Ke committed
199
200
201
202
203
204
205
206
207
    (INDEX_T line_idx , const char* buffer, size_t size) {
      bool is_used = filter_fun(line_idx);
      if (is_used) { out_used_data_indices->push_back(line_idx); }
      if (is_used) { lines_.emplace_back(buffer, size); }
    });
    return total_cnt;
  }

  INDEX_T SampleAndFilterFromFile(const std::function<bool(INDEX_T)>& filter_fun, std::vector<INDEX_T>* out_used_data_indices,
Guolin Ke's avatar
Guolin Ke committed
208
    Random* random, INDEX_T sample_cnt, std::vector<std::string>* out_sampled_data) {
Guolin Ke's avatar
Guolin Ke committed
209
210
211
    INDEX_T cur_sample_cnt = 0;
    out_used_data_indices->clear();
    INDEX_T total_cnt = ReadAllAndProcess(
Guolin Ke's avatar
Guolin Ke committed
212
213
        [=, &filter_fun, &out_used_data_indices, &random, &cur_sample_cnt,
         &out_sampled_data]
Guolin Ke's avatar
Guolin Ke committed
214
215
216
217
218
219
220
    (INDEX_T line_idx, const char* buffer, size_t size) {
      bool is_used = filter_fun(line_idx);
      if (is_used) { out_used_data_indices->push_back(line_idx); }
      if (is_used) {
        if (cur_sample_cnt < sample_cnt) {
          out_sampled_data->emplace_back(buffer, size);
          ++cur_sample_cnt;
221
        } else {
Guolin Ke's avatar
Guolin Ke committed
222
          const size_t idx = static_cast<size_t>(random->NextInt(0, static_cast<int>(out_used_data_indices->size())));
223
          if (idx < static_cast<size_t>(sample_cnt)) {
Guolin Ke's avatar
Guolin Ke committed
224
225
226
227
228
229
230
231
232
233
            out_sampled_data->operator[](idx) = std::string(buffer, size);
          }
        }
      }
    });
    return total_cnt;
  }

  INDEX_T CountLine() {
    return ReadAllAndProcess(
234
      [=](INDEX_T, const char*, size_t) {
Guolin Ke's avatar
Guolin Ke committed
235
236
237
    });
  }

238
  INDEX_T ReadAllAndProcessParallelWithFilter(const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun, const std::function<bool(INDEX_T, INDEX_T)>& filter_fun) {
Guolin Ke's avatar
Guolin Ke committed
239
240
    last_line_ = "";
    INDEX_T total_cnt = 0;
241
    size_t bytes_read = 0;
Guolin Ke's avatar
Guolin Ke committed
242
    INDEX_T used_cnt = 0;
Guolin Ke's avatar
Guolin Ke committed
243
    PipelineReader::Read(filename_, skip_bytes_,
Guolin Ke's avatar
Guolin Ke committed
244
        [&process_fun, &filter_fun, &total_cnt, &bytes_read, &used_cnt, this]
Guolin Ke's avatar
Guolin Ke committed
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
    (const char* buffer_process, size_t read_cnt) {
      size_t cnt = 0;
      size_t i = 0;
      size_t last_i = 0;
      INDEX_T start_idx = used_cnt;
      // skip the break between \r and \n
      if (last_line_.size() == 0 && buffer_process[0] == '\n') {
        i = 1;
        last_i = i;
      }
      while (i < read_cnt) {
        if (buffer_process[i] == '\n' || buffer_process[i] == '\r') {
          if (last_line_.size() > 0) {
            last_line_.append(buffer_process + last_i, i - last_i);
            if (filter_fun(used_cnt, total_cnt)) {
              lines_.push_back(last_line_);
              ++used_cnt;
            }
            last_line_ = "";
264
          } else {
Guolin Ke's avatar
Guolin Ke committed
265
266
267
268
269
270
271
272
273
            if (filter_fun(used_cnt, total_cnt)) {
              lines_.emplace_back(buffer_process + last_i, i - last_i);
              ++used_cnt;
            }
          }
          ++cnt;
          ++i;
          ++total_cnt;
          // skip end of line
274
          while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) { ++i; }
Guolin Ke's avatar
Guolin Ke committed
275
          last_i = i;
276
        } else {
Guolin Ke's avatar
Guolin Ke committed
277
278
279
280
281
282
          ++i;
        }
      }
      process_fun(start_idx, lines_);
      lines_.clear();
      if (last_i != read_cnt) {
283
        last_line_.append(buffer_process + last_i, read_cnt - last_i);
Guolin Ke's avatar
Guolin Ke committed
284
      }
285
286
287

      size_t prev_bytes_read = bytes_read;
      bytes_read += read_cnt;
288
289
      if (prev_bytes_read / read_progress_interval_bytes_ < bytes_read / read_progress_interval_bytes_) {
        Log::Debug("Read %.1f GBs from %s.", 1.0 * bytes_read / kGbs, filename_);
290
291
      }

Guolin Ke's avatar
Guolin Ke committed
292
293
294
295
      return cnt;
    });
    // if last line of file doesn't contain end of line
    if (last_line_.size() > 0) {
296
      Log::Info("Warning: last line of %s has no end of line, still using this line", filename_);
Guolin Ke's avatar
Guolin Ke committed
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
      if (filter_fun(used_cnt, total_cnt)) {
        lines_.push_back(last_line_);
        process_fun(used_cnt, lines_);
      }
      lines_.clear();
      ++total_cnt;
      ++used_cnt;
      last_line_ = "";
    }
    return total_cnt;
  }

  INDEX_T ReadAllAndProcessParallel(const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun) {
    return ReadAllAndProcessParallelWithFilter(process_fun, [](INDEX_T, INDEX_T) { return true; });
  }

  INDEX_T ReadPartAndProcessParallel(const std::vector<INDEX_T>& used_data_indices, const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun) {
314
    return ReadAllAndProcessParallelWithFilter(process_fun,
315
      [&used_data_indices](INDEX_T used_cnt, INDEX_T total_cnt) {
316
      if (static_cast<size_t>(used_cnt) < used_data_indices.size() && total_cnt == used_data_indices[used_cnt]) {
Guolin Ke's avatar
Guolin Ke committed
317
        return true;
318
      } else {
Guolin Ke's avatar
Guolin Ke committed
319
320
321
322
323
        return false;
      }
    });
  }

Nikita Titov's avatar
Nikita Titov committed
324
 private:
Guolin Ke's avatar
Guolin Ke committed
325
326
327
328
329
330
  /*! \brief Filename of text data */
  const char* filename_;
  /*! \brief Cache the read text data */
  std::vector<std::string> lines_;
  /*! \brief Buffer for last line */
  std::string last_line_;
Guolin Ke's avatar
Guolin Ke committed
331
  /*! \brief first line */
332
  std::string first_line_ = "";
Guolin Ke's avatar
Guolin Ke committed
333
334
  /*! \brief is skip first line */
  bool is_skip_first_line_ = false;
335
  size_t read_progress_interval_bytes_;
Guolin Ke's avatar
Guolin Ke committed
336
337
  /*! \brief is skip first line */
  int skip_bytes_ = 0;
Guolin Ke's avatar
Guolin Ke committed
338
339
340
341
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
342
#endif   // LightGBM_UTILS_TEXT_READER_H_