text_reader.h 9.84 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
#ifndef LIGHTGBM_UTILS_TEXT_READER_H_
#define LIGHTGBM_UTILS_TEXT_READER_H_

#include <LightGBM/utils/pipeline_reader.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/random.h>

#include <cstdio>
Guolin Ke's avatar
Guolin Ke committed
9
#include <sstream>
Guolin Ke's avatar
Guolin Ke committed
10
11
12
13
14
15
16
17
18
19
20
21

#include <vector>
#include <string>
#include <functional>

namespace LightGBM {

/*!
* \brief Read text data from file
*/
template<typename INDEX_T>
class TextReader {
Nikita Titov's avatar
Nikita Titov committed
22
 public:
Guolin Ke's avatar
Guolin Ke committed
23
24
25
  /*!
  * \brief Constructor
  * \param filename Filename of data
Guolin Ke's avatar
Guolin Ke committed
26
  * \param is_skip_first_line True if need to skip header
Guolin Ke's avatar
Guolin Ke committed
27
  */
Guolin Ke's avatar
Guolin Ke committed
28
  TextReader(const char* filename, bool is_skip_first_line):
29
    filename_(filename), is_skip_first_line_(is_skip_first_line) {
Guolin Ke's avatar
Guolin Ke committed
30
    if (is_skip_first_line_) {
31
32
      auto reader = VirtualFileReader::Make(filename);
      if (!reader->Init()) {
33
        Log::Fatal("Could not open %s", filename);
Guolin Ke's avatar
Guolin Ke committed
34
      }
Guolin Ke's avatar
Guolin Ke committed
35
      std::stringstream str_buf;
36
37
38
39
      char read_c;
      size_t nread = reader->Read(&read_c, 1);
      while (nread == 1) {
        if (read_c == '\n' || read_c == '\r') {
Guolin Ke's avatar
Guolin Ke committed
40
41
          break;
        }
42
        str_buf << read_c;
Guolin Ke's avatar
Guolin Ke committed
43
        ++skip_bytes_;
44
        nread = reader->Read(&read_c, 1);
Guolin Ke's avatar
Guolin Ke committed
45
      }
46
47
      if (read_c == '\r') {
        reader->Read(&read_c, 1);
Guolin Ke's avatar
Guolin Ke committed
48
49
        ++skip_bytes_;
      }
50
51
      if (read_c == '\n') {
        reader->Read(&read_c, 1);
Guolin Ke's avatar
Guolin Ke committed
52
53
        ++skip_bytes_;
      }
Guolin Ke's avatar
Guolin Ke committed
54
      first_line_ = str_buf.str();
55
      Log::Debug("Skipped header \"%s\" in file %s", first_line_.c_str(), filename_);
Guolin Ke's avatar
Guolin Ke committed
56
    }
Guolin Ke's avatar
Guolin Ke committed
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
  }
  /*!
  * \brief Destructor
  */
  ~TextReader() {
    Clear();
  }
  /*!
  * \brief Clear cached data
  */
  inline void Clear() {
    lines_.clear();
    lines_.shrink_to_fit();
  }
  /*!
Guolin Ke's avatar
Guolin Ke committed
72
73
74
75
76
77
  * \brief return first line of data
  */
  inline std::string first_line() {
    return first_line_;
  }
  /*!
Guolin Ke's avatar
Guolin Ke committed
78
79
80
81
82
83
84
85
  * \brief Get text data that read from file
  * \return Text data, store in std::vector by line
  */
  inline std::vector<std::string>& Lines() { return lines_; }

  INDEX_T ReadAllAndProcess(const std::function<void(INDEX_T, const char*, size_t)>& process_fun) {
    last_line_ = "";
    INDEX_T total_cnt = 0;
Guolin Ke's avatar
Guolin Ke committed
86
    PipelineReader::Read(filename_, skip_bytes_,
87
      [&]
Guolin Ke's avatar
Guolin Ke committed
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
    (const char* buffer_process, size_t read_cnt) {
      size_t cnt = 0;
      size_t i = 0;
      size_t last_i = 0;
      // skip the break between \r and \n
      if (last_line_.size() == 0 && buffer_process[0] == '\n') {
        i = 1;
        last_i = i;
      }
      while (i < read_cnt) {
        if (buffer_process[i] == '\n' || buffer_process[i] == '\r') {
          if (last_line_.size() > 0) {
            last_line_.append(buffer_process + last_i, i - last_i);
            process_fun(total_cnt, last_line_.c_str(), last_line_.size());
            last_line_ = "";
103
          } else {
Guolin Ke's avatar
Guolin Ke committed
104
105
106
107
108
109
            process_fun(total_cnt, buffer_process + last_i, i - last_i);
          }
          ++cnt;
          ++i;
          ++total_cnt;
          // skip end of line
110
          while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) { ++i; }
Guolin Ke's avatar
Guolin Ke committed
111
          last_i = i;
112
        } else {
Guolin Ke's avatar
Guolin Ke committed
113
114
115
116
          ++i;
        }
      }
      if (last_i != read_cnt) {
117
        last_line_.append(buffer_process + last_i, read_cnt - last_i);
Guolin Ke's avatar
Guolin Ke committed
118
119
120
121
122
      }
      return cnt;
    });
    // if last line of file doesn't contain end of line
    if (last_line_.size() > 0) {
123
      Log::Info("Warning: last line of %s has no end of line, still using this line", filename_);
Guolin Ke's avatar
Guolin Ke committed
124
125
126
127
128
129
130
131
132
133
134
135
136
      process_fun(total_cnt, last_line_.c_str(), last_line_.size());
      ++total_cnt;
      last_line_ = "";
    }
    return total_cnt;
  }

  /*!
  * \brief Read all text data from file in memory
  * \return number of lines of text data
  */
  INDEX_T ReadAllLines() {
    return ReadAllAndProcess(
137
      [=](INDEX_T, const char* buffer, size_t size) {
Guolin Ke's avatar
Guolin Ke committed
138
139
140
141
      lines_.emplace_back(buffer, size);
    });
  }

142
143
144
  std::vector<char> ReadContent(size_t* out_len) {
    std::vector<char> ret;
    *out_len = 0;
145
146
    auto reader = VirtualFileReader::Make(filename_);
    if (!reader->Init()) {
147
148
149
150
151
152
      return ret;
    }
    const size_t buffer_size = 16 * 1024 * 1024;
    auto buffer_read = std::vector<char>(buffer_size);
    size_t read_cnt = 0;
    do {
153
      read_cnt = reader->Read(buffer_read.data(), buffer_size);
154
155
156
157
158
159
      ret.insert(ret.end(), buffer_read.begin(), buffer_read.begin() + read_cnt);
      *out_len += read_cnt;
    } while (read_cnt > 0);
    return ret;
  }

Guolin Ke's avatar
Guolin Ke committed
160
161
162
  INDEX_T SampleFromFile(Random& random, INDEX_T sample_cnt, std::vector<std::string>* out_sampled_data) {
    INDEX_T cur_sample_cnt = 0;
    return ReadAllAndProcess(
163
      [&]
Guolin Ke's avatar
Guolin Ke committed
164
165
166
167
    (INDEX_T line_idx, const char* buffer, size_t size) {
      if (cur_sample_cnt < sample_cnt) {
        out_sampled_data->emplace_back(buffer, size);
        ++cur_sample_cnt;
168
      } else {
Guolin Ke's avatar
Guolin Ke committed
169
        const size_t idx = static_cast<size_t>(random.NextInt(0, static_cast<int>(line_idx + 1)));
170
        if (idx < static_cast<size_t>(sample_cnt)) {
Guolin Ke's avatar
Guolin Ke committed
171
172
173
174
175
176
177
178
179
180
181
182
183
184
          out_sampled_data->operator[](idx) = std::string(buffer, size);
        }
      }
    });
  }
  /*!
  * \brief Read part of text data from file in memory, use filter_fun to filter data
  * \param filter_fun Function that perform data filter
  * \param out_used_data_indices Store line indices that read text data
  * \return The number of total data
  */
  INDEX_T ReadAndFilterLines(const std::function<bool(INDEX_T)>& filter_fun, std::vector<INDEX_T>* out_used_data_indices) {
    out_used_data_indices->clear();
    INDEX_T total_cnt = ReadAllAndProcess(
185
      [&]
Guolin Ke's avatar
Guolin Ke committed
186
187
188
189
190
191
192
193
194
    (INDEX_T line_idx , const char* buffer, size_t size) {
      bool is_used = filter_fun(line_idx);
      if (is_used) { out_used_data_indices->push_back(line_idx); }
      if (is_used) { lines_.emplace_back(buffer, size); }
    });
    return total_cnt;
  }

  INDEX_T SampleAndFilterFromFile(const std::function<bool(INDEX_T)>& filter_fun, std::vector<INDEX_T>* out_used_data_indices,
195
    Random& random, INDEX_T sample_cnt, std::vector<std::string>* out_sampled_data) {
Guolin Ke's avatar
Guolin Ke committed
196
197
198
    INDEX_T cur_sample_cnt = 0;
    out_used_data_indices->clear();
    INDEX_T total_cnt = ReadAllAndProcess(
199
      [&]
Guolin Ke's avatar
Guolin Ke committed
200
201
202
203
204
205
206
    (INDEX_T line_idx, const char* buffer, size_t size) {
      bool is_used = filter_fun(line_idx);
      if (is_used) { out_used_data_indices->push_back(line_idx); }
      if (is_used) {
        if (cur_sample_cnt < sample_cnt) {
          out_sampled_data->emplace_back(buffer, size);
          ++cur_sample_cnt;
207
        } else {
Guolin Ke's avatar
Guolin Ke committed
208
          const size_t idx = static_cast<size_t>(random.NextInt(0, static_cast<int>(out_used_data_indices->size())));
209
          if (idx < static_cast<size_t>(sample_cnt)) {
Guolin Ke's avatar
Guolin Ke committed
210
211
212
213
214
215
216
217
218
219
            out_sampled_data->operator[](idx) = std::string(buffer, size);
          }
        }
      }
    });
    return total_cnt;
  }

  INDEX_T CountLine() {
    return ReadAllAndProcess(
220
      [=](INDEX_T, const char*, size_t) {
Guolin Ke's avatar
Guolin Ke committed
221
222
223
    });
  }

224
  INDEX_T ReadAllAndProcessParallelWithFilter(const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun, const std::function<bool(INDEX_T, INDEX_T)>& filter_fun) {
Guolin Ke's avatar
Guolin Ke committed
225
226
227
    last_line_ = "";
    INDEX_T total_cnt = 0;
    INDEX_T used_cnt = 0;
Guolin Ke's avatar
Guolin Ke committed
228
    PipelineReader::Read(filename_, skip_bytes_,
229
      [&]
Guolin Ke's avatar
Guolin Ke committed
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
    (const char* buffer_process, size_t read_cnt) {
      size_t cnt = 0;
      size_t i = 0;
      size_t last_i = 0;
      INDEX_T start_idx = used_cnt;
      // skip the break between \r and \n
      if (last_line_.size() == 0 && buffer_process[0] == '\n') {
        i = 1;
        last_i = i;
      }
      while (i < read_cnt) {
        if (buffer_process[i] == '\n' || buffer_process[i] == '\r') {
          if (last_line_.size() > 0) {
            last_line_.append(buffer_process + last_i, i - last_i);
            if (filter_fun(used_cnt, total_cnt)) {
              lines_.push_back(last_line_);
              ++used_cnt;
            }
            last_line_ = "";
249
          } else {
Guolin Ke's avatar
Guolin Ke committed
250
251
252
253
254
255
256
257
258
            if (filter_fun(used_cnt, total_cnt)) {
              lines_.emplace_back(buffer_process + last_i, i - last_i);
              ++used_cnt;
            }
          }
          ++cnt;
          ++i;
          ++total_cnt;
          // skip end of line
259
          while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) { ++i; }
Guolin Ke's avatar
Guolin Ke committed
260
          last_i = i;
261
        } else {
Guolin Ke's avatar
Guolin Ke committed
262
263
264
265
266
267
          ++i;
        }
      }
      process_fun(start_idx, lines_);
      lines_.clear();
      if (last_i != read_cnt) {
268
        last_line_.append(buffer_process + last_i, read_cnt - last_i);
Guolin Ke's avatar
Guolin Ke committed
269
270
271
272
273
      }
      return cnt;
    });
    // if last line of file doesn't contain end of line
    if (last_line_.size() > 0) {
274
      Log::Info("Warning: last line of %s has no end of line, still using this line", filename_);
Guolin Ke's avatar
Guolin Ke committed
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
      if (filter_fun(used_cnt, total_cnt)) {
        lines_.push_back(last_line_);
        process_fun(used_cnt, lines_);
      }
      lines_.clear();
      ++total_cnt;
      ++used_cnt;
      last_line_ = "";
    }
    return total_cnt;
  }

  INDEX_T ReadAllAndProcessParallel(const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun) {
    return ReadAllAndProcessParallelWithFilter(process_fun, [](INDEX_T, INDEX_T) { return true; });
  }

  INDEX_T ReadPartAndProcessParallel(const std::vector<INDEX_T>& used_data_indices, const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun) {
292
    return ReadAllAndProcessParallelWithFilter(process_fun,
293
      [&used_data_indices](INDEX_T used_cnt, INDEX_T total_cnt) {
294
      if (static_cast<size_t>(used_cnt) < used_data_indices.size() && total_cnt == used_data_indices[used_cnt]) {
Guolin Ke's avatar
Guolin Ke committed
295
        return true;
296
      } else {
Guolin Ke's avatar
Guolin Ke committed
297
298
299
300
301
        return false;
      }
    });
  }

Nikita Titov's avatar
Nikita Titov committed
302
 private:
Guolin Ke's avatar
Guolin Ke committed
303
304
305
306
307
308
  /*! \brief Filename of text data */
  const char* filename_;
  /*! \brief Cache the read text data */
  std::vector<std::string> lines_;
  /*! \brief Buffer for last line */
  std::string last_line_;
Guolin Ke's avatar
Guolin Ke committed
309
  /*! \brief first line */
310
  std::string first_line_ = "";
Guolin Ke's avatar
Guolin Ke committed
311
312
313
314
  /*! \brief is skip first line */
  bool is_skip_first_line_ = false;
  /*! \brief is skip first line */
  int skip_bytes_ = 0;
Guolin Ke's avatar
Guolin Ke committed
315
316
317
318
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
319
#endif   // LightGBM_UTILS_TEXT_READER_H_