pipeline_reader.h 1.81 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
#ifndef LIGHTGBM_UTILS_PIPELINE_READER_H_
#define LIGHTGBM_UTILS_PIPELINE_READER_H_

#include <LightGBM/utils/log.h>

#include <cstdio>

#include <functional>
#include <thread>
Guolin Ke's avatar
Guolin Ke committed
10
#include <memory>
Guolin Ke's avatar
Guolin Ke committed
11
#include <algorithm>
12
#include <vector>
13
#include "file_io.h"
Guolin Ke's avatar
Guolin Ke committed
14

15
namespace LightGBM {
Guolin Ke's avatar
Guolin Ke committed
16
17
18
19
20
21
22
23
24
25
26

/*!
* \brief A pipeline file reader, use 2 threads, one read block from file, the other process the block
*/
class PipelineReader {
public:
  /*!
  * \brief Read data from a file, use pipeline methods
  * \param filename Filename of data
  * \process_fun Process function
  */
27
  static size_t Read(const char* filename, int skip_bytes, const std::function<size_t(const char*, size_t)>& process_fun) {
28
29
    auto reader = VirtualFileReader::Make(filename);
    if (!reader->Init()) {
Guolin Ke's avatar
Guolin Ke committed
30
31
32
      return 0;
    }
    size_t cnt = 0;
33
    const size_t buffer_size =  16 * 1024 * 1024;
Guolin Ke's avatar
Guolin Ke committed
34
    // buffer used for the process_fun
Guolin Ke's avatar
Guolin Ke committed
35
    auto buffer_process = std::vector<char>(buffer_size);
Guolin Ke's avatar
Guolin Ke committed
36
    // buffer used for the file reading
Guolin Ke's avatar
Guolin Ke committed
37
    auto buffer_read = std::vector<char>(buffer_size);
Guolin Ke's avatar
Guolin Ke committed
38
    size_t read_cnt = 0;
Guolin Ke's avatar
Guolin Ke committed
39
40
    if (skip_bytes > 0) {
      // skip first k bytes
41
      read_cnt = reader->Read(buffer_process.data(), skip_bytes);
Guolin Ke's avatar
Guolin Ke committed
42
    }
Guolin Ke's avatar
Guolin Ke committed
43
    // read first block
44
45
    read_cnt = reader->Read(buffer_process.data(), buffer_size);

Guolin Ke's avatar
Guolin Ke committed
46
47
    size_t last_read_cnt = 0;
    while (read_cnt > 0) {
48
      // start read thread
Guolin Ke's avatar
Guolin Ke committed
49
      std::thread read_worker = std::thread(
50
        [&] {
51
        last_read_cnt = reader->Read(buffer_read.data(), buffer_size);
52
      });
Guolin Ke's avatar
Guolin Ke committed
53
      // start process
Guolin Ke's avatar
Guolin Ke committed
54
      cnt += process_fun(buffer_process.data(), read_cnt);
Guolin Ke's avatar
Guolin Ke committed
55
56
57
58
59
60
61
62
63
64
65
66
      // wait for read thread
      read_worker.join();
      // exchange the buffer
      std::swap(buffer_process, buffer_read);
      read_cnt = last_read_cnt;
    }
    return cnt;
  }
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
67
#endif   // LightGBM_UTILS_PIPELINE_READER_H_