pipeline_reader.h 2.01 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#ifndef LIGHTGBM_UTILS_PIPELINE_READER_H_
#define LIGHTGBM_UTILS_PIPELINE_READER_H_

8
#include <LightGBM/utils/file_io.h>
Guolin Ke's avatar
Guolin Ke committed
9
10
#include <LightGBM/utils/log.h>

11
#include <algorithm>
Guolin Ke's avatar
Guolin Ke committed
12
13
#include <cstdio>
#include <functional>
Guolin Ke's avatar
Guolin Ke committed
14
#include <memory>
15
16
#include <thread>
#include <utility>
17
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
18

19
namespace LightGBM {
Guolin Ke's avatar
Guolin Ke committed
20
21
22
23
24

/*!
* \brief A pipeline file reader, use 2 threads, one read block from file, the other process the block
*/
class PipelineReader {
Nikita Titov's avatar
Nikita Titov committed
25
 public:
Guolin Ke's avatar
Guolin Ke committed
26
27
28
29
30
  /*!
  * \brief Read data from a file, use pipeline methods
  * \param filename Filename of data
  * \process_fun Process function
  */
31
  static size_t Read(const char* filename, int skip_bytes, const std::function<size_t(const char*, size_t)>& process_fun) {
32
33
    auto reader = VirtualFileReader::Make(filename);
    if (!reader->Init()) {
Guolin Ke's avatar
Guolin Ke committed
34
35
36
      return 0;
    }
    size_t cnt = 0;
37
    const size_t buffer_size =  16 * 1024 * 1024;
Guolin Ke's avatar
Guolin Ke committed
38
    // buffer used for the process_fun
Guolin Ke's avatar
Guolin Ke committed
39
    auto buffer_process = std::vector<char>(buffer_size);
Guolin Ke's avatar
Guolin Ke committed
40
    // buffer used for the file reading
Guolin Ke's avatar
Guolin Ke committed
41
    auto buffer_read = std::vector<char>(buffer_size);
Guolin Ke's avatar
Guolin Ke committed
42
    size_t read_cnt = 0;
Guolin Ke's avatar
Guolin Ke committed
43
44
    if (skip_bytes > 0) {
      // skip first k bytes
45
      read_cnt = reader->Read(buffer_process.data(), skip_bytes);
Guolin Ke's avatar
Guolin Ke committed
46
    }
Guolin Ke's avatar
Guolin Ke committed
47
    // read first block
48
49
    read_cnt = reader->Read(buffer_process.data(), buffer_size);

Guolin Ke's avatar
Guolin Ke committed
50
51
    size_t last_read_cnt = 0;
    while (read_cnt > 0) {
52
      // start read thread
Guolin Ke's avatar
Guolin Ke committed
53
      std::thread read_worker = std::thread(
54
        [&] {
55
        last_read_cnt = reader->Read(buffer_read.data(), buffer_size);
56
      });
Guolin Ke's avatar
Guolin Ke committed
57
      // start process
Guolin Ke's avatar
Guolin Ke committed
58
      cnt += process_fun(buffer_process.data(), read_cnt);
Guolin Ke's avatar
Guolin Ke committed
59
60
61
62
63
64
65
66
67
68
69
70
      // wait for read thread
      read_worker.join();
      // exchange the buffer
      std::swap(buffer_process, buffer_read);
      read_cnt = last_read_cnt;
    }
    return cnt;
  }
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
71
#endif   // LightGBM_UTILS_PIPELINE_READER_H_