parser.hpp 3.13 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
#ifndef LIGHTGBM_IO_PARSER_HPP_
#define LIGHTGBM_IO_PARSER_HPP_

4
#include <LightGBM/dataset.h>
Guolin Ke's avatar
Guolin Ke committed
5
6
7
8
9
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/log.h>

#include <unordered_map>
#include <utility>
10
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
11
12
13
14

namespace LightGBM {

class CSVParser: public Parser {
Nikita Titov's avatar
Nikita Titov committed
15
 public:
Guolin Ke's avatar
Guolin Ke committed
16
17
  explicit CSVParser(int label_idx, int total_columns)
    :label_idx_(label_idx), total_columns_(total_columns) {
Guolin Ke's avatar
Guolin Ke committed
18
  }
Guolin Ke's avatar
Guolin Ke committed
19
  inline void ParseOneLine(const char* str,
20
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
21
    int idx = 0;
22
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
23
24
    int bias = 0;
    *out_label = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
25
26
    while (*str != '\0') {
      str = Common::Atof(str, &val);
Guolin Ke's avatar
Guolin Ke committed
27
28
29
      if (idx == label_idx_) {
        *out_label = val;
        bias = -1;
30
      } else if (std::fabs(val) > kZeroThreshold || std::isnan(val)) {
Guolin Ke's avatar
Guolin Ke committed
31
        out_features->emplace_back(idx + bias, val);
Guolin Ke's avatar
Guolin Ke committed
32
      }
Guolin Ke's avatar
Guolin Ke committed
33
34
35
36
      ++idx;
      if (*str == ',') {
        ++str;
      } else if (*str != '\0') {
37
        Log::Fatal("Input format error when parsing as CSV");
Guolin Ke's avatar
Guolin Ke committed
38
39
40
      }
    }
  }
Guolin Ke's avatar
Guolin Ke committed
41
42
43
44

  inline int TotalColumns() const override {
    return total_columns_;
  }
45

Nikita Titov's avatar
Nikita Titov committed
46
 private:
Guolin Ke's avatar
Guolin Ke committed
47
  int label_idx_ = 0;
Guolin Ke's avatar
Guolin Ke committed
48
  int total_columns_ = -1;
Guolin Ke's avatar
Guolin Ke committed
49
50
51
};

class TSVParser: public Parser {
Nikita Titov's avatar
Nikita Titov committed
52
 public:
Guolin Ke's avatar
Guolin Ke committed
53
54
  explicit TSVParser(int label_idx, int total_columns)
    :label_idx_(label_idx), total_columns_(total_columns) {
Guolin Ke's avatar
Guolin Ke committed
55
  }
56
  inline void ParseOneLine(const char* str,
57
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
58
    int idx = 0;
59
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
60
    int bias = 0;
Guolin Ke's avatar
Guolin Ke committed
61
62
    while (*str != '\0') {
      str = Common::Atof(str, &val);
Guolin Ke's avatar
Guolin Ke committed
63
64
65
      if (idx == label_idx_) {
        *out_label = val;
        bias = -1;
Guolin Ke's avatar
Guolin Ke committed
66
      } else if (std::fabs(val) > kZeroThreshold || std::isnan(val)) {
Guolin Ke's avatar
Guolin Ke committed
67
        out_features->emplace_back(idx + bias, val);
Guolin Ke's avatar
Guolin Ke committed
68
      }
Guolin Ke's avatar
Guolin Ke committed
69
70
71
72
      ++idx;
      if (*str == '\t') {
        ++str;
      } else if (*str != '\0') {
73
        Log::Fatal("Input format error when parsing as TSV");
Guolin Ke's avatar
Guolin Ke committed
74
75
76
      }
    }
  }
Guolin Ke's avatar
Guolin Ke committed
77
78
79
80

  inline int TotalColumns() const override {
    return total_columns_;
  }
81

Nikita Titov's avatar
Nikita Titov committed
82
 private:
Guolin Ke's avatar
Guolin Ke committed
83
  int label_idx_ = 0;
Guolin Ke's avatar
Guolin Ke committed
84
  int total_columns_ = -1;
Guolin Ke's avatar
Guolin Ke committed
85
86
87
};

class LibSVMParser: public Parser {
Nikita Titov's avatar
Nikita Titov committed
88
 public:
Guolin Ke's avatar
Guolin Ke committed
89
90
91
  explicit LibSVMParser(int label_idx)
    :label_idx_(label_idx) {
    if (label_idx > 0) {
92
      Log::Fatal("Label should be the first column in a LibSVM file");
Guolin Ke's avatar
Guolin Ke committed
93
94
    }
  }
95
  inline void ParseOneLine(const char* str,
96
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
97
    int idx = 0;
98
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
99
100
101
102
103
    if (label_idx_ == 0) {
      str = Common::Atof(str, &val);
      *out_label = val;
      str = Common::SkipSpaceAndTab(str);
    }
Guolin Ke's avatar
Guolin Ke committed
104
105
106
107
108
109
110
111
    while (*str != '\0') {
      str = Common::Atoi(str, &idx);
      str = Common::SkipSpaceAndTab(str);
      if (*str == ':') {
        ++str;
        str = Common::Atof(str, &val);
        out_features->emplace_back(idx, val);
      } else {
112
        Log::Fatal("Input format error when parsing as LibSVM");
Guolin Ke's avatar
Guolin Ke committed
113
114
115
116
      }
      str = Common::SkipSpaceAndTab(str);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
117
118
119
120

  inline int TotalColumns() const override {
    return -1;
  }
121

Nikita Titov's avatar
Nikita Titov committed
122
 private:
Guolin Ke's avatar
Guolin Ke committed
123
  int label_idx_ = 0;
Guolin Ke's avatar
Guolin Ke committed
124
};
Guolin Ke's avatar
Guolin Ke committed
125

Guolin Ke's avatar
Guolin Ke committed
126
}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
127
#endif   // LightGBM_IO_PARSER_HPP_