parser.hpp 3.42 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#ifndef LIGHTGBM_IO_PARSER_HPP_
#define LIGHTGBM_IO_PARSER_HPP_

8
#include <LightGBM/dataset.h>
Guolin Ke's avatar
Guolin Ke committed
9
10
11
12
13
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/log.h>

#include <unordered_map>
#include <utility>
14
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
15
16
17
18

namespace LightGBM {

class CSVParser: public Parser {
Nikita Titov's avatar
Nikita Titov committed
19
 public:
Guolin Ke's avatar
Guolin Ke committed
20
21
  explicit CSVParser(int label_idx, int total_columns)
    :label_idx_(label_idx), total_columns_(total_columns) {
Guolin Ke's avatar
Guolin Ke committed
22
  }
Guolin Ke's avatar
Guolin Ke committed
23
  inline void ParseOneLine(const char* str,
24
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
25
    int idx = 0;
26
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
27
28
    int bias = 0;
    *out_label = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
29
30
    while (*str != '\0') {
      str = Common::Atof(str, &val);
Guolin Ke's avatar
Guolin Ke committed
31
32
33
      if (idx == label_idx_) {
        *out_label = val;
        bias = -1;
34
      } else if (std::fabs(val) > kZeroThreshold || std::isnan(val)) {
Guolin Ke's avatar
Guolin Ke committed
35
        out_features->emplace_back(idx + bias, val);
Guolin Ke's avatar
Guolin Ke committed
36
      }
Guolin Ke's avatar
Guolin Ke committed
37
38
39
40
      ++idx;
      if (*str == ',') {
        ++str;
      } else if (*str != '\0') {
41
        Log::Fatal("Input format error when parsing as CSV");
Guolin Ke's avatar
Guolin Ke committed
42
43
44
      }
    }
  }
Guolin Ke's avatar
Guolin Ke committed
45

46
47
  inline int NumFeatures() const override {
    return total_columns_ - (label_idx_ >= 0);
Guolin Ke's avatar
Guolin Ke committed
48
  }
49

Nikita Titov's avatar
Nikita Titov committed
50
 private:
Guolin Ke's avatar
Guolin Ke committed
51
  int label_idx_ = 0;
Guolin Ke's avatar
Guolin Ke committed
52
  int total_columns_ = -1;
Guolin Ke's avatar
Guolin Ke committed
53
54
55
};

class TSVParser: public Parser {
Nikita Titov's avatar
Nikita Titov committed
56
 public:
Guolin Ke's avatar
Guolin Ke committed
57
58
  explicit TSVParser(int label_idx, int total_columns)
    :label_idx_(label_idx), total_columns_(total_columns) {
Guolin Ke's avatar
Guolin Ke committed
59
  }
60
  inline void ParseOneLine(const char* str,
61
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
62
    int idx = 0;
63
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
64
    int bias = 0;
Guolin Ke's avatar
Guolin Ke committed
65
66
    while (*str != '\0') {
      str = Common::Atof(str, &val);
Guolin Ke's avatar
Guolin Ke committed
67
68
69
      if (idx == label_idx_) {
        *out_label = val;
        bias = -1;
Guolin Ke's avatar
Guolin Ke committed
70
      } else if (std::fabs(val) > kZeroThreshold || std::isnan(val)) {
Guolin Ke's avatar
Guolin Ke committed
71
        out_features->emplace_back(idx + bias, val);
Guolin Ke's avatar
Guolin Ke committed
72
      }
Guolin Ke's avatar
Guolin Ke committed
73
74
75
76
      ++idx;
      if (*str == '\t') {
        ++str;
      } else if (*str != '\0') {
77
        Log::Fatal("Input format error when parsing as TSV");
Guolin Ke's avatar
Guolin Ke committed
78
79
80
      }
    }
  }
Guolin Ke's avatar
Guolin Ke committed
81

82
83
  inline int NumFeatures() const override {
    return total_columns_ - (label_idx_ >= 0);
Guolin Ke's avatar
Guolin Ke committed
84
  }
85

Nikita Titov's avatar
Nikita Titov committed
86
 private:
Guolin Ke's avatar
Guolin Ke committed
87
  int label_idx_ = 0;
Guolin Ke's avatar
Guolin Ke committed
88
  int total_columns_ = -1;
Guolin Ke's avatar
Guolin Ke committed
89
90
91
};

class LibSVMParser: public Parser {
Nikita Titov's avatar
Nikita Titov committed
92
 public:
93
94
  explicit LibSVMParser(int label_idx, int total_columns)
    :label_idx_(label_idx), total_columns_(total_columns) {
Guolin Ke's avatar
Guolin Ke committed
95
    if (label_idx > 0) {
96
      Log::Fatal("Label should be the first column in a LibSVM file");
Guolin Ke's avatar
Guolin Ke committed
97
98
    }
  }
99
  inline void ParseOneLine(const char* str,
100
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
101
    int idx = 0;
102
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
103
104
105
106
107
    if (label_idx_ == 0) {
      str = Common::Atof(str, &val);
      *out_label = val;
      str = Common::SkipSpaceAndTab(str);
    }
Guolin Ke's avatar
Guolin Ke committed
108
109
110
111
112
113
114
115
    while (*str != '\0') {
      str = Common::Atoi(str, &idx);
      str = Common::SkipSpaceAndTab(str);
      if (*str == ':') {
        ++str;
        str = Common::Atof(str, &val);
        out_features->emplace_back(idx, val);
      } else {
116
        Log::Fatal("Input format error when parsing as LibSVM");
Guolin Ke's avatar
Guolin Ke committed
117
118
119
120
      }
      str = Common::SkipSpaceAndTab(str);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
121

122
123
  inline int NumFeatures() const override {
    return total_columns_;
Guolin Ke's avatar
Guolin Ke committed
124
  }
125

Nikita Titov's avatar
Nikita Titov committed
126
 private:
Guolin Ke's avatar
Guolin Ke committed
127
  int label_idx_ = 0;
128
  int total_columns_ = -1;
Guolin Ke's avatar
Guolin Ke committed
129
};
Guolin Ke's avatar
Guolin Ke committed
130

Guolin Ke's avatar
Guolin Ke committed
131
}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
132
#endif   // LightGBM_IO_PARSER_HPP_