parser.hpp 3.54 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#ifndef LIGHTGBM_IO_PARSER_HPP_
#define LIGHTGBM_IO_PARSER_HPP_

8
9
10
11
#include <LightGBM/dataset.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/log.h>

12
13
14
15
#include <unordered_map>
#include <utility>
#include <vector>

Guolin Ke's avatar
Guolin Ke committed
16
17
18
namespace LightGBM {

class CSVParser: public Parser {
Nikita Titov's avatar
Nikita Titov committed
19
 public:
Chen Yufei's avatar
Chen Yufei committed
20
21
  explicit CSVParser(int label_idx, int total_columns, AtofFunc atof)
    :label_idx_(label_idx), total_columns_(total_columns), atof_(atof) {
Guolin Ke's avatar
Guolin Ke committed
22
  }
Guolin Ke's avatar
Guolin Ke committed
23
  inline void ParseOneLine(const char* str,
24
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
25
    int idx = 0;
26
    double val = 0.0f;
27
    int offset = 0;
Guolin Ke's avatar
Guolin Ke committed
28
    *out_label = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
29
    while (*str != '\0') {
Chen Yufei's avatar
Chen Yufei committed
30
      str = atof_(str, &val);
Guolin Ke's avatar
Guolin Ke committed
31
32
      if (idx == label_idx_) {
        *out_label = val;
33
        offset = -1;
34
      } else if (std::fabs(val) > kZeroThreshold || std::isnan(val)) {
35
        out_features->emplace_back(idx + offset, val);
Guolin Ke's avatar
Guolin Ke committed
36
      }
Guolin Ke's avatar
Guolin Ke committed
37
38
39
40
      ++idx;
      if (*str == ',') {
        ++str;
      } else if (*str != '\0') {
41
        Log::Fatal("Input format error when parsing as CSV");
Guolin Ke's avatar
Guolin Ke committed
42
43
44
      }
    }
  }
Guolin Ke's avatar
Guolin Ke committed
45

46
47
  inline int NumFeatures() const override {
    return total_columns_ - (label_idx_ >= 0);
Guolin Ke's avatar
Guolin Ke committed
48
  }
49

Nikita Titov's avatar
Nikita Titov committed
50
 private:
Guolin Ke's avatar
Guolin Ke committed
51
  int label_idx_ = 0;
Guolin Ke's avatar
Guolin Ke committed
52
  int total_columns_ = -1;
Chen Yufei's avatar
Chen Yufei committed
53
  AtofFunc atof_;
Guolin Ke's avatar
Guolin Ke committed
54
55
56
};

class TSVParser: public Parser {
Nikita Titov's avatar
Nikita Titov committed
57
 public:
Chen Yufei's avatar
Chen Yufei committed
58
59
  explicit TSVParser(int label_idx, int total_columns, AtofFunc atof)
    :label_idx_(label_idx), total_columns_(total_columns), atof_(atof) {
Guolin Ke's avatar
Guolin Ke committed
60
  }
61
  inline void ParseOneLine(const char* str,
62
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
63
    int idx = 0;
64
    double val = 0.0f;
65
    int offset = 0;
Guolin Ke's avatar
Guolin Ke committed
66
    while (*str != '\0') {
Chen Yufei's avatar
Chen Yufei committed
67
      str = atof_(str, &val);
Guolin Ke's avatar
Guolin Ke committed
68
69
      if (idx == label_idx_) {
        *out_label = val;
70
        offset = -1;
Guolin Ke's avatar
Guolin Ke committed
71
      } else if (std::fabs(val) > kZeroThreshold || std::isnan(val)) {
72
        out_features->emplace_back(idx + offset, val);
Guolin Ke's avatar
Guolin Ke committed
73
      }
Guolin Ke's avatar
Guolin Ke committed
74
75
76
77
      ++idx;
      if (*str == '\t') {
        ++str;
      } else if (*str != '\0') {
78
        Log::Fatal("Input format error when parsing as TSV");
Guolin Ke's avatar
Guolin Ke committed
79
80
81
      }
    }
  }
Guolin Ke's avatar
Guolin Ke committed
82

83
84
  inline int NumFeatures() const override {
    return total_columns_ - (label_idx_ >= 0);
Guolin Ke's avatar
Guolin Ke committed
85
  }
86

Nikita Titov's avatar
Nikita Titov committed
87
 private:
Guolin Ke's avatar
Guolin Ke committed
88
  int label_idx_ = 0;
Guolin Ke's avatar
Guolin Ke committed
89
  int total_columns_ = -1;
Chen Yufei's avatar
Chen Yufei committed
90
  AtofFunc atof_;
Guolin Ke's avatar
Guolin Ke committed
91
92
93
};

class LibSVMParser: public Parser {
Nikita Titov's avatar
Nikita Titov committed
94
 public:
Chen Yufei's avatar
Chen Yufei committed
95
96
  explicit LibSVMParser(int label_idx, int total_columns, AtofFunc atof)
    :label_idx_(label_idx), total_columns_(total_columns), atof_(atof) {
Guolin Ke's avatar
Guolin Ke committed
97
    if (label_idx > 0) {
98
      Log::Fatal("Label should be the first column in a LibSVM file");
Guolin Ke's avatar
Guolin Ke committed
99
100
    }
  }
101
  inline void ParseOneLine(const char* str,
102
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
103
    int idx = 0;
104
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
105
    if (label_idx_ == 0) {
Chen Yufei's avatar
Chen Yufei committed
106
      str = atof_(str, &val);
Guolin Ke's avatar
Guolin Ke committed
107
108
109
      *out_label = val;
      str = Common::SkipSpaceAndTab(str);
    }
Guolin Ke's avatar
Guolin Ke committed
110
111
112
113
114
115
116
117
    while (*str != '\0') {
      str = Common::Atoi(str, &idx);
      str = Common::SkipSpaceAndTab(str);
      if (*str == ':') {
        ++str;
        str = Common::Atof(str, &val);
        out_features->emplace_back(idx, val);
      } else {
118
        Log::Fatal("Input format error when parsing as LibSVM");
Guolin Ke's avatar
Guolin Ke committed
119
120
121
122
      }
      str = Common::SkipSpaceAndTab(str);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
123

124
125
  inline int NumFeatures() const override {
    return total_columns_;
Guolin Ke's avatar
Guolin Ke committed
126
  }
127

Nikita Titov's avatar
Nikita Titov committed
128
 private:
Guolin Ke's avatar
Guolin Ke committed
129
  int label_idx_ = 0;
130
  int total_columns_ = -1;
Chen Yufei's avatar
Chen Yufei committed
131
  AtofFunc atof_;
Guolin Ke's avatar
Guolin Ke committed
132
};
Guolin Ke's avatar
Guolin Ke committed
133

Guolin Ke's avatar
Guolin Ke committed
134
}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
135
#endif   // LightGBM_IO_PARSER_HPP_