parser.cpp 3.29 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
#include "parser.hpp"

#include <iostream>
#include <fstream>

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
8
void GetStatistic(const char* str, int* comma_cnt, int* tab_cnt, int* colon_cnt) {
Guolin Ke's avatar
Guolin Ke committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
  *comma_cnt = 0;
  *tab_cnt = 0;
  *colon_cnt = 0;
  for (int i = 0; str[i] != '\0'; ++i) {
    if (str[i] == ',') {
      ++(*comma_cnt);
    } else if (str[i] == '\t') {
      ++(*tab_cnt);
    } else if (str[i] == ':') {
      ++(*colon_cnt);
    }
  }
}

23
24
25
26
27
28
29
30
31
32
33
34
35
36
bool CheckHasLabelForLibsvm(std::string& str) {
  str = Common::Trim(str);
  auto pos_space = str.find_first_of(" \f\n\r\t\v");
  auto pos_colon = str.find_first_of(":");
  if (pos_colon == std::string::npos || pos_colon > pos_space) {
    return true;
  } else {
    return false;
  }
}

bool CheckHasLabelForTSV(std::string& str, int num_features) {
  str = Common::Trim(str);
  auto tokens = Common::Split(str.c_str(), '\t');
Guolin Ke's avatar
Guolin Ke committed
37
  if (static_cast<int>(tokens.size()) == num_features) {
38
39
40
41
42
43
44
45
46
    return false;
  } else {
    return true;
  }
}

bool CheckHasLabelForCSV(std::string& str, int num_features) {
  str = Common::Trim(str);
  auto tokens = Common::Split(str.c_str(), ',');
Guolin Ke's avatar
Guolin Ke committed
47
  if (static_cast<int>(tokens.size()) == num_features) {
48
49
50
51
52
53
54
    return false;
  } else {
    return true;
  }
}

Parser* Parser::CreateParser(const char* filename, int num_features, bool* has_label) {
Guolin Ke's avatar
Guolin Ke committed
55
56
57
  std::ifstream tmp_file;
  tmp_file.open(filename);
  if (!tmp_file.is_open()) {
Qiwei Ye's avatar
Qiwei Ye committed
58
    Log::Fatal("Data file: %s doesn't exist", filename);
Guolin Ke's avatar
Guolin Ke committed
59
60
61
62
63
  }
  std::string line1, line2;
  if (!tmp_file.eof()) {
    std::getline(tmp_file, line1);
  } else {
Qiwei Ye's avatar
Qiwei Ye committed
64
    Log::Fatal("Data file: %s at least should have one line", filename);
Guolin Ke's avatar
Guolin Ke committed
65
66
67
68
  }
  if (!tmp_file.eof()) {
    std::getline(tmp_file, line2);
  } else {
Qiwei Ye's avatar
Qiwei Ye committed
69
    Log::Error("Data file: %s only have one line", filename);
Guolin Ke's avatar
Guolin Ke committed
70
71
72
73
74
75
76
77
  }
  tmp_file.close();
  int comma_cnt = 0, comma_cnt2 = 0;
  int tab_cnt = 0, tab_cnt2 = 0;
  int colon_cnt = 0, colon_cnt2 = 0;
  // Get some statistic from 2 line
  GetStatistic(line1.c_str(), &comma_cnt, &tab_cnt, &colon_cnt);
  GetStatistic(line2.c_str(), &comma_cnt2, &tab_cnt2, &colon_cnt2);
78
  Parser* ret = nullptr;
Guolin Ke's avatar
Guolin Ke committed
79
80
81
  if (line2.size() == 0) {
    // if only have one line on file
    if (colon_cnt > 0) {
82
83
84
85
      ret =  new LibSVMParser();
      if (num_features > 0 && has_label != nullptr) {
        *has_label = CheckHasLabelForLibsvm(line1);
      }
Guolin Ke's avatar
Guolin Ke committed
86
    } else if (tab_cnt > 0) {
87
88
89
90
      ret = new TSVParser();
      if (num_features > 0 && has_label != nullptr) {
        *has_label = CheckHasLabelForTSV(line1, num_features);
      }
Guolin Ke's avatar
Guolin Ke committed
91
    } else if (comma_cnt > 0) {
92
93
94
95
96
      ret = new CSVParser();
      if (num_features > 0 && has_label != nullptr) {
        *has_label = CheckHasLabelForCSV(line1, num_features);
      }
    } 
Guolin Ke's avatar
Guolin Ke committed
97
98
  } else {
    if (colon_cnt > 0 || colon_cnt2 > 0) {
99
100
101
102
      ret = new LibSVMParser();
      if (num_features > 0 && has_label != nullptr) {
        *has_label = CheckHasLabelForLibsvm(line1);
      }
Guolin Ke's avatar
Guolin Ke committed
103
104
    }
    else if (tab_cnt == tab_cnt2 && tab_cnt > 0) {
105
106
107
108
      ret = new TSVParser();
      if (num_features > 0 && has_label != nullptr) {
        *has_label = CheckHasLabelForTSV(line1, num_features);
      }
Guolin Ke's avatar
Guolin Ke committed
109
    } else if (comma_cnt == comma_cnt2 && comma_cnt > 0) {
110
111
112
113
      ret = new CSVParser();
      if (num_features > 0 && has_label != nullptr) {
        *has_label = CheckHasLabelForCSV(line1, num_features);
      }
Guolin Ke's avatar
Guolin Ke committed
114
115
    }
  }
116
  return ret;
Guolin Ke's avatar
Guolin Ke committed
117
118
119
}

}  // namespace LightGBM