Unverified Commit dc584070 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

fix bug in parser

parent fdb39237
...@@ -112,7 +112,10 @@ std::vector<std::string> ReadKLineFromFile(const char* filename, bool header, in ...@@ -112,7 +112,10 @@ std::vector<std::string> ReadKLineFromFile(const char* filename, bool header, in
for (int i = 0; i < k; ++i) { for (int i = 0; i < k; ++i) {
if (!tmp_file.eof()) { if (!tmp_file.eof()) {
GetLine(&tmp_file, &cur_line, reader.get(), &buffer, buffer_size); GetLine(&tmp_file, &cur_line, reader.get(), &buffer, buffer_size);
ret.push_back(cur_line); cur_line = Common::Trim(cur_line);
if (!cur_line.empty()) {
ret.push_back(cur_line);
}
} else { } else {
break; break;
} }
...@@ -142,32 +145,32 @@ DataType GetDataType(const std::vector<std::string>& lines, int* num_col) { ...@@ -142,32 +145,32 @@ DataType GetDataType(const std::vector<std::string>& lines, int* num_col) {
} else if (comma_cnt > 0) { } else if (comma_cnt > 0) {
type = DataType::CSV; type = DataType::CSV;
} }
} } else if (lines.size() > 1) {
int comma_cnt2 = 0; int comma_cnt2 = 0;
int tab_cnt2 = 0; int tab_cnt2 = 0;
int colon_cnt2 = 0; int colon_cnt2 = 0;
GetStatistic(lines[1].c_str(), &comma_cnt2, &tab_cnt2, &colon_cnt2); GetStatistic(lines[1].c_str(), &comma_cnt2, &tab_cnt2, &colon_cnt2);
if (colon_cnt > 0 || colon_cnt2 > 0) { if (colon_cnt > 0 || colon_cnt2 > 0) {
type = DataType::LIBSVM; type = DataType::LIBSVM;
} else if (tab_cnt == tab_cnt2 && tab_cnt > 0) { } else if (tab_cnt == tab_cnt2 && tab_cnt > 0) {
type = DataType::TSV; type = DataType::TSV;
} else if (comma_cnt == comma_cnt2 && comma_cnt > 0) { } else if (comma_cnt == comma_cnt2 && comma_cnt > 0) {
type = DataType::CSV; type = DataType::CSV;
} }
if (type == DataType::TSV || type == DataType::CSV) { if (type == DataType::TSV || type == DataType::CSV) {
// valid the type // valid the type
for (size_t i = 2; i < lines.size(); ++i) { for (size_t i = 2; i < lines.size(); ++i) {
GetStatistic(lines[i].c_str(), &comma_cnt2, &tab_cnt2, &colon_cnt2); GetStatistic(lines[i].c_str(), &comma_cnt2, &tab_cnt2, &colon_cnt2);
if (type == DataType::TSV && tab_cnt2 != tab_cnt) { if (type == DataType::TSV && tab_cnt2 != tab_cnt) {
type = DataType::INVALID; type = DataType::INVALID;
break; break;
} else if (type == DataType::CSV && comma_cnt != comma_cnt2) { } else if (type == DataType::CSV && comma_cnt != comma_cnt2) {
type = DataType::INVALID; type = DataType::INVALID;
break; break;
}
} }
} }
} }
if (type == DataType::LIBSVM) { if (type == DataType::LIBSVM) {
int max_col_idx = 0; int max_col_idx = 0;
for (size_t i = 0; i < lines.size(); ++i) { for (size_t i = 0; i < lines.size(); ++i) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment