Commit 308e6451 authored by Guolin Ke's avatar Guolin Ke
Browse files

support nan and inf in parser.

parent f3e37b9e
......@@ -80,6 +80,7 @@ inline static const char* Atoi(const char* p, int* out) {
inline static const char* Atof(const char* p, double* out) {
int frac;
double sign, value, scale;
// Skip leading white space, if any.
while (*p == ' ') {
++p;
......@@ -95,51 +96,85 @@ inline static const char* Atof(const char* p, double* out) {
++p;
}
// Get digits before decimal point or exponent, if any.
for (value = 0.0; *p >= '0' && *p <= '9'; ++p) {
value = value * 10.0 + (*p - '0');
}
// is a number
if ((*p >= '0' && *p <= '9') || *p == '.' || *p == 'e' || *p == 'E') {
// Get digits before decimal point or exponent, if any.
for (value = 0.0; *p >= '0' && *p <= '9'; ++p) {
value = value * 10.0 + (*p - '0');
}
// Get digits after decimal point, if any.
if (*p == '.') {
double pow10 = 10.0;
++p;
while (*p >= '0' && *p <= '9') {
value += (*p - '0') / pow10;
pow10 *= 10.0;
// Get digits after decimal point, if any.
if (*p == '.') {
double pow10 = 10.0;
++p;
while (*p >= '0' && *p <= '9') {
value += (*p - '0') / pow10;
pow10 *= 10.0;
++p;
}
}
}
// Handle exponent, if any.
frac = 0;
scale = 1.0;
if ((*p == 'e') || (*p == 'E')) {
unsigned int expon;
// Get sign of exponent, if any.
++p;
if (*p == '-') {
frac = 1;
// Handle exponent, if any.
frac = 0;
scale = 1.0;
if ((*p == 'e') || (*p == 'E')) {
unsigned int expon;
// Get sign of exponent, if any.
++p;
if (*p == '-') {
frac = 1;
++p;
} else if (*p == '+') {
++p;
}
// Get digits of exponent, if any.
for (expon = 0; *p >= '0' && *p <= '9'; ++p) {
expon = expon * 10 + (*p - '0');
}
if (expon > 308) expon = 308;
// Calculate scaling factor.
while (expon >= 50) { scale *= 1E50; expon -= 50; }
while (expon >= 8) { scale *= 1E8; expon -= 8; }
while (expon > 0) { scale *= 10.0; expon -= 1; }
}
else if (*p == '+') {
// Return signed and scaled floating point result.
*out = sign * (frac ? (value / scale) : (value * scale));
} else {
if (*p == 'n' || *p == 'N') {
++p;
if (!(*p == 'a' || *p == 'A')) {
Log::Stderr("meet error while parsing string to float, expect a nan here");
}
++p;
if (!(*p == 'n' || *p == 'N')) {
Log::Stderr("meet error while parsing string to float, expect a nan here");
}
++p;
// default convert nan to 0
*out = 0;
} else if (*p == 'i' || *p == 'I') {
++p;
if (!(*p == 'n' || *p == 'N')) {
Log::Stderr("meet error while parsing string to float, expect a inf here");
}
++p;
if (!(*p == 'f' || *p == 'F')) {
Log::Stderr("meet error while parsing string to float, expect a inf here");
}
++p;
// default inf
*out = sign * 1e308;
} else {
if (*p != '\0') {
Log::Stderr("Meet unknow characters while parsing string to float");
}
}
// Get digits of exponent, if any.
for (expon = 0; *p >= '0' && *p <= '9'; ++p) {
expon = expon * 10 + (*p - '0');
}
if (expon > 308) expon = 308;
// Calculate scaling factor.
while (expon >= 50) { scale *= 1E50; expon -= 50; }
while (expon >= 8) { scale *= 1E8; expon -= 8; }
while (expon > 0) { scale *= 10.0; expon -= 1; }
}
// Return signed and scaled floating point result.
*out = sign * (frac ? (value / scale) : (value * scale));
while (*p == ' ') {
++p;
}
return p;
}
......
......@@ -5,7 +5,7 @@
namespace LightGBM {
void GetStatistic(const char* str, int* comma_cnt, int* tab_cnt, int *colon_cnt) {
void GetStatistic(const char* str, int* comma_cnt, int* tab_cnt, int* colon_cnt) {
*comma_cnt = 0;
*tab_cnt = 0;
*colon_cnt = 0;
......
......@@ -15,7 +15,7 @@ namespace LightGBM {
class CSVParser: public Parser {
public:
inline void ParseOneLine(const char* str,
std::vector<std::pair<int, double>>* out_features) const override {
std::vector<std::pair<int, double>>* out_features) const override {
int idx = 0;
double val = 0.0;
while (*str != '\0') {
......@@ -30,16 +30,14 @@ public:
}
}
inline void ParseOneLine(const char* str, std::vector<std::pair<int, double>>* out_features,
double* out_label) const override {
double* out_label) const override {
// first column is label
str = Common::Atof(str, out_label);
if (*str == ',') {
++str;
} else if (*str != '\0') {
Log::Stderr("input format error, should be CSV");
}
if (*str == ',') {
++str;
} else if (*str != '\0') {
Log::Stderr("input format error, should be CSV");
}
return ParseOneLine(str, out_features);
}
};
......@@ -61,16 +59,14 @@ public:
}
}
inline void ParseOneLine(const char* str, std::vector<std::pair<int, double>>* out_features,
double* out_label) const override{
double* out_label) const override {
// first column is label
str = Common::Atof(str, out_label);
if (*str == '\t') {
++str;
} else if (*str != '\0') {
Log::Stderr("input format error, should be TSV");
}
if (*str == '\t') {
++str;
} else if (*str != '\0') {
Log::Stderr("input format error, should be TSV");
}
return ParseOneLine(str, out_features);
}
};
......@@ -94,7 +90,7 @@ public:
}
}
inline void ParseOneLine(const char* str, std::vector<std::pair<int, double>>* out_features,
double* out_label) const override{
double* out_label) const override {
// first column is label
str = Common::Atof(str, out_label);
str = Common::SkipSpaceAndTab(str);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment