Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
f30e0bb3
Unverified
Commit
f30e0bb3
authored
Jun 11, 2020
by
Nikita Titov
Committed by
GitHub
Jun 11, 2020
Browse files
refactor LGBM_DatasetGetFeatureNames (#3022)
parent
b3a84df5
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
72 additions
and
12 deletions
+72
-12
R-package/src/lightgbm_R.cpp
R-package/src/lightgbm_R.cpp
+10
-3
include/LightGBM/c_api.h
include/LightGBM/c_api.h
+11
-3
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+32
-0
src/c_api.cpp
src/c_api.cpp
+13
-5
tests/python_package_test/test_basic.py
tests/python_package_test/test_basic.py
+6
-1
No files found.
R-package/src/lightgbm_R.cpp
View file @
f30e0bb3
...
@@ -158,16 +158,23 @@ LGBM_SE LGBM_DatasetGetFeatureNames_R(LGBM_SE handle,
...
@@ -158,16 +158,23 @@ LGBM_SE LGBM_DatasetGetFeatureNames_R(LGBM_SE handle,
R_API_BEGIN
();
R_API_BEGIN
();
int
len
=
0
;
int
len
=
0
;
CHECK_CALL
(
LGBM_DatasetGetNumFeature
(
R_GET_PTR
(
handle
),
&
len
));
CHECK_CALL
(
LGBM_DatasetGetNumFeature
(
R_GET_PTR
(
handle
),
&
len
));
const
size_t
reserved_string_size
=
256
;
std
::
vector
<
std
::
vector
<
char
>>
names
(
len
);
std
::
vector
<
std
::
vector
<
char
>>
names
(
len
);
std
::
vector
<
char
*>
ptr_names
(
len
);
std
::
vector
<
char
*>
ptr_names
(
len
);
for
(
int
i
=
0
;
i
<
len
;
++
i
)
{
for
(
int
i
=
0
;
i
<
len
;
++
i
)
{
names
[
i
].
resize
(
256
);
names
[
i
].
resize
(
reserved_string_size
);
ptr_names
[
i
]
=
names
[
i
].
data
();
ptr_names
[
i
]
=
names
[
i
].
data
();
}
}
int
out_len
;
int
out_len
;
CHECK_CALL
(
LGBM_DatasetGetFeatureNames
(
R_GET_PTR
(
handle
),
size_t
required_string_size
;
ptr_names
.
data
(),
&
out_len
));
CHECK_CALL
(
LGBM_DatasetGetFeatureNames
(
R_GET_PTR
(
handle
),
len
,
&
out_len
,
reserved_string_size
,
&
required_string_size
,
ptr_names
.
data
()));
CHECK_EQ
(
len
,
out_len
);
CHECK_EQ
(
len
,
out_len
);
CHECK_GE
(
reserved_string_size
,
required_string_size
);
auto
merge_str
=
Join
<
char
*>
(
ptr_names
,
"
\t
"
);
auto
merge_str
=
Join
<
char
*>
(
ptr_names
,
"
\t
"
);
EncodeChar
(
feature_names
,
merge_str
.
c_str
(),
buf_len
,
actual_len
,
merge_str
.
size
()
+
1
);
EncodeChar
(
feature_names
,
merge_str
.
c_str
(),
buf_len
,
actual_len
,
merge_str
.
size
()
+
1
);
R_API_END
();
R_API_END
();
...
...
include/LightGBM/c_api.h
View file @
f30e0bb3
...
@@ -280,13 +280,21 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetSetFeatureNames(DatasetHandle handle,
...
@@ -280,13 +280,21 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetSetFeatureNames(DatasetHandle handle,
/*!
/*!
* \brief Get feature names of dataset.
* \brief Get feature names of dataset.
* \param handle Handle of dataset
* \param handle Handle of dataset
* \param[out] feature_names Feature names, should pre-allocate memory
* \param len Number of ``char*`` pointers stored at ``out_strs``.
* If smaller than the max size, only this many strings are copied
* \param[out] num_feature_names Number of feature names
* \param[out] num_feature_names Number of feature names
* \param buffer_len Size of pre-allocated strings.
* Content is copied up to ``buffer_len - 1`` and null-terminated
* \param[out] out_buffer_len String sizes required to do the full string copies
* \param[out] feature_names Feature names, should pre-allocate memory
* \return 0 when succeed, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
*/
LIGHTGBM_C_EXPORT
int
LGBM_DatasetGetFeatureNames
(
DatasetHandle
handle
,
LIGHTGBM_C_EXPORT
int
LGBM_DatasetGetFeatureNames
(
DatasetHandle
handle
,
char
**
feature_names
,
const
int
len
,
int
*
num_feature_names
);
int
*
num_feature_names
,
const
size_t
buffer_len
,
size_t
*
out_buffer_len
,
char
**
feature_names
);
/*!
/*!
* \brief Free space for dataset.
* \brief Free space for dataset.
...
...
python-package/lightgbm/basic.py
View file @
f30e0bb3
...
@@ -1553,6 +1553,38 @@ class Dataset(object):
...
@@ -1553,6 +1553,38 @@ class Dataset(object):
self
.
set_field
(
'group'
,
group
)
self
.
set_field
(
'group'
,
group
)
return
self
return
self
def
get_feature_name
(
self
):
"""Get the names of columns (features) in the Dataset.
Returns
-------
feature_names : list
The names of columns (features) in the Dataset.
"""
if
self
.
handle
is
None
:
raise
LightGBMError
(
"Cannot get feature_name before construct dataset"
)
num_feature
=
self
.
num_feature
()
tmp_out_len
=
ctypes
.
c_int
(
0
)
reserved_string_buffer_size
=
255
required_string_buffer_size
=
ctypes
.
c_size_t
(
0
)
string_buffers
=
[
ctypes
.
create_string_buffer
(
reserved_string_buffer_size
)
for
i
in
range_
(
num_feature
)]
ptr_string_buffers
=
(
ctypes
.
c_char_p
*
num_feature
)(
*
map
(
ctypes
.
addressof
,
string_buffers
))
_safe_call
(
_LIB
.
LGBM_DatasetGetFeatureNames
(
self
.
handle
,
num_feature
,
ctypes
.
byref
(
tmp_out_len
),
reserved_string_buffer_size
,
ctypes
.
byref
(
required_string_buffer_size
),
ptr_string_buffers
))
if
num_feature
!=
tmp_out_len
.
value
:
raise
ValueError
(
"Length of feature names doesn't equal with num_feature"
)
if
reserved_string_buffer_size
<
required_string_buffer_size
.
value
:
raise
BufferError
(
"Allocated feature name buffer size ({}) was inferior to the needed size ({})."
.
format
(
reserved_string_buffer_size
,
required_string_buffer_size
.
value
)
)
return
[
string_buffers
[
i
].
value
.
decode
(
'utf-8'
)
for
i
in
range_
(
num_feature
)]
def
get_label
(
self
):
def
get_label
(
self
):
"""Get the label of the Dataset.
"""Get the label of the Dataset.
...
...
src/c_api.cpp
View file @
f30e0bb3
...
@@ -1110,15 +1110,23 @@ int LGBM_DatasetSetFeatureNames(
...
@@ -1110,15 +1110,23 @@ int LGBM_DatasetSetFeatureNames(
}
}
int
LGBM_DatasetGetFeatureNames
(
int
LGBM_DatasetGetFeatureNames
(
DatasetHandle
handle
,
DatasetHandle
handle
,
char
**
feature_names
,
const
int
len
,
int
*
num_feature_names
)
{
int
*
num_feature_names
,
API_BEGIN
();
const
size_t
buffer_len
,
size_t
*
out_buffer_len
,
char
**
feature_names
)
{
API_BEGIN
();
*
out_buffer_len
=
0
;
auto
dataset
=
reinterpret_cast
<
Dataset
*>
(
handle
);
auto
dataset
=
reinterpret_cast
<
Dataset
*>
(
handle
);
auto
inside_feature_name
=
dataset
->
feature_names
();
auto
inside_feature_name
=
dataset
->
feature_names
();
*
num_feature_names
=
static_cast
<
int
>
(
inside_feature_name
.
size
());
*
num_feature_names
=
static_cast
<
int
>
(
inside_feature_name
.
size
());
for
(
int
i
=
0
;
i
<
*
num_feature_names
;
++
i
)
{
for
(
int
i
=
0
;
i
<
*
num_feature_names
;
++
i
)
{
std
::
memcpy
(
feature_names
[
i
],
inside_feature_name
[
i
].
c_str
(),
inside_feature_name
[
i
].
size
()
+
1
);
if
(
i
<
len
)
{
std
::
memcpy
(
feature_names
[
i
],
inside_feature_name
[
i
].
c_str
(),
std
::
min
(
inside_feature_name
[
i
].
size
()
+
1
,
buffer_len
));
feature_names
[
i
][
buffer_len
-
1
]
=
'\0'
;
}
*
out_buffer_len
=
std
::
max
(
inside_feature_name
[
i
].
size
()
+
1
,
*
out_buffer_len
);
}
}
API_END
();
API_END
();
}
}
...
...
tests/python_package_test/test_basic.py
View file @
f30e0bb3
...
@@ -271,15 +271,20 @@ class TestBasic(unittest.TestCase):
...
@@ -271,15 +271,20 @@ class TestBasic(unittest.TestCase):
self
.
assertTrue
(
np
.
all
(
np
.
isclose
([
data
.
label
[
0
],
data
.
weight
[
0
],
data
.
init_score
[
0
]],
self
.
assertTrue
(
np
.
all
(
np
.
isclose
([
data
.
label
[
0
],
data
.
weight
[
0
],
data
.
init_score
[
0
]],
data
.
label
[
0
])))
data
.
label
[
0
])))
self
.
assertAlmostEqual
(
data
.
label
[
1
],
data
.
weight
[
1
])
self
.
assertAlmostEqual
(
data
.
label
[
1
],
data
.
weight
[
1
])
self
.
assertListEqual
(
data
.
feature_name
,
data
.
get_feature_name
())
X
,
y
=
load_breast_cancer
(
True
)
X
,
y
=
load_breast_cancer
(
True
)
sequence
=
np
.
ones
(
y
.
shape
[
0
])
sequence
=
np
.
ones
(
y
.
shape
[
0
])
sequence
[
0
]
=
np
.
nan
sequence
[
0
]
=
np
.
nan
sequence
[
1
]
=
np
.
inf
sequence
[
1
]
=
np
.
inf
lgb_data
=
lgb
.
Dataset
(
X
,
sequence
,
weight
=
sequence
,
init_score
=
sequence
).
construct
()
feature_names
=
[
'f{0}'
.
format
(
i
)
for
i
in
range
(
X
.
shape
[
1
])]
lgb_data
=
lgb
.
Dataset
(
X
,
sequence
,
weight
=
sequence
,
init_score
=
sequence
,
feature_name
=
feature_names
).
construct
()
check_asserts
(
lgb_data
)
check_asserts
(
lgb_data
)
lgb_data
=
lgb
.
Dataset
(
X
,
y
).
construct
()
lgb_data
=
lgb
.
Dataset
(
X
,
y
).
construct
()
lgb_data
.
set_label
(
sequence
)
lgb_data
.
set_label
(
sequence
)
lgb_data
.
set_weight
(
sequence
)
lgb_data
.
set_weight
(
sequence
)
lgb_data
.
set_init_score
(
sequence
)
lgb_data
.
set_init_score
(
sequence
)
lgb_data
.
set_feature_name
(
feature_names
)
check_asserts
(
lgb_data
)
check_asserts
(
lgb_data
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment