remove set_group_id. fixed bug in set num_pred_iterations.

19512d82 · Guolin Ke · 83007b1c · 19512d82 · 19512d82
Commit 19512d82 authored Nov 27, 2016 by Guolin Ke
Show whitespace changes
Inline Side-by-side

Showing with 45 additions and 55 deletions

python-package/lightgbm/basic.py python-package/lightgbm/basic.py +43 -55

src/boosting/gbdt.h src/boosting/gbdt.h +2 -0

No files found.
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -123,12 +123,17 @@ def c_array(ctype, values):
    """Convert a python array to c array."""
    return (ctype * len(values))(*values)

-def dict_to_str(data):
+def param_dict_to_str(data):
    if data is None or len(data) == 0:
        return ""
    pairs = []
-    for key in data:
-        pairs.append(str(key)+'='+str(data[key]))
+    for key, val in data.items():
+        if isinstance(val, list):
+            pairs.append(str(key)+'='+','.join(val))
+        elif isinstance(val, (int, float, str, bool)):
+            pairs.append(str(key)+'='+str(val))
+        else:
+            raise TypeError('unknow type of parameter:%s , got:%s' %(key, type(val).__name__))
    return ' '.join(pairs)
 """marco definition of data type in c_api of LightGBM"""
 C_API_DTYPE_FLOAT32 =0
@@ -145,7 +150,6 @@ C_API_PREDICT_LEAF_INDEX =2
 FIELD_TYPE_MAPPER = {"label":C_API_DTYPE_FLOAT32, 
 "wegiht":C_API_DTYPE_FLOAT32, 
 "init_score":C_API_DTYPE_FLOAT32,
-"group_id":C_API_DTYPE_INT32,
 "group":C_API_DTYPE_INT32,
 }

@@ -208,7 +212,7 @@ class Predictor(object):
                self.handle,
                ctypes.byref(out_num_class)))
            self.num_class = out_num_class.value
-            self.__num_total_model = out_num_iterations.value * self.num_class
+            self.__num_total_iteration = out_num_iterations.value 
        elif booster_handle is not None:
            self.__is_manage_handle = is_manage_handle
            self.handle = booster_handle
@@ -221,7 +225,7 @@ class Predictor(object):
            _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
                self.handle,
                ctypes.byref(out_num_iterations)))
-            self.__num_total_model = out_num_iterations.value * self.num_class
+            self.__num_total_iteration = out_num_iterations.value 
        else:
            raise TypeError('Need Model file to create a booster')

@@ -261,9 +265,9 @@ class Predictor(object):
            predict_type = C_API_PREDICT_RAW_SCORE
        if pred_leaf:
            predict_type = C_API_PREDICT_LEAF_INDEX
-        int_data_has_header = 0
-        if data_has_header:
-            int_data_has_header = 1
+        int_data_has_header = 1 if data_has_header else 0
+        if num_iteration > self.__num_total_iteration:
+            num_iteration = self.__num_total_iteration
        if is_str(data):
            tmp_pred_fname = tempfile.NamedTemporaryFile(prefix="lightgbm_tmp_pred_").name
            _safe_call(_LIB.LGBM_BoosterPredictForFile(
@@ -303,6 +307,15 @@ class Predictor(object):
                raise ValueError('len of predict result(%d) cannot be divide nrow(%d)' %(preds.size, nrow) )
        return preds

+    def __get_num_preds(self, num_iteration, nrow, predict_type):
+        n_preds = self.num_class * nrow
+        if predict_type == C_API_PREDICT_LEAF_INDEX:
+            if num_iteration > 0:
+                n_preds *= min(num_iteration, self.__num_total_iteration)
+            else:
+                n_preds *= self.__num_total_iteration
+        return n_preds
+
    def __pred_for_np2d(self, mat, num_iteration, predict_type):
        """
        Predict for a 2-D numpy matrix.
@@ -316,13 +329,7 @@ class Predictor(object):
            """change non-float data to float data, need to copy"""
            data = np.array(mat.reshape(mat.size), dtype=np.float32)
        ptr_data, type_ptr_data = c_float_array(data)
-        n_preds = self.num_class * mat.shape[0]
-        if predict_type == C_API_PREDICT_LEAF_INDEX:
-            if num_iteration > 0:
-                n_preds *= num_iteration
-            else:
-                used_iteration = self.__num_total_model / self.num_class
-                n_preds *= used_iteration
+        n_preds = self.__get_num_preds(num_iteration, mat.shape[0], predict_type)
        preds = np.zeros(n_preds, dtype=np.float32)
        out_num_preds = ctypes.c_int64(0)
        _safe_call(_LIB.LGBM_BoosterPredictForMat(
@@ -346,13 +353,7 @@ class Predictor(object):
        Predict for a csr data
        """
        nrow = len(csr.indptr) - 1
-        n_preds = self.num_class * nrow
-        if predict_type == C_API_PREDICT_LEAF_INDEX:
-            if num_iteration > 0:
-                n_preds *= num_iteration
-            else:
-                used_iteration = self.__num_total_model / self.num_class
-                n_preds *= used_iteration
+        n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
        preds = np.zeros(n_preds, dtype=np.float32)
        out_num_preds = ctypes.c_int64(0)

@@ -386,7 +387,7 @@ class Dataset(object):
    """

    def __init__(self, data, label=None, max_bin=255, reference=None,
-        weight=None, group_id=None, predictor=None,
+        weight=None, group=None, predictor=None,
        silent=False, params=None):
        """
        Dataset used in LightGBM.
@@ -404,8 +405,8 @@ class Dataset(object):
            If this dataset validation, need to use training data as reference
        weight : list or numpy 1-D array , optional
            Weight for each instance.
-        group_id : list or numpy 1-D array , optional
-            group/query id for each instance. Note: if having group/query id, data should group by this id
+        group : list or numpy 1-D array , optional
+            group/query size for dataset
        silent : boolean, optional
            Whether print messages during construction
        params: dict, optional
@@ -420,8 +421,7 @@ class Dataset(object):
            return
        self.data_has_header = False
        """process for args"""
-        if params is None:
-            params = {}
+        params = {} if params is None else params
        self.max_bin = max_bin
        self.predictor = predictor
        params["max_bin"] = max_bin
@@ -429,7 +429,7 @@ class Dataset(object):
            params["verbose"] = 0
        elif "verbose" not in params:
            params["verbose"] = 1
-        params_str = dict_to_str(params)
+        params_str = param_dict_to_str(params)
        """process for reference dataset"""
        ref_dataset = None
        if isinstance(reference, Dataset):
@@ -464,8 +464,8 @@ class Dataset(object):
            raise ValueError("label should not be None")
        if weight is not None:
            self.set_weight(weight)
-        if group_id is not None:
-            self.set_group_id(group_id)
+        if group is not None:
+            self.set_group(group)
        # load init score
        if self.predictor is not None and isinstance(self.predictor, Predictor):
            init_score = self.predictor.predict(data, 
@@ -482,7 +482,7 @@ class Dataset(object):
                init_score = new_init_score
            self.set_init_score(init_score)

-    def create_valid(self, data, label=None, weight=None, group_id=None, 
+    def create_valid(self, data, label=None, weight=None, group=None, 
        silent=False, params=None):
        """
        Create validation data align with current dataset
@@ -496,15 +496,15 @@ class Dataset(object):
            Label of the training data.
        weight : list or numpy 1-D array , optional
            Weight for each instance.
-        group_id : list or numpy 1-D array , optional
-            group/query id for each instance. Note: if having group/query id, data should group by this id
+        group : list or numpy 1-D array , optional
+            group/query size for dataset
        silent : boolean, optional
            Whether print messages during construction
        params: dict, optional
            other parameters
        """
        return Dataset(data, label=label, max_bin=self.max_bin, reference=self,
-            weight=weight, group_id=group_id, predictor=self.predictor, 
+            weight=weight, group=group, predictor=self.predictor, 
            silent=silent, params=params)

    def subset(self, used_indices, params=None):
@@ -514,7 +514,7 @@ class Dataset(object):
        used_indices = list_to_1d_numpy(used_indices, np.int32)
        ret = Dataset(None)
        ret.handle = ctypes.c_void_p()
-        params_str = dict_to_str(params)
+        params_str = param_dict_to_str(params)
        _safe_call(_LIB.LGBM_DatasetGetSubset(
            ctypes.byref(self.handle), 
            used_indices.data_as(ctypes.POINTER(ctypes.c_int32)),
@@ -624,6 +624,7 @@ class Dataset(object):
            The array ofdata to be set
        """
        if data is None:
+            """set to None"""
            _safe_call(_LIB.LGBM_DatasetSetField(
                self.handle,
                c_str(field_name),
@@ -713,18 +714,6 @@ class Dataset(object):
        self.__group = group
        self.set_field('group', group)

-    def set_group_id(self, group_id):
-
-        """Set group_id of Dataset (used for ranking).
-
-        Parameters
-        ----------
-        group : array like
-            group_id of Dataset (used for ranking).
-        """
-        if group_id is not None:
-            group_id = list_to_1d_numpy(group_id, np.int32)
-        self.set_field('group_id', group_id)

    def get_label(self):
        """Get the label of the Dataset.
@@ -817,8 +806,7 @@ class Booster(object):
        self.handle = ctypes.c_void_p()
        self.__need_reload_eval_info = True
        self.__is_manage_handle = True
-        if params is None:
-            params = {}
+        params = {} if params is None else params
        if silent:
            params["verbose"] = 0
        elif "verbose" not in params:
@@ -827,7 +815,7 @@ class Booster(object):
            """Training task"""
            if not isinstance(train_set, Dataset):
                raise TypeError('training data should be Dataset instance, met{}'.format(type(train_set).__name__))
-            params_str = dict_to_str(params)
+            params_str = param_dict_to_str(params)
            """construct booster object"""
            _safe_call(_LIB.LGBM_BoosterCreate(
                train_set.handle, 
@@ -907,7 +895,7 @@ class Booster(object):
            params["verbose"] = 0
        elif "verbose" not in params:
            params["verbose"] = 1
-        params_str = dict_to_str(params)
+        params_str = param_dict_to_str(params)
        _safe_call(_LIB.LGBM_BoosterResetParameter(
            self.handle,
            c_str(params_str)))
@@ -1162,11 +1150,11 @@ class Booster(object):
            raise ValueError("data_idx should be smaller than number of dataset")
        if self.__inner_predict_buffer[data_idx] is None:
            if data_idx == 0:
-                num_data = self.train_set.num_data() * self.__num_class
+                n_preds = self.train_set.num_data() * self.__num_class
            else:
-                num_data = self.valid_sets[data_idx - 1].num_data() * self.__num_class
+                n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class
            self.__inner_predict_buffer[data_idx] = \
-                np.array([0.0 for _ in range(num_data)], dtype=np.float32, copy=False)
+                np.array([0.0 for _ in range(n_preds)], dtype=np.float32, copy=False)
        """avoid to predict many time in one iteration"""
        if not self.__is_predicted_cur_iter[data_idx]:
            tmp_out_len = ctypes.c_int64(0)

--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -181,6 +181,8 @@ public:
    } else {
      num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
    }
+    num_iteration_for_pred_ = std::min(num_iteration_for_pred_, 
+      static_cast<int>(models_.size()) / num_class_);
  }

  /*!