refine prediction logic. (#395)

* refine prediction logic. * fix test. * fix out_len in training score of Dart. * improve predict speed for high dimension data.

refine prediction logic. (#395)
* refine prediction logic. * fix test. * fix out_len in training score of Dart. * improve predict speed for high dimension data.
71660f1c · Guolin Ke · GitHub · f1ffc10d · 71660f1c · 71660f1c
Commit 71660f1c authored Apr 10, 2017 by Guolin Ke Committed by GitHub Apr 10, 2017
20 changed files
--- a/include/LightGBM/boosting.h
+++ b/include/LightGBM/boosting.h
@@ -110,27 +110,29 @@ public:
  */
  virtual void GetPredictAt(int data_idx, double* result, int64_t* out_len) = 0;

+  virtual int NumPredictOneRow(int num_iteration, int is_pred_leaf) const = 0;
+
  /*!
  * \brief Prediction for one record, not sigmoid transform
  * \param feature_values Feature value on this record
-  * \return Prediction result for this record
+  * \param output Prediction result for this record
  */
-  virtual std::vector<double> PredictRaw(const double* feature_values) const = 0;
+  virtual void PredictRaw(const double* feature_values, double* output) const = 0;

  /*!
  * \brief Prediction for one record, sigmoid transformation will be used if needed
  * \param feature_values Feature value on this record
-  * \return Prediction result for this record
+  * \param output Prediction result for this record
  */
-  virtual std::vector<double> Predict(const double* feature_values) const = 0;
+  virtual void Predict(const double* feature_values, double* output) const = 0;
  
  /*!
  * \brief Prediction for one record with leaf index
  * \param feature_values Feature value on this record
-  * \return Predicted leaf index for this record
+  * \param output Prediction result for this record
  */
-  virtual std::vector<int> PredictLeafIndex(
-    const double* feature_values) const = 0;
+  virtual void PredictLeafIndex(
+    const double* feature_values, double* output) const = 0;

  /*!
  * \brief Dump model to json format string
@@ -185,6 +187,12 @@ public:
  */
  virtual int NumberOfTotalModel() const = 0;
  
+  /*!
+  * \brief Get number of trees per iteration
+  * \return Number of trees per iteration
+  */
+  virtual int NumTreePerIteration() const = 0;
+
  /*!
  * \brief Get number of classes
  * \return Number of classes
@@ -192,9 +200,11 @@ public:
  virtual int NumberOfClasses() const = 0;

  /*!
-  * \brief Set number of used model for prediction
+  * \brief Initial work for the prediction
+  * \param num_iteration number of used iteration
+  * \return the feature indices mapper
  */
-  virtual void SetNumIterationForPred(int num_iteration) = 0;
+  virtual std::vector<int> InitPredict(int num_iteration) = 0;
  
  /*!
  * \brief Name of submodel

--- a/include/LightGBM/meta.h
+++ b/include/LightGBM/meta.h
@@ -22,7 +22,7 @@ const score_t kEpsilon = 1e-15f;
 using ReduceFunction = std::function<void(const char*, char*, int)>;

 using PredictFunction =
-std::function<std::vector<double>(const std::vector<std::pair<int, double>>&)>;
+std::function<void(const std::vector<std::pair<int, double>>&, double* output)>;

 #define NO_SPECIFIC (-1)


--- a/include/LightGBM/metric.h
+++ b/include/LightGBM/metric.h
@@ -34,8 +34,7 @@ public:
  * \brief Calcaluting and printing metric result
  * \param score Current prediction score
  */
-  virtual std::vector<double> Eval(const double* score, const ObjectiveFunction* objective,
-                                   int num_tree_per_iteration) const = 0;
+  virtual std::vector<double> Eval(const double* score, const ObjectiveFunction* objective) const = 0;

  Metric() = default;
  /*! \brief Disable copy */

--- a/include/LightGBM/objective_function.h
+++ b/include/LightGBM/objective_function.h
@@ -39,14 +39,12 @@ public:

  virtual bool SkipEmptyClass() const { return false; }

-  virtual int numTreePerIteration() const { return 1; }
+  virtual int NumTreePerIteration() const { return 1; }

-  virtual std::vector<double> ConvertOutput(std::vector<double>& input) const {
-    return input;
-  }
+  virtual int NumPredictOneRow() const { return 1; }

-  virtual double ConvertOutput(double input) const {
-    return input;
+  virtual void ConvertOutput(const double* input, double* output) const {
+    output[0] = input[0];
  }

  virtual std::string ToString() const = 0;

--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -111,6 +111,13 @@ public:
    shrinkage_ *= rate;
  }

+  inline void ReMapFeature(const std::vector<int>& feature_mapper) {
+    mapped_feature_ = split_feature_;
+    for (int i = 0; i < num_leaves_ - 1; ++i) {
+      mapped_feature_[i] = feature_mapper[split_feature_[i]];
+    }
+  }
+
  /*! \brief Serialize this object to string*/
  std::string ToString();

@@ -194,9 +201,10 @@ private:
  std::vector<int> leaf_depth_;
  double shrinkage_;
  bool has_categorical_;
+  /*! \brief buffer of mapped split_feature_  */
+  std::vector<int> mapped_feature_;
 };

-
 inline double Tree::Predict(const double* feature_values) const {
  if (num_leaves_ > 1) {
    int leaf = GetLeaf(feature_values);
@@ -217,13 +225,25 @@ inline int Tree::PredictLeafIndex(const double* feature_values) const {

 inline int Tree::GetLeaf(const double* feature_values) const {
  int node = 0;
-  while (node >= 0) {
-    if (decision_funs[decision_type_[node]](
-      feature_values[split_feature_[node]],
-      threshold_[node])) {
-      node = left_child_[node];
-    } else {
-      node = right_child_[node];
+  if (has_categorical_) {
+    while (node >= 0) {
+      if (decision_funs[decision_type_[node]](
+        feature_values[mapped_feature_[node]],
+        threshold_[node])) {
+        node = left_child_[node];
+      } else {
+        node = right_child_[node];
+      }
+    }
+  } else {
+    while (node >= 0) {
+      if (NumericalDecision<double>(
+        feature_values[mapped_feature_[node]],
+        threshold_[node])) {
+        node = left_child_[node];
+      } else {
+        node = right_child_[node];
+      }
    }
  }
  return ~node;

--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -377,18 +377,18 @@ inline void Softmax(std::vector<double>* p_rec) {
  }
 }

-inline void Softmax(double* rec, int len) {
-  double wmax = rec[0];
+inline void Softmax(const double* input, double* output, int len) {
+  double wmax = input[0];
  for (int i = 1; i < len; ++i) {
-    wmax = std::max(rec[i], wmax);
+    wmax = std::max(input[i], wmax);
  }
  double wsum = 0.0f;
  for (int i = 0; i < len; ++i) {
-    rec[i] = std::exp(rec[i] - wmax);
-    wsum += rec[i];
+    output[i] = std::exp(input[i] - wmax);
+    wsum += output[i];
  }
  for (int i = 0; i < len; ++i) {
-    rec[i] /= static_cast<double>(wsum);
+    output[i] /= static_cast<double>(wsum);
  }
 }


--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -54,8 +54,7 @@ void Application::LoadParameters(int argc, char** argv) {
        continue;
      }
      params[key] = value;
-    }
-    else {
+    } else {
      Log::Warning("Unknown parameter in command line: %s", argv[i]);
    }
  }
@@ -86,14 +85,13 @@ void Application::LoadParameters(int argc, char** argv) {
          if (params.count(key) == 0) {
            params[key] = value;
          }
-        }
-        else {
+        } else {
          Log::Warning("Unknown parameter in config file: %s", line.c_str());
        }
      }
    } else {
      Log::Warning("Config file %s doesn't exist, will ignore",
-                                params["config_file"].c_str());
+                   params["config_file"].c_str());
    }
  }
  // check for alias again
@@ -110,23 +108,23 @@ void Application::LoadData() {
  PredictFunction predict_fun = nullptr;
  // need to continue training
  if (boosting_->NumberOfTotalModel() > 0) {
-    predictor.reset(new Predictor(boosting_.get(), true, false));
+    predictor.reset(new Predictor(boosting_.get(), -1, true, false));
    predict_fun = predictor->GetPredictFunction();
  }

  // sync up random seed for data partition
  if (config_.is_parallel_find_bin) {
    config_.io_config.data_random_seed =
-       GlobalSyncUpByMin<int>(config_.io_config.data_random_seed);
+      GlobalSyncUpByMin<int>(config_.io_config.data_random_seed);
  }

  DatasetLoader dataset_loader(config_.io_config, predict_fun,
-    boosting_->NumberOfClasses(), config_.io_config.data_filename.c_str());
+                               config_.boosting_config.num_class, config_.io_config.data_filename.c_str());
  // load Training data
  if (config_.is_parallel_find_bin) {
    // load data for parallel training
    train_data_.reset(dataset_loader.LoadFromFile(config_.io_config.data_filename.c_str(),
-      Network::rank(), Network::num_machines()));
+                                                  Network::rank(), Network::num_machines()));
  } else {
    // load data for single machine
    train_data_.reset(dataset_loader.LoadFromFile(config_.io_config.data_filename.c_str(), 0, 1));
@@ -170,7 +168,7 @@ void Application::LoadData() {
        auto metric = std::unique_ptr<Metric>(Metric::CreateMetric(metric_type, config_.metric_config));
        if (metric == nullptr) { continue; }
        metric->Init(valid_datas_.back()->metadata(),
-          valid_datas_.back()->num_data());
+                     valid_datas_.back()->num_data());
        valid_metrics_.back().push_back(std::move(metric));
      }
      valid_metrics_.back().shrink_to_fit();
@@ -181,7 +179,7 @@ void Application::LoadData() {
  auto end_time = std::chrono::high_resolution_clock::now();
  // output used time on each iteration
  Log::Info("Finished loading data in %f seconds",
-    std::chrono::duration<double, std::milli>(end_time - start_time) * 1e-3);
+            std::chrono::duration<double, std::milli>(end_time - start_time) * 1e-3);
 }

 void Application::InitTrain() {
@@ -201,22 +199,22 @@ void Application::InitTrain() {
  // create boosting
  boosting_.reset(
    Boosting::CreateBoosting(config_.boosting_type,
-      config_.io_config.input_model.c_str()));
+                             config_.io_config.input_model.c_str()));
  // create objective function
  objective_fun_.reset(
    ObjectiveFunction::CreateObjectiveFunction(config_.objective_type,
-      config_.objective_config));
+                                               config_.objective_config));
  // load training data
  LoadData();
  // initialize the objective function
  objective_fun_->Init(train_data_->metadata(), train_data_->num_data());
  // initialize the boosting
  boosting_->Init(&config_.boosting_config, train_data_.get(), objective_fun_.get(),
-    Common::ConstPtrInVectorWrapper<Metric>(train_metric_));
+                  Common::ConstPtrInVectorWrapper<Metric>(train_metric_));
  // add validation data into boosting
  for (size_t i = 0; i < valid_datas_.size(); ++i) {
    boosting_->AddValidDataset(valid_datas_[i].get(),
-      Common::ConstPtrInVectorWrapper<Metric>(valid_metrics_[i]));
+                               Common::ConstPtrInVectorWrapper<Metric>(valid_metrics_[i]));
  }
  Log::Info("Finished initializing training");
 }
@@ -232,7 +230,7 @@ void Application::Train() {
    auto end_time = std::chrono::steady_clock::now();
    // output used time per iteration
    Log::Info("%f seconds elapsed, finished iteration %d", std::chrono::duration<double,
-      std::milli>(end_time - start_time) * 1e-3, iter + 1);
+              std::milli>(end_time - start_time) * 1e-3, iter + 1);
  }
  // save model to file
  boosting_->SaveModelToFile(-1, config_.io_config.output_model.c_str());
@@ -241,12 +239,11 @@ void Application::Train() {


 void Application::Predict() {
-  boosting_->SetNumIterationForPred(config_.io_config.num_iteration_predict);
  // create predictor
-  Predictor predictor(boosting_.get(), config_.io_config.is_predict_raw_score,
-    config_.io_config.is_predict_leaf_index);
+  Predictor predictor(boosting_.get(), config_.io_config.num_iteration_predict, config_.io_config.is_predict_raw_score,
+                      config_.io_config.is_predict_leaf_index);
  predictor.Predict(config_.io_config.data_filename.c_str(),
-    config_.io_config.output_result.c_str(), config_.io_config.has_header);
+                    config_.io_config.output_result.c_str(), config_.io_config.has_header);
  Log::Info("Finished prediction");
 }

@@ -264,9 +261,9 @@ T Application::GlobalSyncUpByMin(T& local) {
    return global;
  }
  Network::Allreduce(reinterpret_cast<char*>(&local),
-                         sizeof(local), sizeof(local),
+                     sizeof(local), sizeof(local),
                     reinterpret_cast<char*>(&global),
-              [](const char* src, char* dst, int len) {
+                     [](const char* src, char* dst, int len) {
    int used_size = 0;
    const int type_size = sizeof(T);
    const T *p1;

--- a/src/application/predictor.hpp
+++ b/src/application/predictor.hpp
@@ -26,39 +26,42 @@ public:
  /*!
  * \brief Constructor
  * \param boosting Input boosting model
+  * \param num_iteration Number of boosting round
  * \param is_raw_score True if need to predict result with raw score
  * \param is_predict_leaf_index True if output leaf index instead of prediction score
  */
-  Predictor(const Boosting* boosting, bool is_raw_score, bool is_predict_leaf_index) {
+  Predictor(Boosting* boosting, int num_iteration,
+            bool is_raw_score, bool is_predict_leaf_index) {
+
+    feature_mapper_ = boosting->InitPredict(num_iteration);
    boosting_ = boosting;
-    num_features_ = boosting_->MaxFeatureIdx() + 1;
-#pragma omp parallel
-#pragma omp master
-    {
-      num_threads_ = omp_get_num_threads();
-    }
-    for (int i = 0; i < num_threads_; ++i) {
-      features_.push_back(std::vector<double>(num_features_));
+    num_pred_one_row_ = boosting_->NumPredictOneRow(num_iteration, is_predict_leaf_index);
+
+    num_total_features_ = static_cast<int>(feature_mapper_.size());
+    num_used_features_ = 1;
+    for (auto fidx : feature_mapper_) {
+      num_used_features_ = std::max(num_used_features_, fidx + 1);
    }
-    features_.shrink_to_fit();
+
+    features_ = std::vector<double>(num_used_features_);
    if (is_predict_leaf_index) {
-      predict_fun_ = [this](const std::vector<std::pair<int, double>>& features) {
-        const int tid = PutFeatureValuesToBuffer(features);
+      predict_fun_ = [this](const std::vector<std::pair<int, double>>& features, double* output) {
+        PutFeatureValuesToBuffer(features);
        // get result for leaf index
-        auto result = boosting_->PredictLeafIndex(features_[tid].data());
-        return std::vector<double>(result.begin(), result.end());
+        boosting_->PredictLeafIndex(features_.data(), output);
      };
+
    } else {
      if (is_raw_score) {
-        predict_fun_ = [this](const std::vector<std::pair<int, double>>& features) {
-          const int tid = PutFeatureValuesToBuffer(features);
+        predict_fun_ = [this](const std::vector<std::pair<int, double>>& features, double* output) {
+          PutFeatureValuesToBuffer(features);
          // get result without sigmoid transformation
-          return boosting_->PredictRaw(features_[tid].data());
+          boosting_->PredictRaw(features_.data(), output);
        };
      } else {
-        predict_fun_ = [this](const std::vector<std::pair<int, double>>& features) {
-          const int tid = PutFeatureValuesToBuffer(features);
-          return boosting_->Predict(features_[tid].data());
+        predict_fun_ = [this](const std::vector<std::pair<int, double>>& features, double* output) {
+          PutFeatureValuesToBuffer(features);
+          boosting_->Predict(features_.data(), output);
        };
      }
    }
@@ -81,16 +84,16 @@ public:
  void Predict(const char* data_filename, const char* result_filename, bool has_header) {
    FILE* result_file;

-#ifdef _MSC_VER
+    #ifdef _MSC_VER
    fopen_s(&result_file, result_filename, "w");
-#else
+    #else
    result_file = fopen(result_filename, "w");
-#endif
+    #endif

    if (result_file == NULL) {
      Log::Fatal("Prediction results file %s doesn't exist", data_filename);
    }
-    auto parser = std::unique_ptr<Parser>(Parser::CreateParser(data_filename, has_header, num_features_, boosting_->LabelIdx()));
+    auto parser = std::unique_ptr<Parser>(Parser::CreateParser(data_filename, has_header, num_used_features_, boosting_->LabelIdx()));

    if (parser == nullptr) {
      Log::Fatal("Could not recognize the data format of data file %s", data_filename);
@@ -108,52 +111,47 @@ public:
      [this, &parser_fun, &result_file]
    (data_size_t, const std::vector<std::string>& lines) {
      std::vector<std::pair<int, double>> oneline_features;
-      std::vector<std::string> pred_result(lines.size(), "");
-      OMP_INIT_EX();
-#pragma omp parallel for schedule(static) private(oneline_features)
      for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); ++i) {
-        OMP_LOOP_EX_BEGIN();
        oneline_features.clear();
        // parser
        parser_fun(lines[i].c_str(), &oneline_features);
        // predict
-        pred_result[i] = Common::Join<double>(predict_fun_(oneline_features), "\t");
-        OMP_LOOP_EX_END();
-      }
-      OMP_THROW_EX();
-      for (size_t i = 0; i < pred_result.size(); ++i) {
-        fprintf(result_file, "%s\n", pred_result[i].c_str());
+        std::vector<double> result(num_pred_one_row_);
+        predict_fun_(oneline_features, result.data());
+        auto str_result = Common::Join<double>(result, "\t");
+        fprintf(result_file, "%s\n", str_result.c_str());
      }
    };
    TextReader<data_size_t> predict_data_reader(data_filename, has_header);
    predict_data_reader.ReadAllAndProcessParallel(process_fun);
-
    fclose(result_file);
  }

 private:
-  int PutFeatureValuesToBuffer(const std::vector<std::pair<int, double>>& features) {
-    int tid = omp_get_thread_num();
-    // init feature value
-    std::memset(features_[tid].data(), 0, sizeof(double)*num_features_);
+  void PutFeatureValuesToBuffer(const std::vector<std::pair<int, double>>& features) {
+    std::memset(features_.data(), 0, sizeof(double)*num_used_features_);
    // put feature value
-    for (const auto& p : features) {
-      if (p.first < num_features_) {
-        features_[tid][p.first] = p.second;
+    int loop_size = static_cast<int>(features.size());
+    #pragma omp parallel for schedule(static, 512) if(loop_size >= 1024) 
+    for (int i = 0; i < loop_size; ++i) {
+      if (features[i].first >= num_total_features_) continue;
+      auto fidx = feature_mapper_[features[i].first];
+      if (fidx >= 0) {
+        features_[fidx] = features[i].second;
      }
    }
-    return tid;
  }
  /*! \brief Boosting model */
  const Boosting* boosting_;
  /*! \brief Buffer for feature values */
-  std::vector<std::vector<double>> features_;
+  std::vector<double> features_;
  /*! \brief Number of features */
-  int num_features_;
-  /*! \brief Number of threads */
-  int num_threads_;
+  int num_used_features_;
  /*! \brief function for prediction */
  PredictFunction predict_fun_;
+  int num_pred_one_row_;
+  std::vector<int> feature_mapper_;
+  int num_total_features_;
 };

 }  // namespace LightGBM

--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -85,7 +85,7 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
  num_tree_per_iteration_ = num_class_;
  if (objective_function_ != nullptr) {
    is_constant_hessian_ = objective_function_->IsConstantHessian();
-    num_tree_per_iteration_ = objective_function_->numTreePerIteration();
+    num_tree_per_iteration_ = objective_function_->NumTreePerIteration();
  } else {
    is_constant_hessian_ = false;
  }
@@ -525,7 +525,7 @@ std::string GBDT::OutputMetric(int iter) {
  if (need_output) {
    for (auto& sub_metric : training_metrics_) {
      auto name = sub_metric->GetName();
-      auto scores = sub_metric->Eval(train_score_updater_->score(), objective_function_, num_tree_per_iteration_);
+      auto scores = sub_metric->Eval(train_score_updater_->score(), objective_function_);
      for (size_t k = 0; k < name.size(); ++k) {
        std::stringstream tmp_buf;
        tmp_buf << "Iteration:" << iter
@@ -543,8 +543,7 @@ std::string GBDT::OutputMetric(int iter) {
    for (size_t i = 0; i < valid_metrics_.size(); ++i) {
      for (size_t j = 0; j < valid_metrics_[i].size(); ++j) {
        auto test_scores = valid_metrics_[i][j]->Eval(valid_score_updater_[i]->score(),
-                                                      objective_function_,
-                                                      num_tree_per_iteration_);
+                                                      objective_function_);
        auto name = valid_metrics_[i][j]->GetName();
        for (size_t k = 0; k < name.size(); ++k) {
          std::stringstream tmp_buf;
@@ -583,8 +582,7 @@ std::vector<double> GBDT::GetEvalAt(int data_idx) const {
  std::vector<double> ret;
  if (data_idx == 0) {
    for (auto& sub_metric : training_metrics_) {
-      auto scores = sub_metric->Eval(train_score_updater_->score(), objective_function_,
-                                     num_tree_per_iteration_);
+      auto scores = sub_metric->Eval(train_score_updater_->score(), objective_function_);
      for (auto score : scores) {
        ret.push_back(score);
      }
@@ -593,8 +591,7 @@ std::vector<double> GBDT::GetEvalAt(int data_idx) const {
    auto used_idx = data_idx - 1;
    for (size_t j = 0; j < valid_metrics_[used_idx].size(); ++j) {
      auto test_scores = valid_metrics_[used_idx][j]->Eval(valid_score_updater_[used_idx]->score(),
-                                                           objective_function_,
-                                                           num_tree_per_iteration_);
+                                                           objective_function_);
      for (auto score : test_scores) {
        ret.push_back(score);
      }
@@ -626,11 +623,12 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
  if (objective_function_ != nullptr) {
    #pragma omp parallel for schedule(static)
    for (data_size_t i = 0; i < num_data; ++i) {
-      std::vector<double> tmp_result(num_class_);
+      std::vector<double> tree_pred(num_tree_per_iteration_);
      for (int j = 0; j < num_tree_per_iteration_; ++j) {
-        tmp_result[j] = raw_scores[j * num_data + i];
+        tree_pred[j] = raw_scores[j * num_data + i];
      }
-      tmp_result = objective_function_->ConvertOutput(tmp_result);
+      std::vector<double> tmp_result(num_class_);
+      objective_function_->ConvertOutput(tree_pred.data(), tmp_result.data());
      for (int j = 0; j < num_class_; ++j) {
        out_result[j * num_data + i] = static_cast<double>(tmp_result[j]);
      }
@@ -638,12 +636,9 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
  } else {
    #pragma omp parallel for schedule(static)
    for (data_size_t i = 0; i < num_data; ++i) {
-      std::vector<double> tmp_result(num_class_);
+      std::vector<double> tmp_result(num_tree_per_iteration_);
      for (int j = 0; j < num_tree_per_iteration_; ++j) {
-        tmp_result[j] = raw_scores[j * num_data + i];
-      }
-      for (int j = 0; j < num_class_; ++j) {
-        out_result[j * num_data + i] = static_cast<double>(tmp_result[j]);
+        out_result[j * num_data + i] = static_cast<double>(raw_scores[j * num_data + i]);
      }
    }
  }
@@ -875,38 +870,57 @@ std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
  return pairs;
 }

-std::vector<double> GBDT::PredictRaw(const double* value) const {
-  std::vector<double> ret(num_tree_per_iteration_, 0.0f);
-  for (int i = 0; i < num_iteration_for_pred_; ++i) {
-    for (int j = 0; j < num_tree_per_iteration_; ++j) {
-      ret[j] += models_[i * num_tree_per_iteration_ + j]->Predict(value);
+
+
+void GBDT::PredictRaw(const double* value, double* output) const {
+  if (num_threads_ <= num_tree_per_iteration_) {
+    #pragma omp parallel for schedule(static)
+    for (int k = 0; k < num_tree_per_iteration_; ++k) {
+      for (int i = 0; i < num_iteration_for_pred_; ++i) {
+        output[k] += models_[i * num_tree_per_iteration_ + k]->Predict(value);
+      }
+    }
+  } else {
+    for (int k = 0; k < num_tree_per_iteration_; ++k) {
+      double t = 0.0f;
+      #pragma omp parallel for schedule(static) reduction(+:t)
+      for (int i = 0; i < num_iteration_for_pred_; ++i) {
+        t += models_[i * num_tree_per_iteration_ + k]->Predict(value);
+      }
+      output[k] = t;
    }
  }
-  return ret;
 }

-std::vector<double> GBDT::Predict(const double* value) const {
-  std::vector<double> ret(num_tree_per_iteration_, 0.0f);
-  for (int i = 0; i < num_iteration_for_pred_; ++i) {
-    for (int j = 0; j < num_tree_per_iteration_; ++j) {
-      ret[j] += models_[i * num_tree_per_iteration_ + j]->Predict(value);
+void GBDT::Predict(const double* value, double* output) const {
+  if (num_threads_ <= num_tree_per_iteration_) {
+    #pragma omp parallel for schedule(static)
+    for (int k = 0; k < num_tree_per_iteration_; ++k) {
+      for (int i = 0; i < num_iteration_for_pred_; ++i) {
+        output[k] += models_[i * num_tree_per_iteration_ + k]->Predict(value);
+      }
+    }
+  } else {
+    for (int k = 0; k < num_tree_per_iteration_; ++k) {
+      double t = 0.0f;
+      #pragma omp parallel for schedule(static) reduction(+:t)
+      for (int i = 0; i < num_iteration_for_pred_; ++i) {
+        t += models_[i * num_tree_per_iteration_ + k]->Predict(value);
+      }
+      output[k] = t;
    }
  }
  if (objective_function_ != nullptr) {
-    return objective_function_->ConvertOutput(ret);
-  } else {
-    return ret;
+    objective_function_->ConvertOutput(output, output);
  }
 }

-std::vector<int> GBDT::PredictLeafIndex(const double* value) const {
-  std::vector<int> ret;
-  for (int i = 0; i < num_iteration_for_pred_; ++i) {
-    for (int j = 0; j < num_tree_per_iteration_; ++j) {
-      ret.push_back(models_[i * num_tree_per_iteration_ + j]->PredictLeafIndex(value));
-    }
+void GBDT::PredictLeafIndex(const double* value, double* output) const {
+  int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_;
+  #pragma omp parallel for schedule(static)
+  for (int i = 0; i < total_tree; ++i) {
+    output[i] = models_[i]->PredictLeafIndex(value);
  }
-  return ret;
 }

 }  // namespace LightGBM
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -50,13 +50,13 @@ public:
      auto new_tree = std::unique_ptr<Tree>(new Tree(*(tree.get())));
      models_.push_back(std::move(new_tree));
    }
-    num_init_iteration_ = static_cast<int>(models_.size()) / num_class_;
+    num_init_iteration_ = static_cast<int>(models_.size()) / num_tree_per_iteration_;
    // push model in current object
    for (const auto& tree : original_models) {
      auto new_tree = std::unique_ptr<Tree>(new Tree(*(tree.get())));
      models_.push_back(std::move(new_tree));
    }
-    num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
+    num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_tree_per_iteration_;
  }

  /*!
@@ -88,7 +88,7 @@ public:
  */
  void RollbackOneIter() override;

-  int GetCurrentIteration() const override { return static_cast<int>(models_.size()) / num_class_; }
+  int GetCurrentIteration() const override { return static_cast<int>(models_.size()) / num_tree_per_iteration_; }

  bool EvalAndCheckEarlyStopping() override;

@@ -122,26 +122,24 @@ public:
  */
  void GetPredictAt(int data_idx, double* out_result, int64_t* out_len) override;

-  /*!
-  * \brief Prediction for one record without sigmoid transformation
-  * \param feature_values Feature value on this record
-  * \return Prediction result for this record
-  */
-  std::vector<double> PredictRaw(const double* feature_values) const override;
+  inline int NumPredictOneRow(int num_iteration, int is_pred_leaf) const override {
+    int num_preb_in_one_row = num_class_;
+    if (is_pred_leaf) {
+      int max_iteration = GetCurrentIteration();
+      if (num_iteration > 0) {
+        num_preb_in_one_row *= static_cast<int>(std::min(max_iteration, num_iteration));
+      } else {
+        num_preb_in_one_row *= max_iteration;
+      }
+    }
+    return num_preb_in_one_row;
+  }

-  /*!
-  * \brief Prediction for one record with sigmoid transformation if enabled
-  * \param feature_values Feature value on this record
-  * \return Prediction result for this record
-  */
-  std::vector<double> Predict(const double* feature_values) const override;
+  void PredictRaw(const double* feature_values, double* output) const override;

-  /*!
-  * \brief Prediction for one record with leaf index
-  * \param feature_values Feature value on this record
-  * \return Predicted leaf index for this record
-  */
-  std::vector<int> PredictLeafIndex(const double* value) const override;
+  void Predict(const double* feature_values, double* output) const override;
+
+  void PredictLeafIndex(const double* value, double* output) const override;

  /*!
  * \brief Dump model to json format string
@@ -193,20 +191,51 @@ public:
  */
  inline int NumberOfTotalModel() const override { return static_cast<int>(models_.size()); }

+  /*!
+  * \brief Get number of tree per iteration
+  * \return number of tree per iteration
+  */
+  inline int NumTreePerIteration() const override { return num_tree_per_iteration_; }
+
  /*!
  * \brief Get number of classes
  * \return Number of classes
  */
  inline int NumberOfClasses() const override { return num_class_; }

-  /*!
-  * \brief Set number of iterations for prediction
-  */
-  inline void SetNumIterationForPred(int num_iteration) override {
-    num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
+  inline std::vector<int> InitPredict(int num_iteration) override {
+    num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_tree_per_iteration_;
    if (num_iteration > 0) {
      num_iteration_for_pred_ = std::min(num_iteration + (boost_from_average_ ? 1 : 0), num_iteration_for_pred_);
    }
+    int used_fidx = 0;
+    // Construct used feature mapper
+    std::vector<int> feature_mapper(max_feature_idx_ + 1, -1);
+    int total_tree = num_iteration_for_pred_ * num_tree_per_iteration_;
+
+    #pragma omp parallel for schedule(static, 64) if (total_tree >= 128)
+    for (int i = 0; i < total_tree; ++i) {
+      int num_leaves = models_[i]->num_leaves();
+      for (int j = 0; j < num_leaves - 1; ++j) {
+        int fidx = models_[i]->split_feature(j);
+        if (feature_mapper[fidx] == -1) {
+          #pragma omp critical
+          {
+            if (feature_mapper[fidx] == -1) {
+              feature_mapper[fidx] = used_fidx;
+              ++used_fidx;
+            }
+          }
+        }
+      }
+    }
+
+    #pragma omp parallel for schedule(static, 64) if (total_tree >= 128)
+    for (int i = 0; i < total_tree; ++i) {
+      models_[i]->ReMapFeature(feature_mapper);
+    }
+
+    return feature_mapper;
  }

  inline double GetLeafValue(int tree_idx, int leaf_idx) const {

--- a/src/c_api.cpp
+++ b/src/c_api.cpp
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -888,7 +888,8 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
      // parser
      parser->ParseOneLine(text_data[i].c_str(), &oneline_features, &tmp_label);
      // set initial score
-      std::vector<double> oneline_init_score = predict_fun_(oneline_features);
+      std::vector<double> oneline_init_score(num_class_);
+      predict_fun_(oneline_features, oneline_init_score.data());
      for (int k = 0; k < num_class_; ++k) {
        init_score[k * dataset->num_data_ + i] = static_cast<double>(oneline_init_score[k]);
      }
@@ -947,7 +948,8 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
      parser->ParseOneLine(lines[i].c_str(), &oneline_features, &tmp_label);
      // set initial score
      if (!init_score.empty()) {
-        std::vector<double> oneline_init_score = predict_fun_(oneline_features);
+        std::vector<double> oneline_init_score(num_class_);
+        predict_fun_(oneline_features, oneline_init_score.data());
        for (int k = 0; k < num_class_; ++k) {
          init_score[k * dataset->num_data_ + start_idx + i] = static_cast<double>(oneline_init_score[k]);
        }

--- a/src/io/tree.cpp
+++ b/src/io/tree.cpp
@@ -318,6 +318,7 @@ std::string Tree::ToString() {
  str_buf << "internal_count="
    << Common::ArrayToString<data_size_t>(internal_count_, num_leaves_ - 1, ' ') << std::endl;
  str_buf << "shrinkage=" << shrinkage_ << std::endl;
+  str_buf << "has_categorical=" << (has_categorical_ ? 1 : 0) << std::endl;
  str_buf << std::endl;
  return str_buf.str();
 }
@@ -327,6 +328,7 @@ std::string Tree::ToJSON() {
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
  str_buf << "\"num_leaves\":" << num_leaves_ << "," << std::endl;
  str_buf << "\"shrinkage\":" << shrinkage_ << "," << std::endl;
+  str_buf << "\"has_categorical\":" << (has_categorical_ ? 1 : 0) << "," << std::endl;
  str_buf << "\"tree_structure\":" << NodeToJSON(0) << std::endl;

  return str_buf.str();
@@ -454,6 +456,15 @@ Tree::Tree(const std::string& str) {
  } else {
    shrinkage_ = 1.0f;
  }
+
+  if (key_vals.count("has_categorical")) {
+    int t = 0;
+    Common::Atoi(key_vals["has_categorical"].c_str(), &t);
+    has_categorical_ = t > 0;
+  } else {
+    has_categorical_ = false;
+  }
+
 }

 }  // namespace LightGBM
--- a/src/metric/binary_metric.hpp
+++ b/src/metric/binary_metric.hpp
@@ -54,8 +54,7 @@ public:
    return -1.0f;
  }

-  std::vector<double> Eval(const double* score, const ObjectiveFunction* objective,
-                           int) const override {
+  std::vector<double> Eval(const double* score, const ObjectiveFunction* objective) const override {
    double sum_loss = 0.0f;
    if (objective == nullptr) {
      if (weights_ == nullptr) {
@@ -75,15 +74,16 @@ public:
      if (weights_ == nullptr) {
        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
-          double prob = objective->ConvertOutput(score[i]);
+          double prob = 0;
+          objective->ConvertOutput(&score[i], &prob);
          // add loss
          sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], prob);
        }
      } else {
        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
-          // sigmoid transform
-          double prob = objective->ConvertOutput(score[i]);
+          double prob = 0;
+          objective->ConvertOutput(&score[i], &prob);
          // add loss
          sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], prob) * weights_[i];
        }
@@ -189,8 +189,7 @@ public:
    }
  }

-  std::vector<double> Eval(const double* score, const ObjectiveFunction*,
-                           int) const override {
+  std::vector<double> Eval(const double* score, const ObjectiveFunction*) const override {
    // get indices sorted by score, descent order
    std::vector<data_size_t> sorted_idx;
    for (data_size_t i = 0; i < num_data_; ++i) {

--- a/src/metric/map_metric.hpp
+++ b/src/metric/map_metric.hpp
@@ -93,8 +93,7 @@ public:
      cur_left = cur_k;
    }
  }
-  std::vector<double> Eval(const double* score, const ObjectiveFunction*,
-                           int) const override {
+  std::vector<double> Eval(const double* score, const ObjectiveFunction*) const override {
    // some buffers for multi-threading sum up
    std::vector<std::vector<double>> result_buffer_;
    for (int i = 0; i < num_threads_; ++i) {

--- a/src/metric/multiclass_metric.hpp
+++ b/src/metric/multiclass_metric.hpp
@@ -15,8 +15,8 @@ namespace LightGBM {
 template<typename PointWiseLossCalculator>
 class MulticlassMetric: public Metric {
 public:
-  explicit MulticlassMetric(const MetricConfig&) {
-
+  explicit MulticlassMetric(const MetricConfig& config) {
+    num_class_ = config.num_class;
  }

  virtual ~MulticlassMetric() {
@@ -49,31 +49,38 @@ public:
    return -1.0f;
  }

-  std::vector<double> Eval(const double* score, const ObjectiveFunction* objective,
-                           int num_tree_per_iteration) const override {
+  std::vector<double> Eval(const double* score, const ObjectiveFunction* objective) const override {
    double sum_loss = 0.0;
+    int num_tree_per_iteration = num_class_;
+    int num_pred_per_row = num_class_;
+    if (objective != nullptr) {
+      num_tree_per_iteration = objective->NumTreePerIteration();
+      num_pred_per_row = objective->NumPredictOneRow();
+    }
    if (objective != nullptr) {
      if (weights_ == nullptr) {
        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
-          std::vector<double> rec(num_tree_per_iteration);
+          std::vector<double> raw_score(num_tree_per_iteration);
          for (int k = 0; k < num_tree_per_iteration; ++k) {
            size_t idx = static_cast<size_t>(num_data_) * k + i;
-            rec[k] = static_cast<double>(score[idx]);
+            raw_score[k] = static_cast<double>(score[idx]);
          }
-          rec = objective->ConvertOutput(rec);
+          std::vector<double> rec(num_pred_per_row);
+          objective->ConvertOutput(raw_score.data(), rec.data());
          // add loss
          sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], rec);
        }
      } else {
        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
-          std::vector<double> rec(num_tree_per_iteration);
+          std::vector<double> raw_score(num_tree_per_iteration);
          for (int k = 0; k < num_tree_per_iteration; ++k) {
            size_t idx = static_cast<size_t>(num_data_) * k + i;
-            rec[k] = static_cast<double>(score[idx]);
+            raw_score[k] = static_cast<double>(score[idx]);
          }
-          rec = objective->ConvertOutput(rec);
+          std::vector<double> rec(num_pred_per_row);
+          objective->ConvertOutput(raw_score.data(), rec.data());
          // add loss
          sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], rec) * weights_[i];
        }
@@ -118,6 +125,7 @@ private:
  double sum_weights_;
  /*! \brief Name of this test set */
  std::vector<std::string> name_;
+  int num_class_;
 };

 /*! \brief L2 loss for multiclass task */

--- a/src/metric/rank_metric.hpp
+++ b/src/metric/rank_metric.hpp
@@ -82,7 +82,7 @@ public:
    return 1.0f;
  }

-  std::vector<double> Eval(const double* score, const ObjectiveFunction*, int) const override {
+  std::vector<double> Eval(const double* score, const ObjectiveFunction*) const override {
    // some buffers for multi-threading sum up
    std::vector<std::vector<double>> result_buffer_;
    for (int i = 0; i < num_threads_; ++i) {

--- a/src/metric/regression_metric.hpp
+++ b/src/metric/regression_metric.hpp
@@ -48,7 +48,7 @@ public:
    }
  }

-  std::vector<double> Eval(const double* score, const ObjectiveFunction* objective, int) const override {
+  std::vector<double> Eval(const double* score, const ObjectiveFunction* objective) const override {
    double sum_loss = 0.0f;
    if (objective == nullptr) {
      if (weights_ == nullptr) {
@@ -69,13 +69,17 @@ public:
        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          // add loss
-          sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], objective->ConvertOutput(score[i]), huber_delta_, fair_c_);
+          double t = 0;
+          objective->ConvertOutput(&score[i], &t);
+          sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], t, huber_delta_, fair_c_);
        }
      } else {
        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
        for (data_size_t i = 0; i < num_data_; ++i) {
          // add loss
-          sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], objective->ConvertOutput(score[i]), huber_delta_, fair_c_) * weights_[i];
+          double t = 0;
+          objective->ConvertOutput(&score[i], &t);
+          sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], t, huber_delta_, fair_c_) * weights_[i];
        }
      }
    }

--- a/src/objective/binary_objective.hpp
+++ b/src/objective/binary_objective.hpp
@@ -116,13 +116,8 @@ public:
    return "binary";
  }

-  std::vector<double> ConvertOutput(std::vector<double>& input) const override {
-    input[0] = 1.0f / (1.0f + std::exp(-sigmoid_ * input[0]));
-    return input;
-  }
-
-  double ConvertOutput(double input) const override {
-    return 1.0f / (1.0f + std::exp(-sigmoid_ * input));
+  void ConvertOutput(const double* input, double* output) const override {
+    output[0] = 1.0f / (1.0f + std::exp(-sigmoid_ * input[0]));
  }

  std::string ToString() const override {

--- a/src/objective/multiclass_objective.hpp
+++ b/src/objective/multiclass_objective.hpp
@@ -113,9 +113,8 @@ public:
    }
  }

-  std::vector<double> ConvertOutput(std::vector<double>& input) const override {
-    Common::Softmax(input.data(), num_class_);
-    return input;
+  void ConvertOutput(const double* input, double* output) const override {
+    Common::Softmax(input, output, num_class_);
  }

  const char* GetName() const override {
@@ -131,7 +130,9 @@ public:

  bool SkipEmptyClass() const override { return true; }

-  int numTreePerIteration() const override { return num_class_; }
+  int NumTreePerIteration() const override { return num_class_; }
+
+  int NumPredictOneRow() const override { return num_class_; }

 private:
  /*! \brief Number of data */
@@ -206,11 +207,10 @@ public:
    return "multiclassova";
  }

-  std::vector<double> ConvertOutput(std::vector<double>& input) const override {
+  void ConvertOutput(const double* input, double* output) const override {
    for (int i = 0; i < num_class_; ++i) {
-      input[i] = 1.0f / (1.0f + std::exp(-sigmoid_ * input[i]));
+      output[i] = 1.0f / (1.0f + std::exp(-sigmoid_ * input[i]));
    }
-    return input;
  }

  std::string ToString() const override {
@@ -223,7 +223,9 @@ public:

  bool SkipEmptyClass() const override { return true; }

-  int numTreePerIteration() const override { return num_class_; }
+  int NumTreePerIteration() const override { return num_class_; }
+
+  int NumPredictOneRow() const override { return num_class_; }

 private:
  /*! \brief Number of data */