Fix random foreset regression not doing quite the right thing.

facefa02 · Davis King · fe803b56 · facefa02 · facefa02
Commit facefa02 authored Jun 20, 2020 by Davis King
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 11 deletions

dlib/random_forest/random_forest_regression.h dlib/random_forest/random_forest_regression.h +7 -6

dlib/test/random_forest.cpp dlib/test/random_forest.cpp +13 -5

No files found.
--- a/dlib/random_forest/random_forest_regression.h
+++ b/dlib/random_forest/random_forest_regression.h
@@ -376,8 +376,6 @@ namespace dlib
            std::vector<std::vector<internal_tree_node<feature_extractor>>> all_trees(num_trees);
            std::vector<std::vector<float>> all_leaves(num_trees);

-            const double sumy = sum(mat(y));
-
            const size_t feats_per_node = std::max(1.0,std::round(fe.max_num_feats()*feature_subsampling_frac));

            // Each tree couldn't have more than this many interior nodes.  It might
@@ -412,15 +410,18 @@ namespace dlib
                // don't make any tree.  Just average the things and be done. 
                if (y.size() <= min_samples_per_leaf)
                {
-                    leaves.push_back(sumy/y.size());
+                    leaves.push_back(mean(mat(y)));
                    return;
                }


+                double sumy = 0;
                // pick a random bootstrap of the data.
                std::vector<std::pair<float,uint32_t>> idxs(y.size());
-                for (auto& idx : idxs)
+                for (auto& idx : idxs) {
                    idx = std::make_pair(0.0f, static_cast<uint32_t>(rnd.get_integer(y.size())));
+                    sumy += y[idx.second];
+                }

                // We are going to use ranges_to_process as a stack that tracks which
                // range of samples we are going to split next.
@@ -702,7 +703,7 @@ namespace dlib
                for (auto i = range.begin; i < range.end; ++i)
                    idxs[i].first = fe.extract_feature_value(x[idxs[i].second], feat);

-                std::sort(idxs.begin()+range.begin, idxs.begin()+range.end, compare_first);
+                std::stable_sort(idxs.begin()+range.begin, idxs.begin()+range.end, compare_first);

                auto split = find_best_split(range, y, idxs);

@@ -716,7 +717,7 @@ namespace dlib
            // resort idxs based on winning feat
            for (auto i = range.begin; i < range.end; ++i)
                idxs[i].first = fe.extract_feature_value(x[idxs[i].second], best.split_feature);
-            std::sort(idxs.begin()+range.begin, idxs.begin()+range.end, compare_first);
+            std::stable_sort(idxs.begin()+range.begin, idxs.begin()+range.end, compare_first);

            return best;
        }

--- a/dlib/test/random_forest.cpp
+++ b/dlib/test/random_forest.cpp
@@ -62,15 +62,23 @@ namespace
            DLIB_TEST(df.get_num_trees() == 1000);

            auto result = test_regression_function(df, samples, labels);
-            // train:    2.239 0.987173 0.970669   1.1399 
+            // train:    1.95064 0.990374  0.92738  1.04536
            dlog << LINFO << "train: " << result;
-            DLIB_TEST_MSG(result(0) < 2.3, result(0));
+            DLIB_TEST_MSG(result(0) < 2.0, result(0));
+
+            // By construction, output values should be in the span of the training labels.
+            const double min_label = min(mat(labels));
+            const double max_label = max(mat(labels));
+            for (auto&& x : samples) {
+                double y = df(x);
+                DLIB_TEST(min_label <= y && y <= max_label);
+            }

            running_stats<double> rs;
            for (size_t i = 0; i < oobs.size(); ++i)
                rs.add(std::pow(oobs[i]-labels[i],2.0));
            dlog << LINFO << "OOB MSE: "<< rs.mean();
-            DLIB_TEST_MSG(rs.mean() < 10.2, rs.mean());
+            DLIB_TEST_MSG(rs.mean() < 10.0, rs.mean());

            print_spinner();

@@ -80,9 +88,9 @@ namespace
            deserialize(df2, ss);
            DLIB_TEST(df2.get_num_trees() == 1000);
            result = test_regression_function(df2, samples, labels);
-            // train:    2.239 0.987173 0.970669   1.1399 
+            // train:    1.95064 0.990374  0.92738  1.04536
            dlog << LINFO << "serialized train results: " << result;
-            DLIB_TEST_MSG(result(0) < 2.3, result(0));
+            DLIB_TEST_MSG(result(0) < 2.0, result(0));
        }
    } a;