Merge branch 'master' of https://github.com/davisking/dlib

b1627bc5 · Fm · 3b3a9939 · 5e550a26 · b1627bc5 · b1627bc5
Commit b1627bc5 authored Jun 28, 2016 by Fm
6 changed files
--- a/examples/dnn_imagenet_ex.cpp
+++ b/examples/dnn_imagenet_ex.cpp
 // The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
 /*
-    This example shows how to classify an image into one of the 1000 imagenet clategories
+    This example shows how to classify an image into one of the 1000 imagenet
-    using the deep learning tools from the dlib C++ Library.  We will use the pretrained
+    categories using the deep learning tools from the dlib C++ Library.  We will
-    ResNet34 model available on the dlib website.
+    use the pretrained ResNet34 model available on the dlib website.
-    The ResNet34 model is from Deep Residual Learning for Image Recognition by He, Zhang,
+    The ResNet34 architecture is from the paper Deep Residual Learning for Image
-    Ren, and Sun.  
+    Recognition by He, Zhang, Ren, and Sun.  The model file that comes with dlib
+    was trained using the dnn_imagenet_train_ex.cpp program on a Titan X for
+    about 2 weeks.  This pretrained model has a top5 error of 7.572% on the 2012
+    imagenet validation dataset.
+    For an introduction to dlib's DNN module read the dnn_introduction_ex.cpp and
+    dnn_introduction2_ex.cpp example programs.
-    These tools will use CUDA and cuDNN to drastically accelerate network
+    Finally, these tools will use CUDA and cuDNN to drastically accelerate
-    training and testing.  CMake should automatically find them if they are
+    network training and testing.  CMake should automatically find them if they
-    installed and configure things appropriately.  If not, the program will
+    are installed and configure things appropriately.  If not, the program will
    still run but will be much slower to execute.
 */
@@ -27,6 +33,7 @@ using namespace dlib;
 // ----------------------------------------------------------------------------------------
+// This block of statements defines the resnet-34 network
 template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
 using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;
@@ -41,14 +48,14 @@ template <int N, typename SUBNET> using ares      = relu<residual<block,N,affine
 template <int N, typename SUBNET> using ares_down = relu<residual_down<block,N,affine,SUBNET>>;
-typedef loss_multiclass_log<fc<1000,avg_pool_everything<
+using anet_type = loss_multiclass_log<fc<1000,avg_pool_everything<
                            ares<512,ares<512,ares_down<512,
                            ares<256,ares<256,ares<256,ares<256,ares<256,ares_down<256,
                            ares<128,ares<128,ares<128,ares_down<128,
                            ares<64,ares<64,ares<64,
                            max_pool<3,3,2,2,relu<affine<con<64,7,7,2,2,
                            input_rgb_image_sized<227>
-                            >>>>>>>>>>>>>>>>>>>>>>> anet_type;
+                            >>>>>>>>>>>>>>>>>>>>>>>;
 // ----------------------------------------------------------------------------------------
@@ -101,14 +108,24 @@ void randomly_crop_images (
 int main(int argc, char** argv) try
 {
+    if (argc == 1)
+    {
+        cout << "Give this program image files as command line arguments.\n" << endl;
+        cout << "You will also need a copy of the file resnet34_1000_imagenet_classifier.dnn " << endl;
+        cout << "available at http://dlib.net/files/resnet34_1000_imagenet_classifier.dnn.bz2" << endl;
+        cout << endl;
+        return 1;
+    }
    std::vector<string> labels;
    anet_type net;
-    // Get this file from http://dlib.net/files/resnet34_1000_imagenet_classifier.dnn.bz2
-    // This pretrained model has a top5 error of 7.572% on the 2012 imagenet validation
-    // dataset.
    deserialize("resnet34_1000_imagenet_classifier.dnn") >> net >> labels;
+    // Make a network with softmax as the final layer.  We don't have to do this
+    // if we just want to output the single best prediction, since the anet_type
+    // already does this.  But if we instead want to get the probability of each
+    // class as output we need to replace the last layer of the network with a
+    // softmax layer, which we do as follows:
    softmax<anet_type::subnet_type> snet; 
    snet.subnet() = net.subnet();
@@ -118,16 +135,19 @@ int main(int argc, char** argv) try
    dlib::rand rnd;
    image_window win;
-    // read images from the command prompt and print the top 5 best labels.
+    // Read images from the command prompt and print the top 5 best labels for each.
    for (int i = 1; i < argc; ++i)
    {
        load_image(img, argv[i]);
        const int num_crops = 16;
+        // Grab 16 random crops from the image.  We will run all of them through the
+        // network and average the results.
        randomly_crop_images(img, images, rnd, num_crops);
+        // p(i) == the probability the image contains object of class i.
        matrix<float,1,1000> p = sum_rows(mat(snet(images.begin(), images.end())))/num_crops;
        win.set_image(img);
+        // Print the 5 most probable labels
        for (int k = 0; k < 5; ++k)
        {
            unsigned long predicted_label = index_of_max(p);
@@ -135,6 +155,7 @@ int main(int argc, char** argv) try
            p(predicted_label) = 0;
        }
+        cout << "Hit enter to process the next image";
        cin.get();
    }

--- a/examples/dnn_imagenet_train_ex.cpp
+++ b/examples/dnn_imagenet_train_ex.cpp
+// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
+/*
+    This program was used to train the resnet34_1000_imagenet_classifier.dnn
+    network used by the dnn_imagenet_ex.cpp example program.  
+    You should be familiar with dlib's DNN module before reading this example
+    program.  So read dnn_introduction_ex.cpp and dnn_introduction2_ex.cpp first.  
+*/
+#include <dlib/dnn.h>
+#include <iostream>
+#include <dlib/data_io.h>
+#include <dlib/image_transforms.h>
+#include <dlib/dir_nav.h>
+#include <iterator>
+#include <thread>
+using namespace std;
+using namespace dlib;
+// ----------------------------------------------------------------------------------------
+template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
+using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;
+template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
+using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>;
+template <int N, template <typename> class BN, int stride, typename SUBNET> 
+using block  = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>;
+template <int N, typename SUBNET> using res       = relu<residual<block,N,bn_con,SUBNET>>;
+template <int N, typename SUBNET> using ares      = relu<residual<block,N,affine,SUBNET>>;
+template <int N, typename SUBNET> using res_down  = relu<residual_down<block,N,bn_con,SUBNET>>;
+template <int N, typename SUBNET> using ares_down = relu<residual_down<block,N,affine,SUBNET>>;
+// ----------------------------------------------------------------------------------------
+// training network type
+using net_type = loss_multiclass_log<fc<1000,avg_pool_everything<
+                            res<512,res<512,res_down<512,
+                            res<256,res<256,res<256,res<256,res<256,res_down<256,
+                            res<128,res<128,res<128,res_down<128,
+                            res<64,res<64,res<64,
+                            max_pool<3,3,2,2,relu<bn_con<con<64,7,7,2,2,
+                            input_rgb_image_sized<227>
+                            >>>>>>>>>>>>>>>>>>>>>>>;
+// testing network type (replaced batch normalization with fixed affine transforms)
+using anet_type = loss_multiclass_log<fc<1000,avg_pool_everything<
+                            ares<512,ares<512,ares_down<512,
+                            ares<256,ares<256,ares<256,ares<256,ares<256,ares_down<256,
+                            ares<128,ares<128,ares<128,ares_down<128,
+                            ares<64,ares<64,ares<64,
+                            max_pool<3,3,2,2,relu<affine<con<64,7,7,2,2,
+                            input_rgb_image_sized<227>
+                            >>>>>>>>>>>>>>>>>>>>>>>;
+// ----------------------------------------------------------------------------------------
+rectangle make_random_cropping_rect_resnet(
+    const matrix<rgb_pixel>& img,
+    dlib::rand& rnd
+)
+{
+    // figure out what rectangle we want to crop from the image
+    double mins = 0.466666666, maxs = 0.875;
+    auto scale = mins + rnd.get_random_double()*(maxs-mins);
+    auto size = scale*std::min(img.nr(), img.nc());
+    rectangle rect(size, size);
+    // randomly shift the box around
+    point offset(rnd.get_random_32bit_number()%(img.nc()-rect.width()),
+                 rnd.get_random_32bit_number()%(img.nr()-rect.height()));
+    return move_rect(rect, offset);
+}
+// ----------------------------------------------------------------------------------------
+void randomly_crop_image (
+    const matrix<rgb_pixel>& img,
+    matrix<rgb_pixel>& crop,
+    dlib::rand& rnd
+)
+{
+    auto rect = make_random_cropping_rect_resnet(img, rnd);
+    // now crop it out as a 227x227 image.
+    extract_image_chip(img, chip_details(rect, chip_dims(227,227)), crop);
+    // Also randomly flip the image
+    if (rnd.get_random_double() > 0.5)
+        crop = fliplr(crop);
+    // And then randomly adjust the colors.
+    apply_random_color_offset(crop, rnd);
+}
+void randomly_crop_images (
+    const matrix<rgb_pixel>& img,
+    dlib::array<matrix<rgb_pixel>>& crops,
+    dlib::rand& rnd,
+    long num_crops
+)
+{
+    std::vector<chip_details> dets;
+    for (long i = 0; i < num_crops; ++i)
+    {
+        auto rect = make_random_cropping_rect_resnet(img, rnd);
+        dets.push_back(chip_details(rect, chip_dims(227,227)));
+    }
+    extract_image_chips(img, dets, crops);
+    for (auto&& img : crops)
+    {
+        // Also randomly flip the image
+        if (rnd.get_random_double() > 0.5)
+            img = fliplr(img);
+        // And then randomly adjust the colors.
+        apply_random_color_offset(img, rnd);
+    }
+}
+// ----------------------------------------------------------------------------------------
+struct image_info
+{
+    string filename;
+    string label;
+    long numeric_label;
+};
+std::vector<image_info> get_imagenet_train_listing(
+    const std::string& images_folder
+)
+{
+    std::vector<image_info> results;
+    image_info temp;
+    temp.numeric_label = 0;
+    // We will loop over all the label types in the dataset, each is contained in a subfolder.
+    auto subdirs = directory(images_folder).get_dirs();
+    // But first, sort the sub directories so the numeric labels will be assigned in sorted order.
+    std::sort(subdirs.begin(), subdirs.end());
+    for (auto subdir : subdirs)
+    {
+        // Now get all the images in this label type
+        temp.label = subdir.name();
+        for (auto image_file : subdir.get_files())
+        {
+            temp.filename = image_file;
+            results.push_back(temp);
+        }
+        ++temp.numeric_label;
+    }
+    return results;
+}
+std::vector<image_info> get_imagenet_val_listing(
+    const std::string& imagenet_root_dir,
+    const std::string& validation_images_file 
+)
+{
+    ifstream fin(validation_images_file);
+    string label, filename;
+    std::vector<image_info> results;
+    image_info temp;
+    temp.numeric_label = -1;
+    while(fin >> label >> filename)
+    {
+        temp.filename = imagenet_root_dir+"/"+filename;
+        if (!file_exists(temp.filename))
+        {
+            cerr << "file doesn't exist! " << temp.filename << endl;
+            exit(1);
+        }
+        if (label != temp.label)
+            ++temp.numeric_label;
+        temp.label = label;
+        results.push_back(temp);
+    }
+    return results;
+}
+// ----------------------------------------------------------------------------------------
+int main(int argc, char** argv) try
+{
+    if (argc != 3)
+    {
+        cout << "To run this program you need a copy of the imagenet ILSVRC2015 dataset and" << endl;
+        cout << "also the file http://dlib.net/files/imagenet2015_validation_images.txt.bz2" << endl;
+        cout << endl;
+        cout << "With those things, you call this program like this: " << endl;
+        cout << "./dnn_imagenet_train_ex /path/to/ILSVRC2015 imagenet2015_validation_images.txt" << endl;
+        return 1;
+    }
+    cout << "\nSCANNING IMAGENET DATASET\n" << endl;
+    auto listing = get_imagenet_train_listing(string(argv[1])+"/Data/CLS-LOC/train/");
+    cout << "images in dataset: " << listing.size() << endl;
+    const auto number_of_classes = listing.back().numeric_label+1;
+    if (listing.size() == 0 || number_of_classes != 1000)
+    {
+        cout << "Didn't find the imagenet dataset. " << endl;
+        return 1;
+    }
+    set_dnn_prefer_smallest_algorithms();
+    const double initial_learning_rate = 0.1;
+    const double weight_decay = 0.0001;
+    const double momentum = 0.9;
+    net_type net;
+    dnn_trainer<net_type> trainer(net,sgd(weight_decay, momentum));
+    trainer.be_verbose();
+    trainer.set_learning_rate(initial_learning_rate);
+    trainer.set_synchronization_file("imagenet_trainer_state_file.dat", std::chrono::minutes(10));
+    // This threshold is probably excessively large.  You could likely get good results
+    // with a smaller value but if you aren't in a hurry this value will surely work well.
+    trainer.set_iterations_without_progress_threshold(20000);
+    std::vector<matrix<rgb_pixel>> samples;
+    std::vector<unsigned long> labels;
+    // Start a bunch of threads that read images from disk and pull out random crops.  It's
+    // important to be sure to feed the GPU fast enough to keep it busy.  Using multiple
+    // thread for this kind of data preparation helps us do that.  Each thread puts the
+    // crops into the data queue.
+    dlib::pipe<std::pair<image_info,matrix<rgb_pixel>>> data(200);
+    auto f = [&data, &listing](time_t seed)
+    {
+        dlib::rand rnd(time(0)+seed);
+        matrix<rgb_pixel> img;
+        std::pair<image_info, matrix<rgb_pixel>> temp;
+        while(data.is_enabled())
+        {
+            temp.first = listing[rnd.get_random_32bit_number()%listing.size()];
+            load_image(img, temp.first.filename);
+            randomly_crop_image(img, temp.second, rnd);
+            data.enqueue(temp);
+        }
+    };
+    std::thread data_loader1([f](){ f(1); });
+    std::thread data_loader2([f](){ f(2); });
+    std::thread data_loader3([f](){ f(3); });
+    std::thread data_loader4([f](){ f(4); });
+    // The main training loop.  Keep making mini-batches and giving them to the trainer.
+    // We will run until the learning rate has dropped by a factor of 1e-3.
+    while(trainer.get_learning_rate() >= initial_learning_rate*1e-3)
+    {
+        samples.clear();
+        labels.clear();
+        // make a 160 image mini-batch
+        std::pair<image_info, matrix<rgb_pixel>> img;
+        while(samples.size() < 160)
+        {
+            data.dequeue(img);
+            samples.push_back(std::move(img.second));
+            labels.push_back(img.first.numeric_label);
+        }
+        trainer.train_one_step(samples, labels);
+    }
+    // Training done, tell threads to stop and make sure to wait for them to finish before
+    // moving on.
+    data.disable();
+    data_loader1.join();
+    data_loader2.join();
+    data_loader3.join();
+    data_loader4.join();
+    // also wait for threaded processing to stop in the trainer.
+    trainer.get_net();
+    net.clean();
+    cout << "saving network" << endl;
+    serialize("resnet34.dnn") << net;
+    // Now test the network on the imagenet validation dataset.  First, make a testing
+    // network with softmax as the final layer.  We don't have to do this if we just wanted
+    // to test the "top1 accuracy" since the normal network outputs the class prediction.
+    // But this snet object will make getting the top5 predictions easy as it directly
+    // outputs the probability of each class as its final output.
+    softmax<anet_type::subnet_type> snet; snet.subnet() = net.subnet();
+    cout << "Testing network on imagenet validation dataset..." << endl;
+    int num_right = 0;
+    int num_wrong = 0;
+    int num_right_top1 = 0;
+    int num_wrong_top1 = 0;
+    dlib::rand rnd(time(0));
+    // loop over all the imagenet validation images
+    for (auto l : get_imagenet_val_listing(argv[1], argv[2]))
+    {
+        dlib::array<matrix<rgb_pixel>> images;
+        matrix<rgb_pixel> img;
+        load_image(img, l.filename);
+        // Grab 16 random crops from the image.  We will run all of them through the
+        // network and average the results.
+        const int num_crops = 16;
+        randomly_crop_images(img, images, rnd, num_crops);
+        // p(i) == the probability the image contains object of class i.
+        matrix<float,1,1000> p = sum_rows(mat(snet(images.begin(), images.end())))/num_crops;
+        // check top 1 accuracy
+        if (index_of_max(p) == l.numeric_label)
+            ++num_right_top1;
+        else
+            ++num_wrong_top1;
+        // check top 5 accuracy
+        bool found_match = false;
+        for (int k = 0; k < 5; ++k)
+        {
+            long predicted_label = index_of_max(p);
+            p(predicted_label) = 0;
+            if (predicted_label == l.numeric_label)
+            {
+                found_match = true;
+                break;
+            }
+        }
+        if (found_match)
+            ++num_right;
+        else
+            ++num_wrong;
+    }
+    cout << "val top5 accuracy:  " << num_right/(double)(num_right+num_wrong) << endl;
+    cout << "val top1 accuracy:  " << num_right_top1/(double)(num_right_top1+num_wrong_top1) << endl;
+}
+catch(std::exception& e)
+{
+    cout << e.what() << endl;
+}
--- a/examples/dnn_inception_ex.cpp
+++ b/examples/dnn_inception_ex.cpp
@@ -2,8 +2,8 @@
 /*
    This is an example illustrating the use of the deep learning tools from the
    dlib C++ Library.  I'm assuming you have already read the introductory
-    dnn_mnist_ex.cpp and dnn_mnist_advanced_ex.cpp examples.  In this example we
+    dnn_introduction_ex.cpp and dnn_introduction2_ex.cpp examples.  In this
-    are going to show how to create inception networks. 
+    example we are going to show how to create inception networks. 
    An inception network is composed of inception blocks of the form:

--- a/examples/dnn_mnist_advanced_ex.cpp
+++ b/examples/dnn_mnist_advanced_ex.cpp
 // The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
 /*
    This is an example illustrating the use of the deep learning tools from the
-    dlib C++ Library.  I'm assuming you have already read the dnn_mnist_ex.cpp
+    dlib C++ Library.  I'm assuming you have already read the dnn_introduction_ex.cpp 
    example.  So in this example program I'm going to go over a number of more
    advanced parts of the API, including:
        - Using multiple GPUs

--- a/examples/dnn_mnist_ex.cpp
+++ b/examples/dnn_mnist_ex.cpp
--- a/python_examples/face_landmark_detection.py
+++ b/python_examples/face_landmark_detection.py
@@ -18,7 +18,7 @@
 #   tools. See train_shape_predictor.py to see an example.
 #
 #   You can get the shape_predictor_68_face_landmarks.dat file from:
-#   http://sourceforge.net/projects/dclib/files/dlib/v18.10/shape_predictor_68_face_landmarks.dat.bz2
+#   http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
 #
 # COMPILING/INSTALLING THE DLIB PYTHON INTERFACE
 #   You can install dlib using the command:
@@ -56,7 +56,7 @@ if len(sys.argv) != 3:
        "execute this program by running:\n"
        "    ./face_landmark_detection.py shape_predictor_68_face_landmarks.dat ../examples/faces\n"
        "You can download a trained facial shape predictor from:\n"
-        "    http://sourceforge.net/projects/dclib/files/dlib/v18.10/shape_predictor_68_face_landmarks.dat.bz2")
+        "    http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2")
    exit()
 predictor_path = sys.argv[1]