dnn_metric_learning_on_images_ex.cpp 13.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
/*
    This is an example illustrating the use of the deep learning tools from the
    dlib C++ Library.  In it, we will show how to use the loss_metric layer to do
    metric learning on images.  

    The main reason you might want to use this kind of algorithm is because you
    would like to use a k-nearest neighbor classifier or similar algorithm, but
    you don't know a good way to calculate the distance between two things.  A
    popular example would be face recognition.  There are a whole lot of papers
    that train some kind of deep metric learning algorithm that embeds face
    images in some vector space where images of the same person are close to each
    other and images of different people are far apart.  Then in that vector
    space it's very easy to do face recognition with some kind of k-nearest
    neighbor classifier.  
    
Davis King's avatar
Davis King committed
17
18
19
20
    In this example we will use a version of the ResNet network from the
    dnn_imagenet_ex.cpp example to learn to map images into some vector space where
    pictures of the same person are close and pictures of different people are far
    apart.  
21
22
23
24
25
26
27

    You might want to read the simpler introduction to the deep metric learning
    API, dnn_metric_learning_ex.cpp, before reading this example.  You should
    also have read the examples that introduce the dlib DNN API before
    continuing.  These are dnn_introduction_ex.cpp and dnn_introduction2_ex.cpp.

*/
28
29
30
31
32
33
34
35

#include <dlib/dnn.h>
#include <dlib/image_io.h>
#include <dlib/misc_api.h>

using namespace dlib;
using namespace std;

36
// ----------------------------------------------------------------------------------------
37

38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
// We will need to create some functions for loading data.  This program will
// expect to be given a directory structured as follows:
//    top_level_directory/
//        person1/
//            image1.jpg
//            image2.jpg
//            image3.jpg
//        person2/
//            image4.jpg
//            image5.jpg
//            image6.jpg
//        person3/
//            image7.jpg
//            image8.jpg
//            image9.jpg
//
// The specific folder and image names don't matter, nor does the number of folders or
// images.  What does matter is that there is a top level folder, which contains
// subfolders, and each subfolder contains images of a single person.

// This function spiders the top level directory and obtains a list of all the
// image files.
60
61
62
63
64
65
66
67
68
69
70
std::vector<std::vector<string>> load_objects_list (
    const string& dir 
)
{
    std::vector<std::vector<string>> objects;
    for (auto subdir : directory(dir).get_dirs())
    {
        std::vector<string> imgs;
        for (auto img : subdir.get_files())
            imgs.push_back(img);

Davis King's avatar
Davis King committed
71
72
        if (imgs.size() != 0)
            objects.push_back(imgs);
73
74
75
76
    }
    return objects;
}

77
78
79
80
81
82
83
// This function takes the output of load_objects_list() as input and randomly
// selects images for training.  It should also be pointed out that it's really
// important that each mini-batch contain multiple images of each person.  This
// is because the metric learning algorithm needs to consider pairs of images
// that should be close (i.e. images of the same person) as well as pairs of
// images that should be far apart (i.e. images of different people) during each
// training step.
84
void load_mini_batch (
85
86
    const size_t num_people,     // how many different people to include
    const size_t samples_per_id, // how many images per person to select.
87
88
89
90
91
92
93
94
    dlib::rand& rnd,
    const std::vector<std::vector<string>>& objs,
    std::vector<matrix<rgb_pixel>>& images,
    std::vector<unsigned long>& labels
)
{
    images.clear();
    labels.clear();
95
    DLIB_CASSERT(num_people <= objs.size(), "The dataset doesn't have that many people in it.");
96

97
    std::vector<bool> already_selected(objs.size(), false);
98
    matrix<rgb_pixel> image; 
99
    for (size_t i = 0; i < num_people; ++i)
100
    {
101
102
103
104
105
106
        size_t id = rnd.get_random_32bit_number()%objs.size();
        // don't pick a person we already added to the mini-batch
        while(already_selected[id])
            id = rnd.get_random_32bit_number()%objs.size();
        already_selected[id] = true;

107
108
109
110
111
112
113
114
115
        for (size_t j = 0; j < samples_per_id; ++j)
        {
            const auto& obj = objs[id][rnd.get_random_32bit_number()%objs[id].size()];
            load_image(image, obj);
            images.push_back(std::move(image));
            labels.push_back(id);
        }
    }

Davis King's avatar
Davis King committed
116
    // You might want to do some data augmentation at this point.  Here we do some simple
117
118
    // color augmentation.
    for (auto&& crop : images)
119
    {
120
        disturb_colors(crop,rnd);
121
122
123
124
        // Jitter most crops
        if (rnd.get_random_double() > 0.1)
            crop = jitter_image(crop,rnd);
    }
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139


    // All the images going into a mini-batch have to be the same size.  And really, all
    // the images in your entire training dataset should be the same size for what we are
    // doing to make the most sense.  
    DLIB_CASSERT(images.size() > 0);
    for (auto&& img : images)
    {
        DLIB_CASSERT(img.nr() == images[0].nr() && img.nc() == images[0].nc(), 
            "All the images in a single mini-batch must be the same size.");
    }
}

// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
140
// The next page of code defines a ResNet network.  It's basically copied
141
// and pasted from the dnn_imagenet_ex.cpp example, except we replaced the loss
Davis King's avatar
Davis King committed
142
// layer with loss_metric and make the network somewhat smaller.
143

144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;

template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>;

template <int N, template <typename> class BN, int stride, typename SUBNET> 
using block  = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>;


template <int N, typename SUBNET> using res       = relu<residual<block,N,bn_con,SUBNET>>;
template <int N, typename SUBNET> using ares      = relu<residual<block,N,affine,SUBNET>>;
template <int N, typename SUBNET> using res_down  = relu<residual_down<block,N,bn_con,SUBNET>>;
template <int N, typename SUBNET> using ares_down = relu<residual_down<block,N,affine,SUBNET>>;

// ----------------------------------------------------------------------------------------

Davis King's avatar
Davis King committed
161
162
163
164
165
template <typename SUBNET> using level0 = res_down<256,SUBNET>;
template <typename SUBNET> using level1 = res<256,res<256,res_down<256,SUBNET>>>;
template <typename SUBNET> using level2 = res<128,res<128,res_down<128,SUBNET>>>;
template <typename SUBNET> using level3 = res<64,res<64,res<64,res_down<64,SUBNET>>>>;
template <typename SUBNET> using level4 = res<32,res<32,res<32,SUBNET>>>;
166

Davis King's avatar
Davis King committed
167
168
169
170
171
template <typename SUBNET> using alevel0 = ares_down<256,SUBNET>;
template <typename SUBNET> using alevel1 = ares<256,ares<256,ares_down<256,SUBNET>>>;
template <typename SUBNET> using alevel2 = ares<128,ares<128,ares_down<128,SUBNET>>>;
template <typename SUBNET> using alevel3 = ares<64,ares<64,ares<64,ares_down<64,SUBNET>>>>;
template <typename SUBNET> using alevel4 = ares<32,ares<32,ares<32,SUBNET>>>;
172
173
174


// training network type
Davis King's avatar
Davis King committed
175
using net_type = loss_metric<fc_no_bias<128,avg_pool_everything<
Davis King's avatar
Davis King committed
176
                            level0<
177
178
179
180
                            level1<
                            level2<
                            level3<
                            level4<
Davis King's avatar
Davis King committed
181
                            max_pool<3,3,2,2,relu<bn_con<con<32,7,7,2,2,
Davis King's avatar
Davis King committed
182
                            input_rgb_image
Davis King's avatar
Davis King committed
183
                            >>>>>>>>>>>>;
184
185

// testing network type (replaced batch normalization with fixed affine transforms)
Davis King's avatar
Davis King committed
186
using anet_type = loss_metric<fc_no_bias<128,avg_pool_everything<
Davis King's avatar
Davis King committed
187
                            alevel0<
188
189
190
191
                            alevel1<
                            alevel2<
                            alevel3<
                            alevel4<
Davis King's avatar
Davis King committed
192
                            max_pool<3,3,2,2,relu<affine<con<32,7,7,2,2,
Davis King's avatar
Davis King committed
193
                            input_rgb_image
Davis King's avatar
Davis King committed
194
                            >>>>>>>>>>>>;
195
196
197
198
199
200
201

// ----------------------------------------------------------------------------------------

int main(int argc, char** argv)
{
    if (argc != 2)
    {
202
203
204
205
206
        cout << "Give a folder as input.  It should contain sub-folders of images and we will " << endl;
        cout << "learn to distinguish between these sub-folders with metric learning.  " << endl;
        cout << "For example, you can run this program on the very small examples/johns dataset" << endl;
        cout << "that comes with dlib by running this command:" << endl;
        cout << "   ./dnn_metric_learning_on_images_ex johns" << endl;
207
208
209
210
211
212
213
214
215
216
217
218
219
        return 1;
    }

    auto objs = load_objects_list(argv[1]);

    cout << "objs.size(): "<< objs.size() << endl;

    std::vector<matrix<rgb_pixel>> images;
    std::vector<unsigned long> labels;


    net_type net;

Davis King's avatar
Davis King committed
220
    dnn_trainer<net_type> trainer(net, sgd(0.0001, 0.9));
221
222
223
    trainer.set_learning_rate(0.1);
    trainer.be_verbose();
    trainer.set_synchronization_file("face_metric_sync", std::chrono::minutes(5));
224
225
    // I've set this to something really small to make the example terminate
    // sooner.  But when you really want to train a good model you should set
Davis King's avatar
Davis King committed
226
    // this to something like 10000 so training doesn't terminate too early.
227
228
    trainer.set_iterations_without_progress_threshold(300);

229
230
231
232
233
    // If you have a lot of data then it might not be reasonable to load it all
    // into RAM.  So you will need to be sure you are decompressing your images
    // and loading them fast enough to keep the GPU occupied.  I like to do this
    // using the following coding pattern: create a bunch of threads that dump
    // mini-batches into dlib::pipes.  
234
235
236
237
238
239
240
241
242
243
244
    dlib::pipe<std::vector<matrix<rgb_pixel>>> qimages(4);
    dlib::pipe<std::vector<unsigned long>> qlabels(4);
    auto data_loader = [&qimages, &qlabels, &objs](time_t seed)
    {
        dlib::rand rnd(time(0)+seed);
        std::vector<matrix<rgb_pixel>> images;
        std::vector<unsigned long> labels;
        while(qimages.is_enabled())
        {
            try
            {
245
                load_mini_batch(5, 5, rnd, objs, images, labels);
246
247
248
249
250
251
252
253
254
255
                qimages.enqueue(images);
                qlabels.enqueue(labels);
            }
            catch(std::exception& e)
            {
                cout << "EXCEPTION IN LOADING DATA" << endl;
                cout << e.what() << endl;
            }
        }
    };
256
257
    // Run the data_loader from 5 threads.  You should set the number of threads
    // relative to the number of CPU cores you have.
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
    std::thread data_loader1([data_loader](){ data_loader(1); });
    std::thread data_loader2([data_loader](){ data_loader(2); });
    std::thread data_loader3([data_loader](){ data_loader(3); });
    std::thread data_loader4([data_loader](){ data_loader(4); });
    std::thread data_loader5([data_loader](){ data_loader(5); });


    // Here we do the training.  We keep passing mini-batches to the trainer until the
    // learning rate has dropped low enough.
    while(trainer.get_learning_rate() >= 1e-4)
    {
        qimages.dequeue(images);
        qlabels.dequeue(labels);
        trainer.train_one_step(images, labels);
    }

274
    // Wait for training threads to stop
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
    trainer.get_net();
    cout << "done training" << endl;

    // Save the network to disk
    net.clean();
    serialize("metric_network_renset.dat") << net;

    // stop all the data loading threads and wait for them to terminate.
    qimages.disable();
    qlabels.disable();
    data_loader1.join();
    data_loader2.join();
    data_loader3.join();
    data_loader4.join();
    data_loader5.join();





295
    // Now, just to show an example of how you would use the network, let's check how well
296
297
    // it performs on the training data.
    dlib::rand rnd(time(0));
298
    load_mini_batch(5, 5, rnd, objs, images, labels);
299

Davis King's avatar
Davis King committed
300
301
302
303
    // Normally you would use the non-batch-normalized version of the network to do
    // testing, which is what we do here.
    anet_type testing_net = net;

304
    // Run all the images through the network to get their vector embeddings.
Davis King's avatar
Davis King committed
305
    std::vector<matrix<float,0,1>> embedded = testing_net(images);
306

307
308
    // Now, check if the embedding puts images with the same labels near each other and
    // images with different labels far apart.
309
310
311
312
313
314
315
316
    int num_right = 0;
    int num_wrong = 0;
    for (size_t i = 0; i < embedded.size(); ++i)
    {
        for (size_t j = i+1; j < embedded.size(); ++j)
        {
            if (labels[i] == labels[j])
            {
317
                // The loss_metric layer will cause images with the same label to be less
318
319
                // than net.loss_details().get_distance_threshold() distance from each
                // other.  So we can use that distance value as our testing threshold.
Davis King's avatar
Davis King committed
320
                if (length(embedded[i]-embedded[j]) < testing_net.loss_details().get_distance_threshold())
321
322
323
324
325
326
                    ++num_right;
                else
                    ++num_wrong;
            }
            else
            {
Davis King's avatar
Davis King committed
327
                if (length(embedded[i]-embedded[j]) >= testing_net.loss_details().get_distance_threshold())
328
329
330
331
332
333
334
335
336
337
338
339
340
                    ++num_right;
                else
                    ++num_wrong;
            }
        }
    }

    cout << "num_right: "<< num_right << endl;
    cout << "num_wrong: "<< num_wrong << endl;

}