dnn_mit67_ex.cpp 9.5 KB
Newer Older
Davis King's avatar
Davis King committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43



#include <dlib/dnn.h>
#include <iostream>
#include <dlib/svm.h>
#include <dlib/data_io.h>
#include <dlib/gui_widgets.h>
#include <dlib/image_transforms.h>
#include <dlib/dir_nav.h>
#include <iterator>

using namespace std;
using namespace dlib;
 
// ----------------------------------------------------------------------------------------

template <typename T> using ares = relu<affine<add_prev1<con<relu<affine<con<tag1<T>>>>>>>>;

template <typename T> using res = relu<bn<add_prev1<con<relu<bn<con<tag1<T>>>>>>>>;
std::tuple<relu_,bn_,add_prev1_,con_,relu_,bn_,con_> res_ (
    unsigned long outputs,
    unsigned long stride = 1
) 
{
    return std::make_tuple(relu_(),
                           bn_(CONV_MODE),
                           add_prev1_(),
                           con_(outputs,3,3,stride,stride),
                           relu_(),
                           bn_(CONV_MODE),
                           con_(outputs,3,3,stride,stride));
}

// ----------------------------------------------------------------------------------------

void randomly_crop_image (
    const matrix<rgb_pixel>& img,
    matrix<rgb_pixel>& crop,
    dlib::rand& rnd
)
{
    // figure out what rectangle we want to crop from the image
Davis King's avatar
Davis King committed
44
45
46
    //auto scale = 1-rnd.get_random_double()*0.2;
    double mins = 0.466666666, maxs = 0.875;
    auto scale = mins + rnd.get_random_double()*(maxs-mins);
Davis King's avatar
Davis King committed
47
48
49
50
51
52
53
    auto size = scale*std::min(img.nr(), img.nc());
    rectangle rect(size, size);
    // randomly shift the box around
    point offset(rnd.get_random_32bit_number()%(img.nc()-rect.width()),
                 rnd.get_random_32bit_number()%(img.nr()-rect.height()));
    rect = move_rect(rect, offset);

Davis King's avatar
Davis King committed
54
55
    // now crop it out as a 224x224 image.
    extract_image_chip(img, chip_details(rect, chip_dims(224,224)), crop);
Davis King's avatar
Davis King committed
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75

    // Also randomly flip the image
    if (rnd.get_random_double() > 0.5)
        crop = fliplr(crop);

    // And then randomly adjust the color balance and gamma.
    disturb_colors(crop, rnd);
}

void randomly_crop_images (
    const matrix<rgb_pixel>& img,
    dlib::array<matrix<rgb_pixel>>& crops,
    dlib::rand& rnd,
    long num_crops
)
{
    std::vector<chip_details> dets;
    for (long i = 0; i < num_crops; ++i)
    {
        // figure out what rectangle we want to crop from the image
Davis King's avatar
Davis King committed
76
77
78
        //auto scale = 1-rnd.get_random_double()*0.2;
        double mins = 0.466666666, maxs = 0.875;
        auto scale = mins + rnd.get_random_double()*(maxs-mins);
Davis King's avatar
Davis King committed
79
80
81
82
83
84
85
        auto size = scale*std::min(img.nr(), img.nc());
        rectangle rect(size, size);
        // randomly shift the box around
        point offset(rnd.get_random_32bit_number()%(img.nc()-rect.width()),
            rnd.get_random_32bit_number()%(img.nr()-rect.height()));
        rect = move_rect(rect, offset);

Davis King's avatar
Davis King committed
86
        dets.push_back(chip_details(rect, chip_dims(224,224)));
Davis King's avatar
Davis King committed
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
    }

    extract_image_chips(img, dets, crops);

    for (auto&& img : crops)
    {
        // Also randomly flip the image
        if (rnd.get_random_double() > 0.5)
            img = fliplr(img);

        // And then randomly adjust the color balance and gamma.
        disturb_colors(img, rnd);
    }
}

// ----------------------------------------------------------------------------------------

struct image_info
{
    string filename;
    string label;
    unsigned long numeric_label;
};

Davis King's avatar
Davis King committed
111
std::vector<image_info> get_imagenet_listing(
Davis King's avatar
Davis King committed
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
    const std::string& images_folder
)
{
    std::vector<image_info> results;
    image_info temp;
    temp.numeric_label = 0;
    // loop over all the scene types in the dataset, each is contained in a subfolder.
    auto subdirs = directory(images_folder).get_dirs();
    // sort the sub directories so the numeric labels will be assigned in sorted order.
    std::sort(subdirs.begin(), subdirs.end());
    for (auto subdir : subdirs)
    {
        // Now get all the images in this scene type
        temp.label = subdir.name();
        for (auto image_file : subdir.get_files())
        {
            temp.filename = image_file;
            results.push_back(temp);
        }
        ++temp.numeric_label;
    }
    return results;
}

unsigned long vote (
    const std::vector<unsigned long>& votes
)
{
    std::vector<unsigned long> counts(max(mat(votes))+1);
    for (auto i : votes)
        counts[i]++;
    return index_of_max(mat(counts));
}

int main(int argc, char** argv) try
{
    if (argc != 3)
    {
        cout << "give MIT 67 scene folder as input and a weight decay value!" << endl;
        return 1;
    }

Davis King's avatar
Davis King committed
154
    auto listing = get_imagenet_listing(argv[1]);
Davis King's avatar
Davis King committed
155
    cout << "images in dataset: " << listing.size() << endl;
Davis King's avatar
Davis King committed
156
157
    const auto number_of_classes = listing.back().numeric_label+1;
    if (listing.size() == 0 || number_of_classes != 1000)
Davis King's avatar
Davis King committed
158
159
160
161
162
163
164
165
166
167
168
    {
        cout << "Didn't find the MIT 67 scene dataset.  Are you sure you gave the correct folder?" << endl;
        cout << "Give the Images folder as an argument to this program." << endl;
        return 1;
    }
        

    const double initial_step_size = 0.1;
    const double weight_decay = sa = argv[2];

    typedef loss_multiclass_log<fc<avg_pool<
Davis King's avatar
Davis King committed
169
170
171
172
                                res<res<res<
                                res<res<res<res<res<res<
                                res<res<res<res<
                                res<res<res<
Davis King's avatar
Davis King committed
173
174
                                max_pool<relu<bn<con<
                                input<matrix<rgb_pixel>
Davis King's avatar
Davis King committed
175
                                >>>>>>>>>>>>>>>>>>>>>>>> net_type;
Davis King's avatar
Davis King committed
176
177


Davis King's avatar
Davis King committed
178
    net_type net(fc_(number_of_classes),
Davis King's avatar
Davis King committed
179
                 avg_pool_(1000,1000,1000,1000),
Davis King's avatar
Davis King committed
180
181
182
183
                 res_(512),res_(512),res_(512,2),
                 res_(256),res_(256),res_(256),res_(256),res_(256),res_(256,2),
                 res_(128),res_(128),res_(128),res_(128,2),
                 res_(64), res_(64), res_(64),
Davis King's avatar
Davis King committed
184
185
186
187
188
189
190
191
192
                 max_pool_(3,3,2,2), relu_(), bn_(CONV_MODE), con_(64,7,7,2,2)
                );


    cout << "initial step size: "<< initial_step_size << endl;
    cout << "weight decay: " << weight_decay << endl;

    dnn_trainer<net_type> trainer(net,sgd(initial_step_size, weight_decay));
    trainer.be_verbose();
Davis King's avatar
Davis King committed
193
194
    trainer.set_synchronization_file("sync_imagenet_full_training_set_40000_minstep_"+cast_to_string(weight_decay), std::chrono::minutes(5));
    trainer.set_iterations_between_step_size_adjust(40000);
Davis King's avatar
Davis King committed
195
196
197
198
    std::vector<matrix<rgb_pixel>> samples;
    std::vector<unsigned long> labels;

    randomize_samples(listing);
Davis King's avatar
Davis King committed
199
    const size_t training_part = listing.size()*1.0;
Davis King's avatar
Davis King committed
200
201
202
203
204
205
206

    dlib::rand rnd;


    const bool do_training = true;
    if (do_training)
    {
Davis King's avatar
Davis King committed
207
        while(trainer.get_step_size() >= 1e-3)
Davis King's avatar
Davis King committed
208
209
210
211
        {
            samples.clear();
            labels.clear();

Davis King's avatar
Davis King committed
212
            // make a 128 image mini-batch
Davis King's avatar
Davis King committed
213
            matrix<rgb_pixel> img, crop;
Davis King's avatar
Davis King committed
214
            while(samples.size() < 128)
Davis King's avatar
Davis King committed
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
            {
                auto l = listing[rnd.get_random_32bit_number()%training_part];
                load_image(img, l.filename);
                randomly_crop_image(img, crop, rnd);
                samples.push_back(crop);
                labels.push_back(l.numeric_label);
            }

            trainer.train_one_step(samples, labels);
        }

        // wait for threaded processing to stop.
        trainer.get_net();

        net.clean();
        cout << "saving network" << endl;
Davis King's avatar
Davis King committed
231
        serialize("imagenet_full_training_set_40000_minstep_"+cast_to_string(weight_decay)+".dat") << net;
Davis King's avatar
Davis King committed
232
233
234
    }


Davis King's avatar
Davis King committed
235
    const bool test_network = false;
Davis King's avatar
Davis King committed
236
237
238
239
    if (test_network)
    {

        typedef loss_multiclass_log<fc<avg_pool<
Davis King's avatar
Davis King committed
240
241
242
243
            ares<ares<ares<
            ares<ares<ares<ares<ares<ares<
            ares<ares<ares<ares<
            ares<ares<ares<
Davis King's avatar
Davis King committed
244
245
            max_pool<relu<affine<con<
            input<matrix<rgb_pixel>
Davis King's avatar
Davis King committed
246
            >>>>>>>>>>>>>>>>>>>>>>>> anet_type;
Davis King's avatar
Davis King committed
247
248
    
        anet_type net;
Davis King's avatar
Davis King committed
249
        deserialize("imagenet_network3_"+cast_to_string(weight_decay)+".dat") >> net;
Davis King's avatar
Davis King committed
250
251
252
253
254
255
256
257

        dlib::array<matrix<rgb_pixel>> images;
        std::vector<unsigned long> labels;
        matrix<rgb_pixel> img, crop;
        cout << "loading images..." << endl;
        int num_right = 0;
        int num_wrong = 0;
        console_progress_indicator pbar(training_part);
Davis King's avatar
Davis King committed
258
        /*
Davis King's avatar
Davis King committed
259
260
261
262
263
264
265
266
267
268
269
270
        for (size_t i = 0; i < training_part; ++i)
        {
            pbar.print_status(i);
            load_image(img, listing[i].filename);

            randomly_crop_images(img, images, rnd, 16);
            unsigned long predicted_label = vote(net(images, 32));
            if (predicted_label == listing[i].numeric_label)
                ++num_right;
            else
                ++num_wrong;
        }
Davis King's avatar
Davis King committed
271
        */
Davis King's avatar
Davis King committed
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302

        cout << "\ntraining num_right: " << num_right << endl;
        cout << "training num_wrong: " << num_wrong << endl;
        cout << "training accuracy:  " << num_right/(double)(num_right+num_wrong) << endl;

        pbar.reset(listing.size()-training_part);
        num_right = 0;
        num_wrong = 0;
        for (size_t i = training_part; i < listing.size(); ++i)
        {
            pbar.print_status(i-training_part);
            load_image(img, listing[i].filename);

            randomly_crop_images(img, images, rnd, 16);
            unsigned long predicted_label = vote(net(images, 32));
            if (predicted_label == listing[i].numeric_label)
                ++num_right;
            else
                ++num_wrong;
        }
        cout << "\ntesting num_right: " << num_right << endl;
        cout << "testing num_wrong: " << num_wrong << endl;
        cout << "testing accuracy:  " << num_right/(double)(num_right+num_wrong) << endl;
        return 0;
    }
}
catch(std::exception& e)
{
    cout << e.what() << endl;
}