#include #include #include #include #include #include #include using namespace std; using namespace dlib; // ---------------------------------------------------------------------------------------- // Only these computational layers have parameters const std::set comp_tags_with_params = {"fc", "fc_no_bias", "con", "affine_con", "affine_fc", "affine", "prelu"}; struct layer { string type; // comp, loss, or input int idx; matrix output_tensor_shape; // (N,K,NR,NC) string detail_name; // The name of the tag inside the layer tag. e.g. fc, con, max_pool, input_rgb_image. std::map attributes; matrix params; long tag_id = -1; // If this isn't -1 then it means this layer was tagged, e.g. wrapped with tag2<> giving tag_id==2 long skip_id = -1; // If this isn't -1 then it means this layer draws its inputs from // the most recent layer with tag_id==skip_id rather than its immediate predecessor. double attribute (const string& key) const { auto i = attributes.find(key); if (i != attributes.end()) return i->second; else throw dlib::error("Layer doesn't have the requested attribute '" + key + "'."); } string caffe_layer_name() const { if (type == "input") return "data"; else return detail_name+to_string(idx); } }; // ---------------------------------------------------------------------------------------- std::vector parse_dlib_xml( const matrix& input_tensor_shape, const string& xml_filename ); // ---------------------------------------------------------------------------------------- template const layer& find_layer ( iterator i, long tag_id ) /*! requires - i is a reverse iterator pointing to a layer in the list of layers produced by parse_dlib_xml(). - i is not an input layer. ensures - if (tag_id == -1) then - returns the previous layer (i.e. closer to the input) to layer i. - else - returns the previous layer (i.e. closer to the input) to layer i with the given tag_id. !*/ { if (tag_id == -1) { return *(i-1); } else { while(true) { i--; // if we hit the end of the network before we found what we were looking for if (i->tag_id == tag_id) return *i; if (i->type == "input") throw dlib::error("Network definition is bad, a layer wanted to skip back to a non-existing layer."); } } } template const layer& find_input_layer (iterator i) { return find_layer(i, i->skip_id); } template string find_layer_caffe_name ( iterator i, long tag_id ) { return find_layer(i,tag_id).caffe_layer_name(); } template string find_input_layer_caffe_name (iterator i) { return find_input_layer(i).caffe_layer_name(); } // ---------------------------------------------------------------------------------------- template void print_as_np_array(std::ostream& out, const matrix_exp& m) { out << "np.array(["; for (auto x : m) out << x << ","; out << "], dtype='float32')"; } // ---------------------------------------------------------------------------------------- void convert_dlib_xml_to_caffe_python_code( const string& xml_filename, const long N, const long K, const long NR, const long NC ) { const string out_filename = left_substr(xml_filename,".") + "_dlib_to_caffe_model.py"; cout << "Writing model to " << out_filename << endl; ofstream fout(out_filename); fout.precision(9); const auto layers = parse_dlib_xml({N,K,NR,NC}, xml_filename); fout << "#\n"; fout << "# !!! This file was automatically generated by dlib's tools/convert_dlib_nets_to_caffe utility. !!!\n"; fout << "# !!! It contains all the information from a dlib DNN network and lets you save it as a cafe model. !!!\n"; fout << "#\n"; fout << "import caffe " << endl; fout << "from caffe import layers as L, params as P" << endl; fout << "import numpy as np" << endl; // dlib nets don't commit to a batch size, so just use 1 as the default fout << "\n# Input tensor dimensions" << endl; fout << "input_batch_size = " << N << ";" << endl; if (layers.back().detail_name == "input_rgb_image") { fout << "input_num_channels = 3;" << endl; fout << "input_num_rows = "<type == "loss" || i->type == "input") continue; if (i->detail_name == "con") { fout << " n." << i->caffe_layer_name() << " = L.Convolution(n." << find_input_layer_caffe_name(i); fout << ", num_output=" << i->attribute("num_filters"); fout << ", kernel_w=" << i->attribute("nc"); fout << ", kernel_h=" << i->attribute("nr"); fout << ", stride_w=" << i->attribute("stride_x"); fout << ", stride_h=" << i->attribute("stride_y"); fout << ", pad_w=" << i->attribute("padding_x"); fout << ", pad_h=" << i->attribute("padding_y"); fout << ");\n"; } else if (i->detail_name == "relu") { fout << " n." << i->caffe_layer_name() << " = L.ReLU(n." << find_input_layer_caffe_name(i); fout << ");\n"; } else if (i->detail_name == "sig") { fout << " n." << i->caffe_layer_name() << " = L.Sigmoid(n." << find_input_layer_caffe_name(i); fout << ");\n"; } else if (i->detail_name == "prelu") { fout << " n." << i->caffe_layer_name() << " = L.PReLU(n." << find_input_layer_caffe_name(i); fout << ", channel_shared=True"; fout << ");\n"; } else if (i->detail_name == "max_pool") { fout << " n." << i->caffe_layer_name() << " = L.Pooling(n." << find_input_layer_caffe_name(i); fout << ", pool=P.Pooling.MAX"; if (i->attribute("nc")==0) { fout << ", global_pooling=True"; } else { fout << ", kernel_w=" << i->attribute("nc"); fout << ", kernel_h=" << i->attribute("nr"); } fout << ", stride_w=" << i->attribute("stride_x"); fout << ", stride_h=" << i->attribute("stride_y"); fout << ", pad_w=" << i->attribute("padding_x"); fout << ", pad_h=" << i->attribute("padding_y"); fout << ");\n"; } else if (i->detail_name == "avg_pool") { fout << " n." << i->caffe_layer_name() << " = L.Pooling(n." << find_input_layer_caffe_name(i); fout << ", pool=P.Pooling.AVE"; if (i->attribute("nc")==0) { fout << ", global_pooling=True"; } else { fout << ", kernel_w=" << i->attribute("nc"); fout << ", kernel_h=" << i->attribute("nr"); } if (i->attribute("padding_x") != 0 || i->attribute("padding_y") != 0) { throw dlib::error("dlib and caffe implement pooling with non-zero padding differently, so you can't convert a " "network with such pooling layers."); } fout << ", stride_w=" << i->attribute("stride_x"); fout << ", stride_h=" << i->attribute("stride_y"); fout << ", pad_w=" << i->attribute("padding_x"); fout << ", pad_h=" << i->attribute("padding_y"); fout << ");\n"; } else if (i->detail_name == "fc") { fout << " n." << i->caffe_layer_name() << " = L.InnerProduct(n." << find_input_layer_caffe_name(i); fout << ", num_output=" << i->attribute("num_outputs"); fout << ", bias_term=True"; fout << ");\n"; } else if (i->detail_name == "fc_no_bias") { fout << " n." << i->caffe_layer_name() << " = L.InnerProduct(n." << find_input_layer_caffe_name(i); fout << ", num_output=" << i->attribute("num_outputs"); fout << ", bias_term=False"; fout << ");\n"; } else if (i->detail_name == "bn_con" || i->detail_name == "bn_fc") { throw dlib::error("Conversion from dlib's batch norm layers to caffe's isn't supported. Instead, " "you should put your dlib network into 'test mode' by switching batch norm layers to affine layers. " "Then you can convert that 'test mode' network to caffe."); } else if (i->detail_name == "affine_con") { fout << " n." << i->caffe_layer_name() << " = L.Scale(n." << find_input_layer_caffe_name(i); fout << ", bias_term=True"; fout << ");\n"; } else if (i->detail_name == "affine_fc") { fout << " n." << i->caffe_layer_name() << " = L.Scale(n." << find_input_layer_caffe_name(i); fout << ", bias_term=True"; fout << ");\n"; } else if (i->detail_name == "add_prev") { auto in_shape1 = find_input_layer(i).output_tensor_shape; auto in_shape2 = find_layer(i,i->attribute("tag")).output_tensor_shape; if (in_shape1 != in_shape2) { // if only the number of channels differs then we will use a dummy layer to // pad with zeros. But otherwise we will throw an error. if (in_shape1(0) == in_shape2(0) && in_shape1(2) == in_shape2(2) && in_shape1(3) == in_shape2(3)) { fout << " n." << i->caffe_layer_name() << "_zeropad = L.DummyData(num=" << in_shape1(0); fout << ", channels="<attribute("tag")); if (in_shape1(1) > in_shape2(1)) swap(smaller_layer, bigger_layer); fout << " n." << i->caffe_layer_name() << "_concat = L.Concat(n." << smaller_layer; fout << ", n." << i->caffe_layer_name() << "_zeropad"; fout << ");\n"; fout << " n." << i->caffe_layer_name() << " = L.Eltwise(n." << i->caffe_layer_name() << "_concat"; fout << ", n." << bigger_layer; fout << ", operation=P.Eltwise.SUM"; fout << ");\n"; } else { std::ostringstream sout; sout << "The dlib network contained an add_prev layer (layer idx " << i->idx << ") that adds two previous "; sout << "layers with different output tensor dimensions. Caffe's equivalent layer, Eltwise, doesn't support "; sout << "adding layers together with different dimensions. In the special case where the only difference is "; sout << "in the number of channels, this converter program will add a dummy layer that outputs a tensor full of zeros "; sout << "and concat it appropriately so this will work. However, this network you are converting has tensor dimensions "; sout << "different in values other than the number of channels. In particular, here are the two tensor shapes (batch size, channels, rows, cols): "; std::ostringstream sout2; sout2 << wrap_string(sout.str()) << endl; sout2 << trans(in_shape1); sout2 << trans(in_shape2); throw dlib::error(sout2.str()); } } else { fout << " n." << i->caffe_layer_name() << " = L.Eltwise(n." << find_input_layer_caffe_name(i); fout << ", n." << find_layer_caffe_name(i, i->attribute("tag")); fout << ", operation=P.Eltwise.SUM"; fout << ");\n"; } } else { throw dlib::error("No known transformation from dlib's " + i->detail_name + " layer to caffe."); } } fout << " return n.to_proto();\n\n" << endl; // ----------------------------------------------------------------------------------- // The next block of code outputs python code that populates all the filter weights. // ----------------------------------------------------------------------------------- fout << "def set_network_weights(net):\n"; fout << " # populate network parameters\n"; // iterate the layers starting with the input layer for (auto i = layers.rbegin(); i != layers.rend(); ++i) { // skip input and loss layers if (i->type == "loss" || i->type == "input") continue; if (i->detail_name == "con") { const long num_filters = i->attribute("num_filters"); matrix weights = trans(rowm(i->params,range(0,i->params.size()-num_filters-1))); matrix biases = trans(rowm(i->params,range(i->params.size()-num_filters, i->params.size()-1))); // main filter weights fout << " p = "; print_as_np_array(fout,weights); fout << ";\n"; fout << " p.shape = net.params['"<caffe_layer_name()<<"'][0].data.shape;\n"; fout << " net.params['"<caffe_layer_name()<<"'][0].data[:] = p;\n"; // biases fout << " p = "; print_as_np_array(fout,biases); fout << ";\n"; fout << " p.shape = net.params['"<caffe_layer_name()<<"'][1].data.shape;\n"; fout << " net.params['"<caffe_layer_name()<<"'][1].data[:] = p;\n"; } else if (i->detail_name == "fc") { matrix weights = trans(rowm(i->params, range(0,i->params.nr()-2))); matrix biases = rowm(i->params, i->params.nr()-1); // main filter weights fout << " p = "; print_as_np_array(fout,weights); fout << ";\n"; fout << " p.shape = net.params['"<caffe_layer_name()<<"'][0].data.shape;\n"; fout << " net.params['"<caffe_layer_name()<<"'][0].data[:] = p;\n"; // biases fout << " p = "; print_as_np_array(fout,biases); fout << ";\n"; fout << " p.shape = net.params['"<caffe_layer_name()<<"'][1].data.shape;\n"; fout << " net.params['"<caffe_layer_name()<<"'][1].data[:] = p;\n"; } else if (i->detail_name == "fc_no_bias") { matrix weights = trans(i->params); // main filter weights fout << " p = "; print_as_np_array(fout,weights); fout << ";\n"; fout << " p.shape = net.params['"<caffe_layer_name()<<"'][0].data.shape;\n"; fout << " net.params['"<caffe_layer_name()<<"'][0].data[:] = p;\n"; } else if (i->detail_name == "affine_con" || i->detail_name == "affine_fc") { const long dims = i->params.size()/2; matrix gamma = trans(rowm(i->params,range(0,dims-1))); matrix beta = trans(rowm(i->params,range(dims, 2*dims-1))); // set gamma weights fout << " p = "; print_as_np_array(fout,gamma); fout << ";\n"; fout << " p.shape = net.params['"<caffe_layer_name()<<"'][0].data.shape;\n"; fout << " net.params['"<caffe_layer_name()<<"'][0].data[:] = p;\n"; // set beta weights fout << " p = "; print_as_np_array(fout,beta); fout << ";\n"; fout << " p.shape = net.params['"<caffe_layer_name()<<"'][1].data.shape;\n"; fout << " net.params['"<caffe_layer_name()<<"'][1].data[:] = p;\n"; } else if (i->detail_name == "prelu") { const double param = i->params(0); // main filter weights fout << " tmp = net.params['"<caffe_layer_name()<<"'][0].data.view();\n"; fout << " tmp.shape = 1;\n"; fout << " tmp[0] = "< layers; bool seen_first_tag = false; layer next_layer; std::stack current_tag; long tag_id = -1; virtual void start_document ( ) { layers.clear(); seen_first_tag = false; tag_id = -1; } virtual void end_document ( ) { } virtual void start_element ( const unsigned long line_number, const std::string& name, const dlib::attribute_list& atts ) { if (!seen_first_tag) { if (name != "net") throw dlib::error("The top level XML tag must be a 'net' tag."); seen_first_tag = true; } if (name == "layer") { next_layer = layer(); if (atts["type"] == "skip") { // Don't make a new layer, just apply the tag id to the previous layer if (layers.size() == 0) throw dlib::error("A skip layer was found as the first layer, but the first layer should be an input layer."); layers.back().skip_id = sa = atts["id"]; // We intentionally leave next_layer empty so the end_element() callback // don't add it as another layer when called. } else if (atts["type"] == "tag") { // Don't make a new layer, just remember the tag id so we can apply it on // the next layer. tag_id = sa = atts["id"]; // We intentionally leave next_layer empty so the end_element() callback // don't add it as another layer when called. } else { next_layer.idx = sa = atts["idx"]; next_layer.type = atts["type"]; if (tag_id != -1) { next_layer.tag_id = tag_id; tag_id = -1; } } } else if (current_tag.size() != 0 && current_tag.top() == "layer") { next_layer.detail_name = name; // copy all the XML tag's attributes into the layer struct atts.reset(); while (atts.move_next()) next_layer.attributes[atts.element().key()] = sa = atts.element().value(); } current_tag.push(name); } virtual void end_element ( const unsigned long line_number, const std::string& name ) { current_tag.pop(); if (name == "layer" && next_layer.type.size() != 0) layers.push_back(next_layer); } virtual void characters ( const std::string& data ) { if (current_tag.size() == 0) return; if (comp_tags_with_params.count(current_tag.top()) != 0) { istringstream sin(data); sin >> next_layer.params; } } virtual void processing_instruction ( const unsigned long line_number, const std::string& target, const std::string& data ) { } }; // ---------------------------------------------------------------------------------------- void compute_output_tensor_shapes(const matrix& input_tensor_shape, std::vector& layers) { DLIB_CASSERT(layers.back().type == "input"); layers.back().output_tensor_shape = input_tensor_shape; for (auto i = ++layers.rbegin(); i != layers.rend(); ++i) { const auto input_shape = find_input_layer(i).output_tensor_shape; if (i->type == "comp") { if (i->detail_name == "fc" || i->detail_name == "fc_no_bias") { long num_outputs = i->attribute("num_outputs"); i->output_tensor_shape = {input_shape(0), num_outputs, 1, 1}; } else if (i->detail_name == "con") { long num_filters = i->attribute("num_filters"); long filter_nc = i->attribute("nc"); long filter_nr = i->attribute("nr"); long stride_x = i->attribute("stride_x"); long stride_y = i->attribute("stride_y"); long padding_x = i->attribute("padding_x"); long padding_y = i->attribute("padding_y"); long nr = 1+(input_shape(2) + 2*padding_y - filter_nr)/stride_y; long nc = 1+(input_shape(3) + 2*padding_x - filter_nc)/stride_x; i->output_tensor_shape = {input_shape(0), num_filters, nr, nc}; } else if (i->detail_name == "max_pool" || i->detail_name == "avg_pool") { long filter_nc = i->attribute("nc"); long filter_nr = i->attribute("nr"); long stride_x = i->attribute("stride_x"); long stride_y = i->attribute("stride_y"); long padding_x = i->attribute("padding_x"); long padding_y = i->attribute("padding_y"); long nr = 1+(input_shape(2) + 2*padding_y - filter_nr)/stride_y; long nc = 1+(input_shape(3) + 2*padding_x - filter_nc)/stride_x; i->output_tensor_shape = {input_shape(0), input_shape(1), nr, nc}; } else if (i->detail_name == "add_prev") { auto aux_shape = find_layer(i, i->attribute("tag")).output_tensor_shape; for (long j = 0; j < input_shape.size(); ++j) i->output_tensor_shape(j) = std::max(input_shape(j), aux_shape(j)); } else { i->output_tensor_shape = input_shape; } } else { i->output_tensor_shape = input_shape; } } } // ---------------------------------------------------------------------------------------- std::vector parse_dlib_xml( const matrix& input_tensor_shape, const string& xml_filename ) { doc_handler dh; parse_xml(xml_filename, dh); if (dh.layers.size() == 0) throw dlib::error("No layers found in XML file!"); if (dh.layers.back().type != "input") throw dlib::error("The network in the XML file is missing an input layer!"); compute_output_tensor_shapes(input_tensor_shape, dh.layers); return dh.layers; } // ----------------------------------------------------------------------------------------