sequence_segmenter_ex.cpp 9.82 KB
Newer Older
1
2
3
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
/*

4
5
6
7
8
9
10
11
    This example shows how to use dlib to learn to do sequence segmentation.  In a sequence
    segmentation task we are given a sequence of objects (e.g. words in a sentence) and we
    are supposed to detect certain subsequences (e.g. the names of people).  Therefore, in
    the code below we create some very simple training sequences and use them to learn a
    sequence segmentation model.  In particular, our sequences will be sentences
    represented as arrays of words and our task will be to learn to identify person names.
    Once we have our segmentation model we can use it to find names in new sentences, as we
    will show.
Davis King's avatar
Davis King committed
12

13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
*/


#include <iostream>
#include <cctype>
#include <dlib/svm_threaded.h>
#include <dlib/string.h>

using namespace std;
using namespace dlib;


// ----------------------------------------------------------------------------------------

class feature_extractor
{
    /*
        The sequence segmentation models we work with in this example are chain structured
        conditional random field style models.  Therefore, central to a sequence
        segmentation model is a feature extractor object.  This object defines all the
        properties of the model such as how many features it will use, and more importantly,
        how they are calculated.  
    */

public:
    // This should be the type used to represent an input sequence.  It can be
    // anything so long as it has a .size() which returns the length of the sequence.
    typedef std::vector<std::string> sequence_type;

    // The next four lines define high-level properties of the feature extraction model.
    // See the documentation for the sequence_labeler object for an extended discussion of
    // how they are used (note that the main body of the documentation is at the top of the
    // file documenting the sequence_labeler).
    const static bool use_BIO_model           = true;
    const static bool use_high_order_features = true;
    const static bool allow_negative_weights  = true;
    unsigned long window_size()  const { return 3; }

    // This function defines the dimensionality of the vectors output by the get_features()
    // function defined below.
    unsigned long num_features() const { return 1; }

    template <typename feature_setter>
    void get_features (
        feature_setter& set_feature,
        const sequence_type& sentence,
        unsigned long position
    ) const
    /*!
        requires
            - position < sentence.size()
            - set_feature is a function object which allows expressions of the form:
                - set_features((unsigned long)feature_index, (double)feature_value);
                - set_features((unsigned long)feature_index);
        ensures
            - This function computes a feature vector which should capture the properties
              of sentence[position] that are informative relative to the sequence
              segmentation task you are trying to perform.
            - The output feature vector is returned as a sparse vector by invoking set_feature().
              For example, to set the feature with an index of 55 to the value of 1
              this method would call:
                set_feature(55);
              Or equivalently:
                set_feature(55,1);
              Therefore, the first argument to set_feature is the index of the feature
              to be set while the second argument is the value the feature should take.
              Additionally, note that calling set_feature() multiple times with the
              same feature index does NOT overwrite the old value, it adds to the
              previous value.  For example, if you call set_feature(55) 3 times then it
              will result in feature 55 having a value of 3.
            - This function only calls set_feature() with feature_index values < num_features()
    !*/
    {
        // The model in this example program is very simple.  Our features only look at the 
        // capitalization pattern of the words.  So we have a single feature which checks
        // if the first letter is capitalized or not.  
        if (isupper(sentence[position][0]))
            set_feature(0);
    }
};

// We need to define serialize() and deserialize() for our feature extractor if we want 
// to be able to serialize and deserialize our learned models.  In this case the 
// implementation is empty since our feature_extractor doesn't have any state.  But you 
// might define more complex feature extractors which have state that needs to be saved.
void serialize(const feature_extractor&, std::ostream&) {}
void deserialize(feature_extractor&, std::istream&) {}

// ----------------------------------------------------------------------------------------

void make_training_examples (
    std::vector<std::vector<std::string> >& samples,
    std::vector<std::vector<std::pair<unsigned long, unsigned long> > >& segments
)
/*!
    ensures
        - This function fills samples with example sentences and segments with the
          locations of person names that should be segmented out.
        - #samples.size() == #segments.size()
!*/
{
    std::vector<std::pair<unsigned long, unsigned long> > names;


    // Here we make our first training example.  split() turns the string into an array of
    // 10 words and then we store that into samples.
    samples.push_back(split("The other day I saw a man named Jim Smith"));
    // We want to detect person names.  So we note that the name is located within the
    // range [8, 10).  Note that we use half open ranges to identify segments.  So in this
    // case, the segment identifies the string "Jim Smith".
    names.push_back(make_pair(8, 10));
    segments.push_back(names); names.clear();

    // Now we add a few more example sentences

    samples.push_back(split("Davis King is the main author of the dlib Library"));
    names.push_back(make_pair(0, 2));
    segments.push_back(names); names.clear();


    samples.push_back(split("Bob Jones is a name and so is George Clinton"));
    names.push_back(make_pair(0, 2));
    names.push_back(make_pair(8, 10));
    segments.push_back(names); names.clear();


    samples.push_back(split("My dog is named Bob Barker"));
    names.push_back(make_pair(4, 6));
    segments.push_back(names); names.clear();


    samples.push_back(split("ABC is an acronym but John James Smith is a name"));
    names.push_back(make_pair(5, 8));
    segments.push_back(names); names.clear();


    samples.push_back(split("No names in this sentence at all"));
    segments.push_back(names); names.clear();
}

// ----------------------------------------------------------------------------------------

void print_segment (
    const std::vector<std::string>& sentence,
    const std::pair<unsigned long,unsigned long>& segment
)
{
    // Recall that a segment is a half open range starting with .first and ending just
    // before .second. 
    for (unsigned long i = segment.first; i < segment.second; ++i)
        cout << sentence[i] << " ";
    cout << endl;
}

// ----------------------------------------------------------------------------------------

int main()
{
    // Finally we make it into the main program body.  So the first thing we do is get our
    // training data.
    std::vector<std::vector<std::string> > samples;
    std::vector<std::vector<std::pair<unsigned long, unsigned long> > > segments;
    make_training_examples(samples, segments);


    // Next we use the structural_sequence_segmentation_trainer to learn our segmentation
    // model based on just the samples and segments.  But first we setup some of its
    // parameters.
    structural_sequence_segmentation_trainer<feature_extractor> trainer;
    // This is the common SVM C parameter.  Larger values encourage the trainer to attempt
    // to fit the data exactly but might overfit.  In general, you determine this parameter
    // by cross-validation.
    trainer.set_c(10);
    // This trainer can use multiple CPU cores to speed up the training.  So set this to
    // the number of available CPU cores. 
    trainer.set_num_threads(4);


    // Learn to do sequence segmentation from the dataset
    sequence_segmenter<feature_extractor> segmenter = trainer.train(samples, segments);


Davis King's avatar
Davis King committed
195
    // Let's print out all the segments our segmenter detects.
196
197
198
199
200
201
202
203
204
205
206
207
    for (unsigned long i = 0; i < samples.size(); ++i)
    {
        // get all the detected segments in samples[i]
        std::vector<std::pair<unsigned long,unsigned long> > seg = segmenter(samples[i]);
        // Print each of them
        for (unsigned long j = 0; j < seg.size(); ++j)
        {
            print_segment(samples[i], seg[j]);
        }
    }


Davis King's avatar
Davis King committed
208
    // Now let's test it on a new sentence and see what it detects.  
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
    std::vector<std::string> sentence(split("There once was a man from Nantucket whose name rhymed with Bob Bucket"));
    std::vector<std::pair<unsigned long,unsigned long> > seg = segmenter(sentence);
    for (unsigned long j = 0; j < seg.size(); ++j)
    {
        print_segment(sentence, seg[j]);
    }



    // We can also test the accuracy of the segmenter on a dataset.  This statement simply
    // tests on the training data.  In this case we will see that it predicts everything
    // correctly.
    cout << "\nprecision, recall, f1-score: " << test_sequence_segmenter(segmenter, samples, segments);
    // Similarly, we can do 5-fold cross-validation and print the results.  Just as before,
    // we see everything is predicted correctly.
    cout << "precision, recall, f1-score: " << cross_validate_sequence_segmenter(trainer, samples, segments, 5);





    // Finally, the segmenter can be serialized to disk just like most dlib objects.
231
    serialize("segmenter.dat") << segmenter;
232
233

    // recall from disk
234
    deserialize("segmenter.dat") >> segmenter;
235
236
237
238
}

// ----------------------------------------------------------------------------------------