#!/usr/bin/python # The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt # # This example shows how to use dlib to learn to do sequence segmentation. In # a sequence segmentation task we are given a sequence of objects (e.g. words in # a sentence) and we are supposed to detect certain subsequences (e.g. the names # of people). Therefore, in the code below we create some very simple training # sequences and use them to learn a sequence segmentation model. In particular, # our sequences will be sentences represented as arrays of words and our task # will be to learn to identify person names. Once we have our segmentation # model we can use it to find names in new sentences, as we will show. # # COMPILING THE DLIB PYTHON INTERFACE # Dlib comes with a compiled python interface for python 2.7 on MS Windows. If # you are using another python version or operating system then you need to # compile the dlib python interface before you can use this file. To do this, # run compile_dlib_python_module.bat. This should work on any operating # system so long as you have CMake and boost-python installed. # On Ubuntu, this can be done easily by running the command: # sudo apt-get install libboost-python-dev cmake import sys import dlib # The sequence segmentation models we work with in this example are chain # structured conditional random field style models. Therefore, central to a # sequence segmentation model is some method for converting the elements of a # sequence into feature vectors. That is, while you might start out representing # your sequence as an array of strings, the dlib interface works in terms of # arrays of feature vectors. Each feature vector should capture important # information about its corresponding element in the original raw sequence. So # in this example, since we work with sequences of words and want to identify # names, we will create feature vectors that tell us if the word is capitalized # or not. In our simple data, this will be enough to identify names. # Therefore, we define sentence_to_vectors() which takes a sentence represented # as a string and converts it into an array of words and then associates a # feature vector with each word. def sentence_to_vectors(sentence): # Create an empty array of vectors vects = dlib.vectors() for word in sentence.split(): # Our vectors are very simple 1-dimensional vectors. The value of the # single feature is 1 if the first letter of the word is capitalized and # 0 otherwise. if word[0].isupper(): vects.append(dlib.vector([1])) else: vects.append(dlib.vector([0])) return vects # Dlib also supports the use of a sparse vector representation. This is more # efficient than the above form when you have very high dimensional vectors that # are mostly full of zeros. In dlib, each sparse vector is represented as an # array of pair objects. Each pair contains an index and value. Any index not # listed in the vector is implicitly associated with a value of zero. # Additionally, when using sparse vectors with dlib.train_sequence_segmenter() # you can use "unsorted" sparse vectors. This means you can add the index/value # pairs into your sparse vectors in any order you want and don't need to worry # about them being in sorted order. def sentence_to_sparse_vectors(sentence): vects = dlib.sparse_vectors() has_cap = dlib.sparse_vector() no_cap = dlib.sparse_vector() # make has_cap equivalent to dlib.vector([1]) has_cap.append(dlib.pair(0, 1)) # Since we didn't add anything to no_cap it is equivalent to # dlib.vector([0]) for word in sentence.split(): if word[0].isupper(): vects.append(has_cap) else: vects.append(no_cap) return vects def print_segment(sentence, names): words = sentence.split() for name in names: for i in name: sys.stdout.write(words[i] + " ") sys.stdout.write("\n") # Now let's make some training data. Each example is a sentence as well as a # set of ranges which indicate the locations of any names. names = dlib.ranges() # make an array of dlib.range objects. segments = dlib.rangess() # make an array of arrays of dlib.range objects. sentences = ["The other day I saw a man named Jim Smith", "Davis King is the main author of the dlib Library", "Bob Jones is a name and so is George Clinton", "My dog is named Bob Barker", "ABC is an acronym but John James Smith is a name", "No names in this sentence at all"] # We want to detect person names. So we note that the name is located within # the range [8, 10). Note that we use half open ranges to identify segments. # So in this case, the segment identifies the string "Jim Smith". names.append(dlib.range(8, 10)) segments.append(names) # make names empty for use again below names.clear() names.append(dlib.range(0, 2)) segments.append(names) names.clear() names.append(dlib.range(0, 2)) names.append(dlib.range(8, 10)) segments.append(names) names.clear() names.append(dlib.range(4, 6)) segments.append(names) names.clear() names.append(dlib.range(5, 8)) segments.append(names) names.clear() segments.append(names) names.clear() # Now before we can pass these training sentences to the dlib tools we need to # convert them into arrays of vectors as discussed above. We can use either a # sparse or dense representation depending on our needs. In this example, we # show how to do it both ways. use_sparse_vects = False if use_sparse_vects: # Make an array of arrays of dlib.sparse_vector objects. training_sequences = dlib.sparse_vectorss() for s in sentences: training_sequences.append(sentence_to_sparse_vectors(s)) else: # Make an array of arrays of dlib.vector objects. training_sequences = dlib.vectorss() for s in sentences: training_sequences.append(sentence_to_vectors(s)) # Now that we have a simple training set we can train a sequence segmenter. # However, the sequence segmentation trainer has some optional parameters we can # set. These parameters determine properties of the segmentation model we will # learn. See the dlib documentation for the sequence_segmenter object for a # full discussion of their meanings. params = dlib.segmenter_params() params.window_size = 3 params.use_high_order_features = True params.use_BIO_model = True # This is the common SVM C parameter. Larger values encourage the trainer to # attempt to fit the data exactly but might overfit. In general, you determine # this parameter by cross-validation. params.C = 10 # Train a model. The model object is responsible for predicting the locations # of names in new sentences. model = dlib.train_sequence_segmenter(training_sequences, segments, params) # Let's print out the things the model thinks are names. The output is a set # of ranges which are predicted to contain names. If you run this example # program you will see that it gets them all correct. for i, s in enumerate(sentences): print_segment(s, model(training_sequences[i])) # Let's also try segmenting a new sentence. This will print out "Bob Bucket". # Note that we need to remember to use the same vector representation as we used # during training. test_sentence = "There once was a man from Nantucket " \ "whose name rhymed with Bob Bucket" if use_sparse_vects: print_segment(test_sentence, model(sentence_to_sparse_vectors(test_sentence))) else: print_segment(test_sentence, model(sentence_to_vectors(test_sentence))) # We can also measure the accuracy of a model relative to some labeled data. # This statement prints the precision, recall, and F1-score of the model # relative to the data in training_sequences/segments. print("Test on training data: {}".format( dlib.test_sequence_segmenter(model, training_sequences, segments))) # We can also do 5-fold cross-validation and print the resulting precision, # recall, and F1-score. print("Cross validation: {}".format( dlib.cross_validate_sequence_segmenter(training_sequences, segments, 5, params)))