#include #include #include #include #include #include #include bool is_whitespace(char ch) noexcept { // " \t\n\r\x0b\x0c" (python string.whitespace) return ch == 32 or (9 <= ch and ch <= 13); // return ch <= 32; // arguably too general, but slightly faster } bool is_punctuation(char c) noexcept { // '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' ascii values: 33-47, 58-64, 91-96, 123-126 return (33 <= c and c <= 47) or (58 <= c and c <= 64) or (91 <= c and c <= 96) or (123 <= c and c <= 126); } // Takes a string and makes ngrams of length N, splitting grams on whitespace and ignoring ignored characters // Returns a LARGE array of ngrams std::vector clean_ngram( std::string const & input, std::string const & ignore, size_t ngram_n ) noexcept { size_t num_grams = 0; std::vector ngram_list; std::vector gram_lengths; std::string current_ngram; // Max gram length is set to 10 below. current_ngram.reserve(11*ngram_n); gram_lengths.reserve(ngram_n); bool started_gram = false; gram_lengths.push_back(0); //for (size_t i=0; i 10) { // Skip all whitespace while (++iter != input.end() && is_whitespace(*iter)); iter--; if (started_gram){ num_grams += 1; // Building 1grams is a special case if (ngram_n == 1){ ngram_list.push_back(current_ngram); current_ngram = current_ngram.substr(gram_lengths.front()); gram_lengths.back() = 0; // If there are enough grams to form an ngram, save } else if (num_grams >= ngram_n){ // Save the current ngram ngram_list.push_back(current_ngram); // Start the next ngram by dropping the first gram and its space from the ngram current_ngram = current_ngram.substr(gram_lengths.front() + 1); current_ngram += ' '; // Drop the length of the first gram and prepare to record the length of the new gram gram_lengths.erase(gram_lengths.begin()); gram_lengths.push_back(0); // Otherwise, continute building } else { current_ngram += ' '; gram_lengths.push_back(0); } started_gram = false; } // Skip ignored characters // alternatively, (perhaps marginally) faster: if (is_punctuation(ch)) continue; } else if (ignore.find(*iter) != std::string::npos) { continue; } // If it is a non-ignored character, add it to the ngram and update the last gram's length else { current_ngram += tolower(*iter); gram_lengths.back() += 1; started_gram = true; } } return ngram_list; } // Takes a string and makes ngrams of length N, splitting grams on whitespace and ignoring ignored characters // Returns a LARGE array of tuples of (ngram, start_idx, end_idx) std::vector > clean_ngram_with_indices( std::string const & input, std::string const & ignore, size_t ngram_n ) noexcept { size_t num_grams = 0; std::vector > ngram_list; std::vector gram_lengths; std::vector gram_start_indices; std::string current_ngram; // Max gram length is set to 10 below. current_ngram.reserve(11*ngram_n); bool started_gram = false; gram_lengths.push_back(0); gram_start_indices.push_back(0); for (size_t i=0; i 10) { // Skip all whitespace while (++i < input.length() && is_whitespace(input[i])); i--; if (started_gram){ num_grams += 1; // Building 1grams is a special case if (ngram_n == 1){ ngram_list.push_back(std::make_tuple(current_ngram, gram_start_indices.front(), i)); current_ngram = current_ngram.substr(gram_lengths.front()); gram_lengths.back() = 0; gram_start_indices.back() = i+1; // If there are enough grams to form an ngram, save } else if (num_grams >= ngram_n){ // Save the current ngram ngram_list.push_back( std::make_tuple(current_ngram, gram_start_indices.front(), i) ); // Start the next ngram by dropping the first gram and its space from the ngram current_ngram = current_ngram.substr(gram_lengths.front() + 1); current_ngram += ' '; // Drop the length of the first gram and prepare to record the length of the new gram gram_lengths.erase(gram_lengths.begin()); gram_lengths.push_back(0); gram_start_indices.erase(gram_start_indices.begin()); gram_start_indices.push_back(i+1); // Otherwise, continute building } else { current_ngram += ' '; gram_lengths.push_back(0); gram_start_indices.push_back(i+1); } started_gram = false; } // Skip ignored characters } else if (ignore.find(*iter) != std::string::npos) { continue; // If it is a non-ignored character, add it to the ngram and update the last gram's length } else { current_ngram += tolower(ch); gram_lengths.back() += 1; started_gram = true; } } return ngram_list; } PYBIND11_MODULE(janitor_util, m) { m.doc() = "pybind11 example plugin"; // optional module docstring // m.def("add", &add, "A function which adds two numbers"); // example function m.def("clean_ngram", &clean_ngram, "Create ngrams of words, ignoring some characters"); m.def("clean_ngram_with_indices", &clean_ngram_with_indices, "Create ngrams of words with indices, ignoring some characters"); } // Example compile // c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) // If python and gcc aren't linked, append to the above: -undefined dynamic_lookup