("read_vocab_table",po::value<std::string>(&vocab_table),"Vocabulary hash table to read. This should be a probing hash table with size at the beginning.")
("write_vocab_list",po::value<std::string>(&vocab_list),"Vocabulary list to write as null-delimited strings.");
UTIL_THROW_IF(*i>=vocab.Size(),util::Exception,"Vocab ID "<<*i<<" is larger than the vocab file's maximum of "<<vocab.Size()<<". Are you sure you have the right order and vocab file for these counts?");
std::cout<<vocab.Lookup(*i)<<' ';
}
// TODO don't use std::cout because it is slow. Add fast uint64_t printing support to FileStream.
// throw if each n-gram order has not threshold specified
UTIL_THROW_IF(prune_thresholds.size()>order,util::Exception,"You specified pruning thresholds for orders 1 through "<<prune_thresholds.size()<<" but the model only has order "<<order);
// threshold for unigram can only be 0 (no pruning)
UTIL_THROW_IF(lower_threshold>*it,util::Exception,"Pruning thresholds should be in non-decreasing order. Otherwise substrings would be removed, which is bad for query-time data structures.");
UTIL_THROW_IF(discount<0.0||discount>static_cast<float>(i+1),util::Exception,"The discount for count "<<(i+1)<<" was parsed as "<<discount<<" which is not in the range [0, "<<(i+1)<<"].");
ret.amount[i+1]=discount;
}
returnret;
}
}// namespace
intmain(intargc,char*argv[]){
try{
namespacepo=boost::program_options;
po::options_descriptionoptions("Language model building options");
("interpolate_unigrams",po::value<bool>(&pipeline.initial_probs.interpolate_unigrams)->default_value(true)->implicit_value(true),"Interpolate the unigrams (default) as opposed to giving lots of mass to <unk> like SRI. If you want SRI's behavior with a large <unk> and the old lmplz default, use --interpolate_unigrams 0.")
("skip_symbols",po::bool_switch(),"Treat <s>, </s>, and <unk> as whitespace instead of throwing an exception")
("vocab_estimate",po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000),"Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
("vocab_pad",po::value<uint64_t>(&pipeline.vocab_size_for_unk)->default_value(0),"If the vocabulary is smaller than this value, pad with <unk> to reach this size. Requires --interpolate_unigrams")
("verbose_header",po::bool_switch(&verbose_header),"Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
("text",po::value<std::string>(&text),"Read text from a file instead of stdin")
("arpa",po::value<std::string>(&arpa),"Write ARPA to a file instead of stdout")
("intermediate",po::value<std::string>(&intermediate),"Write ngrams to intermediate files. Turns off ARPA output (which can be reactivated by --arpa file). Forces --renumber on.")
("renumber",po::bool_switch(&pipeline.renumber_vocabulary),"Renumber the vocabulary identifiers so that they are monotone with the hash of each string. This is consistent with the ordering used by the trie data structure.")
("collapse_values",po::bool_switch(&pipeline.output_q),"Collapse probability and backoff into a single value, q that yields the same sentence-level probabilities. See http://kheafield.com/professional/edinburgh/rest_paper.pdf for more details, including a proof.")
("prune",po::value<std::vector<std::string>>(&pruning)->multitoken(),"Prune n-grams with count less than or equal to the given threshold. Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above. The sequence of values must be non-decreasing and the last value applies to any remaining orders. Default is to not prune, which is equivalent to --prune 0.")
("limit_vocab_file",po::value<std::string>(&pipeline.prune_vocab_file)->default_value(""),"Read allowed vocabulary separated by whitespace. N-grams that contain vocabulary items not in this list will be pruned. Can be combined with --prune arg")
("discount_fallback",po::value<std::vector<std::string>>(&discount_fallback)->multitoken()->implicit_value(discount_fallback_default,"0.5 1 1.5"),"The closed-form estimate for Kneser-Ney discounts does not work without singletons or doubletons. It can also fail if these values are out of range. This option falls back to user-specified discounts when the closed-form estimate fails. Note that this option is generally a bad idea: you should deduplicate your corpus instead. However, class-based models need custom discounts because they lack singleton unigrams. Provide up to three discounts (for adjusted counts 1, 2, and 3+), which will be applied to all orders where the closed-form estimates fail.");
/* Outputs from lmplz: ARPA, sharded files, etc */
namespacelm{namespacebuilder{
// These are different types of hooks. Values should be consecutive to enable a vector lookup.
enumHookType{
// TODO: counts.
PROB_PARALLEL_HOOK,// Probability and backoff (or just q). Output must process the orders in parallel or there will be a deadlock.
PROB_SEQUENTIAL_HOOK,// Probability and backoff (or just q). Output can process orders any way it likes. This requires writing the data to disk then reading. Useful for ARPA files, which put unigrams first etc.
NUMBER_OF_HOOKS// Keep this last so we know how many values there are.
master>>Interpolate(std::max(master.Config().vocab_size_for_unk,counts[0]-1/* <s> is not included */),util::stream::ChainPositions(gamma_chains),config.prune_thresholds,config.prune_vocab,config.output_q,specials);
"Not enough memory to fit "<<(config.order*config.block_count)<<" blocks with minimum size "<<config.minimum_block<<". Increase memory to "<<(config.minimum_block*config.order*config.block_count)<<" bytes or decrease the minimum block size.");
Mastermaster(config,output.Steps());
// master's destructor will wait for chains. But they might be deadlocked if
// this thread dies because e.g. it ran out of memory.