if(version!=kSeparatelyQuantizeVersion)UTIL_THROW(FormatLoadException,"This file has quantization version "<<(unsigned)version<<" but the code expects version "<<(unsigned)kSeparatelyQuantizeVersion);
if(config.prob_bits==0)UTIL_THROW(ConfigException,"You can't quantize probability to zero");
if(config.backoff_bits==0)UTIL_THROW(ConfigException,"You can't quantize backoff to zero");
if(config.prob_bits>25)UTIL_THROW(ConfigException,"For efficiency reasons, quantizing probability supports at most 25 bits. Currently you have requested "<<static_cast<unsigned>(config.prob_bits)<<" bits.");
if(config.backoff_bits>25)UTIL_THROW(ConfigException,"For efficiency reasons, quantizing backoff supports at most 25 bits. Currently you have requested "<<static_cast<unsigned>(config.backoff_bits)<<" bits.");
UTIL_THROW(FormatLoadException,"Looks like a gzip file. If this is an ARPA file, pipe "<<in.FileName()<<" through zcat. If this already in binary format, you need to decompress it because mmap doesn't work on top of gzip.");
UTIL_THROW(FormatLoadException,"This looks like a binary file but got sent to the ARPA parser. Did you compress the binary file or pass a binary file where only ARPA files are accepted?");
UTIL_THROW_IF(line.size()>=4&&StringPiece(line.data(),4)=="blmt",FormatLoadException,"This looks like an IRSTLM binary file. Did you forget to pass --text yes to compile-lm?");
UTIL_THROW_IF(line=="iARPA",FormatLoadException,"This looks like an IRSTLM iARPA file. You need an ARPA file. Run\n compile-lm --text yes "<<in.FileName()<<" "<<in.FileName()<<".arpa\nfirst.");
UTIL_THROW(FormatLoadException,"first non-empty line was \""<<line<<"\" not \\data\\.");
}
while(!IsEntirelyWhiteSpace(line=in.ReadLine())){
if(line.size()<6||strncmp(line.data(),"ngram ",6))UTIL_THROW(FormatLoadException,"count line \""<<line<<"\"doesn't begin with \"ngram \"");
if((end_ptr==remaining.c_str())||(length-1!=number.size()))UTIL_THROW(FormatLoadException,"ngram count lengths should be consecutive starting with 1: "<<line);
if(*end_ptr!='=')UTIL_THROW(FormatLoadException,"Expected = immediately following the first number in the count line "<<line);
UTIL_THROW(FormatLoadException,"Expected newline after backoffs, got "<<got);
}
break;
case'\r':
ConsumeNewline(in);
// Intentionally no break.
case'\n':
backoff=ngram::kNoExtensionBackoff;
break;
default:
UTIL_THROW(FormatLoadException,"Expected tab or newline for backoff");
}
}
voidReadEnd(util::FilePiece&in){
StringPieceline;
do{
line=in.ReadLine();
}while(IsEntirelyWhiteSpace(line));
if(line!="\\end\\")UTIL_THROW(FormatLoadException,"Expected \\end\\ but the ARPA file has "<<line);
try{
while(true){
line=in.ReadLine();
if(!IsEntirelyWhiteSpace(line))UTIL_THROW(FormatLoadException,"Trailing line "<<line);
}
}catch(constutil::EndOfFileException&){}
}
voidPositiveProbWarn::Warn(floatprob){
switch(action_){
caseTHROW_UP:
UTIL_THROW(FormatLoadException,"Positive log probability "<<prob<<" in the model. This is a bug in IRSTLM; you can set config.positive_log_probability = SILENT or pass -i to build_binary to substitute 0.0 for the log probability. Error");
caseCOMPLAIN:
std::cerr<<"There's a positive log probability "<<prob<<" in the APRA file, probably because of a bug in IRSTLM. This and subsequent entires will be mapped to 0 log probability."<<std::endl;
// Backoff will always be 0.0. We'll get the probability and rest in another pass.
entry.value.backoff=kNoExtensionBackoff;
// Go back and find the longest right-aligned entry, informing it that it extends left. Normally this will match immediately, but sometimes SRI is dumb.
for(intlower=keys.size()-2;;--lower){
if(lower==-1){
between.push_back(&unigram);
return;
}
entry.key=keys[lower];
boolfound=middle[lower].FindOrInsert(entry,iter);
between.push_back(&iter->value);
if(found)return;
}
}
// Between usually has single entry, the value to adjust. But sometimes SRI stupidly pruned entries so it has unitialized blank values to be set here.
UTIL_THROW(util::ProbingSizeException,"Avoid pruning n-grams like \"bar baz quux\" when \"foo bar baz quux\" is still in the model. KenLM will work when this pruning happens, but the probing model assumes these events are rare enough that using blank space in the probing hash table will cover all of them. Increase probing_multiplier (-p to build_binary) to add more blank spaces.\n");
if(fixed[0]!=initial[0])UTIL_THROW(util::Exception,"Unigram count should be constant but initial is "<<initial[0]<<" and recounted is "<<fixed[0]);
if(fixed.back()!=initial.back())UTIL_THROW(util::Exception,"Longest count should be constant but it changed from "<<initial.back()<<" to "<<fixed.back());
for(unsignedchari=0;i<initial.size();++i){
if(fixed[i]<initial[i])UTIL_THROW(util::Exception,"Counts came out lower than expected. This shouldn't happen");
// Write the last unigram entry, which is the end pointer for the bigrams.
writer.Unigram(counts[0]);
}
// Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.
if(word_bits_>57)UTIL_THROW(util::Exception,"Sorry, word indices more than "<<(1ULL<<57)<<" are not implemented. Edit util/bit_packing.hh and fix the bit packing functions.");
// If the offset of the method changes, also change TrieSearch::UpdateConfigFromBinary.
bhiksha_(base,entries+1,max_next,config),
next_source_(&next_source){
if(entries+1>=(1ULL<<57)||(max_next>=(1ULL<<57)))UTIL_THROW(util::Exception,"Sorry, this does not support more than "<<(1ULL<<57)<<" n-grams of a particular order. Edit util/bit_packing.hh and fix the bit packing functions.");
// Proxy for an entry except there is some extra cruft between the entries. This is used to sort (n-1)-grams using the same memory as the sorted n-grams.