Commit 688b6eac authored by SWHL's avatar SWHL
Browse files

Update files

parents
KenLM intermediate binary file
Counts 6 7 6
Payload pb
#!/bin/bash
../../../../build/bin/lmplz --discount_fallback -o 3 -S 100M --intermediate toy0 --arpa ../toy0.arpa <<EOF
a a b a
b a a b
EOF
../../../../build/bin/lmplz --discount_fallback -o 3 -S 100M --intermediate toy1 --arpa ../toy1.arpa <<EOF
a a b b b b b b b
c
EOF
KenLM intermediate binary file
Counts 5 7 7
Payload pb
KenLM intermediate binary file
Counts 6 7 6
Payload pb
\data\
ngram 1=5
ngram 2=7
ngram 3=7
\1-grams:
-0.90309 <unk> 0
0 <s> -0.30103
-0.46943438 a -0.30103
-0.5720968 </s> 0
-0.5720968 b -0.30103
\2-grams:
-0.37712017 <s> a -0.30103
-0.37712017 a a -0.30103
-0.2984526 b a -0.30103
-0.58682007 a </s> 0
-0.5220179 b </s> 0
-0.41574955 <s> b -0.30103
-0.58682007 a b -0.30103
\3-grams:
-0.14885087 <s> a a
-0.33741078 b a a
-0.124077894 <s> b a
-0.2997394 a b a
-0.42082912 b a </s>
-0.397617 a b </s>
-0.20102891 a a b
\end\
\data\
ngram 1=6
ngram 2=7
ngram 3=6
\1-grams:
-1 <unk> 0
0 <s> -0.30103
-0.6146491 a -0.30103
-0.6146491 </s> 0
-0.7659168 c -0.30103
-0.6146491 b -0.30103
\2-grams:
-0.4301247 <s> a -0.30103
-0.4301247 a a -0.30103
-0.20660876 c </s> 0
-0.5404639 b </s> 0
-0.4740302 <s> c -0.30103
-0.4301247 a b -0.30103
-0.3422159 b b -0.47712123
\3-grams:
-0.1638568 <s> a a
-0.09113217 <s> c </s>
-0.7462621 b b </s>
-0.1638568 a a b
-0.13823806 a b b
-0.13375957 b b b
\end\
#include "config.hh"
#include <iostream>
namespace lm {
namespace ngram {
Config::Config() :
show_progress(true),
messages(&std::cerr),
enumerate_vocab(NULL),
unknown_missing(COMPLAIN),
sentence_marker_missing(THROW_UP),
positive_log_probability(THROW_UP),
unknown_missing_logprob(-100.0),
probing_multiplier(1.5),
building_memory(1073741824ULL), // 1 GB
temporary_directory_prefix(""),
arpa_complain(ALL),
write_mmap(NULL),
write_method(WRITE_AFTER),
include_vocab(true),
rest_function(REST_MAX),
prob_bits(8),
backoff_bits(8),
pointer_bhiksha_bits(22),
load_method(util::POPULATE_OR_READ) {}
} // namespace ngram
} // namespace lm
#ifndef LM_CONFIG_H
#define LM_CONFIG_H
#include "lm_exception.hh"
#include "../util/mmap.hh"
#include <iosfwd>
#include <string>
#include <vector>
/* Configuration for ngram model. Separate header to reduce pollution. */
namespace lm {
class EnumerateVocab;
namespace ngram {
struct Config {
// EFFECTIVE FOR BOTH ARPA AND BINARY READS
// (default true) print progress bar to messages
bool show_progress;
// Where to log messages including the progress bar. Set to NULL for
// silence.
std::ostream *messages;
std::ostream *ProgressMessages() const {
return show_progress ? messages : 0;
}
// This will be called with every string in the vocabulary by the
// constructor; it need only exist for the lifetime of the constructor.
// See enumerate_vocab.hh for more detail. Config does not take ownership;
// just delete/let it go out of scope after the constructor exits.
EnumerateVocab *enumerate_vocab;
// ONLY EFFECTIVE WHEN READING ARPA
// What to do when <unk> isn't in the provided model.
WarningAction unknown_missing;
// What to do when <s> or </s> is missing from the model.
// If THROW_UP, the exception will be of type util::SpecialWordMissingException.
WarningAction sentence_marker_missing;
// What to do with a positive log probability. For COMPLAIN and SILENT, map
// to 0.
WarningAction positive_log_probability;
// The probability to substitute for <unk> if it's missing from the model.
// No effect if the model has <unk> or unknown_missing == THROW_UP.
float unknown_missing_logprob;
// Size multiplier for probing hash table. Must be > 1. Space is linear in
// this. Time is probing_multiplier / (probing_multiplier - 1). No effect
// for sorted variant.
// If you find yourself setting this to a low number, consider using the
// TrieModel which has lower memory consumption.
float probing_multiplier;
// Amount of memory to use for building. The actual memory usage will be
// higher since this just sets sort buffer size. Only applies to trie
// models.
std::size_t building_memory;
// Template for temporary directory appropriate for passing to mkdtemp.
// The characters XXXXXX are appended before passing to mkdtemp. Only
// applies to trie. If empty, defaults to write_mmap. If that's NULL,
// defaults to input file name.
std::string temporary_directory_prefix;
// Level of complaining to do when loading from ARPA instead of binary format.
enum ARPALoadComplain {ALL, EXPENSIVE, NONE};
ARPALoadComplain arpa_complain;
// While loading an ARPA file, also write out this binary format file. Set
// to NULL to disable.
const char *write_mmap;
enum WriteMethod {
WRITE_MMAP, // Map the file directly.
WRITE_AFTER // Write after we're done.
};
WriteMethod write_method;
// Include the vocab in the binary file? Only effective if write_mmap != NULL.
bool include_vocab;
// Left rest options. Only used when the model includes rest costs.
enum RestFunction {
REST_MAX, // Maximum of any score to the left
REST_LOWER, // Use lower-order files given below.
};
RestFunction rest_function;
// Only used for REST_LOWER.
std::vector<std::string> rest_lower_files;
// Quantization options. Only effective for QuantTrieModel. One value is
// reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
// to quantize (and one of the remaining backoffs will be 0).
uint8_t prob_bits, backoff_bits;
// Bhiksha compression (simple form). Only works with trie.
uint8_t pointer_bhiksha_bits;
// ONLY EFFECTIVE WHEN READING BINARY
// How to get the giant array into memory: lazy mmap, populate, read etc.
// See util/mmap.hh for details of MapMethod.
util::LoadMethod load_method;
// Set defaults.
Config();
};
} /* namespace ngram */ } /* namespace lm */
#endif // LM_CONFIG_H
#ifndef LM_ENUMERATE_VOCAB_H
#define LM_ENUMERATE_VOCAB_H
#include "word_index.hh"
#include "../util/string_piece.hh"
namespace lm {
/* If you need the actual strings in the vocabulary, inherit from this class
* and implement Add. Then put a pointer in Config.enumerate_vocab; it does
* not take ownership. Add is called once per vocab word. index starts at 0
* and increases by 1 each time. This is only used by the Model constructor;
* the pointer is not retained by the class.
*/
class EnumerateVocab {
public:
virtual ~EnumerateVocab() {}
virtual void Add(WordIndex index, const StringPiece &str) = 0;
protected:
EnumerateVocab() {}
};
} // namespace lm
#endif // LM_ENUMERATE_VOCAB_H
#ifndef LM_FACADE_H
#define LM_FACADE_H
#include "virtual_interface.hh"
#include "../util/string_piece.hh"
#include <string>
namespace lm {
namespace base {
// Common model interface that depends on knowing the specific classes.
// Curiously recurring template pattern.
template <class Child, class StateT, class VocabularyT> class ModelFacade : public Model {
public:
typedef StateT State;
typedef VocabularyT Vocabulary;
/* Translate from void* to State */
FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const {
return static_cast<const Child*>(this)->FullScore(
*reinterpret_cast<const State*>(in_state),
new_word,
*reinterpret_cast<State*>(out_state));
}
FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const {
return static_cast<const Child*>(this)->FullScoreForgotState(
context_rbegin,
context_rend,
new_word,
*reinterpret_cast<State*>(out_state));
}
// Default Score function calls FullScore. Model can override this.
float Score(const State &in_state, const WordIndex new_word, State &out_state) const {
return static_cast<const Child*>(this)->FullScore(in_state, new_word, out_state).prob;
}
float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const {
return static_cast<const Child*>(this)->Score(
*reinterpret_cast<const State*>(in_state),
new_word,
*reinterpret_cast<State*>(out_state));
}
const State &BeginSentenceState() const { return begin_sentence_; }
const State &NullContextState() const { return null_context_; }
const Vocabulary &GetVocabulary() const { return *static_cast<const Vocabulary*>(&BaseVocabulary()); }
protected:
ModelFacade() : Model(sizeof(State)) {}
virtual ~ModelFacade() {}
// begin_sentence and null_context can disappear after. vocab should stay.
void Init(const State &begin_sentence, const State &null_context, const Vocabulary &vocab, unsigned char order) {
begin_sentence_ = begin_sentence;
null_context_ = null_context;
begin_sentence_memory_ = &begin_sentence_;
null_context_memory_ = &null_context_;
base_vocab_ = &vocab;
order_ = order;
}
private:
State begin_sentence_, null_context_;
};
} // mamespace base
} // namespace lm
#endif // LM_FACADE_H
# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
# Explicitly list the source files for this subdirectory
#
# If you add any source files to this subdirectory
# that should be included in the kenlm library,
# (this excludes any unit test files)
# you should add them to the following list:
#
# In order to set correct paths to these files
# in case this variable is referenced by CMake files in the parent directory,
# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
#
set(KENLM_FILTER_SOURCE
${CMAKE_CURRENT_SOURCE_DIR}/arpa_io.cc
${CMAKE_CURRENT_SOURCE_DIR}/phrase.cc
${CMAKE_CURRENT_SOURCE_DIR}/vocab.cc
)
# Group these objects together for later use.
#
# Given add_library(foo OBJECT ${my_foo_sources}),
# refer to these objects as $<TARGET_OBJECTS:foo>
#
add_library(kenlm_filter ${KENLM_FILTER_SOURCE})
target_link_libraries(kenlm_filter PUBLIC kenlm_util)
# Since headers are relative to `include/kenlm` at install time, not just `include`
target_include_directories(kenlm_filter PUBLIC $<INSTALL_INTERFACE:include/kenlm>)
AddExes(EXES filter phrase_table_vocab
LIBRARIES kenlm_filter kenlm)
install(
TARGETS kenlm_filter
EXPORT kenlmTargets
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib
INCLUDES DESTINATION include
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment