llama_config.ini

[ft_instance_hyperparameter]
data_type=fp16
enable_custom_all_reduce=0
pipeline_para_size=1
tensor_para_size=1
model_dir=/workspace/models/triton_models/weights/


[request]
request_batch_size=8
request_output_len=2048
beam_width=1 ; beam width for beam search
top_k=1 ; k value for top k sampling
top_p=0.0 ; p value for top p sampling
temperature=1.0 ; Use for sampling
repetition_penalty=1.00 ; Use for sampling
presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed.
len_penalty=0.0
beam_search_diversity_rate=0.0
; PJLM start/end ids
start_id=0
end_id=1


; --------------------- legacy params -------------------------

; LLaMA start/end ids
; start_id=1
; end_id=2

[4999_llama]
head_num=80
size_per_head=128
vocab_size=65632
num_layer=82
rotary_embedding=128
norm_eps=1e-5
start_id=0
end_id=1
inter_size=27392

[llama_7B]
head_num=32
size_per_head=128
vocab_size=32000
num_layer=32
rotary_embedding=128
start_id=1
end_id=2
inter_size=11008

[llama_13B]
head_num=40
size_per_head=128
vocab_size=32000
num_layer=40
rotary_embedding=128
start_id=1
end_id=2
inter_size=13824

[llama_30B]
head_num=52
size_per_head=128
vocab_size=32000
num_layer=60
rotary_embedding=128
start_id=1
end_id=2
inter_size=17920

[llama_65B]
head_num=64
size_per_head=128
vocab_size=32000
num_layer=80
rotary_embedding=128
start_id=1
end_id=2
inter_size=22016