[ft_instance_hyperparameter] data_type=fp16 enable_custom_all_reduce=0 pipeline_para_size=1 tensor_para_size=1 model_dir=/workspace/models/triton_models/weights/ [request] request_batch_size=8 request_output_len=2048 beam_width=1 ; beam width for beam search top_k=1 ; k value for top k sampling top_p=0.0 ; p value for top p sampling temperature=1.0 ; Use for sampling repetition_penalty=1.00 ; Use for sampling presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed. len_penalty=0.0 beam_search_diversity_rate=0.0 ; PJLM start/end ids start_id=0 end_id=1 ; --------------------- legacy params ------------------------- ; LLaMA start/end ids ; start_id=1 ; end_id=2 [4999_llama] head_num=80 size_per_head=128 vocab_size=65632 num_layer=82 rotary_embedding=128 norm_eps=1e-5 start_id=0 end_id=1 inter_size=27392 [llama_7B] head_num=32 size_per_head=128 vocab_size=32000 num_layer=32 rotary_embedding=128 start_id=1 end_id=2 inter_size=11008 [llama_13B] head_num=40 size_per_head=128 vocab_size=32000 num_layer=40 rotary_embedding=128 start_id=1 end_id=2 inter_size=13824 [llama_30B] head_num=52 size_per_head=128 vocab_size=32000 num_layer=60 rotary_embedding=128 start_id=1 end_id=2 inter_size=17920 [llama_65B] head_num=64 size_per_head=128 vocab_size=32000 num_layer=80 rotary_embedding=128 start_id=1 end_id=2 inter_size=22016