glm_config.ini

[ft_instance_hyperparameter]
max_batch_size=1 ; Use for allocate the buffer
max_seq_len=512 ; The sequence length of position embedding table, should move to model hyper-parameter
beam_width=1 ; beam width for beam search
top_k=5 ; k value for top k sampling
top_p=0 ; p value for top p sampling
temperature=1.0 ; Use for sampling
repetition_penalty=1.0 ; Use for sampling
len_penalty=1.0
beam_search_diversity_rate=0.0
is_half=1
enable_custom_all_reduce=0

tensor_para_size=8
pipeline_para_size=1

model_name=GLM_130B
model_dir=/parastor/home/zhouxiang/glm-130b-models/glm-130b-sat-8card-fp16/ft_models
model_dtype=1 # "fp32": 0, "fp16": 1, "int8": 2, "int4": 3

[request]
request_batch_size=1 # determine by the request
request_output_len=128 # determine by the request

[GLM_130B]
head_num=96
size_per_head=128
decoder_layers=70
rotary_embedding=64
vocab_size=150528
start_id=150004
end_id=150001
inter_size=32768 # change this is useless