run_infer.sh 1.57 KB
Newer Older
yaoht's avatar
yaoht committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
cd InfiniLM-fa                                                                                                                     
                                                          
# Basic inference                                                                                                                  
python examples/jiuge.py --hygon --model_path=../models/9g_8b_thinking_llama/

# With paged attention + flash-attn (use block_size=64 for Hygon)
python examples/jiuge.py --hygon --enable-paged-attn --paged_kv_block_size=64 --attn=flash-attn --model_path=../models/9g_8b_thinking_llama/

# With graph compilation
python examples/jiuge.py --hygon --enable-paged-attn --paged_kv_block_size=64 --attn=flash-attn --enable-graph --model_path=../models/9g_8b_thinking_llama/

# Multi-GPU (tensor parallel)
python examples/jiuge.py --hygon --tp=4 --model_path=../models/9g_8b_thinking_llama/

# Custom options
python examples/jiuge.py --hygon --model_path=<path> \
    --max_new_tokens=100 \
    --batch_size=1 \
    --prompt="How are you" \
    --top_k=1 --top_p=1.0 --temperature=1.0

python examples/bench.py --hygon --warmup --model=../models/9g_8b_thinking_llama/ \
  --enable-paged-attn --attn=flash-attn --enable-graph \
  --input-len=32,256,4096 --output-len=256,1024,2048,4096 \
  --batch-size=1 --skip-load >> perf_hygon_bs_1.txt

python examples/bench.py --hygon --warmup --model=../models/9g_8b_thinking_llama/ \
  --enable-paged-attn --attn=flash-attn --enable-graph \
  --input-len=32,256 --output-len=256,1024,2048,4096 \
  --batch-size=16 --skip-load >> perf_hygon_bs_16.txt