cd InfiniLM-fa # Basic inference python examples/jiuge.py --hygon --model_path=../models/9g_8b_thinking_llama/ # With paged attention + flash-attn (use block_size=64 for Hygon) python examples/jiuge.py --hygon --enable-paged-attn --paged_kv_block_size=64 --attn=flash-attn --model_path=../models/9g_8b_thinking_llama/ # With graph compilation python examples/jiuge.py --hygon --enable-paged-attn --paged_kv_block_size=64 --attn=flash-attn --enable-graph --model_path=../models/9g_8b_thinking_llama/ # Multi-GPU (tensor parallel) python examples/jiuge.py --hygon --tp=4 --model_path=../models/9g_8b_thinking_llama/ # Custom options python examples/jiuge.py --hygon --model_path= \ --max_new_tokens=100 \ --batch_size=1 \ --prompt="How are you" \ --top_k=1 --top_p=1.0 --temperature=1.0 python examples/bench.py --hygon --warmup --model=../models/9g_8b_thinking_llama/ \ --enable-paged-attn --attn=flash-attn --enable-graph \ --input-len=32,256,4096 --output-len=256,1024,2048,4096 \ --batch-size=1 --skip-load >> perf_hygon_bs_1.txt python examples/bench.py --hygon --warmup --model=../models/9g_8b_thinking_llama/ \ --enable-paged-attn --attn=flash-attn --enable-graph \ --input-len=32,256 --output-len=256,1024,2048,4096 \ --batch-size=16 --skip-load >> perf_hygon_bs_16.txt