export ALLREDUCE_STREAM_WITH_COMPUTE=1 #同流 export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') #ip地址 #export VLLM_TORCH_PROFILER_DIR=/workspace #torchprof export VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD=1 export USE_FUSED_RMS_QUANT=1 #启用rmsnorm+quant融合,其它优化默认已打开 export DEBUG_CLR_GRAPH_PACKET_CAPTURE=false export VLLM_SPEC_DECODE_EAGER=1 export VLLM_USE_GLOBAL_CACHE13=1 export VLLM_FUSED_MOE_CHUNK_SIZE=8192 export SENDRECV_STREAM_WITH_COMPUTE=1 export VLLM_ENABLE_TBO=0 export VLLM_REJECT_SAMPLE_OPT=1 export VLLM_USE_LIGHTOP_RMS_ROPE_CONCAT=1 export VLLM_ZERO_OVERHEAD=1 export VLLM_NUMA_BIND=1 export VLLM_RANK0_NUMA=0 export VLLM_RANK1_NUMA=1 export VLLM_RANK2_NUMA=2 export VLLM_RANK3_NUMA=3 export VLLM_RANK4_NUMA=4 export VLLM_RANK5_NUMA=5 export VLLM_RANK6_NUMA=6 export VLLM_RANK7_NUMA=7 current_time=$(date +"%Y%m%d-%H%M") #-pp 2 -tp 4 \ #--enable-expert-parallel vllm serve /module3/DeepSeek-R1-0528-W4A8-V2 \ --port 20011 \ --trust-remote-code \ --dtype bfloat16 \ -q slimquant_w4a8_marlin \ --kv-cache-dtype fp8_e5m2 \ --max-model-len 49152 \ --max-num-batched-tokens 8192 \ -tp 8 \ --gpu-memory-utilization 0.93 \ --max-num-seqs 512 \ --disable-log-requests \ --block-size 64 \ --enforce-eager \ --no-enable-prefix-caching \ --enable-chunked-prefill \ --speculative_config '{"method": "deepseek_mtp", "num_speculative_tokens": 3}' \ #--kv-transfer-config '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.16.1.115","proxy_port":"30007","http_port":"20011","send_type":"PUT_ASYNC"}}' \ 2>&1 | tee 1p_log/1p-${current_time}.log