export ALLREDUCE_STREAM_WITH_COMPUTE=1 #同流 export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') #ip地址 export VLLM_TORCH_PROFILER_DIR=/workspace #torchprof export DEBUG_CLR_GRAPH_PACKET_CAPTURE=false export VLLM_FUSED_MOE_CHUNK_SIZE=8192 export VLLM_USE_GLOBAL_CACHE13=1 #不影响性能,帮助解决oom export VLLM_ENABLE_TBO=0 #defalut export VLLM_ZERO_OVERHEAD=1 current_time=$(date +"%Y%m%d-%H%M") #export VLLM_P2P_ASYNC=1 vllm serve /module3/DeepSeek-R1-0528-W4A8-V2 \ --host 0.0.0.0 \ --port 20009 \ --trust-remote-code \ --dtype bfloat16 \ -q slimquant_w4a8_marlin \ --kv-cache-dtype fp8_e5m2 \ --max-model-len 49152 \ -tp 8 \ --gpu-memory-utilization 0.95 \ --max-num-seqs 256 \ --block-size 64 \ --disable-log-requests \ --max-num-batched-tokens 8192 \ --no-enable-prefix-caching \ --enable-chunked-prefill \ --speculative_config '{"method": "deepseek_mtp", "num_speculative_tokens": 3}' \ --kv-transfer-config '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"1e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.16.1.115","proxy_port":"30007","http_port":"20009","send_type":"PUT_ASYNC","mem_pool_size_gb":256}}' \ 2>&1 | tee 1d_log/2d-${current_time}.log