更新vllm版本到0.18.1，搭配vllm-omni 0.18.0版本

aedfc254 · weishb · 84b8a706 · aedfc254 · aedfc254 · aedfc254
Commit aedfc254 authored Apr 23, 2026 by weishb
6 changed files
--- a/README.md
+++ b/README.md
@@ -21,11 +21,11 @@ Qwen3-TTS 覆盖10种主要语言（中文、英文、日文、韩文、德文
 | DTK | 26.04 |
 | python | 3.10.12 |
 | transformers | 4.57.6 |
-| vllm | 0.15.1+das.opt1.alpha.dtk2604 |
+| vllm | 0.18.1+das.dtk2604 |
-| torchaudio | torchaudio-2.9.0+das.opt1.dtk2604.20260206.g275d08c2 |
+| torchaudio | 2.10.0 |
-| vllm-omni | 0.15.1+fix1 |
+| vllm-omni | 0.18.0 |
-推荐使用镜像:harbor.sourcefind.cn:5443/dcu/admin/base/custom:vllm0.15.1-ubuntu22.04-dtk26.04-0130-py3.10-20260220
+推荐使用镜像:harbor.sourcefind.cn:5443/dcu/admin/base/custom:vllm018-ubuntu22.04-dtk26.04-nemotron-20260422
 ```bash
 docker run -it \
@@ -42,7 +42,7 @@ docker run -it \
    -u root \
    -v /opt/hyhal/:/opt/hyhal/:ro \
    -v /path/your_code_data/:/path/your_code_data/ \
-    harbor.sourcefind.cn:5443/dcu/admin/base/custom:vllm0.15.1-ubuntu22.04-dtk26.04-0130-py3.10-20260220 bash
+    harbor.sourcefind.cn:5443/dcu/admin/base/custom:vllm018-ubuntu22.04-dtk26.04-nemotron-20260422 bash
 ```
 更多镜像可前往[光源](https://sourcefind.cn/#/service-list)下载使用。
@@ -53,11 +53,8 @@ pip install -r requirements.txt
 镜像内其他环境配置
 ```
-pip uninstall vllm
+pip install vllm_omni==0.18.0
-pip install vllm-0.15.1+das.opt1.alpha.dtk2604-cp310-cp310-linux_x86_64.whl  --no-deps
+pip install torchaudio-2.10.0-cp310-cp310-linux_x86_64.whl --no-deps
-pip install vllm_omni-0.15.1+fix1-py3-none-any.whl
-pip install torchaudio-2.9.0+das.opt1.dtk2604.20260206.g275d08c2-cp310-cp310-linux_x86_64.whl --no-deps
-pip install pycountry
 ```

--- a/qwen3_tts.yaml
+++ b/qwen3_tts.yaml
+async_chunk: true
 stage_args:
  - stage_id: 0
-    stage_type: llm  # Use llm stage type to launch OmniLLM
+    stage_type: llm
+    is_comprehension: true
    runtime:
      devices: "0"
-      max_batch_size: 1
    engine_args:
      model_stage: qwen3_tts
-      model_arch: Qwen3TTSForConditionalGeneration
+      max_num_seqs: 10
+      model_arch: Qwen3TTSTalkerForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      enforce_eager: false
+      trust_remote_code: true
+      async_scheduling: true
+      enable_prefix_caching: false
+      engine_output_type: latent
+      gpu_memory_utilization: 0.3
+      distributed_executor_backend: "mp"
+      max_num_batched_tokens: 512
+      max_model_len: 4096
+      custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk
+    # Use named connector to apply runtime.connectors.extra.
+    output_connectors:
+      to_stage_1: connector_of_shared_memory
+    default_sampling_params:
+      temperature: 0.9
+      top_k: 50
+      max_tokens: 4096
+      seed: 42
+      detokenize: false
+      repetition_penalty: 1.05
+      stop_token_ids: [2150]
+  - stage_id: 1
+    stage_type: llm
+    runtime:
+      devices: "0"
+    engine_args:
+      model_stage: code2wav
+      max_num_seqs: 1
+      model_arch: Qwen3TTSCode2Wav
      worker_type: generation
      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
      enforce_eager: true
      trust_remote_code: true
-      async_scheduling: false
+      async_scheduling: true
      enable_prefix_caching: false
-      engine_output_type: audio  # Final output: audio waveform
+      engine_output_type: audio
-      gpu_memory_utilization: 0.1
+      gpu_memory_utilization: 0.3
      distributed_executor_backend: "mp"
-      max_num_batched_tokens: 1000000
+      # Must be divisible by num_code_groups and cover (left_context + chunk).
+      max_num_batched_tokens: 8192
+      # async_chunk appends windows per step; max_model_len must cover accumulated stream.
+      max_model_len: 32768
+    engine_input_source: [0]
    final_output: true
    final_output_type: audio
+    # Distributed connector configuration
+    input_connectors:
+      from_stage_0: connector_of_shared_memory
+    tts_args:
+      max_instructions_length: 500
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 65536
+      seed: 42
+      detokenize: true
+      repetition_penalty: 1.0
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1
+    max_inflight: 1
+  connectors:
+    connector_of_shared_memory:
+      name: SharedMemoryConnector
+      extra:
+        shm_threshold_bytes: 65536
+        # Frame-aligned codec streaming transport.
+        codec_streaming: true
+        # Connector polling / timeout (unit: loop count, sleep interval in seconds).
+        connector_get_sleep_s: 0.01
+        connector_get_max_wait_first_chunk: 3000
+        connector_get_max_wait: 300
+        # Align with Omni: small chunks with sufficient context overlap.
+        codec_chunk_frames: 25
+        codec_left_context_frames: 25
+  edges:
+    - from: 0
+      to: 1
+      window_size: -1
\ No newline at end of file
--- a/torchaudio-2.10.0-cp310-cp310-linux_x86_64.whl
+++ b/torchaudio-2.10.0-cp310-cp310-linux_x86_64.whl
--- a/torchaudio-2.9.0+das.opt1.dtk2604.20260206.g275d08c2-cp310-cp310-linux_x86_64.whl
+++ b/torchaudio-2.9.0+das.opt1.dtk2604.20260206.g275d08c2-cp310-cp310-linux_x86_64.whl
--- a/vllm-0.15.1+das.opt1.alpha.dtk2604-cp310-cp310-linux_x86_64.whl
+++ b/vllm-0.15.1+das.opt1.alpha.dtk2604-cp310-cp310-linux_x86_64.whl
--- a/vllm_omni-0.15.1+fix1-py3-none-any.whl
+++ b/vllm_omni-0.15.1+fix1-py3-none-any.whl