Commit d8de2ca8 authored by chenych's avatar chenych
Browse files

Support vllm cpu offload

parent 2eaae45d
......@@ -65,6 +65,7 @@ pip install -r requirements.txt
pip install "tensordict<0.6"
# 编译
pip install -e .
export LLAMA_NN=0
```
#### Dockerfile(方法二)
......@@ -79,6 +80,7 @@ pip install -r requirements.txt
pip install "tensordict<0.6"
# 编译
pip install -e .
export LLAMA_NN=0
```
#### Anaconda(方法三)
......@@ -89,7 +91,7 @@ python: 3.10
torch: 2.4.1
deepspeed: 0.14.2+das.opt2.dtk2504
flash-attn: 2.6.1+das.opt4.dtk2504
vllm: 0.8.3
vllm: 0.7.2+das.opt1.c137085.dtk2504
```
`Tips:以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应`
......@@ -99,6 +101,9 @@ pip install -r requirements.txt
pip install "tensordict<0.6"
# 编译
pip install -e .
## vllm cpu offload
export LLAMA_NN=0
```
### 数据集
......@@ -109,6 +114,7 @@ pip install -e .
- Multi-image-text dataset: https://huggingface.co/datasets/hiyouga/journeybench-multi-image-vqa
### GRPO 训练
如果无法连接到Hugging Face,请先安装`pip install -U huggingface_hub hf_transfer`,再在启动前增加 `export HF_ENDPOINT=https://hf-mirror.com`命令
```bash
bash examples/qwen2_5_vl_7b_geo3k_grpo.sh
......@@ -121,7 +127,6 @@ python3 scripts/model_merger.py --local_dir path_to_your_actor_checkpoint
```
> [!NOTE]
> 如果无法连接到Hugging Face,请先安装`pip install -U huggingface_hub hf_transfer`,再在启动前增加 `export HF_ENDPOINT=https://hf-mirror.com`命令
>
> 如果您想使用SwanLab日志记录器,请考虑使用 `bash examples/qwen2_5_vl_7b_geo3k_swanlab.sh`.
......@@ -159,3 +164,7 @@ python3 scripts/model_merger.py --local_dir path_to_your_actor_checkpoint
> RuntimeError: 0 active drivers ([]). There should only be one.
在当前python环境下卸载 `deepspeed`.
> RuntimeError: No HIP GPUs are available
`~/.bashrc`中新增 `export HIP_VISIBLE_DEVICES=0,1,2,3`,并重新source下环境
......@@ -79,19 +79,17 @@ class vLLMRollout(BaseRollout):
gpu_memory_utilization=config.gpu_memory_utilization,
max_num_batched_tokens=config.max_num_batched_tokens,
disable_log_stats=config.disable_log_stats,
enforce_eager=True,
enforce_eager=config.enforce_eager,
disable_custom_all_reduce=True,
limit_mm_per_prompt={"image": config.limit_images} if config.limit_images > 0 else None,
disable_mm_preprocessor_cache=True,
enable_chunked_prefill=config.enable_chunked_prefill,
enable_sleep_mode=False, # only support GPUs
# swap_space=20,
cpu_offload_gb=64,
)
# Offload vllm model to reduce peak memory usage
# self.inference_engine.sleep(level=1)
## TODO DCU 怎么释放显存
# self.inference_engine.offload_model_weights()
sampling_kwargs = {
"max_tokens": config.response_length,
......
......@@ -88,8 +88,8 @@ class FSDPVLLMShardingManager(BaseShardingManager):
del actor_weights
torch.cuda.empty_cache()
if "tags" in inspect.signature(self.inference_engine.wake_up).parameters:
self.inference_engine.wake_up(tags=["kv_cache"])
# if "tags" in inspect.signature(self.inference_engine.wake_up).parameters:
# self.inference_engine.wake_up(tags=["kv_cache"])
print_gpu_memory_usage("After del state_dict and empty_cache in sharding manager")
# important: need to manually set the random states of each tp to be identical.
......@@ -101,8 +101,6 @@ class FSDPVLLMShardingManager(BaseShardingManager):
print_gpu_memory_usage("Before vllm offload in sharding manager")
free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
# self.inference_engine.sleep(level=1)
## TODO DCU 怎么释放显存
# self.inference_engine.offload_model_weights()
free_bytes_after_sleep = torch.cuda.mem_get_info()[0]
self.freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment