Commit d8de2ca8 authored by chenych's avatar chenych
Browse files

Support vllm cpu offload

parent 2eaae45d
...@@ -65,6 +65,7 @@ pip install -r requirements.txt ...@@ -65,6 +65,7 @@ pip install -r requirements.txt
pip install "tensordict<0.6" pip install "tensordict<0.6"
# 编译 # 编译
pip install -e . pip install -e .
export LLAMA_NN=0
``` ```
#### Dockerfile(方法二) #### Dockerfile(方法二)
...@@ -79,6 +80,7 @@ pip install -r requirements.txt ...@@ -79,6 +80,7 @@ pip install -r requirements.txt
pip install "tensordict<0.6" pip install "tensordict<0.6"
# 编译 # 编译
pip install -e . pip install -e .
export LLAMA_NN=0
``` ```
#### Anaconda(方法三) #### Anaconda(方法三)
...@@ -89,7 +91,7 @@ python: 3.10 ...@@ -89,7 +91,7 @@ python: 3.10
torch: 2.4.1 torch: 2.4.1
deepspeed: 0.14.2+das.opt2.dtk2504 deepspeed: 0.14.2+das.opt2.dtk2504
flash-attn: 2.6.1+das.opt4.dtk2504 flash-attn: 2.6.1+das.opt4.dtk2504
vllm: 0.8.3 vllm: 0.7.2+das.opt1.c137085.dtk2504
``` ```
`Tips:以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应` `Tips:以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应`
...@@ -99,6 +101,9 @@ pip install -r requirements.txt ...@@ -99,6 +101,9 @@ pip install -r requirements.txt
pip install "tensordict<0.6" pip install "tensordict<0.6"
# 编译 # 编译
pip install -e . pip install -e .
## vllm cpu offload
export LLAMA_NN=0
``` ```
### 数据集 ### 数据集
...@@ -109,6 +114,7 @@ pip install -e . ...@@ -109,6 +114,7 @@ pip install -e .
- Multi-image-text dataset: https://huggingface.co/datasets/hiyouga/journeybench-multi-image-vqa - Multi-image-text dataset: https://huggingface.co/datasets/hiyouga/journeybench-multi-image-vqa
### GRPO 训练 ### GRPO 训练
如果无法连接到Hugging Face,请先安装`pip install -U huggingface_hub hf_transfer`,再在启动前增加 `export HF_ENDPOINT=https://hf-mirror.com`命令
```bash ```bash
bash examples/qwen2_5_vl_7b_geo3k_grpo.sh bash examples/qwen2_5_vl_7b_geo3k_grpo.sh
...@@ -121,7 +127,6 @@ python3 scripts/model_merger.py --local_dir path_to_your_actor_checkpoint ...@@ -121,7 +127,6 @@ python3 scripts/model_merger.py --local_dir path_to_your_actor_checkpoint
``` ```
> [!NOTE] > [!NOTE]
> 如果无法连接到Hugging Face,请先安装`pip install -U huggingface_hub hf_transfer`,再在启动前增加 `export HF_ENDPOINT=https://hf-mirror.com`命令
> >
> 如果您想使用SwanLab日志记录器,请考虑使用 `bash examples/qwen2_5_vl_7b_geo3k_swanlab.sh`. > 如果您想使用SwanLab日志记录器,请考虑使用 `bash examples/qwen2_5_vl_7b_geo3k_swanlab.sh`.
...@@ -159,3 +164,7 @@ python3 scripts/model_merger.py --local_dir path_to_your_actor_checkpoint ...@@ -159,3 +164,7 @@ python3 scripts/model_merger.py --local_dir path_to_your_actor_checkpoint
> RuntimeError: 0 active drivers ([]). There should only be one. > RuntimeError: 0 active drivers ([]). There should only be one.
在当前python环境下卸载 `deepspeed`. 在当前python环境下卸载 `deepspeed`.
> RuntimeError: No HIP GPUs are available
`~/.bashrc`中新增 `export HIP_VISIBLE_DEVICES=0,1,2,3`,并重新source下环境
...@@ -79,19 +79,17 @@ class vLLMRollout(BaseRollout): ...@@ -79,19 +79,17 @@ class vLLMRollout(BaseRollout):
gpu_memory_utilization=config.gpu_memory_utilization, gpu_memory_utilization=config.gpu_memory_utilization,
max_num_batched_tokens=config.max_num_batched_tokens, max_num_batched_tokens=config.max_num_batched_tokens,
disable_log_stats=config.disable_log_stats, disable_log_stats=config.disable_log_stats,
enforce_eager=True, enforce_eager=config.enforce_eager,
disable_custom_all_reduce=True, disable_custom_all_reduce=True,
limit_mm_per_prompt={"image": config.limit_images} if config.limit_images > 0 else None, limit_mm_per_prompt={"image": config.limit_images} if config.limit_images > 0 else None,
disable_mm_preprocessor_cache=True, disable_mm_preprocessor_cache=True,
enable_chunked_prefill=config.enable_chunked_prefill, enable_chunked_prefill=config.enable_chunked_prefill,
enable_sleep_mode=False, # only support GPUs enable_sleep_mode=False, # only support GPUs
# swap_space=20, cpu_offload_gb=64,
) )
# Offload vllm model to reduce peak memory usage # Offload vllm model to reduce peak memory usage
# self.inference_engine.sleep(level=1) # self.inference_engine.sleep(level=1)
## TODO DCU 怎么释放显存
# self.inference_engine.offload_model_weights()
sampling_kwargs = { sampling_kwargs = {
"max_tokens": config.response_length, "max_tokens": config.response_length,
......
...@@ -88,8 +88,8 @@ class FSDPVLLMShardingManager(BaseShardingManager): ...@@ -88,8 +88,8 @@ class FSDPVLLMShardingManager(BaseShardingManager):
del actor_weights del actor_weights
torch.cuda.empty_cache() torch.cuda.empty_cache()
if "tags" in inspect.signature(self.inference_engine.wake_up).parameters: # if "tags" in inspect.signature(self.inference_engine.wake_up).parameters:
self.inference_engine.wake_up(tags=["kv_cache"]) # self.inference_engine.wake_up(tags=["kv_cache"])
print_gpu_memory_usage("After del state_dict and empty_cache in sharding manager") print_gpu_memory_usage("After del state_dict and empty_cache in sharding manager")
# important: need to manually set the random states of each tp to be identical. # important: need to manually set the random states of each tp to be identical.
...@@ -101,8 +101,6 @@ class FSDPVLLMShardingManager(BaseShardingManager): ...@@ -101,8 +101,6 @@ class FSDPVLLMShardingManager(BaseShardingManager):
print_gpu_memory_usage("Before vllm offload in sharding manager") print_gpu_memory_usage("Before vllm offload in sharding manager")
free_bytes_before_sleep = torch.cuda.mem_get_info()[0] free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
# self.inference_engine.sleep(level=1) # self.inference_engine.sleep(level=1)
## TODO DCU 怎么释放显存
# self.inference_engine.offload_model_weights()
free_bytes_after_sleep = torch.cuda.mem_get_info()[0] free_bytes_after_sleep = torch.cuda.mem_get_info()[0]
self.freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep self.freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment