Commit ac9d2b05 authored by chenych's avatar chenych
Browse files

Fix sleep bug

parent ff7fb65e
...@@ -62,10 +62,8 @@ docker run -it --shm-size 200g --network=host --name docker_name --privileged -- ...@@ -62,10 +62,8 @@ docker run -it --shm-size 200g --network=host --name docker_name --privileged --
## 安装所需环境包 ## 安装所需环境包
cd EasyR1 cd EasyR1
pip install -r requirements.txt --no-deps
## 注释掉accelerate、liger-kernel、tensordict之后再执行以下步骤
pip install -r requirements.txt pip install -r requirements.txt
pip install "tensordict<0.6"
# 编译 # 编译
pip install -e . pip install -e .
``` ```
...@@ -78,10 +76,8 @@ docker run -it --shm-size 200g --network=host --name docker_name --privileged -- ...@@ -78,10 +76,8 @@ docker run -it --shm-size 200g --network=host --name docker_name --privileged --
## 安装所需环境包 ## 安装所需环境包
cd EasyR1 cd EasyR1
pip install -r requirements.txt --no-deps
## 注释掉accelerate、liger-kernel、tensordict之后再执行以下步骤
pip install -r requirements.txt pip install -r requirements.txt
pip install "tensordict<0.6"
# 编译 # 编译
pip install -e . pip install -e .
``` ```
...@@ -94,15 +90,14 @@ python: 3.10 ...@@ -94,15 +90,14 @@ python: 3.10
torch: 2.4.1 torch: 2.4.1
deepspeed: 0.14.2+das.opt2.dtk2504 deepspeed: 0.14.2+das.opt2.dtk2504
flash-attn: 2.6.1+das.opt4.dtk2504 flash-attn: 2.6.1+das.opt4.dtk2504
vllm: 0.7.2
``` ```
`Tips:以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应` `Tips:以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应`
```bash ```bash
cd EasyR1 cd EasyR1
pip install -r requirements.txt --no-deps
## 注释掉accelerate、liger-kernel、tensordict之后再执行以下步骤
pip install -r requirements.txt pip install -r requirements.txt
pip install "tensordict<0.6"
# 编译 # 编译
pip install -e . pip install -e .
``` ```
......
...@@ -12,7 +12,6 @@ pyarrow>=15.0.0 ...@@ -12,7 +12,6 @@ pyarrow>=15.0.0
pylatexenc pylatexenc
qwen-vl-utils qwen-vl-utils
ray[default] ray[default]
tensordict
torchdata torchdata
transformers>=4.49.0 transformers>=4.49.0
wandb wandb
......
...@@ -75,7 +75,7 @@ class vLLMRollout(BaseRollout): ...@@ -75,7 +75,7 @@ class vLLMRollout(BaseRollout):
enforce_eager=config.enforce_eager, enforce_eager=config.enforce_eager,
max_model_len=config.prompt_length + config.response_length, max_model_len=config.prompt_length + config.response_length,
max_num_batched_tokens=config.max_num_batched_tokens, max_num_batched_tokens=config.max_num_batched_tokens,
enable_sleep_mode=True, enable_sleep_mode=False,
distributed_executor_backend="external_launcher", distributed_executor_backend="external_launcher",
disable_custom_all_reduce=True, disable_custom_all_reduce=True,
disable_mm_preprocessor_cache=True, disable_mm_preprocessor_cache=True,
...@@ -86,7 +86,7 @@ class vLLMRollout(BaseRollout): ...@@ -86,7 +86,7 @@ class vLLMRollout(BaseRollout):
) )
# Offload vllm model to reduce peak memory usage # Offload vllm model to reduce peak memory usage
self.inference_engine.sleep(level=1) # self.inference_engine.sleep(level=1)
sampling_kwargs = {"max_tokens": config.response_length, "detokenize": False} sampling_kwargs = {"max_tokens": config.response_length, "detokenize": False}
default_sampling_params = SamplingParams() default_sampling_params = SamplingParams()
......
...@@ -102,7 +102,7 @@ class FSDPVLLMShardingManager(BaseShardingManager): ...@@ -102,7 +102,7 @@ class FSDPVLLMShardingManager(BaseShardingManager):
def __exit__(self, exc_type, exc_value, traceback): def __exit__(self, exc_type, exc_value, traceback):
print_gpu_memory_usage("Before vllm offload in sharding manager") print_gpu_memory_usage("Before vllm offload in sharding manager")
free_bytes_before_sleep = torch.cuda.mem_get_info()[0] free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
self.inference_engine.sleep(level=1) # self.inference_engine.sleep(level=1)
free_bytes_after_sleep = torch.cuda.mem_get_info()[0] free_bytes_after_sleep = torch.cuda.mem_get_info()[0]
self.freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep self.freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
print_gpu_memory_usage("After vllm offload in sharding manager") print_gpu_memory_usage("After vllm offload in sharding manager")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment