Commit ac9d2b05 authored by chenych's avatar chenych
Browse files

Fix sleep bug

parent ff7fb65e
......@@ -62,10 +62,8 @@ docker run -it --shm-size 200g --network=host --name docker_name --privileged --
## 安装所需环境包
cd EasyR1
pip install -r requirements.txt --no-deps
## 注释掉accelerate、liger-kernel、tensordict之后再执行以下步骤
pip install -r requirements.txt
pip install "tensordict<0.6"
# 编译
pip install -e .
```
......@@ -78,10 +76,8 @@ docker run -it --shm-size 200g --network=host --name docker_name --privileged --
## 安装所需环境包
cd EasyR1
pip install -r requirements.txt --no-deps
## 注释掉accelerate、liger-kernel、tensordict之后再执行以下步骤
pip install -r requirements.txt
pip install "tensordict<0.6"
# 编译
pip install -e .
```
......@@ -94,15 +90,14 @@ python: 3.10
torch: 2.4.1
deepspeed: 0.14.2+das.opt2.dtk2504
flash-attn: 2.6.1+das.opt4.dtk2504
vllm: 0.7.2
```
`Tips:以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应`
```bash
cd EasyR1
pip install -r requirements.txt --no-deps
## 注释掉accelerate、liger-kernel、tensordict之后再执行以下步骤
pip install -r requirements.txt
pip install "tensordict<0.6"
# 编译
pip install -e .
```
......
......@@ -12,7 +12,6 @@ pyarrow>=15.0.0
pylatexenc
qwen-vl-utils
ray[default]
tensordict
torchdata
transformers>=4.49.0
wandb
......
......@@ -75,7 +75,7 @@ class vLLMRollout(BaseRollout):
enforce_eager=config.enforce_eager,
max_model_len=config.prompt_length + config.response_length,
max_num_batched_tokens=config.max_num_batched_tokens,
enable_sleep_mode=True,
enable_sleep_mode=False,
distributed_executor_backend="external_launcher",
disable_custom_all_reduce=True,
disable_mm_preprocessor_cache=True,
......@@ -86,7 +86,7 @@ class vLLMRollout(BaseRollout):
)
# Offload vllm model to reduce peak memory usage
self.inference_engine.sleep(level=1)
# self.inference_engine.sleep(level=1)
sampling_kwargs = {"max_tokens": config.response_length, "detokenize": False}
default_sampling_params = SamplingParams()
......
......@@ -102,7 +102,7 @@ class FSDPVLLMShardingManager(BaseShardingManager):
def __exit__(self, exc_type, exc_value, traceback):
print_gpu_memory_usage("Before vllm offload in sharding manager")
free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
self.inference_engine.sleep(level=1)
# self.inference_engine.sleep(level=1)
free_bytes_after_sleep = torch.cuda.mem_get_info()[0]
self.freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
print_gpu_memory_usage("After vllm offload in sharding manager")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment