Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
7e1d5e53
"sgl-kernel/setup_hip.py" did not exist on "9c3e95d98be95ce08d157d2331c091e28f24120b"
Commit
7e1d5e53
authored
Feb 19, 2024
by
zhuwenwen
Browse files
merge v0.3.1
parents
e3378b20
5f08050d
Changes
103
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
164 additions
and
66 deletions
+164
-66
vllm/worker/cache_engine.py
vllm/worker/cache_engine.py
+2
-0
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+108
-46
vllm/worker/worker.py
vllm/worker/worker.py
+54
-20
No files found.
vllm/worker/cache_engine.py
View file @
7e1d5e53
...
...
@@ -104,11 +104,13 @@ class CacheEngine:
size
=
(
self
.
num_cpu_blocks
,
*
key_block_shape
),
dtype
=
self
.
dtype
,
pin_memory
=
pin_memory
,
device
=
"cpu"
,
)
value_blocks
=
torch
.
empty
(
size
=
(
self
.
num_cpu_blocks
,
*
value_block_shape
),
dtype
=
self
.
dtype
,
pin_memory
=
pin_memory
,
device
=
"cpu"
,
)
cpu_cache
.
append
((
key_blocks
,
value_blocks
))
return
cpu_cache
...
...
vllm/worker/model_runner.py
View file @
7e1d5e53
import
contextlib
import
time
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Set
,
Union
...
...
@@ -5,11 +6,15 @@ import numpy as np
import
torch
import
torch.nn
as
nn
from
vllm.config
import
ModelConfig
,
LoRAConfig
,
ParallelConfig
,
SchedulerConfig
from
vllm.config
import
(
DeviceConfig
,
ModelConfig
,
LoRAConfig
,
ParallelConfig
,
SchedulerConfig
)
from
vllm.logger
import
init_logger
from
vllm.model_executor
import
get_model
,
InputMetadata
,
SamplingMetadata
from
vllm.model_executor.parallel_utils
import
cupy_utils
from
vllm.model_executor.parallel_utils.communication_op
import
(
broadcast_tensor_dict
)
from
vllm.model_executor.parallel_utils.parallel_state
import
(
with_cupy_nccl_for_all_reduce
)
from
vllm.model_executor.parallel_utils
import
custom_all_reduce
from
vllm.sampling_params
import
SamplingParams
,
SamplingType
from
vllm.sequence
import
SamplerOutput
,
SequenceData
,
SequenceGroupMetadata
...
...
@@ -35,6 +40,7 @@ class ModelRunner:
model_config
:
ModelConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
kv_cache_dtype
:
Optional
[
str
]
=
"auto"
,
is_driver_worker
:
bool
=
False
,
...
...
@@ -49,7 +55,10 @@ class ModelRunner:
# FIXME(woosuk): This is a hack to make the tests work. Refactor this.
self
.
sliding_window
=
(
model_config
.
get_sliding_window
()
if
model_config
is
not
None
else
None
)
self
.
device
=
torch
.
device
(
torch
.
cuda
.
current_device
())
self
.
device_config
=
(
device_config
if
device_config
is
not
None
else
DeviceConfig
())
self
.
device
=
self
.
device_config
.
device
self
.
model
=
None
self
.
block_size
=
None
# Set after initial profiling.
self
.
lora_manager
=
None
...
...
@@ -72,16 +81,26 @@ class ModelRunner:
self
.
kv_cache_dtype
=
kv_cache_dtype
def
load_model
(
self
)
->
None
:
self
.
model
=
get_model
(
self
.
model_config
,
self
.
lora_config
)
self
.
model
=
get_model
(
self
.
model_config
,
self
.
device_config
,
self
.
lora_config
)
vocab_size
=
self
.
model
.
config
.
vocab_size
if
self
.
lora_config
:
assert
hasattr
(
self
.
model
,
"supported_lora_modules"
)
and
self
.
model
.
supported_lora_modules
,
"Model does not support LoRA"
assert
hasattr
(
self
.
model
,
"embedding_modules"
),
"Model does not have embedding_modules"
assert
hasattr
(
self
.
model
,
"embedding_padding_modules"
),
"Model does not have embedding_padding_modules"
self
.
lora_manager
=
LRUCacheWorkerLoRAManager
(
self
.
scheduler_config
.
max_num_seqs
,
self
.
scheduler_config
.
max_num_batched_tokens
+
self
.
scheduler_config
.
max_paddings
,
vocab_size
,
self
.
lora_config
,
self
.
device
)
self
.
lora_config
,
self
.
device
,
self
.
model
.
embedding_modules
,
self
.
model
.
embedding_padding_modules
)
self
.
model
=
self
.
lora_manager
.
create_lora_manager
(
self
.
model
)
def
set_block_size
(
self
,
block_size
:
int
)
->
None
:
...
...
@@ -142,10 +161,10 @@ class ModelRunner:
if
lora_id
>
0
:
lora_requests
.
add
(
seq_group_metadata
.
lora_request
)
lora_index_mapping
.
append
([
lora_id
]
*
prompt_len
)
lora_index_mapping
.
append
([
lora_id
]
*
(
prompt_len
-
prefix_len
)
)
lora_prompt_mapping
.
extend
(
[
lora_id
]
*
(
prompt_len
(
prompt_len
-
prefix_len
if
seq_group_metadata
.
sampling_params
.
prompt_logprobs
else
1
))
if
seq_group_metadata
.
block_tables
is
None
:
...
...
@@ -182,22 +201,25 @@ class ModelRunner:
input_tokens
=
_make_tensor_with_pad
(
input_tokens
,
max_prompt_len
,
pad
=
0
,
dtype
=
torch
.
long
)
dtype
=
torch
.
long
,
device
=
self
.
device
)
input_positions
=
_make_tensor_with_pad
(
input_positions
,
max_prompt_len
,
pad
=
0
,
dtype
=
torch
.
long
)
dtype
=
torch
.
long
,
device
=
self
.
device
)
slot_mapping
=
_make_tensor_with_pad
(
slot_mapping
,
max_prompt_len
,
pad
=
_PAD_SLOT_ID
,
dtype
=
torch
.
long
)
dtype
=
torch
.
long
,
device
=
self
.
device
)
lora_index_mapping
=
[
_pad_to_max
(
mapping
,
max_prompt_len
,
pad
=
0
)
for
mapping
in
lora_index_mapping
]
context_lens_tensor
=
torch
.
tensor
(
context_lens
,
dtype
=
torch
.
int
,
device
=
'cuda'
)
device
=
self
.
device
)
# Prepare prefix block tables
max_prompt_block_table_len
=
max
(
len
(
t
)
for
t
in
prefix_block_tables
)
block_tables
=
_make_tensor_with_pad
(
...
...
@@ -205,15 +227,16 @@ class ModelRunner:
max_len
=
max_prompt_block_table_len
,
pad
=
0
,
dtype
=
torch
.
int
,
device
=
self
.
device
,
)
start_loc_tensor
=
torch
.
arange
(
0
,
len
(
prompt_lens
)
*
max_prompt_len
,
max_prompt_len
,
dtype
=
torch
.
long
,
device
=
'cuda'
)
device
=
self
.
device
)
prompt_lens_tensor
=
torch
.
tensor
(
prompt_lens
,
dtype
=
torch
.
long
,
device
=
'cuda'
)
device
=
self
.
device
)
input_metadata
=
InputMetadata
(
is_prompt
=
True
,
...
...
@@ -305,20 +328,20 @@ class ModelRunner:
max_len
=
1
,
pad
=
0
,
dtype
=
torch
.
long
,
device
=
"cuda"
)
device
=
self
.
device
)
input_positions
=
_make_tensor_with_pad
(
input_positions
,
max_len
=
1
,
pad
=
0
,
dtype
=
torch
.
long
,
device
=
"cuda"
)
device
=
self
.
device
)
slot_mapping
=
_make_tensor_with_pad
(
slot_mapping
,
max_len
=
1
,
pad
=
_PAD_SLOT_ID
,
dtype
=
torch
.
long
,
device
=
"cuda"
)
device
=
self
.
device
)
context_lens
=
torch
.
tensor
(
context_lens
,
dtype
=
torch
.
int
,
device
=
"cuda"
)
device
=
self
.
device
)
if
use_captured_graph
:
# The shape of graph_block_tables is
...
...
@@ -327,7 +350,7 @@ class ModelRunner:
for
i
,
block_table
in
enumerate
(
block_tables
):
if
block_table
:
input_block_tables
[
i
,
:
len
(
block_table
)]
=
block_table
block_tables
=
torch
.
tensor
(
input_block_tables
,
device
=
"cuda"
)
block_tables
=
torch
.
tensor
(
input_block_tables
,
device
=
self
.
device
)
else
:
max_block_table_len
=
max
(
len
(
block_table
)
for
block_table
in
block_tables
)
...
...
@@ -336,7 +359,7 @@ class ModelRunner:
max_len
=
max_block_table_len
,
pad
=
0
,
dtype
=
torch
.
int
,
device
=
"cuda"
,
device
=
self
.
device
,
)
lora_index_mapping
=
[
...
...
@@ -355,7 +378,8 @@ class ModelRunner:
use_cuda_graph
=
use_captured_graph
,
kv_cache_dtype
=
self
.
kv_cache_dtype
,
)
return
input_tokens
,
input_positions
,
input_metadata
,
lora_index_mapping
,
lora_prompt_mapping
,
lora_requests
return
(
input_tokens
,
input_positions
,
input_metadata
,
lora_index_mapping
,
lora_prompt_mapping
,
lora_requests
)
def
_prepare_sample
(
self
,
...
...
@@ -410,9 +434,13 @@ class ModelRunner:
selected_token_indices
=
_async_h2d
(
selected_token_indices
,
dtype
=
torch
.
long
,
target_device
=
self
.
device
,
pin_memory
=
not
self
.
in_wsl
)
categorized_sample_indices
=
{
t
:
_async_h2d
(
seq_ids
,
dtype
=
torch
.
int
,
pin_memory
=
not
self
.
in_wsl
)
t
:
_async_h2d
(
seq_ids
,
dtype
=
torch
.
int
,
target_device
=
self
.
device
,
pin_memory
=
not
self
.
in_wsl
)
for
t
,
seq_ids
in
categorized_sample_indices
.
items
()
}
...
...
@@ -511,7 +539,8 @@ class ModelRunner:
perform_sampling
=
False
,
)
return
input_tokens
,
input_positions
,
input_metadata
,
sampling_metadata
,
lora_requests
,
lora_mapping
return
(
input_tokens
,
input_positions
,
input_metadata
,
sampling_metadata
,
lora_requests
,
lora_mapping
)
@
torch
.
inference_mode
()
def
execute_model
(
...
...
@@ -519,8 +548,9 @@ class ModelRunner:
seq_group_metadata_list
:
Optional
[
List
[
SequenceGroupMetadata
]],
kv_caches
:
List
[
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]],
)
->
Optional
[
SamplerOutput
]:
input_tokens
,
input_positions
,
input_metadata
,
sampling_metadata
,
lora_requests
,
lora_mapping
=
(
self
.
prepare_input_tensors
(
seq_group_metadata_list
))
(
input_tokens
,
input_positions
,
input_metadata
,
sampling_metadata
,
lora_requests
,
lora_mapping
)
=
self
.
prepare_input_tensors
(
seq_group_metadata_list
)
if
self
.
lora_config
:
self
.
set_active_loras
(
lora_requests
,
lora_mapping
)
...
...
@@ -628,6 +658,10 @@ class ModelRunner:
@
torch
.
inference_mode
()
def
capture_model
(
self
,
kv_caches
:
List
[
KVCache
])
->
None
:
# NOTE(woosuk): This is a hack to ensure that the NCCL backend is never
# deleted before the CUDA graphs.
self
.
cupy_nccl_backend
=
cupy_utils
.
get_nccl_backend
()
assert
not
self
.
model_config
.
enforce_eager
logger
.
info
(
"Capturing the model for CUDA graphs. This may lead to "
"unexpected consequences if the model is not static. To "
...
...
@@ -656,9 +690,15 @@ class ModelRunner:
bs
for
bs
in
_BATCH_SIZES_TO_CAPTURE
if
bs
<=
graph_batch_size
]
# NOTE: Capturing the largest batch size first may help reduce the
# memory usage of CUDA graph.
# NOTE(woosuk): There are 3 backends for all-reduce: custom all-reduce
# kernel, CuPy NCCL, and PyTorch NCCL. When using CUDA graph, we use
# either custom all-reduce kernel or CuPy NCCL. When not using CUDA
# graph, we use either custom all-reduce kernel or PyTorch NCCL.
# We always prioritize using custom all-reduce kernel but fall back
# to PyTorch or CuPy NCCL if it is disabled or not supported.
with
custom_all_reduce
.
capture
():
# NOTE: Capturing the largest batch size first may help reduce the
# memory usage of CUDA graph.
for
batch_size
in
reversed
(
batch_size_capture_list
):
# Create dummy input_metadata.
input_metadata
=
InputMetadata
(
...
...
@@ -697,6 +737,14 @@ class ModelRunner:
# This usually takes < 10 seconds.
logger
.
info
(
f
"Graph capturing finished in
{
elapsed_time
:.
0
f
}
secs."
)
def
__del__
(
self
)
->
None
:
# Delete the CUDA graphs before deleting the CuPy NCCL communicator.
# NOTE(woosuk): This is necessary because otherwise deadlocks can
# happen.
# FIXME(woosuk): This is a bit hacky. Find a more robust solution.
self
.
graph_runners
.
clear
()
self
.
cupy_nccl_backend
=
None
class
CUDAGraphRunner
:
...
...
@@ -718,18 +766,8 @@ class CUDAGraphRunner:
# Run the model once without capturing the graph.
# This is to make sure that the captured graph does not include the
# kernel launches for initial benchmarking (e.g., Triton autotune).
self
.
model
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
)
torch
.
cuda
.
synchronize
()
# Capture the graph.
self
.
graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
self
.
graph
,
pool
=
memory_pool
):
hidden_states
=
self
.
model
(
with
_maybe_cupy_nccl
():
self
.
model
(
input_ids
,
positions
,
kv_caches
,
...
...
@@ -737,6 +775,20 @@ class CUDAGraphRunner:
)
torch
.
cuda
.
synchronize
()
# Capture the graph.
# NOTE(woosuk): Python 3.8 does not support multi-line with statements.
# https://stackoverflow.com/questions/31039022/python-multi-line-with-statement
self
.
graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
self
.
graph
,
pool
=
memory_pool
):
# noqa: SIM117
with
_maybe_cupy_nccl
():
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
input_metadata
,
)
torch
.
cuda
.
synchronize
()
# Save the input and output buffers.
self
.
input_buffers
=
{
"input_ids"
:
input_ids
,
...
...
@@ -779,6 +831,15 @@ class CUDAGraphRunner:
return
self
.
forward
(
*
args
,
**
kwargs
)
@
contextlib
.
contextmanager
def
_maybe_cupy_nccl
():
if
cupy_utils
.
is_initialized
()
and
not
custom_all_reduce
.
is_initialized
():
with
with_cupy_nccl_for_all_reduce
():
yield
else
:
yield
def
_pad_to_max
(
x
:
List
[
int
],
max_len
:
int
,
pad
:
int
)
->
List
[
int
]:
assert
len
(
x
)
<=
max_len
return
x
+
[
pad
]
*
(
max_len
-
len
(
x
))
...
...
@@ -789,14 +850,10 @@ def _make_tensor_with_pad(
max_len
:
int
,
pad
:
int
,
dtype
:
torch
.
dtype
,
device
:
Union
[
str
,
torch
.
device
]
=
"cuda"
,
pin_memory
:
bool
=
False
,
device
:
Optional
[
Union
[
str
,
torch
.
device
]],
)
->
torch
.
Tensor
:
padded_x
=
[
_pad_to_max
(
x_i
,
max_len
,
pad
)
for
x_i
in
x
]
return
torch
.
tensor
(
padded_x
,
dtype
=
dtype
,
device
=
device
,
pin_memory
=
pin_memory
and
str
(
device
)
==
"cpu"
)
return
torch
.
tensor
(
padded_x
,
dtype
=
dtype
,
device
=
device
)
def
_get_graph_batch_size
(
batch_size
:
int
)
->
int
:
...
...
@@ -808,6 +865,11 @@ def _get_graph_batch_size(batch_size: int) -> int:
return
(
batch_size
+
7
)
//
8
*
8
def
_async_h2d
(
data
:
list
,
dtype
,
pin_memory
):
t
=
torch
.
tensor
(
data
,
dtype
=
dtype
,
pin_memory
=
pin_memory
)
return
t
.
to
(
device
=
"cuda"
,
non_blocking
=
True
)
def
_async_h2d
(
data
:
list
,
dtype
:
torch
.
dtype
,
target_device
:
Union
[
str
,
torch
.
device
],
pin_memory
:
bool
,
)
->
torch
.
Tensor
:
t
=
torch
.
tensor
(
data
,
dtype
=
dtype
,
pin_memory
=
pin_memory
,
device
=
"cpu"
)
return
t
.
to
(
device
=
target_device
,
non_blocking
=
True
)
vllm/worker/worker.py
View file @
7e1d5e53
...
...
@@ -6,9 +6,10 @@ from typing import Dict, List, Tuple, Set, Optional
import
torch
import
torch.distributed
from
vllm.config
import
(
CacheConfig
,
Model
Config
,
Parall
elConfig
,
SchedulerConfig
,
LoRAConfig
)
from
vllm.config
import
(
CacheConfig
,
Device
Config
,
Mod
elConfig
,
ParallelConfig
,
SchedulerConfig
,
LoRAConfig
)
from
vllm.model_executor
import
set_random_seed
from
vllm.model_executor.parallel_utils
import
cupy_utils
from
vllm.model_executor.parallel_utils.communication_op
import
(
broadcast_tensor_dict
)
from
vllm.model_executor.parallel_utils.custom_all_reduce
import
init_custom_ar
...
...
@@ -18,6 +19,7 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata
from
vllm.worker.cache_engine
import
CacheEngine
from
vllm.worker.model_runner
import
ModelRunner
from
vllm.lora.request
import
LoRARequest
from
vllm.utils
import
is_hip
class
Worker
:
...
...
@@ -33,6 +35,7 @@ class Worker:
model_config
:
ModelConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
local_rank
:
int
,
rank
:
int
,
distributed_init_method
:
str
,
...
...
@@ -43,6 +46,7 @@ class Worker:
self
.
model_config
=
model_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
device_config
=
device_config
self
.
local_rank
=
local_rank
self
.
rank
=
rank
self
.
distributed_init_method
=
distributed_init_method
...
...
@@ -54,6 +58,7 @@ class Worker:
self
.
model_runner
=
ModelRunner
(
model_config
,
parallel_config
,
scheduler_config
,
device_config
,
lora_config
=
self
.
lora_config
,
kv_cache_dtype
=
kv_cache_dtype
,
is_driver_worker
=
is_driver_worker
)
...
...
@@ -64,25 +69,30 @@ class Worker:
self
.
cache_events
=
None
self
.
gpu_cache
=
None
def
init_model
(
self
)
->
None
:
# torch.distributed.all_reduce does not free the input tensor until
# the synchronization point. This causes the memory usage to grow
# as the number of all_reduce calls increases. This env var disables
# this behavior.
# Related issue:
# https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
os
.
environ
[
"TORCH_NCCL_AVOID_RECORD_STREAMS"
]
=
"1"
# This env var set by Ray causes exceptions with graph building.
os
.
environ
.
pop
(
"NCCL_ASYNC_ERROR_HANDLING"
,
None
)
self
.
device
=
torch
.
device
(
f
"cuda:
{
self
.
local_rank
}
"
)
torch
.
cuda
.
set_device
(
self
.
device
)
_check_if_gpu_supports_dtype
(
self
.
model_config
.
dtype
)
def
init_model
(
self
,
cupy_port
:
Optional
[
int
]
=
None
)
->
None
:
if
self
.
device_config
.
device
.
type
==
"cuda"
:
# torch.distributed.all_reduce does not free the input tensor until
# the synchronization point. This causes the memory usage to grow
# as the number of all_reduce calls increases. This env var disables
# this behavior.
# Related issue:
# https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
os
.
environ
[
"TORCH_NCCL_AVOID_RECORD_STREAMS"
]
=
"1"
# This env var set by Ray causes exceptions with graph building.
os
.
environ
.
pop
(
"NCCL_ASYNC_ERROR_HANDLING"
,
None
)
self
.
device
=
torch
.
device
(
f
"cuda:
{
self
.
local_rank
}
"
)
torch
.
cuda
.
set_device
(
self
.
device
)
_check_if_gpu_supports_dtype
(
self
.
model_config
.
dtype
)
torch
.
cuda
.
empty_cache
()
self
.
init_gpu_memory
=
torch
.
cuda
.
mem_get_info
()[
0
]
else
:
raise
RuntimeError
(
f
"Not support device type:
{
self
.
device_config
.
device
}
"
)
# Initialize the distributed environment.
init_distributed_environment
(
self
.
parallel_config
,
self
.
rank
,
self
.
distributed_init_method
)
cupy_port
,
self
.
distributed_init_method
)
if
not
self
.
parallel_config
.
disable_custom_all_reduce
:
init_custom_ar
()
# Initialize the model.
...
...
@@ -119,7 +129,9 @@ class Worker:
# profiled peak memory.
torch
.
cuda
.
synchronize
()
free_gpu_memory
,
total_gpu_memory
=
torch
.
cuda
.
mem_get_info
()
peak_memory
=
total_gpu_memory
-
free_gpu_memory
# NOTE(woosuk): Here we assume that the other processes using the same
# GPU did not change their memory usage during the profiling.
peak_memory
=
self
.
init_gpu_memory
-
free_gpu_memory
cache_block_size
=
CacheEngine
.
get_cache_block_size
(
block_size
,
cache_dtype
,
self
.
model_config
,
self
.
parallel_config
)
...
...
@@ -227,6 +239,7 @@ class Worker:
def
init_distributed_environment
(
parallel_config
:
ParallelConfig
,
rank
:
int
,
cupy_port
:
Optional
[
int
],
distributed_init_method
:
Optional
[
str
]
=
None
,
)
->
None
:
"""Initialize the distributed environment."""
...
...
@@ -249,8 +262,29 @@ def init_distributed_environment(
init_method
=
distributed_init_method
,
)
if
cupy_utils
.
is_initialized
():
cupy_world_size
=
cupy_utils
.
get_world_size
()
if
cupy_world_size
!=
parallel_config
.
world_size
:
raise
RuntimeError
(
"cupy.distributed is already initialized but the cupy world "
"size does not match parallel_config.world_size "
f
"(
{
cupy_world_size
}
vs.
{
parallel_config
.
world_size
}
)."
)
elif
(
parallel_config
.
world_size
>
1
and
cupy_port
is
not
None
and
not
is_hip
()):
# NOTE(woosuk): We don't initialize CuPy process group when world size
# is 1.
# TODO(woosuk): Support multi-node connection.
cupy_utils
.
init_process_group
(
world_size
=
parallel_config
.
world_size
,
rank
=
rank
,
host
=
"localhost"
,
port
=
cupy_port
,
)
# A small all_reduce for warmup.
torch
.
distributed
.
all_reduce
(
torch
.
zeros
(
1
).
cuda
())
if
cupy_utils
.
is_initialized
():
cupy_utils
.
all_reduce
(
torch
.
zeros
(
1
).
cuda
())
ensure_model_parallel_initialized
(
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
)
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment