Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
8823cc48
Unverified
Commit
8823cc48
authored
Jan 29, 2024
by
Frank Lee
Committed by
GitHub
Jan 29, 2024
Browse files
Merge pull request #5310 from hpcaitech/feature/npu
Feature/npu
parents
bce9499e
73f4dc57
Changes
266
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
83 additions
and
55 deletions
+83
-55
examples/language/gpt/experiments/auto_offload/train_gpt_offload.py
...anguage/gpt/experiments/auto_offload/train_gpt_offload.py
+2
-2
examples/language/gpt/gemini/train_gpt_demo.py
examples/language/gpt/gemini/train_gpt_demo.py
+6
-2
examples/language/gpt/hybridparallelism/finetune.py
examples/language/gpt/hybridparallelism/finetune.py
+6
-4
examples/language/gpt/titans/model/embed.py
examples/language/gpt/titans/model/embed.py
+9
-5
examples/language/llama2/benchmark.py
examples/language/llama2/benchmark.py
+6
-5
examples/language/llama2/data_utils.py
examples/language/llama2/data_utils.py
+4
-2
examples/language/llama2/finetune.py
examples/language/llama2/finetune.py
+4
-2
examples/language/llama2/performance_evaluator.py
examples/language/llama2/performance_evaluator.py
+4
-5
examples/language/llama2/pretrain.py
examples/language/llama2/pretrain.py
+4
-2
examples/language/openmoe/benchmark/benchmark_cai.py
examples/language/openmoe/benchmark/benchmark_cai.py
+6
-4
examples/language/openmoe/model/modeling_openmoe.py
examples/language/openmoe/model/modeling_openmoe.py
+1
-1
examples/language/openmoe/train.py
examples/language/openmoe/train.py
+4
-2
examples/language/palm/train.py
examples/language/palm/train.py
+6
-2
examples/tutorial/new_api/cifar_resnet/train.py
examples/tutorial/new_api/cifar_resnet/train.py
+3
-3
examples/tutorial/new_api/cifar_vit/train.py
examples/tutorial/new_api/cifar_vit/train.py
+3
-3
examples/tutorial/new_api/glue_bert/finetune.py
examples/tutorial/new_api/glue_bert/finetune.py
+2
-2
examples/tutorial/opt/opt/run_clm.py
examples/tutorial/opt/opt/run_clm.py
+10
-6
examples/tutorial/sequence_parallel/model/bert.py
examples/tutorial/sequence_parallel/model/bert.py
+1
-1
examples/tutorial/sequence_parallel/model/layers/head.py
examples/tutorial/sequence_parallel/model/layers/head.py
+1
-1
examples/tutorial/sequence_parallel/train.py
examples/tutorial/sequence_parallel/train.py
+1
-1
No files found.
examples/language/gpt/experiments/auto_offload/train_gpt_offload.py
View file @
8823cc48
...
...
@@ -7,13 +7,13 @@ from model_zoo import GPTLMLoss, get_gpt2_components
from
torch.utils._pytree
import
tree_map
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.auto_parallel.offload.amp_optimizer
import
AMPOptimizer
from
colossalai.auto_parallel.offload.mem_optimize
import
memory_optimize
from
colossalai.auto_parallel.offload.solver
import
NOT_NVML
from
colossalai.fx.profiler
import
parameter_size
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.testing
import
spawn
from
colossalai.utils
import
get_current_device
def
parse_args
():
...
...
@@ -41,7 +41,7 @@ def train_gpt(args):
64
,
8
,
),
device
=
get_current_device
(),
device
=
get_
accelerator
().
get_
current_device
(),
)
criterion
=
GPTLMLoss
()
...
...
examples/language/gpt/gemini/train_gpt_demo.py
View file @
8823cc48
...
...
@@ -12,12 +12,12 @@ from commons.utils import get_data, get_profile_context, get_tflops, get_time_st
from
packaging
import
version
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin
import
GeminiPlugin
,
LowLevelZeroPlugin
,
TorchDDPPlugin
from
colossalai.lazy
import
LazyInitContext
from
colossalai.logging
import
disable_existing_loggers
,
get_dist_logger
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
CAI_VERSION
=
colossalai
.
__version__
...
...
@@ -141,7 +141,11 @@ def main():
criterion
=
GPTLMLoss
()
torch
.
manual_seed
(
123
)
if
args
.
distplan
.
startswith
(
"CAI"
):
ctx
=
LazyInitContext
(
default_device
=
get_current_device
())
if
args
.
distplan
==
"CAI_Gemini"
else
nullcontext
()
ctx
=
(
LazyInitContext
(
default_device
=
get_accelerator
().
get_current_device
())
if
args
.
distplan
==
"CAI_Gemini"
else
nullcontext
()
)
# build GPT model
with
ctx
:
model
=
model_builder
(
args
.
model_type
)(
checkpoint
=
True
)
...
...
examples/language/gpt/hybridparallelism/finetune.py
View file @
8823cc48
...
...
@@ -13,11 +13,11 @@ from tqdm import tqdm
from
transformers
import
AutoConfig
,
GPT2ForSequenceClassification
,
get_linear_schedule_with_warmup
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin
import
GeminiPlugin
,
HybridParallelPlugin
,
LowLevelZeroPlugin
,
TorchDDPPlugin
from
colossalai.cluster
import
DistCoordinator
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
# ==============================
# Prepare Hyperparameters
...
...
@@ -54,7 +54,7 @@ def evaluate_model(
use_pipeline
=
isinstance
(
booster
.
plugin
,
HybridParallelPlugin
)
and
booster
.
plugin
.
pp_size
>
1
is_pp_last_stage
=
use_pipeline
and
booster
.
plugin
.
stage_manager
.
is_last_stage
()
accum_loss
=
torch
.
zeros
(
1
,
device
=
get_current_device
())
accum_loss
=
torch
.
zeros
(
1
,
device
=
get_
accelerator
().
get_
current_device
())
for
batch
in
dataloader
:
batch
=
move_to_cuda
(
batch
)
labels
=
batch
[
"labels"
]
...
...
@@ -83,8 +83,10 @@ def evaluate_model(
object_list
=
[
None
,
None
]
dist
.
broadcast_object_list
(
object_list
,
src
=
current_pp_group_ranks
[
-
1
],
group
=
pp_group
)
metric
.
add_batch
(
predictions
=
object_list
[
0
].
to
(
get_current_device
()),
references
=
labels
)
accum_loss
.
add_
(
object_list
[
1
].
to
(
get_current_device
()))
metric
.
add_batch
(
predictions
=
object_list
[
0
].
to
(
get_accelerator
().
get_current_device
()),
references
=
labels
)
accum_loss
.
add_
(
object_list
[
1
].
to
(
get_accelerator
().
get_current_device
()))
else
:
batch
=
move_to_cuda
(
batch
)
...
...
examples/language/gpt/titans/model/embed.py
View file @
8823cc48
...
...
@@ -5,6 +5,7 @@ from torch import nn as nn
from
torch.nn
import
functional
as
F
from
torch.nn.parameter
import
Parameter
from
colossalai.accelerator
import
get_accelerator
from
colossalai.legacy.context
import
ParallelMode
,
seed
from
colossalai.legacy.core
import
global_context
as
gpc
from
colossalai.legacy.nn.layer.base_layer
import
ParallelLayer
...
...
@@ -12,7 +13,6 @@ from colossalai.legacy.nn.layer.parallel_1d._utils import gather_forward_split_b
from
colossalai.legacy.nn.layer.parallel_1d.layers
import
Linear1D_Row
from
colossalai.legacy.nn.layer.utils
import
divide
from
colossalai.legacy.registry
import
LAYERS
,
LOSSES
from
colossalai.utils
import
get_current_device
class
VocabParallelEmbedding
(
torch
.
nn
.
Module
):
...
...
@@ -96,7 +96,9 @@ class VocabParallelEmbedding(torch.nn.Module):
if
position_ids
is
not
None
:
position_ids
=
position_ids
.
view
(
-
1
,
input_shape
[
-
1
])
if
position_ids
is
None
:
position_ids
=
torch
.
arange
(
0
,
input_shape
[
-
1
]
+
0
,
dtype
=
torch
.
long
,
device
=
get_current_device
())
position_ids
=
torch
.
arange
(
0
,
input_shape
[
-
1
]
+
0
,
dtype
=
torch
.
long
,
device
=
get_accelerator
().
get_current_device
()
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
view
(
-
1
,
input_shape
[
-
1
])
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
...
...
@@ -194,7 +196,7 @@ class VocabParallelEmbedding1D(torch.nn.Module):
self
.
num_embeddings_per_partition
=
self
.
vocab_end_index
-
self
.
vocab_start_index
# Allocate weights and initialize.
factory_kwargs
=
{
"device"
:
get_current_device
(),
"dtype"
:
dtype
}
factory_kwargs
=
{
"device"
:
get_accelerator
().
get_current_device
(),
"dtype"
:
dtype
}
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
num_embeddings_per_partition
,
self
.
embedding_dim
,
**
factory_kwargs
))
init
.
uniform_
(
self
.
weight
,
-
1
,
1
)
...
...
@@ -439,7 +441,9 @@ class HiddenParallelEmbedding(torch.nn.Module):
if
position_ids
is
not
None
:
position_ids
=
position_ids
.
view
(
-
1
,
input_shape
[
-
1
])
if
position_ids
is
None
:
position_ids
=
torch
.
arange
(
0
,
input_shape
[
-
1
]
+
0
,
dtype
=
torch
.
long
,
device
=
get_current_device
())
position_ids
=
torch
.
arange
(
0
,
input_shape
[
-
1
]
+
0
,
dtype
=
torch
.
long
,
device
=
get_accelerator
().
get_current_device
()
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
view
(
-
1
,
input_shape
[
-
1
])
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
...
...
@@ -532,7 +536,7 @@ class HiddenParallelEmbedding1D(torch.nn.Module):
self
.
_weight
=
None
# Allocate weights and initialize.
factory_kwargs
=
{
"device"
:
get_current_device
(),
"dtype"
:
dtype
}
factory_kwargs
=
{
"device"
:
get_accelerator
().
get_current_device
(),
"dtype"
:
dtype
}
self
.
weight
=
Parameter
(
torch
.
empty
(
num_embeddings
,
embed_dim_per_partition
,
**
factory_kwargs
))
init
.
uniform_
(
self
.
weight
,
-
1
,
1
)
...
...
examples/language/llama2/benchmark.py
View file @
8823cc48
...
...
@@ -13,13 +13,12 @@ from transformers.models.llama.configuration_llama import LlamaConfig
from
transformers.models.llama.modeling_llama
import
LlamaForCausalLM
import
colossalai
import
colossalai.
utils.device
as
device_utils
from
colossalai.
accelerator
import
get_accelerator
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin
import
GeminiPlugin
,
HybridParallelPlugin
,
TorchFSDPPlugin
from
colossalai.cluster
import
DistCoordinator
from
colossalai.lazy
import
LazyInitContext
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
# ==============================
# Constants
...
...
@@ -177,7 +176,7 @@ def main():
# Initialize Model and Optimizer
# ==============================
init_ctx
=
(
LazyInitContext
(
default_device
=
get_current_device
())
LazyInitContext
(
default_device
=
get_
accelerator
().
get_
current_device
())
if
isinstance
(
plugin
,
(
GeminiPlugin
,
HybridParallelPlugin
))
else
nullcontext
()
)
...
...
@@ -208,7 +207,9 @@ def main():
torch
.
set_default_dtype
(
torch
.
bfloat16
)
model
,
optimizer
,
_
,
dataloader
,
_
=
booster
.
boost
(
model
,
optimizer
,
dataloader
=
dataloader
)
torch
.
set_default_dtype
(
torch
.
float
)
coordinator
.
print_on_master
(
f
"Booster init max CUDA memory:
{
device_utils
.
max_memory_allocated
()
/
1024
**
2
:.
2
f
}
MB"
)
coordinator
.
print_on_master
(
f
"Booster init max CUDA memory:
{
get_accelerator
().
max_memory_allocated
()
/
1024
**
2
:.
2
f
}
MB"
)
coordinator
.
print_on_master
(
f
"Booster init max CPU memory:
{
resource
.
getrusage
(
resource
.
RUSAGE_SELF
).
ru_maxrss
/
1024
:.
2
f
}
MB"
)
...
...
@@ -234,7 +235,7 @@ def main():
performance_evaluator
.
on_step_end
(
**
batch
)
performance_evaluator
.
on_fit_end
()
coordinator
.
print_on_master
(
f
"Max CUDA memory usage:
{
device_utils
.
max_memory_allocated
()
/
1024
**
2
:.
2
f
}
MB"
)
coordinator
.
print_on_master
(
f
"Max CUDA memory usage:
{
get_accelerator
()
.
max_memory_allocated
()
/
1024
**
2
:.
2
f
}
MB"
)
if
__name__
==
"__main__"
:
...
...
examples/language/llama2/data_utils.py
View file @
8823cc48
...
...
@@ -8,7 +8,7 @@ from torch.distributed import ProcessGroup
from
torch.distributed.distributed_c10d
import
_get_default_group
from
torch.utils.data
import
DataLoader
,
Dataset
,
DistributedSampler
from
colossalai.
utils
import
get_
current_device
from
colossalai.
accelerator
import
get_
accelerator
class
StatefulDistributedSampler
(
DistributedSampler
):
...
...
@@ -108,7 +108,9 @@ class RandomDataset(Dataset):
def
__init__
(
self
,
num_samples
:
int
=
1000
,
max_length
:
int
=
2048
,
vocab_size
:
int
=
32000
):
self
.
num_samples
=
num_samples
self
.
max_length
=
max_length
self
.
input_ids
=
torch
.
randint
(
0
,
vocab_size
,
(
num_samples
,
max_length
),
device
=
get_current_device
())
self
.
input_ids
=
torch
.
randint
(
0
,
vocab_size
,
(
num_samples
,
max_length
),
device
=
get_accelerator
().
get_current_device
()
)
self
.
attention_mask
=
torch
.
ones_like
(
self
.
input_ids
)
def
__len__
(
self
):
...
...
examples/language/llama2/finetune.py
View file @
8823cc48
...
...
@@ -21,13 +21,13 @@ from transformers.models.llama.modeling_llama import LlamaForCausalLM
from
transformers.models.llama.tokenization_llama
import
LlamaTokenizer
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin
import
GeminiPlugin
,
HybridParallelPlugin
,
LowLevelZeroPlugin
from
colossalai.cluster
import
DistCoordinator
from
colossalai.lazy
import
LazyInitContext
from
colossalai.nn.lr_scheduler
import
CosineAnnealingWarmupLR
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
def
get_model_numel
(
model
:
nn
.
Module
)
->
int
:
...
...
@@ -191,7 +191,9 @@ def main():
config
=
LlamaConfig
.
from_pretrained
(
args
.
model_path
)
# use lazy init when using GeminiPlugin
init_ctx
=
(
LazyInitContext
(
default_device
=
get_current_device
())
if
isinstance
(
plugin
,
GeminiPlugin
)
else
nullcontext
()
LazyInitContext
(
default_device
=
get_accelerator
().
get_current_device
())
if
isinstance
(
plugin
,
GeminiPlugin
)
else
nullcontext
()
)
with
init_ctx
:
...
...
examples/language/llama2/performance_evaluator.py
View file @
8823cc48
...
...
@@ -5,9 +5,8 @@ import torch
import
torch.distributed
as
dist
from
torch
import
Tensor
import
colossalai.
utils.device
as
device_utils
from
colossalai.
accelerator
import
get_accelerator
from
colossalai.cluster
import
DistCoordinator
from
colossalai.utils.device
import
get_current_device
def
divide
(
x
:
float
,
y
:
float
)
->
float
:
...
...
@@ -22,7 +21,7 @@ def divide(x: float, y: float) -> float:
def
all_reduce_mean
(
x
:
float
,
world_size
:
int
)
->
float
:
if
world_size
==
1
:
return
x
tensor
=
torch
.
tensor
([
x
],
device
=
get_current_device
())
tensor
=
torch
.
tensor
([
x
],
device
=
get_
accelerator
().
get_
current_device
())
dist
.
all_reduce
(
tensor
)
tensor
=
tensor
/
world_size
return
tensor
.
item
()
...
...
@@ -86,13 +85,13 @@ class PerformanceEvaluator:
self
.
disable
=
self
.
ignore_steps
>
0
and
step
<
self
.
ignore_steps
if
self
.
disable
:
return
device_utils
.
synchronize
()
get_accelerator
()
.
synchronize
()
self
.
timer
.
start
()
def
on_step_end
(
self
,
input_ids
:
Tensor
,
**
kwargs
)
->
None
:
if
self
.
disable
:
return
device_utils
.
synchronize
()
get_accelerator
()
.
synchronize
()
self
.
timer
.
end
()
batch_size
,
seq_len
=
input_ids
.
shape
...
...
examples/language/llama2/pretrain.py
View file @
8823cc48
...
...
@@ -20,13 +20,13 @@ from transformers.models.llama.modeling_llama import LlamaForCausalLM
from
transformers.models.llama.tokenization_llama
import
LlamaTokenizer
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin
import
GeminiPlugin
,
HybridParallelPlugin
,
LowLevelZeroPlugin
from
colossalai.cluster
import
DistCoordinator
from
colossalai.lazy
import
LazyInitContext
from
colossalai.nn.lr_scheduler
import
CosineAnnealingWarmupLR
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
MODEL_CONFIGS
=
{
"7b"
:
LlamaConfig
(
max_position_embeddings
=
4096
),
...
...
@@ -227,7 +227,9 @@ def main():
config
=
MODEL_CONFIGS
[
args
.
config
]
# use lazy init when using GeminiPlugin
init_ctx
=
(
LazyInitContext
(
default_device
=
get_current_device
())
if
isinstance
(
plugin
,
GeminiPlugin
)
else
nullcontext
()
LazyInitContext
(
default_device
=
get_accelerator
().
get_current_device
())
if
isinstance
(
plugin
,
GeminiPlugin
)
else
nullcontext
()
)
with
init_ctx
:
...
...
examples/language/openmoe/benchmark/benchmark_cai.py
View file @
8823cc48
...
...
@@ -14,6 +14,7 @@ from transformers.models.llama import LlamaConfig
from
utils
import
PerformanceEvaluator
,
get_model_numel
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin.moe_hybrid_parallel_plugin
import
MoeHybridParallelPlugin
from
colossalai.cluster
import
DistCoordinator
...
...
@@ -21,7 +22,6 @@ from colossalai.moe.layers import apply_load_balance
from
colossalai.moe.manager
import
MOE_MANAGER
from
colossalai.moe.utils
import
skip_init
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
def
move_to_cuda
(
batch
,
device
):
...
...
@@ -64,13 +64,15 @@ class RandomDataset(Dataset):
)
self
.
input_ids
.
append
(
encode
[
"input_ids"
])
self
.
attention_mask
.
append
(
encode
[
"attention_mask"
])
self
.
input_ids
=
torch
.
cat
(
self
.
input_ids
,
dim
=
0
).
to
(
get_current_device
())
self
.
attention_mask
=
torch
.
cat
(
self
.
attention_mask
,
dim
=
0
).
to
(
get_current_device
())
self
.
input_ids
=
torch
.
cat
(
self
.
input_ids
,
dim
=
0
).
to
(
get_
accelerator
().
get_
current_device
())
self
.
attention_mask
=
torch
.
cat
(
self
.
attention_mask
,
dim
=
0
).
to
(
get_
accelerator
().
get_
current_device
())
repeat_times
=
num_samples
//
self
.
input_ids
.
shape
[
0
]
+
1
self
.
input_ids
=
self
.
input_ids
.
repeat
(
repeat_times
,
1
)[:
num_samples
]
self
.
attention_mask
=
self
.
attention_mask
.
repeat
(
repeat_times
,
1
)[:
num_samples
]
else
:
self
.
input_ids
=
torch
.
randint
(
0
,
vocab_size
,
(
num_samples
,
max_length
),
device
=
get_current_device
())
self
.
input_ids
=
torch
.
randint
(
0
,
vocab_size
,
(
num_samples
,
max_length
),
device
=
get_accelerator
().
get_current_device
()
)
self
.
attention_mask
=
torch
.
ones_like
(
self
.
input_ids
)
def
__len__
(
self
):
...
...
examples/language/openmoe/model/modeling_openmoe.py
View file @
8823cc48
...
...
@@ -35,7 +35,7 @@ from transformers.utils import (
replace_return_docstrings
,
)
from
colossalai.kernel.
cuda_native.mha
.flash_att
n_2
import
HAS_FLASH_ATTN
from
colossalai.kernel.
extensions
.flash_att
ention
import
HAS_FLASH_ATTN
from
colossalai.kernel.triton.llama_act_combine_kernel
import
HAS_TRITON
from
colossalai.moe.layers
import
SparseMLP
from
colossalai.moe.manager
import
MOE_MANAGER
...
...
examples/language/openmoe/train.py
View file @
8823cc48
...
...
@@ -15,6 +15,7 @@ from transformers import T5Tokenizer
from
transformers.models.llama
import
LlamaConfig
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin.moe_hybrid_parallel_plugin
import
MoeHybridParallelPlugin
from
colossalai.cluster
import
DistCoordinator
...
...
@@ -22,7 +23,6 @@ from colossalai.moe.layers import apply_load_balance
from
colossalai.moe.manager
import
MOE_MANAGER
from
colossalai.moe.utils
import
skip_init
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
def
move_to_cuda
(
batch
,
device
):
...
...
@@ -61,7 +61,9 @@ class RandomDataset(Dataset):
def
__init__
(
self
,
num_samples
:
int
=
1000
,
max_length
:
int
=
2048
,
vocab_size
:
int
=
32000
,
tokenizer
=
None
):
self
.
num_samples
=
num_samples
self
.
max_length
=
max_length
self
.
input_ids
=
torch
.
randint
(
0
,
vocab_size
,
(
num_samples
,
max_length
),
device
=
get_current_device
())
self
.
input_ids
=
torch
.
randint
(
0
,
vocab_size
,
(
num_samples
,
max_length
),
device
=
get_accelerator
().
get_current_device
()
)
self
.
attention_mask
=
torch
.
ones_like
(
self
.
input_ids
)
def
__len__
(
self
):
...
...
examples/language/palm/train.py
View file @
8823cc48
...
...
@@ -14,12 +14,12 @@ from palm_pytorch.autoregressive_wrapper import AutoregressiveWrapper
from
torch.utils.data
import
DataLoader
,
Dataset
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin
import
GeminiPlugin
,
LowLevelZeroPlugin
,
TorchDDPPlugin
from
colossalai.lazy
import
LazyInitContext
from
colossalai.logging
import
disable_existing_loggers
,
get_dist_logger
from
colossalai.nn
import
HybridAdam
from
colossalai.utils
import
get_current_device
# constants
...
...
@@ -159,7 +159,11 @@ if args.distplan == "colossalai":
logger
.
info
(
f
"plugin:
{
plugin
}
"
)
booster
=
Booster
(
plugin
=
plugin
,
**
booster_kwargs
)
ctx
=
LazyInitContext
(
default_device
=
get_current_device
())
if
args
.
plugin
==
"gemini"
else
nullcontext
()
ctx
=
(
LazyInitContext
(
default_device
=
get_accelerator
().
get_current_device
())
if
args
.
plugin
==
"gemini"
else
nullcontext
()
)
with
ctx
:
model
=
PaLM
(
num_tokens
=
50304
,
dim
=
4096
,
depth
=
64
)
...
...
examples/tutorial/new_api/cifar_resnet/train.py
View file @
8823cc48
...
...
@@ -13,12 +13,12 @@ from torch.utils.data import DataLoader
from
tqdm
import
tqdm
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin
import
GeminiPlugin
,
LowLevelZeroPlugin
,
TorchDDPPlugin
from
colossalai.booster.plugin.dp_plugin_base
import
DPPluginBase
from
colossalai.cluster
import
DistCoordinator
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
# ==============================
# Prepare Hyperparameters
...
...
@@ -53,8 +53,8 @@ def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPl
@
torch
.
no_grad
()
def
evaluate
(
model
:
nn
.
Module
,
test_dataloader
:
DataLoader
,
coordinator
:
DistCoordinator
)
->
float
:
model
.
eval
()
correct
=
torch
.
zeros
(
1
,
dtype
=
torch
.
int64
,
device
=
get_current_device
())
total
=
torch
.
zeros
(
1
,
dtype
=
torch
.
int64
,
device
=
get_current_device
())
correct
=
torch
.
zeros
(
1
,
dtype
=
torch
.
int64
,
device
=
get_
accelerator
().
get_
current_device
())
total
=
torch
.
zeros
(
1
,
dtype
=
torch
.
int64
,
device
=
get_
accelerator
().
get_
current_device
())
for
images
,
labels
in
test_dataloader
:
images
=
images
.
cuda
()
labels
=
labels
.
cuda
()
...
...
examples/tutorial/new_api/cifar_vit/train.py
View file @
8823cc48
...
...
@@ -13,13 +13,13 @@ from torch.utils.data import DataLoader
from
tqdm
import
tqdm
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin
import
GeminiPlugin
,
LowLevelZeroPlugin
,
TorchDDPPlugin
from
colossalai.booster.plugin.dp_plugin_base
import
DPPluginBase
from
colossalai.cluster
import
DistCoordinator
from
colossalai.nn.lr_scheduler
import
LinearWarmupLR
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
# ==============================
# Prepare Hyperparameters
...
...
@@ -73,8 +73,8 @@ def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPl
@
torch
.
no_grad
()
def
evaluate
(
model
:
nn
.
Module
,
test_dataloader
:
DataLoader
,
coordinator
:
DistCoordinator
)
->
float
:
model
.
eval
()
correct
=
torch
.
zeros
(
1
,
dtype
=
torch
.
int64
,
device
=
get_current_device
())
total
=
torch
.
zeros
(
1
,
dtype
=
torch
.
int64
,
device
=
get_current_device
())
correct
=
torch
.
zeros
(
1
,
dtype
=
torch
.
int64
,
device
=
get_
accelerator
().
get_
current_device
())
total
=
torch
.
zeros
(
1
,
dtype
=
torch
.
int64
,
device
=
get_
accelerator
().
get_
current_device
())
for
images
,
labels
in
test_dataloader
:
images
=
images
.
cuda
()
labels
=
labels
.
cuda
()
...
...
examples/tutorial/new_api/glue_bert/finetune.py
View file @
8823cc48
...
...
@@ -12,11 +12,11 @@ from tqdm import tqdm
from
transformers
import
AutoConfig
,
BertForSequenceClassification
,
get_linear_schedule_with_warmup
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin
import
GeminiPlugin
,
LowLevelZeroPlugin
,
TorchDDPPlugin
from
colossalai.cluster
import
DistCoordinator
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
# ==============================
# Prepare Hyperparameters
...
...
@@ -45,7 +45,7 @@ def evaluate(
model
.
eval
()
def
evaluate_subset
(
dataloader
:
DataLoader
):
accum_loss
=
torch
.
zeros
(
1
,
device
=
get_current_device
())
accum_loss
=
torch
.
zeros
(
1
,
device
=
get_
accelerator
().
get_
current_device
())
for
batch
in
dataloader
:
batch
=
move_to_cuda
(
batch
)
outputs
=
model
(
**
batch
)
...
...
examples/tutorial/opt/opt/run_clm.py
View file @
8823cc48
...
...
@@ -51,13 +51,13 @@ from transformers import (
from
transformers.utils.versions
import
require_version
import
colossalai
from
colossalai.accelerator
import
get_accelerator
from
colossalai.legacy.context
import
ParallelMode
from
colossalai.legacy.core
import
global_context
as
gpc
from
colossalai.legacy.tensor
import
ProcessGroup
from
colossalai.legacy.utils
import
get_dataloader
from
colossalai.logging
import
disable_existing_loggers
,
get_dist_logger
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.utils
import
get_current_device
from
colossalai.zero
import
GeminiOptimizer
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/language-modeling/requirements.txt"
)
...
...
@@ -249,9 +249,9 @@ def parse_args():
def
colo_memory_cap
(
size_in_GB
):
from
colossalai.utils
import
colo_device_memory_capacity
,
colo_set_process_memory_fraction
,
get_current_device
from
colossalai.utils
import
colo_device_memory_capacity
,
colo_set_process_memory_fraction
cuda_capacity
=
colo_device_memory_capacity
(
get_current_device
())
cuda_capacity
=
colo_device_memory_capacity
(
get_
accelerator
().
get_
current_device
())
if
size_in_GB
*
(
1024
**
3
)
<
cuda_capacity
:
colo_set_process_memory_fraction
(
size_in_GB
*
(
1024
**
3
)
/
cuda_capacity
)
print
(
"Using {} GB of GPU memory"
.
format
(
size_in_GB
))
...
...
@@ -265,7 +265,9 @@ class DummyDataloader:
self
.
vocab_size
=
vocab_size
def
generate
(
self
):
input_ids
=
torch
.
randint
(
0
,
self
.
vocab_size
,
(
self
.
batch_size
,
self
.
seq_len
),
device
=
get_current_device
())
input_ids
=
torch
.
randint
(
0
,
self
.
vocab_size
,
(
self
.
batch_size
,
self
.
seq_len
),
device
=
get_accelerator
().
get_current_device
()
)
attention_mask
=
torch
.
ones_like
(
input_ids
)
return
{
"input_ids"
:
input_ids
,
"attention_mask"
:
attention_mask
,
"labels"
:
input_ids
}
...
...
@@ -390,7 +392,7 @@ def main():
if
args
.
init_in_cpu
:
init_dev
=
torch
.
device
(
"cpu"
)
else
:
init_dev
=
get_current_device
()
init_dev
=
get_accelerator
().
get_current_device
()
cai_version
=
colossalai
.
__version__
logger
.
info
(
f
"using Colossal-AI version
{
cai_version
}
"
)
...
...
@@ -439,7 +441,9 @@ def main():
except
ImportError
:
# this works for unreleased main branch, and this may be released on 0.2.9
from
colossalai.zero
import
GeminiDDP
model
=
GeminiDDP
(
model
,
device
=
get_current_device
(),
placement_policy
=
PLACEMENT_POLICY
,
pin_memory
=
True
)
model
=
GeminiDDP
(
model
,
device
=
get_accelerator
().
get_current_device
(),
placement_policy
=
PLACEMENT_POLICY
,
pin_memory
=
True
)
elif
version
.
parse
(
cai_version
)
<=
version
.
parse
(
"0.1.10"
)
and
version
.
parse
(
cai_version
)
>=
version
.
parse
(
"0.1.9"
):
from
colossalai.gemini
import
ChunkManager
,
GeminiManager
...
...
examples/tutorial/sequence_parallel/model/bert.py
View file @
8823cc48
...
...
@@ -3,13 +3,13 @@ import inspect
import
torch
import
torch.nn
as
nn
from
colossalai.kernel
import
LayerNorm
from
colossalai.legacy.context
import
ParallelMode
from
colossalai.legacy.context.parallel_mode
import
ParallelMode
from
colossalai.legacy.core
import
global_context
as
gpc
from
colossalai.legacy.nn.layer.wrapper
import
PipelineSharedModuleWrapper
from
colossalai.legacy.pipeline.utils
import
partition_uniform
from
colossalai.logging
import
get_dist_logger
from
colossalai.nn.layer.layernorm
import
MixedFusedLayerNorm
as
LayerNorm
from
.layers
import
BertDualHead
,
BertLayer
,
Embedding
,
PreProcessor
,
VocabEmbedding
from
.layers.init_method
import
init_normal
,
output_init_normal
...
...
examples/tutorial/sequence_parallel/model/layers/head.py
View file @
8823cc48
...
...
@@ -3,9 +3,9 @@ import torch.nn as nn
import
torch.nn.functional
as
F
from
loss_func.cross_entropy
import
vocab_cross_entropy
from
colossalai.kernel
import
LayerNorm
from
colossalai.legacy.context
import
ParallelMode
from
colossalai.legacy.core
import
global_context
as
gpc
from
colossalai.nn.layer.layernorm
import
MixedFusedLayerNorm
as
LayerNorm
from
.linear
import
Linear
from
.pooler
import
Pooler
...
...
examples/tutorial/sequence_parallel/train.py
View file @
8823cc48
...
...
@@ -8,12 +8,12 @@ from lr_scheduler import AnnealingLR
from
model.bert
import
BertForPretrain
,
build_pipeline_bert
import
colossalai
from
colossalai.kernel
import
LayerNorm
from
colossalai.legacy.amp
import
AMP_TYPE
from
colossalai.legacy.context.parallel_mode
import
ParallelMode
from
colossalai.legacy.core
import
global_context
as
gpc
from
colossalai.legacy.utils
import
is_using_pp
from
colossalai.logging
import
get_dist_logger
from
colossalai.nn.layer.layernorm
import
MixedFusedLayerNorm
as
LayerNorm
from
colossalai.nn.optimizer
import
FusedAdam
from
colossalai.utils
import
MultiTimer
...
...
Prev
1
…
5
6
7
8
9
10
11
12
13
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment