Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
b0474878
Unverified
Commit
b0474878
authored
May 26, 2023
by
jiangmingyan
Committed by
GitHub
May 26, 2023
Browse files
[doc] update nvme offload documents. (#3850)
parent
ae959a72
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
28 additions
and
13 deletions
+28
-13
docs/source/en/features/nvme_offload.md
docs/source/en/features/nvme_offload.md
+13
-6
docs/source/zh-Hans/features/nvme_offload.md
docs/source/zh-Hans/features/nvme_offload.md
+15
-7
No files found.
docs/source/en/features/nvme_offload.md
View file @
b0474878
...
@@ -78,8 +78,9 @@ from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
...
@@ -78,8 +78,9 @@ from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
import
colossalai
import
colossalai
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.zero
import
zero_model_wrapper
,
zero_optim_wrapper
from
colossalai.utils.model.colo_init_context
import
ColoInitContext
from
colossalai.utils.model.colo_init_context
import
ColoInitContext
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin
import
GeminiPlugin
```
```
Then we define a loss function:
Then we define a loss function:
...
@@ -192,17 +193,23 @@ def train_gemini_cpu(nvme_offload_fraction: float = 0.0):
...
@@ -192,17 +193,23 @@ def train_gemini_cpu(nvme_offload_fraction: float = 0.0):
optimizer
=
HybridAdam
(
model
.
parameters
(),
nvme_offload_fraction
=
nvme_offload_fraction
)
optimizer
=
HybridAdam
(
model
.
parameters
(),
nvme_offload_fraction
=
nvme_offload_fraction
)
print
(
f
'Model numel:
{
get_model_numel
(
model
)
/
1024
**
3
:.
3
f
}
B'
)
print
(
f
'Model numel:
{
get_model_numel
(
model
)
/
1024
**
3
:.
3
f
}
B'
)
gemini_config
=
dict
(
strict_ddp_mode
=
True
,
device
=
torch
.
cuda
.
current_device
(),
plugin
=
GeminiPlugin
(
placement_policy
=
'cpu'
,
pin_memory
=
True
,
hidden_dim
=
config
.
n_embd
)
strict_ddp_mode
=
True
,
model
=
zero_model_wrapper
(
model
,
zero_stage
=
3
,
gemini_config
=
gemini_config
)
device
=
torch
.
cuda
.
current_device
(),
optimizer
=
zero_optim_wrapper
(
model
,
optimizer
,
initial_scale
=
2
**
5
)
placement_policy
=
'cpu'
,
pin_memory
=
True
,
hidden_dim
=
config
.
n_embd
,
initial_scale
=
2
**
5
)
booster
=
Booster
(
plugin
)
model
,
optimizer
,
criterion
,
_
*
=
booster
.
boost
(
model
,
optimizer
,
criterion
)
start
=
time
.
time
()
start
=
time
.
time
()
for
step
in
range
(
3
):
for
step
in
range
(
3
):
data
=
get_data
(
4
,
128
,
config
.
vocab_size
)
data
=
get_data
(
4
,
128
,
config
.
vocab_size
)
outputs
=
model
(
**
data
)
outputs
=
model
(
**
data
)
loss
=
criterion
(
outputs
.
logits
,
data
[
'input_ids'
])
loss
=
criterion
(
outputs
.
logits
,
data
[
'input_ids'
])
optimiz
er
.
backward
(
loss
)
boost
er
.
backward
(
loss
,
optimizer
)
optimizer
.
step
()
optimizer
.
step
()
optimizer
.
zero_grad
()
optimizer
.
zero_grad
()
print
(
f
'[
{
step
}
] loss:
{
loss
.
item
():.
3
f
}
'
)
print
(
f
'[
{
step
}
] loss:
{
loss
.
item
():.
3
f
}
'
)
...
...
docs/source/zh-Hans/features/nvme_offload.md
View file @
b0474878
...
@@ -55,7 +55,6 @@ optimizer = HybridAdam(model.parameters(), lr=1e-3, nvme_offload_fraction=1.0, n
...
@@ -55,7 +55,6 @@ optimizer = HybridAdam(model.parameters(), lr=1e-3, nvme_offload_fraction=1.0, n
## Examples
## Examples
Let's start from two simple examples -- training GPT with different methods. These examples relies on
`transformers`
.
首先让我们从两个简单的例子开始 -- 用不同的方法训练 GPT。这些例子依赖
`transformers`
。
首先让我们从两个简单的例子开始 -- 用不同的方法训练 GPT。这些例子依赖
`transformers`
。
我们首先应该安装依赖:
我们首先应该安装依赖:
...
@@ -77,8 +76,9 @@ from transformers.models.gpt2.configuration_gpt2 import GPT2Config
...
@@ -77,8 +76,9 @@ from transformers.models.gpt2.configuration_gpt2 import GPT2Config
from
transformers.models.gpt2.modeling_gpt2
import
GPT2LMHeadModel
from
transformers.models.gpt2.modeling_gpt2
import
GPT2LMHeadModel
import
colossalai
import
colossalai
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.zero
import
zero_model_wrapper
,
zero_optim_wrapper
from
colossalai.utils.model.colo_init_context
import
ColoInitContext
from
colossalai.utils.model.colo_init_context
import
ColoInitContext
from
colossalai.booster
import
Booster
from
colossalai.booster.plugin
import
GeminiPlugin
```
```
然后我们定义一个损失函数:
然后我们定义一个损失函数:
...
@@ -182,16 +182,24 @@ def train_gemini_cpu(nvme_offload_fraction: float = 0.0):
...
@@ -182,16 +182,24 @@ def train_gemini_cpu(nvme_offload_fraction: float = 0.0):
criterion
=
GPTLMLoss
()
criterion
=
GPTLMLoss
()
optimizer
=
HybridAdam
(
model
.
parameters
(),
nvme_offload_fraction
=
nvme_offload_fraction
)
optimizer
=
HybridAdam
(
model
.
parameters
(),
nvme_offload_fraction
=
nvme_offload_fraction
)
print
(
f
'Model numel:
{
get_model_numel
(
model
)
/
1024
**
3
:.
3
f
}
B'
)
print
(
f
'Model numel:
{
get_model_numel
(
model
)
/
1024
**
3
:.
3
f
}
B'
)
gemini_config
=
dict
(
strict_ddp_mode
=
True
,
device
=
torch
.
cuda
.
current_device
(),
placement_policy
=
'cpu'
,
pin_memory
=
True
,
hidden_dim
=
config
.
n_embd
)
plugin
=
GeminiPlugin
(
model
=
zero_model_wrapper
(
model
,
zero_stage
=
3
,
gemini_config
=
gemini_config
)
strict_ddp_mode
=
True
,
optimizer
=
zero_optim_wrapper
(
model
,
optimizer
,
initial_scale
=
2
**
5
)
device
=
torch
.
cuda
.
current_device
(),
placement_policy
=
'cpu'
,
pin_memory
=
True
,
hidden_dim
=
config
.
n_embd
,
initial_scale
=
2
**
5
)
booster
=
Booster
(
plugin
)
model
,
optimizer
,
criterion
,
_
*
=
booster
.
boost
(
model
,
optimizer
,
criterion
)
start
=
time
.
time
()
start
=
time
.
time
()
for
step
in
range
(
3
):
for
step
in
range
(
3
):
data
=
get_data
(
4
,
128
,
config
.
vocab_size
)
data
=
get_data
(
4
,
128
,
config
.
vocab_size
)
outputs
=
model
(
**
data
)
outputs
=
model
(
**
data
)
loss
=
criterion
(
outputs
.
logits
,
data
[
'input_ids'
])
loss
=
criterion
(
outputs
.
logits
,
data
[
'input_ids'
])
optimiz
er
.
backward
(
loss
)
boost
er
.
backward
(
loss
,
optimizer
)
optimizer
.
step
()
optimizer
.
step
()
optimizer
.
zero_grad
()
optimizer
.
zero_grad
()
print
(
f
'[
{
step
}
] loss:
{
loss
.
item
():.
3
f
}
'
)
print
(
f
'[
{
step
}
] loss:
{
loss
.
item
():.
3
f
}
'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment