Unverified Commit e2fbe4d2 authored by Cheng Li's avatar Cheng Li Committed by GitHub
Browse files

squash latest flops profiling changes (#1) (#664)


Co-authored-by: default avatarCheng Li <pistasable@gmail.com>
Co-authored-by: default avatarJeff Rasley <jerasley@microsoft.com>
parent adcfd269
"""
Copyright (c) Microsoft Corporation
Licensed under the MIT license.
"""
from deepspeed.runtime.config_utils import get_scalar_param
from deepspeed.profiling.constants import *
class DeepSpeedFlopsProfilerConfig(object):
def __init__(self, param_dict):
"""
docstring
"""
super(DeepSpeedFlopsProfilerConfig, self).__init__()
self.enabled = None
self.start_step = None
self.end_step = None
self.module_depth = None
self.top_modules = None
if FLOPS_PROFILER in param_dict.keys():
flops_profiler_dict = param_dict[FLOPS_PROFILER]
else:
flops_profiler_dict = {}
self._initialize(flops_profiler_dict)
def _initialize(self, flops_profiler_dict):
"""
docstring
"""
self.enabled = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_ENABLED,
FLOPS_PROFILER_ENABLED_DEFAULT)
self.start_step = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_START_STEP,
FLOPS_PROFILER_START_STEP_DEFAULT)
self.end_step = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_END_STEP,
FLOPS_PROFILER_END_STEP_DEFAULT)
self.module_depth = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_MODULE_DEPTH,
FLOPS_PROFILER_MODULE_DEPTH_DEFAULT)
self.top_modules = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_TOP_MODULES,
FLOPS_PROFILER_TOP_MODULES_DEFAULT)
"""
Copyright (c) Microsoft Corporation
Licensed under the MIT license.
"""
#########################################
# flops profiler
#########################################
# Flops profiler. By default, this feature is not enabled.
# Users can configure in ds_config.json as below example:
FLOPS_PROFILER_FORMAT = '''
flops profiler should be enabled as:
"session_params": {
"flops_profiler": {
"enalbe": [true|false],
"start_step": 5,
"end_step": 6,
"module_depth": -1,
"top_modules": 3,
}
}
'''
FLOPS_PROFILER = "flops_profiler"
FLOPS_PROFILER_ENABLED = "enabled"
FLOPS_PROFILER_ENABLED_DEFAULT = False
FLOPS_PROFILER_START_STEP = "start_step"
FLOPS_PROFILER_START_STEP_DEFAULT = 5
FLOPS_PROFILER_END_STEP = "end_step"
FLOPS_PROFILER_END_STEP_DEFAULT = FLOPS_PROFILER_START_STEP_DEFAULT + 1
FLOPS_PROFILER_MODULE_DEPTH = "module_depth"
FLOPS_PROFILER_MODULE_DEPTH_DEFAULT = -1
FLOPS_PROFILER_TOP_MODULES = "top_modules"
FLOPS_PROFILER_TOP_MODULES_DEFAULT = 3
# flops-profiler
> Measures the time, number of estimated flops and parameters of each module in a PyTorch Model.
The flops-profiler profiles the forward pass of a PyTorch model and prints the model graph with the measured profile attached to each module. It shows how time, flops and parameters are spent in the model and which modules or layers could be the bottleneck. It also outputs the names of the top k modules in terms of aggregated time, flops, and parameters at depth l with k and l specified by the user. The output profile is computed for each batch of input. If multiple forward passes are specified by the user to caputre (in the case where the model have different paths or for more accurate timing), the average profile of the multiple batches is taken.
The flops estimation is partly inspired by [ptflops](https://github.com/sovrasov/flops-counter.pytorch) with the major difference being that flops-profiler captures `torch.nn.functional` invoked in a module to estimate the flops, thus allowing customized modules in the model (e.g. `ParallelTransformerLayerworks, ParallelSelfAttention, RowParallelLinear, etc.` in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)). The flops-profiler also supports flops computation at module level (for RNNs).
For models running on multi-node or multi-gpu, only the model parallelism affects the number of flops and parameters (e.g. `--model-parallel-size` in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)), i.e., model_parallel_size _ flops = total_flops, model_parallel_size _ parameters = total_parameters. The number of gpus or nodes does not affect the output profile.
Below is an example output for LeNet5 with batch size 1024 on a V100 GPU:
```
LeNet5(
61.71 k, 100.00% Params, 439.55 MMACs, 100.00% MACs, 25.62 ms, 100.00% time, 0.034 TFLOPS,
(feature_extractor): Sequential(
50.69 k, 82.15% Params, 428.37 MMACs, 97.46% MACs, 18.41 ms, 71.85% time, 0.047 TFLOPS,
(0): Conv2d(156, 0.25% Params, 125.24 MMACs, 28.49% MACs, 10.56 ms, 41.21% time, 0.024 TFLOPS, 1, 6, kernel_size=(5, 5), stride=(1, 1))
(1): Tanh(0, 0.00% Params, 0.0 MACs, 0.00% MACs, 2.25 ms, 8.79% time, 0.0 TFLOPS, )
(2): AvgPool2d(0, 0.00% Params, 4.82 MMACs, 1.10% MACs, 2.47 ms, 9.63% time, 0.0039 TFLOPS, kernel_size=2, stride=2, padding=0)
(3): Conv2d(2.42 k, 3.92% Params, 247.4 MMACs, 56.28% MACs, 1.08 ms, 4.23% time, 0.46 TFLOPS, 6, 16, kernel_size=(5, 5), stride=(1, 1))
(4): Tanh(0, 0.00% Params, 0.0 MACs, 0.00% MACs, 497.39 us, 1.94% time, 0.0 TFLOPS, )
(5): AvgPool2d(0, 0.00% Params, 1.64 MMACs, 0.37% MACs, 758.24 us, 2.96% time, 0.0043 TFLOPS, kernel_size=2, stride=2, padding=0)
(6): Conv2d(48.12 k, 77.98% Params, 49.27 MMACs, 11.21% MACs, 606.35 us, 2.37% time, 0.16 TFLOPS, 16, 120, kernel_size=(5, 5), stride=(1, 1))
(7): Tanh(0, 0.00% Params, 0.0 MACs, 0.00% MACs, 68.86 us, 0.27% time, 0.0 TFLOPS, )
)
(classifier): Sequential(
11.01 k, 17.85% Params, 11.18 MMACs, 2.54% MACs, 7.03 ms, 27.43% time, 0.0032 TFLOPS,
(0): Linear(10.16 k, 16.47% Params, 10.32 MMACs, 2.35% MACs, 2.71 ms, 10.57% time, 0.0076 TFLOPS, in_features=120, out_features=84, bias=True)
(1): Tanh(0, 0.00% Params, 0.0 MACs, 0.00% MACs, 78.77 us, 0.31% time, 0.0 TFLOPS, )
(2): Linear(850, 1.38% Params, 860.16 KMACs, 0.20% MACs, 4.17 ms, 16.27% time, 0.00041 TFLOPS, in_features=84, out_features=10, bias=True)
)
)
Top 3 modules in flops at depth 2 are {'Conv2d': '421.91 MMACs', 'Linear': '11.18 MMACs', 'AvgPool2d': '6.46 MMACs'}
Top 3 modules in params at depth 2 are {'Conv2d': '50.69 k', 'Linear': '11.01 k', 'Tanh': '0'}
Top 3 modules in time at depth 2 are {'Conv2d': '12.25 ms', 'Linear': '6.88 ms', 'AvgPool2d': '3.23 ms'}
Batch size: 1024
Number of multiply-adds: 439.55 MMACs
Number of parameters: 61.71 k
Number of steps profiled: 10
```
## Installation
The profiler is an integral part of DeepSpeed and can be installed by
```
pip install deepspeed
```
Refer to the [installaiton of DeepSpeed](https://www.deepspeed.ai/getting-started/#installation) for more information.
## Usage
### With the DeepSpeed runtime
If using DeepSpeed for model training, no explict API calls are needed to use the flops-profiler.
In DeepSpeed config file, specify:
```python
ds_config = {
...# other deepspeed configs
"flops_profiler": {
"enabled": True,
"start_step": 2,
"end_step": 3,
"module_depth": -1,
"top_modules": 3,
},
}
```
- `"enabled": true` to enable the flops-profiler.
- `"start_step": 5` to start the profiler at step 5. Note that warm-up is necessary for getting accurate timing information.
- `"end_step": 6` to end the profiler at step 6. Note that `end_step > start_step`.
- `"module_depth": -1` to print aggregated module information at the maximum depth (innermost modules). Can be set to any positive number, caped by the maximum depth of the model.
- `"top_modules": 3`to set the number of top modules to print aggregated profile
An example is given in [test_flops_profiler](tests/unit/test_flops_profiler.py).
### Without the DeepSpeed runtime
The flops-profiler can be used as a standalone package outside of the deepspeed runtime.
#### Use the low-level APIs to profile the forward pass in the existing model training workflow
- `start_profile` - starts profiling
- `get_total_flops` - returns the total number of flops
- `get_total_params` - returns the total number of params
- `get_total_duration` - returns the total duration of the model forward pass
- `get_total_steps` - returns the total number of steps (or input batches) profiled.
- `print_model_profile` - prints the profile annotated
- `print_model_aggregated_profile` - prints the aggregated profile for the top modules
- `end_profile` - ends profiling and cleans up, invoked at the end of the profiling and before any printing method.
`flops_to_string`, `params_to_string`, `duration_to_string` are utility functions to convert the metric number to string.
Below is an example of this usage in a typical training workflow.
```python
from deepspeed.profiling.flops_profiler.profiler import FlopsProfiler
model = Model()
profiler = FlopsProfiler(model)
start_step = 5
end_step = 10
assert (end_step > start_step), "should end profiling after start profiling"
print_profile = True
pring_aggregated_profile = True
for step, batch in enumerate(data_loader):
# start profiling at training step "profile_step"
if step == start_step:
profiler.start_profile()
# end profiling and print output at training step "profile_step"
if model == end_step: # if using multi nodes, check global_rank == 0 as well
flops = profiler.get_total_flops()
params = profiler.get_total_flops()
duration = profiler.get_total_duration()
steps = profiler.get_total_steps()
if print_profile:
profiler.print_model_profile()
if print_aggregated_profile:
profiler.print_model_aggregated_profile(module_depth=-1, top_modules=3)
profiler.end_profile()
print(flops, params, duration, step)
# forward() method
loss = model(batch)
# runs backpropagation
loss.backward()
# weight update
optimizer.step()
```
#### Use the high level-API and run the model inference for profiling purpose
Examples of this usage are given below.
##### Classification model example:
```python
import argparse
import sys
import torch
import torchvision.models as models
from deepspeed.profiling.flops_profiler import get_model_profile
pt_models = {
'resnet18': models.resnet18,
'resnet50': models.resnet50,
'alexnet': models.alexnet,
'vgg16': models.vgg16,
'squeezenet': models.squeezenet1_0,
'densenet': models.densenet161,
'inception': models.inception_v3
}
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='flops-profiler example script')
parser.add_argument('--device',
type=int,
default=0,
help='Device to store the model.')
parser.add_argument('--model',
choices=list(pt_models.keys()),
type=str,
default='resnet18')
args = parser.parse_args()
model = pt_models[args.model]()
if torch.cuda.is_available():
model.cuda(device=args.device)
batch_size = 256
macs, params, steps = get_model_profile(model, # the PyTorch model to be profiled
input_res=(batch_size, 3, 224, 224), # input shape or input to the input_constructor
input_constructor=None, # If specified, the constructor is applied to input_res and the constructor output is used as the input to the model
print_profile=True, # whether to print the model graph with the profile annotated. Defaults to True
print_aggregated_profile=True, # whether to print the aggregated profile for top modules. Defaults to True
module_depth=-1, # the depth into the nested modules. Defaults to -1 (the inner most modules)
top_modules=3, # the number of top modules to print aggregated profile
warm_up=10, # the number of warm-up steps before measuring the time of each module. Defaults to 5
num_steps=10, # the number of steps to profile. Defaults to 10
as_strings=True, # whether to print the output as strings (e.g. 1k). Defaults to True
ignore_modules=None) # the list of modules to ignore during profiling. Defaults to None
print("{:<30} {:<8}".format("Batch size: ", batch_size))
print('{:<30} {:<8}'.format('Number of MACs: ', macs))
print('{:<30} {:<8}'.format('Number of parameters: ', params))
print('{:<30} {:<8}'.format('Number of steps profiled: ', steps))
# Output:
# Number of MACs: 466.48 GMACs
# Number of parameters: 11.69 M
```
##### Bert model example:
```python
from functools import partial
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from deepspeed.profiling.flops_profiler import get_model_profile
def bert_input_constructor(input_shape, tokenizer):
inp_seq = ""
for _ in range(input_shape[1] - 2): # there are two special tokens [CLS] and [SEP]
inp_seq += tokenizer.pad_token # let's use pad token to form a fake
# sequence for subsequent flops calculation
inputs = tokenizer([inp_seq] * input_shape[0],
padding=True,
truncation=True,
return_tensors="pt")
labels = torch.tensor([1] * input_shape[0])
# Batch size input_shape[0], sequence length input_shape[128]
inputs = dict(inputs)
inputs.update({"labels": labels})
return inputs
if __name__ == '__main__':
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
macs, params, steps = get_model_profile(
model,
(2, 128),
input_constructor=partial(bert_input_constructor,
tokenizer=bert_tokenizer),
print_profile=True,
print_aggregated_profile=True,
)
print("{:<30} {:<8}".format("Number of multiply-adds: ", macs))
print("{:<30} {:<8}".format("Number of parameters: ", params))
print("{:<30} {:<8}".format("Number of steps profiled: ", steps))
# Output:
# Number of multiply-adds: 21.74 GMACs
# Number of parameters: 109.48 M
```
This diff is collapsed.
...@@ -22,6 +22,8 @@ from ..elasticity.config import ElasticityConfigError ...@@ -22,6 +22,8 @@ from ..elasticity.config import ElasticityConfigError
from ..elasticity.constants import ELASTICITY, IGNORE_NON_ELASTIC_BATCH_INFO, \ from ..elasticity.constants import ELASTICITY, IGNORE_NON_ELASTIC_BATCH_INFO, \
IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT
from ..profiling.config import DeepSpeedFlopsProfilerConfig
TENSOR_CORE_ALIGN_SIZE = 8 TENSOR_CORE_ALIGN_SIZE = 8
ADAM_OPTIMIZER = 'adam' ADAM_OPTIMIZER = 'adam'
...@@ -613,6 +615,7 @@ class DeepSpeedConfig(object): ...@@ -613,6 +615,7 @@ class DeepSpeedConfig(object):
self.scheduler_params = get_scheduler_params(param_dict) self.scheduler_params = get_scheduler_params(param_dict)
self.wall_clock_breakdown = get_wall_clock_breakdown(param_dict) self.wall_clock_breakdown = get_wall_clock_breakdown(param_dict)
self.flops_profiler_config = DeepSpeedFlopsProfilerConfig(param_dict)
self.memory_breakdown = get_memory_breakdown(param_dict) self.memory_breakdown = get_memory_breakdown(param_dict)
self.tensorboard_enabled = get_tensorboard_enabled(param_dict) self.tensorboard_enabled = get_tensorboard_enabled(param_dict)
self.tensorboard_output_path = get_tensorboard_output_path(param_dict) self.tensorboard_output_path = get_tensorboard_output_path(param_dict)
......
...@@ -39,6 +39,8 @@ from ..ops.op_builder import UtilsBuilder ...@@ -39,6 +39,8 @@ from ..ops.op_builder import UtilsBuilder
from ..ops.adam import DeepSpeedCPUAdam from ..ops.adam import DeepSpeedCPUAdam
from ..ops.adam import FusedAdam from ..ops.adam import FusedAdam
from deepspeed.profiling.flops_profiler.profiler import FlopsProfiler
MEMORY_OPT_ALLREDUCE_SIZE = 500000000 MEMORY_OPT_ALLREDUCE_SIZE = 500000000
try: try:
...@@ -265,6 +267,21 @@ class DeepSpeedEngine(Module): ...@@ -265,6 +267,21 @@ class DeepSpeedEngine(Module):
def wall_clock_breakdown(self): def wall_clock_breakdown(self):
return self._config.wall_clock_breakdown return self._config.wall_clock_breakdown
def flops_profiler_enabled(self):
return self._config.flops_profiler_config.enabled
def flops_profiler_start_step(self):
return self._config.flops_profiler_config.start_step
def flops_profiler_end_step(self):
return self._config.flops_profiler_config.end_step
def flops_profiler_module_depth(self):
return self._config.flops_profiler_config.module_depth
def flops_profiler_top_modules(self):
return self._config.flops_profiler_config.top_modules
def memory_breakdown(self): def memory_breakdown(self):
return self._config.memory_breakdown return self._config.memory_breakdown
...@@ -764,6 +781,30 @@ class DeepSpeedEngine(Module): ...@@ -764,6 +781,30 @@ class DeepSpeedEngine(Module):
*inputs: Variable length input list *inputs: Variable length input list
**kwargs: variable length keyword arguments **kwargs: variable length keyword arguments
""" """
if self.flops_profiler_enabled(
) and self.global_steps == self.flops_profiler_start_step(
) and self.global_rank == 0:
self.flops_profiler = FlopsProfiler(self.module)
self.flops_profiler.start_profile(ignore_list=None)
if self.flops_profiler_enabled(
) and self.global_steps == self.flops_profiler_end_step(
) and self.global_rank == 0:
print('{:<30} {:<8}'.format(
'Number of multiply-adds: ',
self.flops_profiler.get_total_flops(in_str=False)))
print('{:<30} {:<8}'.format(
'Number of parameters: ',
self.flops_profiler.get_total_params(in_str=False)))
print('{:<30} {:<8}'.format('Number of steps profiled: ',
self.flops_profiler.get_total_steps()))
self.flops_profiler.print_model_profile()
self.flops_profiler.print_model_aggregated_profile(
module_depth=self.flops_profiler_module_depth(),
top_modules=self.flops_profiler_top_modules())
self.flops_profiler.flops = self.flops_profiler.get_total_flops()
self.flops_profiler.params = self.flops_profiler.get_total_params()
self.flops_profiler.end_profile()
if self.module.training and self.progressive_layer_drop: if self.module.training and self.progressive_layer_drop:
kwargs.update(self.progressive_layer_drop.get_state()) kwargs.update(self.progressive_layer_drop.get_state())
......
import torch
import deepspeed
import deepspeed.runtime.utils as ds_utils
from deepspeed.profiling.flops_profiler import FlopsProfiler, get_model_profile
from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
from common import distributed_test
def test_flops_profiler_in_ds_trainning(tmpdir):
config_dict = {
"train_batch_size": 1,
"steps_per_print": 1,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.001,
}
},
"zero_optimization": {
"stage": 0
},
"fp16": {
"enabled": True,
},
"flops_profiler": {
"enabled": True,
"start_step": 2,
"end_step": 3,
"module_depth": -1,
"top_modules": 3,
},
}
args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=False)
@distributed_test(world_size=[1])
def _test_flops_profiler_in_ds_trainning(args, model, hidden_dim):
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
data_loader = random_dataloader(model=model,
total_samples=50,
hidden_dim=hidden_dim,
device=model.device,
dtype=torch.half)
for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
model.backward(loss)
model.step()
if n == 3: break
assert model.flops_profiler.flops == 100
assert model.flops_profiler.params == 110
_test_flops_profiler_in_ds_trainning(args, model, hidden_dim)
class LeNet5(torch.nn.Module):
def __init__(self, n_classes):
super(LeNet5, self).__init__()
self.feature_extractor = torch.nn.Sequential(
torch.nn.Conv2d(in_channels=1,
out_channels=6,
kernel_size=5,
stride=1),
torch.nn.Tanh(),
torch.nn.AvgPool2d(kernel_size=2),
torch.nn.Conv2d(in_channels=6,
out_channels=16,
kernel_size=5,
stride=1),
torch.nn.Tanh(),
torch.nn.AvgPool2d(kernel_size=2),
torch.nn.Conv2d(in_channels=16,
out_channels=120,
kernel_size=5,
stride=1),
torch.nn.Tanh(),
)
self.classifier = torch.nn.Sequential(
torch.nn.Linear(in_features=120,
out_features=84),
torch.nn.Tanh(),
torch.nn.Linear(in_features=84,
out_features=n_classes),
)
def forward(self, x):
x = self.feature_extractor(x)
x = torch.flatten(x, 1)
logits = self.classifier(x)
probs = torch.nn.functional.softmax(logits, dim=1)
return logits, probs
def test_flops_profiler_in_inference():
mod = LeNet5(10)
batch_size = 1024
input = torch.randn(batch_size, 1, 32, 32)
macs, params, steps = get_model_profile(
mod,
tuple(input.shape),
print_profile=True,
print_aggregated_profile=True,
module_depth=-1,
top_modules=3,
warm_up=5,
num_steps=10,
as_strings=True,
ignore_modules=None,
)
print(macs, params, steps)
assert macs == "439.55 MMACs"
assert params == "61.71 k"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment