"git@developer.sourcefind.cn:OpenDAS/torchaudio.git" did not exist on "3ab521fa2b26b6dbfdb87d36739f0e4191a500a1"
Unverified Commit 001abe23 authored by Jeff Rasley's avatar Jeff Rasley Committed by GitHub
Browse files

Refactor simple model test, fix pythonpath issue (#96)

Also a fix for #94 
parent f2d75135
...@@ -128,6 +128,10 @@ def _add_core_arguments(parser): ...@@ -128,6 +128,10 @@ def _add_core_arguments(parser):
type=str, type=str,
help='DeepSpeed json configuration file.') help='DeepSpeed json configuration file.')
group.add_argument('--deepscale_config',
default=None,
type=str,
help='Deprecated DeepSpeed json configuration file.')
return parser return parser
......
...@@ -322,6 +322,14 @@ class DeepSpeedLight(Module): ...@@ -322,6 +322,14 @@ class DeepSpeedLight(Module):
# Validate command line arguments # Validate command line arguments
def _do_args_sanity_check(self, args): def _do_args_sanity_check(self, args):
if hasattr(args, 'deepscale_config') and args.deepscale_config is not None:
logging.warning(
"************ --deepscale_config is deprecated, please use --deepspeed_config ************"
)
if hasattr(args, 'deepspeed_config'):
assert args.deepspeed_config is None, "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config"
args.deepspeed_config = args.deepscale_config
assert hasattr(args, 'local_rank') and type(args.local_rank) == int, \ assert hasattr(args, 'local_rank') and type(args.local_rank) == int, \
'DeepSpeed requires integer command line parameter --local_rank' 'DeepSpeed requires integer command line parameter --local_rank'
......
...@@ -306,7 +306,10 @@ def main(args=None): ...@@ -306,7 +306,10 @@ def main(args=None):
num_gpus_per_node = None num_gpus_per_node = None
curr_path = os.path.abspath('.') curr_path = os.path.abspath('.')
env['PYTHONPATH'] = curr_path + ":" + env['PYTHONPATH'] if 'PYTHONPATH' in env:
env['PYTHONPATH'] = curr_path + ":" + env['PYTHONPATH']
else:
env['PYTHONPATH'] = curr_path
exports = "" exports = ""
for var in env.keys(): for var in env.keys():
......
...@@ -109,16 +109,11 @@ if [ "$third_party_install" == "1" ]; then ...@@ -109,16 +109,11 @@ if [ "$third_party_install" == "1" ]; then
sudo -H pip install third_party/apex/dist/apex*.whl sudo -H pip install third_party/apex/dist/apex*.whl
fi fi
if [ "$deepspeed_install" == "1" ]; then if [ "$deepspeed_install" == "1" ]; then
echo "Installing deepspeed" echo "Building deepspeed wheel"
python setup.py bdist_wheel python setup.py bdist_wheel
fi fi
if [ "$local_only" == "1" ]; then if [ "$local_only" == "1" ]; then
if [ "$third_party_install" == "1" ]; then
echo "Installing apex locally"
sudo -H pip uninstall -y apex
sudo -H pip install third_party/apex/dist/apex*.whl
fi
if [ "$deepspeed_install" == "1" ]; then if [ "$deepspeed_install" == "1" ]; then
echo "Installing deepspeed" echo "Installing deepspeed"
sudo -H pip uninstall -y deepspeed sudo -H pip uninstall -y deepspeed
......
import os
import json
import argparse
import torch
class SimpleModel(torch.nn.Module):
def __init__(self, hidden_dim, empty_grad=False):
super(SimpleModel, self).__init__()
self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
if empty_grad:
self.layers2 = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim)])
self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
def forward(self, x, y):
hidden_dim = x
hidden_dim = self.linear(hidden_dim)
return self.cross_entropy_loss(hidden_dim, y)
def random_dataloader(model, total_samples, hidden_dim, device):
batch_size = model.train_micro_batch_size_per_gpu()
train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.half)
train_label = torch.empty(total_samples,
dtype=torch.long,
device=device).random_(hidden_dim)
train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
return train_loader
def create_config_from_dict(tmpdir, config_dict):
config_path = os.path.join(tmpdir, 'temp_config.json')
with open(config_path, 'w') as fd:
json.dump(config_dict, fd)
return config_path
def args_from_dict(tmpdir, config_dict):
config_path = create_config_from_dict(tmpdir, config_dict)
parser = argparse.ArgumentParser()
args = parser.parse_args(args='')
args.deepspeed = True
args.deepspeed_config = config_path
args.local_rank = 0
return args
# A test on its own # A test on its own
import torch import torch
import pytest import pytest
import json
import argparse
from common import distributed_test from common import distributed_test
from simple_model import SimpleModel, create_config_from_dict, random_dataloader
import torch.distributed as dist import torch.distributed as dist
# A test on its own # A test on its own
...@@ -100,3 +103,54 @@ def test_batch_config(num_ranks, batch, micro_batch, gas, success): ...@@ -100,3 +103,54 @@ def test_batch_config(num_ranks, batch, micro_batch, gas, success):
"""Run batch config test """ """Run batch config test """
_test_batch_config(num_ranks, batch, micro_batch, gas, success) _test_batch_config(num_ranks, batch, micro_batch, gas, success)
def test_temp_config_json(tmpdir):
config_dict = {
"train_batch_size": 1,
}
config_path = create_config_from_dict(tmpdir, config_dict)
config_json = json.load(open(config_path, 'r'))
assert 'train_batch_size' in config_json
def test_deprecated_deepscale_config(tmpdir):
config_dict = {
"train_batch_size": 1,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
},
"fp16": {
"enabled": True
}
}
config_path = create_config_from_dict(tmpdir, config_dict)
parser = argparse.ArgumentParser()
args = parser.parse_args(args='')
args.deepscale_config = config_path
args.local_rank = 0
hidden_dim = 10
model = SimpleModel(hidden_dim)
@distributed_test(world_size=[1])
def _test_deprecated_deepscale_config(args, model, hidden_dim):
model, _, _,_ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters(),
dist_init_required=False)
data_loader = random_dataloader(model=model,
total_samples=5,
hidden_dim=hidden_dim,
device=model.device)
for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
model.backward(loss)
model.step()
_test_deprecated_deepscale_config(args=args, model=model, hidden_dim=hidden_dim)
...@@ -5,67 +5,7 @@ import pytest ...@@ -5,67 +5,7 @@ import pytest
import json import json
import os import os
from common import distributed_test from common import distributed_test
from simple_model import SimpleModel, random_dataloader, args_from_dict
def create_config_from_dict(tmpdir, config_dict):
config_path = os.path.join(tmpdir, 'temp_config.json')
with open(config_path, 'w') as fd:
json.dump(config_dict, fd)
return config_path
class SimpleModel(torch.nn.Module):
def __init__(self, hidden_dim, empty_grad=False):
super(SimpleModel, self).__init__()
self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
if empty_grad:
self.layers2 = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim)])
self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
def forward(self, x, y):
hidden_dim = x
hidden_dim = self.linear(hidden_dim)
return self.cross_entropy_loss(hidden_dim, y)
def test_temp_config_json(tmpdir):
config_dict = {
"train_batch_size": 1,
}
config_path = create_config_from_dict(tmpdir, config_dict)
config_json = json.load(open(config_path, 'r'))
assert 'train_batch_size' in config_json
def prepare_optimizer_parameters(model):
param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [{
'params': [p for n,
p in param_optimizer],
'weight_decay': 0.0
}]
return optimizer_grouped_parameters
def get_data_loader(model, total_samples, hidden_dim, device):
batch_size = model.train_micro_batch_size_per_gpu()
train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.half)
train_label = torch.empty(total_samples,
dtype=torch.long,
device=device).random_(hidden_dim)
train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
return train_loader
def get_args(tmpdir, config_dict):
config_path = create_config_from_dict(tmpdir, config_dict)
parser = argparse.ArgumentParser()
args = parser.parse_args(args='')
args.deepspeed = True
args.deepspeed_config = config_path
args.local_rank = 0
return args
def test_lamb_fp16_basic(tmpdir): def test_lamb_fp16_basic(tmpdir):
...@@ -83,7 +23,7 @@ def test_lamb_fp16_basic(tmpdir): ...@@ -83,7 +23,7 @@ def test_lamb_fp16_basic(tmpdir):
"enabled": True "enabled": True
} }
} }
args = get_args(tmpdir, config_dict) args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10 hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=False) model = SimpleModel(hidden_dim, empty_grad=False)
...@@ -94,10 +34,10 @@ def test_lamb_fp16_basic(tmpdir): ...@@ -94,10 +34,10 @@ def test_lamb_fp16_basic(tmpdir):
model=model, model=model,
model_parameters=model.parameters(), model_parameters=model.parameters(),
dist_init_required=False) dist_init_required=False)
data_loader = get_data_loader(model=model, data_loader = random_dataloader(model=model,
total_samples=50, total_samples=50,
hidden_dim=hidden_dim, hidden_dim=hidden_dim,
device=model.device) device=model.device)
for n, batch in enumerate(data_loader): for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1]) loss = model(batch[0], batch[1])
model.backward(loss) model.backward(loss)
...@@ -121,7 +61,7 @@ def test_lamb_fp16_empty_grad(tmpdir): ...@@ -121,7 +61,7 @@ def test_lamb_fp16_empty_grad(tmpdir):
"enabled": True "enabled": True
} }
} }
args = get_args(tmpdir, config_dict) args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10 hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=True) model = SimpleModel(hidden_dim, empty_grad=True)
...@@ -132,10 +72,10 @@ def test_lamb_fp16_empty_grad(tmpdir): ...@@ -132,10 +72,10 @@ def test_lamb_fp16_empty_grad(tmpdir):
model=model, model=model,
model_parameters=model.parameters(), model_parameters=model.parameters(),
dist_init_required=False) dist_init_required=False)
data_loader = get_data_loader(model=model, data_loader = random_dataloader(model=model,
total_samples=50, total_samples=50,
hidden_dim=hidden_dim, hidden_dim=hidden_dim,
device=model.device) device=model.device)
for n, batch in enumerate(data_loader): for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1]) loss = model(batch[0], batch[1])
model.backward(loss) model.backward(loss)
...@@ -152,7 +92,7 @@ def test_adamw_fp16_basic(tmpdir): ...@@ -152,7 +92,7 @@ def test_adamw_fp16_basic(tmpdir):
"enabled": True "enabled": True
} }
} }
args = get_args(tmpdir, config_dict) args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10 hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=False) model = SimpleModel(hidden_dim, empty_grad=False)
...@@ -164,10 +104,10 @@ def test_adamw_fp16_basic(tmpdir): ...@@ -164,10 +104,10 @@ def test_adamw_fp16_basic(tmpdir):
model=model, model=model,
optimizer=optimizer, optimizer=optimizer,
dist_init_required=False) dist_init_required=False)
data_loader = get_data_loader(model=model, data_loader = random_dataloader(model=model,
total_samples=50, total_samples=50,
hidden_dim=hidden_dim, hidden_dim=hidden_dim,
device=model.device) device=model.device)
for n, batch in enumerate(data_loader): for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1]) loss = model(batch[0], batch[1])
model.backward(loss) model.backward(loss)
...@@ -184,7 +124,7 @@ def test_adamw_fp16_empty_grad(tmpdir): ...@@ -184,7 +124,7 @@ def test_adamw_fp16_empty_grad(tmpdir):
"enabled": True "enabled": True
} }
} }
args = get_args(tmpdir, config_dict) args = args_from_dict(tmpdir, config_dict)
hidden_dim = 10 hidden_dim = 10
model = SimpleModel(hidden_dim, empty_grad=True) model = SimpleModel(hidden_dim, empty_grad=True)
...@@ -196,10 +136,10 @@ def test_adamw_fp16_empty_grad(tmpdir): ...@@ -196,10 +136,10 @@ def test_adamw_fp16_empty_grad(tmpdir):
model=model, model=model,
optimizer=optimizer, optimizer=optimizer,
dist_init_required=False) dist_init_required=False)
data_loader = get_data_loader(model=model, data_loader = random_dataloader(model=model,
total_samples=50, total_samples=50,
hidden_dim=hidden_dim, hidden_dim=hidden_dim,
device=model.device) device=model.device)
for n, batch in enumerate(data_loader): for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1]) loss = model(batch[0], batch[1])
model.backward(loss) model.backward(loss)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment