Unverified Commit 7bdb9a7f authored by anj-s's avatar anj-s Committed by GitHub
Browse files

remove examples dir (#712)

parent 370b8483
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
def dist_init(rank, world_size):
backend = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO # type: ignore
print(f"Using backend: {backend}")
dist.init_process_group(backend=backend, init_method="tcp://localhost:29501", rank=rank, world_size=world_size)
def get_model():
return nn.Sequential(torch.nn.Linear(10, 10), torch.nn.ReLU(), torch.nn.Linear(10, 5))
def get_data(n_batches=1):
return [(torch.randn(20, 10), torch.randint(0, 2, size=(20, 1)).squeeze()) for i in range(n_batches)]
def get_loss_fun():
return F.nll_loss
# adapted from https://github.com/pytorch/examples/blob/master/mnist/main.py
from __future__ import print_function
import argparse
import time
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from fairscale.nn.data_parallel import ShardedDataParallel
WORLD_SIZE = 2
OPTIM = torch.optim.RMSprop
BACKEND = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO
def dist_init(rank, world_size, backend):
print(f"Using backend: {backend}")
dist.init_process_group(backend=backend, init_method="tcp://localhost:29501", rank=rank, world_size=world_size)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout2d(0.25)
self.dropout2 = nn.Dropout2d(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
def train(rank, args, model, device, train_loader, num_epochs):
##############
# SETUP
dist_init(rank, WORLD_SIZE, BACKEND)
ddp = ShardedDataParallel(
module=model,
optimizer=torch.optim.Adadelta,
optimizer_params={"lr": 1e-4},
world_size=WORLD_SIZE,
broadcast_buffers=True,
)
ddp.train()
optimizer = ddp.optimizer
# Reset the memory use counter
torch.cuda.reset_peak_memory_stats(rank)
# Training loop
torch.cuda.synchronize(rank)
training_start = time.monotonic()
loss_fn = nn.CrossEntropyLoss()
##############
model.train()
measurements = []
for epoch in range(num_epochs):
epoch_start = time.monotonic()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
def closure():
model.zero_grad()
outputs = model(data)
loss = loss_fn(outputs, target)
loss.backward()
ddp.reduce() # Send the gradients to the appropriate shards
return loss
optimizer.step(closure)
epoch_end = time.monotonic()
torch.cuda.synchronize(rank)
training_stop = time.monotonic()
print("Total Time:", training_stop - training_start)
def main():
# Training settings
parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
parser.add_argument(
"--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)"
)
parser.add_argument(
"--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)"
)
parser.add_argument("--epochs", type=int, default=14, metavar="N", help="number of epochs to train (default: 14)")
parser.add_argument("--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)")
parser.add_argument("--gamma", type=float, default=0.7, metavar="M", help="Learning rate step gamma (default: 0.7)")
parser.add_argument("--no-cuda", action="store_true", default=False, help="disables CUDA training")
parser.add_argument("--dry-run", action="store_true", default=False, help="quickly check a single pass")
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
parser.add_argument(
"--log-interval",
type=int,
default=10,
metavar="N",
help="how many batches to wait before logging training status",
)
parser.add_argument("--save-model", action="store_true", default=False, help="For Saving the current Model")
args = parser.parse_args()
use_cuda = not args.no_cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {"batch_size": args.batch_size}
if use_cuda:
kwargs.update({"num_workers": 1, "pin_memory": True, "shuffle": True},)
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1, **kwargs)
model = Net().to(device)
mp.spawn(
train, args=(args, model, device, train_loader, args.epochs), nprocs=WORLD_SIZE, join=True,
)
if __name__ == "__main__":
main()
# adapted from https://github.com/pytorch/examples/blob/master/mnist/main.py
from __future__ import print_function
import argparse
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torchvision import datasets, transforms
from fairscale.nn import Pipe
net = nn.Sequential(
nn.Conv2d(1, 32, 3, 1),
nn.ReLU(),
nn.Conv2d(32, 64, 3, 1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2),
nn.Dropout2d(0.25),
nn.Flatten(1),
nn.Linear(9216, 128),
nn.ReLU(),
nn.Dropout2d(0.5),
nn.Linear(128, 10),
nn.LogSoftmax(dim=1),
)
def train(args, model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output.to(device), target.to(device))
loss.backward()
optimizer.step()
if batch_idx % args.log_interval == 0:
print(
"Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
epoch,
batch_idx * len(data),
len(train_loader.dataset),
100.0 * batch_idx / len(train_loader),
loss.item(),
)
)
if args.dry_run:
break
def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output.to(device), target.to(device), reduction="sum").item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True).to(device) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print(
"\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
test_loss, correct, len(test_loader.dataset), 100.0 * correct / len(test_loader.dataset)
)
)
def main():
# Training settings
parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
parser.add_argument(
"--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)"
)
parser.add_argument(
"--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)"
)
parser.add_argument("--epochs", type=int, default=14, metavar="N", help="number of epochs to train (default: 14)")
parser.add_argument("--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)")
parser.add_argument("--gamma", type=float, default=0.7, metavar="M", help="Learning rate step gamma (default: 0.7)")
parser.add_argument("--dry-run", action="store_true", default=False, help="quickly check a single pass")
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
parser.add_argument(
"--log-interval",
type=int,
default=10,
metavar="N",
help="how many batches to wait before logging training status",
)
parser.add_argument("--save-model", action="store_true", default=False, help="For Saving the current Model")
args = parser.parse_args()
torch.manual_seed(args.seed)
kwargs = {"batch_size": args.batch_size}
kwargs.update({"num_workers": 1, "pin_memory": True, "shuffle": True},)
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform)
dataset2 = datasets.MNIST("../data", train=False, transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1, **kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, **kwargs)
model = net
model = Pipe(model, balance=[6, 6], devices=[0, 1], chunks=2)
device = model.devices[0]
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
for epoch in range(1, args.epochs + 1):
tic = time.perf_counter()
train(args, model, device, train_loader, optimizer, epoch)
toc = time.perf_counter()
print(f">>> TRANING Time {toc - tic:0.4f} seconds")
tic = time.perf_counter()
test(model, device, test_loader)
toc = time.perf_counter()
print(f">>> TESTING Time {toc - tic:0.4f} seconds")
scheduler.step()
if args.save_model:
torch.save(model.state_dict(), "mnist_cnn.pt")
if __name__ == "__main__":
main()
import time
from typing import Optional, Union, cast
from helpers import dist_init, get_data, get_loss_fun, get_model
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from fairscale.optim.oss import OSS
WORLD_SIZE = 2
EPOCHS = 3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def train(rank: int, world_size: int, epochs: int, use_oss: bool):
# DDP
dist_init(rank, world_size)
device = torch.device("cpu") if DEVICE == "cpu" else rank # type:ignore
# Problem statement
model = get_model().to(device)
dataloader = get_data(n_batches=1)
loss_fn = get_loss_fun()
optimizer: Optional[Union[OSS, torch.optim.SGD]] = None
if not use_oss:
optimizer = torch.optim.SGD(params=model.parameters(), lr=1e-4)
else:
base_optimizer = torch.optim.SGD
base_optimizer_arguments = {"lr": 1e-4} # any optimizer specific arguments, LR, momentum, etc...
optimizer = OSS(
params=model.parameters(), optim=base_optimizer, broadcast_buffer_size=2 ** 17, **base_optimizer_arguments
)
training_start = time.monotonic()
# Any relevant training loop, nothing specific to OSS. For example:
model.train()
for _ in range(epochs):
for (data, target) in dataloader:
data, target = data.to(device), target.to(device)
# Train
model.zero_grad()
outputs = model(data)
loss = loss_fn(outputs, target)
loss.backward()
# if you want to clip the gradients / get the current max:
max_norm = 1000.0
norm_type = 1
if not use_oss:
_total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm, norm_type=norm_type) # type: ignore
else:
optimizer = cast(OSS, optimizer)
_total_norm = optimizer.clip_grad_norm(max_norm, norm_type=norm_type)
optimizer.step()
print(f"Loss: {loss.item()}")
training_end = time.monotonic()
print(f"[{dist.get_rank()}] : Training done. {training_end-training_start:.2f} sec")
if DEVICE == "cuda":
max_memory = torch.cuda.max_memory_allocated(rank)
print(f"[{dist.get_rank()}] : Peak memory {max_memory:.1f}MiB")
if __name__ == "__main__":
training_start1 = time.monotonic()
mp.spawn(train, args=(WORLD_SIZE, EPOCHS, False), nprocs=WORLD_SIZE, join=True)
training_end1 = time.monotonic()
training_start2 = time.monotonic()
mp.spawn(train, args=(WORLD_SIZE, EPOCHS, True), nprocs=WORLD_SIZE, join=True)
training_end2 = time.monotonic()
print("Total Time without:", training_end1 - training_start1)
print("Total Time with:", training_end2 - training_start2)
from helpers import get_data, get_loss_fun, get_model
import torch
import torch.optim as optim
import fairscale
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
RANK = 0 # example
model = get_model()
data, target = get_data()[0]
loss_fn = get_loss_fun()
model = fairscale.nn.Pipe(model, balance=[2, 1])
# define optimizer and loss function
optimizer = optim.SGD(model.parameters(), lr=0.001)
# zero the parameter gradients
optimizer.zero_grad()
device = torch.device("cuda", RANK) if DEVICE == "cuda" else torch.device("cpu")
# outputs and target need to be on the same device
# forward step
outputs = model(data.to(device).requires_grad_())
# compute loss
loss = loss_fn(outputs.to(device), target.to(device))
# backward + optimize
loss.backward()
optimizer.step()
print("Finished Training Step")
del model
# run with:
# mpirun -np 2 --host localhost:2 -x PYTHONPATH=$PWD python # examples/tutorial_pipe_rpc.py
import os
from helpers import dist_init, get_data, get_loss_fun, get_model
import torch
import torch.optim as optim
import torch_pg
import fairscale
from fairscale.nn.model_parallel import initialize_model_parallel
def register_optimizer(ctx, model):
# Set the optimizer as an attribute on the model so we can access it later
model.optimizer = optim.SGD(model.parameters(), **ctx)
# zero the parameter gradients
model.optimizer.zero_grad()
def run_optimizer(ctx, model):
model.optimizer.step()
def run(rank, world_size):
torch_pg.init_mpi()
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "10638"
dist_init(rank, world_size) # FIXME (supports gloo)
os.environ["MASTER_PORT"] = "10639"
torch.distributed.rpc.init_rpc(f"worker{rank}", rank=rank, world_size=world_size)
initialize_model_parallel(1, world_size, pipeline_backend="mpi")
if rank == 1:
# For RPC, all ranks other than 0 just need to call rpc.shutdown()
torch.distributed.rpc.shutdown()
return
model = get_model()
data, target = get_data()[0]
loss_fn = get_loss_fun()
device = torch.device("cuda", rank)
model = fairscale.nn.PipeRPCWrapper(
model,
balance=[2, 1],
worker_map={0: "worker0", 1: "worker1"}, # Needed to convert ranks to RPC worker names
input_device=device,
).to(device)
# We can't directly access the model on each worker, so we need to call
# foreach_worker with a callback to setup the optimizer
model.foreach_worker(register_optimizer, {"lr": 0.001}, include_self=True)
outputs = model(data.to(device))
loss = loss_fn(outputs.to(device), target.to(device))
loss.backward()
# Same as earlier, use foreach_worker to step the optimizer on each rank
model.foreach_worker(run_optimizer, include_self=True)
print(f"Finished Training Step on {rank}")
torch.distributed.rpc.shutdown()
del model
if __name__ == "__main__":
rank = int(os.environ["OMPI_COMM_WORLD_RANK"])
world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"])
run(rank, world_size)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment