Commit 0024a5c6 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'main' of https://github.com/NVIDIA/Megatron-LM

parents b004456b 3db2063b
Pipeline #229 failed with stages
in 0 seconds
# Copyright (c) 2020 The MMSegmenation Authors.
#
# This source code is licensed under the Apache license found in the
# LICENSE file in the root directory of this source tree.
import random
import os
import math
import mmcv
import torch
import numpy as np
import torchvision.transforms as T
from torchvision import datasets
from torch.utils.data import Dataset
from megatron import print_rank_0
from megatron import get_args
from PIL import Image, ImageOps, ImageEnhance
import torchvision.transforms as torch_tr
def _is_pil_image(img):
return isinstance(img, Image.Image)
class PhotoMetricDistortion(object):
"""Apply photometric distortion to image sequentially, every transformation
is applied with a probability of 0.5. The position of random contrast is in
second or second to last.
1. random brightness
2. random contrast (mode 0)
3. convert color from BGR to HSV
4. random saturation
5. random hue
6. convert color from HSV to BGR
7. random contrast (mode 1)
8. randomly swap channels
Args:
brightness_delta (int): delta of brightness.
contrast_range (tuple): range of contrast.
saturation_range (tuple): range of saturation.
hue_delta (int): delta of hue.
"""
def __init__(self,
brightness_delta=32,
contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5),
hue_delta=18):
self.brightness_delta = brightness_delta
self.contrast_lower, self.contrast_upper = contrast_range
self.saturation_lower, self.saturation_upper = saturation_range
self.hue_delta = hue_delta
def convert(self, img, alpha=1, beta=0):
"""Multiple with alpha and add beat with clip."""
img = img.astype(np.float32) * alpha + beta
img = np.clip(img, 0, 255)
return img.astype(np.uint8)
def brightness(self, img):
"""Brightness distortion."""
if random.randint(0, 1):
return self.convert(
img,
beta=random.uniform(-self.brightness_delta,
self.brightness_delta))
return img
def contrast(self, img):
"""Contrast distortion."""
if random.randint(0, 1):
return self.convert(
img,
alpha=random.uniform(self.contrast_lower, self.contrast_upper))
return img
def saturation(self, img):
"""Saturation distortion."""
if random.randint(0, 1):
img = mmcv.bgr2hsv(img)
img[:, :, 1] = self.convert(
img[:, :, 1],
alpha=random.uniform(self.saturation_lower,
self.saturation_upper))
img = mmcv.hsv2bgr(img)
return img
def hue(self, img):
"""Hue distortion."""
if random.randint(0, 1):
img = mmcv.bgr2hsv(img)
img[:, :,
0] = (img[:, :, 0].astype(int) +
random.randint(-self.hue_delta, self.hue_delta)) % 180
img = mmcv.hsv2bgr(img)
return img
def __call__(self, img):
"""Call function to perform photometric distortion on images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Result dict with images distorted.
"""
img = np.array(img)
# random brightness
img = self.brightness(img)
# mode == 0 --> do random contrast first
# mode == 1 --> do random contrast last
mode = random.randint(0, 1)
if mode == 1:
img = self.contrast(img)
# random saturation
img = self.saturation(img)
# random hue
img = self.hue(img)
# random contrast
if mode == 0:
img = self.contrast(img)
img = Image.fromarray(img.astype(np.uint8)).convert('RGB')
return img
class RandomCrop(object):
"""
Take a random crop from the image.
First the image or crop size may need to be adjusted if the incoming image
is too small...
If the image is smaller than the crop, then:
the image is padded up to the size of the crop
unless 'nopad', in which case the crop size is shrunk to fit the image
A random crop is taken such that the crop fits within the image.
if cfg.DATASET.TRANSLATION_AUG_FIX is set, we insure that there's always
translation randomness of at least that value around the image.
if image < crop_size:
# slide crop within image, random offset
else:
# slide image within crop
"""
def __init__(self, crop_size):
args = get_args()
self.size = crop_size
self.cat_max_ratio = 0.75
self.ignore_index = args.ignore_index
self.pad_color = (0, 0, 0)
def get_crop_bbox(self, img):
"""Randomly get a crop bounding box."""
img_w, img_h = img.size
target_h, target_w = self.size #[H W]
margin_h = max(img_h - target_h, 0)
margin_w = max(img_w - target_w, 0)
offset_h = random.randint(0, margin_h)
offset_w = random.randint(0, margin_w)
crop_y1, crop_y2 = offset_h, offset_h + target_h
crop_x1, crop_x2 = offset_w, offset_w + target_w
return crop_y1, crop_y2, crop_x1, crop_x2
def crop(self, img, crop_bbox):
"""Crop from ``img``"""
crop_y1, crop_y2, crop_x1, crop_x2 = crop_bbox
img = img.crop((crop_x1, crop_y1, crop_x2, crop_y2))
return img
@staticmethod
def crop_in_image(target_w, target_h, w, h, img, mask):
if w == target_w:
x1 = 0
else:
x1 = random.randint(0, w - target_w)
if h == target_h:
y1 = 0
else:
y1 = random.randint(0, h - target_h)
return [img.crop((x1, y1, x1 + target_w, y1 + target_h)),
mask.crop((x1, y1, x1 + target_w, y1 + target_h))]
def __call__(self, img, mask):
w, h = img.size
target_h, target_w = self.size # ASSUME H, W
if w == target_w and h == target_h:
return img, mask
# Pad image if image < crop
if target_h > h:
pad_h = (target_h - h) // 2 + 1
else:
pad_h = 0
if target_w > w:
pad_w = (target_w - w) // 2 + 1
else:
pad_w = 0
border = (pad_w, pad_h, pad_w, pad_h)
if pad_h or pad_w:
img = ImageOps.expand(img, border=border, fill=(0, 0, 0))
mask = ImageOps.expand(mask, border=border, fill=self.ignore_index)
w, h = img.size
crop_bbox = self.get_crop_bbox(img)
if self.cat_max_ratio < 1.:
# Repeat 10 times
for _ in range(10):
seg_temp = self.crop(mask, crop_bbox)
labels, cnt = np.unique(seg_temp, return_counts=True)
cnt = cnt[labels != self.ignore_index]
if len(cnt) > 1 and np.max(cnt) / np.sum(
cnt) < self.cat_max_ratio:
break
crop_bbox = self.get_crop_bbox(img)
# crop the image
img = self.crop(img, crop_bbox)
# crop semantic seg
mask = self.crop(mask, crop_bbox)
assert(img.size[0] == self.size[1] and img.size[1] == self.size[0])
return img, mask
class RandomSizeAndCrop(object):
def __init__(self,
crop_size,
scale_min=0.5,
scale_max=2.0):
self.crop = RandomCrop(crop_size)
self.scale_min = scale_min
self.scale_max = scale_max
def __call__(self, img, mask):
scale_amt = random.uniform(self.scale_min, self.scale_max)
w, h = [int(i * scale_amt) for i in img.size]
resized_img = img.resize((w, h), Image.BICUBIC)
resized_mask = mask.resize((w, h), Image.NEAREST)
img, mask = self.crop(resized_img, resized_mask)
return img, mask
class RandomHorizontallyFlip(object):
def __call__(self, img, mask):
if random.random() < 0.5:
return img.transpose(Image.FLIP_LEFT_RIGHT), mask.transpose(
Image.FLIP_LEFT_RIGHT)
return img, mask
def adjust_brightness(img, brightness_factor):
"""Adjust brightness of an Image.
Args:
img (PIL Image): PIL Image to be adjusted.
brightness_factor (float): How much to adjust the brightness. Can be
any non negative number. 0 gives a black image, 1 gives the
original image while 2 increases the brightness by a factor of 2.
Returns:
PIL Image: Brightness adjusted image.
"""
if not _is_pil_image(img):
raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
enhancer = ImageEnhance.Brightness(img)
img = enhancer.enhance(brightness_factor)
return img
def adjust_contrast(img, contrast_factor):
"""Adjust contrast of an Image.
Args:
img (PIL Image): PIL Image to be adjusted.
contrast_factor (float): How much to adjust the contrast. Can be any
non negative number. 0 gives a solid gray image, 1 gives the
original image while 2 increases the contrast by a factor of 2.
Returns:
PIL Image: Contrast adjusted image.
"""
if not _is_pil_image(img):
raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
enhancer = ImageEnhance.Contrast(img)
img = enhancer.enhance(contrast_factor)
return img
def adjust_saturation(img, saturation_factor):
"""Adjust color saturation of an image.
Args:
img (PIL Image): PIL Image to be adjusted.
saturation_factor (float): How much to adjust the saturation. 0 will
give a black and white image, 1 will give the original image while
2 will enhance the saturation by a factor of 2.
Returns:
PIL Image: Saturation adjusted image.
"""
if not _is_pil_image(img):
raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
enhancer = ImageEnhance.Color(img)
img = enhancer.enhance(saturation_factor)
return img
def adjust_hue(img, hue_factor):
"""Adjust hue of an image.
The image hue is adjusted by converting the image to HSV and
cyclically shifting the intensities in the hue channel (H).
The image is then converted back to original image mode.
`hue_factor` is the amount of shift in H channel and must be in the
interval `[-0.5, 0.5]`.
See https://en.wikipedia.org/wiki/Hue for more details on Hue.
Args:
img (PIL Image): PIL Image to be adjusted.
hue_factor (float): How much to shift the hue channel. Should be in
[-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
HSV space in positive and negative direction respectively.
0 means no shift. Therefore, both -0.5 and 0.5 will give an image
with complementary colors while 0 gives the original image.
Returns:
PIL Image: Hue adjusted image.
"""
if not(-0.5 <= hue_factor <= 0.5):
raise ValueError('hue_factor is not in [-0.5, 0.5].'.format(hue_factor))
if not _is_pil_image(img):
raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
input_mode = img.mode
if input_mode in {'L', '1', 'I', 'F'}:
return img
h, s, v = img.convert('HSV').split()
np_h = np.array(h, dtype=np.uint8)
# uint8 addition take cares of rotation across boundaries
with np.errstate(over='ignore'):
np_h += np.uint8(hue_factor * 255)
h = Image.fromarray(np_h, 'L')
img = Image.merge('HSV', (h, s, v)).convert(input_mode)
return img
class ColorJitter(object):
"""Randomly change the brightness, contrast and saturation of an image.
Args:
brightness (float): How much to jitter brightness. brightness_factor
is chosen uniformly from [max(0, 1 - brightness), 1 + brightness].
contrast (float): How much to jitter contrast. contrast_factor
is chosen uniformly from [max(0, 1 - contrast), 1 + contrast].
saturation (float): How much to jitter saturation. saturation_factor
is chosen uniformly from [max(0, 1 - saturation), 1 + saturation].
hue(float): How much to jitter hue. hue_factor is chosen uniformly from
[-hue, hue]. Should be >=0 and <= 0.5.
"""
def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
self.brightness = brightness
self.contrast = contrast
self.saturation = saturation
self.hue = hue
@staticmethod
def get_params(brightness, contrast, saturation, hue):
"""Get a randomized transform to be applied on image.
Arguments are same as that of __init__.
Returns:
Transform which randomly adjusts brightness, contrast and
saturation in a random order.
"""
transforms = []
if brightness > 0:
brightness_factor = np.random.uniform(max(0, 1 - brightness), 1 + brightness)
transforms.append(
torch_tr.Lambda(lambda img: adjust_brightness(img, brightness_factor)))
if contrast > 0:
contrast_factor = np.random.uniform(max(0, 1 - contrast), 1 + contrast)
transforms.append(
torch_tr.Lambda(lambda img: adjust_contrast(img, contrast_factor)))
if saturation > 0:
saturation_factor = np.random.uniform(max(0, 1 - saturation), 1 + saturation)
transforms.append(
torch_tr.Lambda(lambda img: adjust_saturation(img, saturation_factor)))
if hue > 0:
hue_factor = np.random.uniform(-hue, hue)
transforms.append(
torch_tr.Lambda(lambda img: adjust_hue(img, hue_factor)))
np.random.shuffle(transforms)
transform = torch_tr.Compose(transforms)
return transform
def __call__(self, img):
"""
Args:
img (PIL Image): Input image.
Returns:
PIL Image: Color jittered image.
"""
transform = self.get_params(self.brightness, self.contrast,
self.saturation, self.hue)
return transform(img)
import math
import torch
import numpy as np
from megatron import get_args
def slidingcrops(img, mask):
# img: [b c h w]
# mask: [b h w]
args = get_args()
assert args.img_h == args.img_w
crop_size = args.img_h
stride = args.seg_stride
ignore_index = args.ignore_index
n, c, h, w = img.shape
assert h >= crop_size
assert w >= crop_size
long_size = max(h, w)
img_slices, mask_slices, slices_info = [], [], []
if long_size > crop_size:
assert stride <= crop_size
h_step_num = int(math.ceil((h - crop_size) / float(stride))) + 1
w_step_num = int(math.ceil((w - crop_size) / float(stride))) + 1
for yy in range(h_step_num):
for xx in range(w_step_num):
sy, sx = yy * stride, xx * stride
ey, ex = sy + crop_size, sx + crop_size
img_sub = img[:, :, sy: ey, sx: ex]
mask_sub = mask[:, sy: ey, sx: ex]
# padding
sub_h, sub_w = img_sub.shape[2:]
pad_h = max(crop_size - sub_h, 0)
pad_w = max(crop_size - sub_w, 0)
img_sub = torch.nn.functional.pad(img_sub, pad=(0, pad_w, 0, pad_h), value=ignore_index)
mask_sub = torch.nn.functional.pad(mask_sub, pad=(0, pad_w, 0, pad_h))
img_slices.append(img_sub)
mask_slices.append(mask_sub)
slices_info.append([sy, ey, sx, ex, sub_h, sub_w])
return torch.cat(img_slices), torch.cat(mask_slices), slices_info, (h, w)
else:
return img, mask, [[0, h, 0, w, h, w]], (h, w)
def slidingjoins(preds, probs, labels, slices_info, img_size):
args = get_args()
num_slices = len(slices_info)
if num_slices == 1:
return preds, labels
h, w = img_size
split_size = args.micro_batch_size
preds_split = torch.split(preds, split_size)
probs_split = torch.split(probs, split_size)
labels_split = torch.split(labels, split_size)
assert(len(preds_split) == num_slices)
total_max_probs = torch.zeros((split_size, h, w), dtype=torch.float, device='cuda')
total_preds = torch.zeros((split_size, h, w), dtype=torch.int, device='cuda')
total_labels = torch.zeros((split_size, h, w), dtype=torch.int, device='cuda')
for i in range(num_slices):
sy, ey, sx, ex, sub_h, sub_w = slices_info[i]
assert sy + sub_h <= h
assert sx + sub_w <= w
curr_max_probs = total_max_probs[:, sy:sy + sub_h, sx:sx + sub_w]
curr_preds = total_preds[:, sy:sy + sub_h, sx:sx + sub_w]
local_max_probs = probs_split[i][:, :sub_h, : sub_w]
local_preds = preds_split[i][:, :sub_h, :sub_w]
result_max_probs = torch.maximum(curr_max_probs, local_max_probs)
result_preds = torch.where(curr_max_probs >= local_max_probs, curr_preds, local_preds)
total_max_probs[:, sy:sy + sub_h, sx:sx + sub_w] = result_max_probs
total_preds[:, sy:sy + sub_h, sx:sx + sub_w] = result_preds
total_labels[:, sy:sy + sub_h, sx:sx + sub_w] = labels_split[i][0, :sub_h, :sub_w]
return total_preds, total_labels
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Zero-shot datasets."""
import json
import math
import numpy as np
import torch
from megatron import get_args
from megatron import print_rank_0
from megatron import get_tokenizer
from .detokenizer import get_detokenizer
def build_dataset(task):
"""Helper function to select and build dataset."""
if task == 'LAMBADA':
return _build_lambada_dataset()
if task == 'WIKITEXT103':
return _build_wikitext103_dataset()
raise NotImplementedError('dataset for {} task is not '
'implemented.'.format(task))
class _LMDataset(torch.utils.data.Dataset):
def __init__(self, tokens, seq_len, pad_idx, num_original_tokens,
num_tokenized_tokens, overalapping_eval=None):
self.tokens = tokens
self.seq_len = seq_len
self.pad_idx = pad_idx
self.overalapping_eval = overalapping_eval
if self.overalapping_eval is None:
self.overalapping_eval = self.seq_len
self.overalapping_eval = max(1, self.overalapping_eval)
self.num_original_tokens = num_original_tokens
self.num_tokenized_tokens = num_tokenized_tokens
self.total_targets = len(self.tokens) - 1
# remove first sequence tokens
targets = max(self.total_targets - self.overalapping_eval, 0)
self.total_sequences = max(
math.ceil(targets / self.overalapping_eval) + 1, 1)
def __len__(self):
return self.total_sequences
def __getitem__(self, idx):
start_idx = idx * self.overalapping_eval
end_idx = start_idx + self.seq_len
tokens = self.tokens[start_idx:end_idx + 1]
num_tokens = len(tokens)
pad_mask = [1] * num_tokens
if num_tokens < self.seq_len + 1:
num_pad = (self.seq_len + 1 - num_tokens)
pad_mask += [0] * (num_pad)
tokens += [self.pad_idx] * num_pad
pad_mask = np.array(pad_mask[1:])
if self.overalapping_eval != self.seq_len and idx != 0:
pad_mask[:-self.overalapping_eval] *= 0
return {'text': np.array(tokens), 'pad_mask': pad_mask}
class _LambadaDataset(torch.utils.data.Dataset):
def __init__(self, path, pad_idx, tokenizer, seq_len, strict=False):
print_rank_0('> building lambada dataset from {} ...'.format(path))
self.seq_len = seq_len
self.pad_idx = pad_idx
self.tokenizer = tokenizer
self.strict = strict
self.tokens = []
self.labels = []
with open(path, 'r') as f:
for line in f.readlines():
text = json.loads(line)['text']
tokens, labels = self.get_tokens(text)
self.tokens.append(tokens)
self.labels.append(labels)
def get_tokens(self, text):
if not self.strict:
tokens = self.tokenizer.tokenize(text)
return tokens[:-1], [tokens[-1]]
last_token = text.split()[-1]
start_idx = text.rfind(last_token)
beginning_tokens = self.tokenizer.tokenize(text[:start_idx].strip())
last_token = self.tokenizer.tokenize(' ' + last_token)
return beginning_tokens, last_token
def __len__(self):
return len(self.tokens)
def __getitem__(self, idx):
tokens = self.tokens[idx]
num_tokens = len(tokens)
pad_mask = [0] * num_tokens
labels = self.labels[idx]
pad_mask += [1] * len(labels)
tokens = tokens + labels
num_tokens = len(tokens)
if num_tokens < self.seq_len + 1:
num_pad = (self.seq_len + 1 - num_tokens)
pad_mask += [0] * (num_pad)
tokens += [self.pad_idx] * num_pad
pad_mask = np.array(pad_mask[1:])
return {'text': np.array(tokens), 'pad_mask': pad_mask}
def _build_lambada_dataset():
"""Build lambada dataset."""
args = get_args()
tokenizer = get_tokenizer()
assert len(args.valid_data) == 1
val_dataset = _LambadaDataset(args.valid_data[0], tokenizer.eod, tokenizer,
args.seq_length, args.strict_lambada)
print_rank_0(' > found {} samples.'.format(len(val_dataset)))
return val_dataset
def _build_wikitext103_dataset():
""""""
args = get_args()
tokenizer = get_tokenizer()
assert len(args.valid_data) == 1
with open(args.valid_data[0], "rb") as reader:
entire_data = reader.read().decode('utf-8')
num_original_tokens = len(entire_data.strip().split(" "))
entire_data = get_detokenizer(args.valid_data[0])(entire_data)
tokenized_data = tokenizer.tokenize(entire_data)
num_tokenized_tokens = len(tokenized_data)
val_dataset = _LMDataset(tokenized_data, args.seq_length, tokenizer.eod,
num_original_tokens, num_tokenized_tokens,
args.overlapping_eval)
print_rank_0(' > number of original tokens: {}, number of detokenized '
'tokens: {}'.format(num_original_tokens, num_tokenized_tokens))
return val_dataset
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Detokenization."""
import re
def ptb_detokenizer(string):
string = string.replace(" '", "'")
string = string.replace(" \n", "\n")
string = string.replace("\n ", "\n")
string = string.replace(" n't", "n't")
string = string.replace(" N ", "1 ")
string = string.replace("$ 1", "$1")
string = string.replace("# 1", "#1")
return string
def wikitext_detokenizer(string):
# contractions
string = string.replace("s '", "s'")
string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
# number separators
string = string.replace(" @-@ ", "-")
string = string.replace(" @,@ ", ",")
string = string.replace(" @.@ ", ".")
# punctuation
string = string.replace(" : ", ": ")
string = string.replace(" ; ", "; ")
string = string.replace(" . ", ". ")
string = string.replace(" ! ", "! ")
string = string.replace(" ? ", "? ")
string = string.replace(" , ", ", ")
# double brackets
string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
# miscellaneous
string = string.replace("= = = =", "====")
string = string.replace("= = =", "===")
string = string.replace("= =", "==")
string = string.replace(" " + chr(176) + " ", chr(176))
string = string.replace(" \n", "\n")
string = string.replace("\n ", "\n")
string = string.replace(" N ", " 1 ")
string = string.replace(" 's", "'s")
return string
def lambada_detokenizer(string):
return string
_DETOKENIZERS = {
'ptb': ptb_detokenizer,
'wiki': wikitext_detokenizer,
'lambada': lambada_detokenizer,
}
def get_detokenizer(path):
for key in _DETOKENIZERS.keys():
if key in path:
return _DETOKENIZERS[key]
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""GPT zero-shot evaluation."""
import math
import torch
from megatron import get_args
from megatron import print_rank_0, is_last_rank
from megatron import get_tokenizer
from megatron.core import parallel_state, tensor_parallel
from megatron.checkpointing import load_checkpoint
from megatron.model import GPTModel
from megatron.training import get_model
from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
from megatron.p2p_communication import recv_forward, send_forward
from tasks.finetune_utils import build_data_loader
from .datasets import build_dataset
# These are needed to unwrap the model, would be nice to put these in megatron.utils if possible?
from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
from megatron.model import DistributedDataParallel as LocalDDP
from megatron.model import Float16Module
def get_model_provider(eval_metric):
"""Based on evaluation metric set the parallel-output flag and
return the model provider."""
def model_provider(pre_process=True, post_process=True):
"""Build the model."""
if eval_metric == 'loss':
parallel_output = True
elif eval_metric == 'accuracy':
parallel_output = False
else:
raise NotImplementedError('output type for {} evaluation metric '
'is not supported.'.format(eval_metric))
print_rank_0('building GPT model ...')
model = GPTModel(num_tokentypes=0, parallel_output=parallel_output,
pre_process=pre_process, post_process=post_process)
return model
return model_provider
def process_batch(batch):
"""Process batch and produce inputs for the model."""
args = get_args()
tokenizer = get_tokenizer()
loss_mask = batch['pad_mask'].long().cuda().contiguous().byte()
tokens_ = batch['text'].long().cuda().contiguous()
labels = tokens_[:, 1:].contiguous()
tokens = tokens_[:, :-1].contiguous()
# Get the masks and postition ids.
attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
tokens,
tokenizer.eod,
args.reset_position_ids,
args.reset_attention_mask,
args.eod_mask_loss)
return tokens, labels, attention_mask, position_ids, loss_mask
def forward_step(batch, model, eval_metric):
"""Forward step."""
# Get the batch.
tokens, labels, attention_mask, position_ids, loss_mask = process_batch(
batch)
# Tell the model what our actual batch size will be
args = get_args()
args.micro_batch_size = len(labels)
input_tensor = recv_forward()
# Forward pass through the model.
unwrapped_model = unwrap_model(
model, (torchDDP, LocalDDP, Float16Module))
unwrapped_model.set_input_tensor(input_tensor)
output = model(tokens, position_ids, attention_mask)
send_forward(output)
if parallel_state.is_pipeline_last_stage():
# For loss, return the unreduced loss.
if eval_metric == 'loss':
losses = tensor_parallel.vocab_parallel_cross_entropy(
output.contiguous().float(), labels.contiguous())
loss = torch.sum(
losses.view(-1) * loss_mask.contiguous().view(-1).float())
return loss
# For accuracy, return the number of correctly predicted samples.
if eval_metric == 'accuracy':
outputs = torch.argmax(output, -1)
correct = (outputs == labels).float()
correct[(1 - loss_mask).bool()] = 1
correct = correct.prod(-1)
return correct.sum()
raise NotImplementedError('forward method for evaluation metric {} '
'is not implemented.'.format(eval_metric))
return None
def evaluate(data_loader, model, eval_metric):
"""Evaluation."""
args = get_args()
# Turn on evaluation mode which disables dropout.
model.eval()
total_output = 0.0
with torch.no_grad():
# For all the batches in the dataset.
for iteration, batch in enumerate(data_loader):
if iteration % args.log_interval == 0:
print_rank_0('> working on iteration: {}'.format(iteration))
# Forward evaluation.
output = forward_step(batch, model, eval_metric)
# Reduce across processes.
if parallel_state.is_pipeline_last_stage():
torch.distributed.all_reduce(output,
group=parallel_state.get_data_parallel_group())
total_output += output
return total_output
def evaluate_and_print_results(task, data_loader, model, eval_metric):
"""Evaluate and print results on screen."""
# Evaluate and get results.
output = evaluate(data_loader, model, eval_metric)
string = ' validation results on {} | '.format(task)
if is_last_rank():
if eval_metric == 'loss':
num_tokenized_tokens = data_loader.dataset.num_tokenized_tokens
num_original_tokens = data_loader.dataset.num_original_tokens
val_loss = output / (num_tokenized_tokens - 1)
ppl = math.exp(min(20, val_loss))
token_ratio = (num_tokenized_tokens - 1) / (num_original_tokens - 1)
adjusted_ppl = math.exp(min(20, val_loss * token_ratio))
string += 'avg loss: {:.4E} | '.format(val_loss)
string += 'ppl: {:.4E} | '.format(ppl)
string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl)
string += 'token ratio: {} |'.format(token_ratio)
elif eval_metric == 'accuracy':
num_examples = len(data_loader.dataset)
acc = output / num_examples
string += 'number correct: {:.4E} | '.format(output)
string += 'total examples: {:.4E} | '.format(num_examples)
string += 'avg accuracy: {:.4E}'.format(acc)
else:
raise NotImplementedError('evaluation method for {} metric is not '
'implemented yet.'.format(eval_metric))
length = len(string) + 1
print('-' * length)
print(string)
print('-' * length)
def main():
"""Main program."""
args = get_args()
if args.num_layers_per_virtual_pipeline_stage is not None:
print("Interleaved pipeline schedule is not yet supported for text generation.")
exit()
if args.task == 'LAMBADA':
eval_metric = 'accuracy'
elif args.task == 'WIKITEXT103':
eval_metric = 'loss'
else:
raise NotImplementedError('{} task is not implemented.'.format(
args.task))
# Set up model and load checkpoint.
model = get_model(get_model_provider(eval_metric), wrap_with_ddp=False)
if args.load is not None:
_ = load_checkpoint(model, None, None)
assert len(model) == 1, "Above condition should have caught this"
model = model[0]
# Data stuff.
dataset = build_dataset(args.task)
dataloader = build_data_loader(dataset, args.micro_batch_size,
args.num_workers, drop_last=False)
# Run evaluation.
evaluate_and_print_results(args.task, dataloader, model, eval_metric)
print_rank_0('done :-)')
import os
import sys
import json
import shutil
import glob
from tensorboard.backend.event_processing import event_accumulator
def read_tb_logs_as_list(path, summary_name):
"""Reads a TensorBoard Events file from the input path, and returns the
summary specified as input as a list.
Arguments:
path: str, path to the dir where the events file is located.
summary_name: str, name of the summary to read from the TB logs.
Output:
summary_list: list, the values in the read summary list, formatted as a list.
"""
files = glob.glob(f"{path}/events*tfevents*")
files += glob.glob(f"{path}/results/events*tfevents*")
files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
if files:
event_file = files[0]
ea = event_accumulator.EventAccumulator(event_file)
ea.Reload()
summary = ea.Scalars(summary_name)
summary_list = [round(x.value, 5) for x in summary]
print(f'\nObtained the following list for {summary_name} ------------------')
print(summary_list)
return summary_list
raise FileNotFoundError(f"File not found matching: {path}/events*")
def collect_train_test_metrics(logs_dir, run_name):
# TODO: Fetch current baseline
# train loss
train_loss_list = read_tb_logs_as_list(logs_dir, "lm loss")
# num zeros
num_zeros = read_tb_logs_as_list(logs_dir, "num-zeros")
iteration_time = read_tb_logs_as_list(logs_dir, "iteration-time")
# First few iterations might take a little longer. So we take the last 70 percent of the timings
idx = len(iteration_time)//3
iteration_time_avg = sum(iteration_time[idx:])/len(iteration_time[idx:])
train_metrics = {
"lm loss": {
"start_step": 0,
"end_step": len(train_loss_list),
"step_interval": 5,
"values": train_loss_list[0:len(train_loss_list):5],
},
"num-zeros": {
"start_step": 0,
"end_step": len(num_zeros),
"step_interval": 5,
"values": num_zeros[0:len(num_zeros):5],
},
"iteration_timing_avg": iteration_time_avg,
}
str_train_metrics = str(train_metrics).replace("'", "\"")
print(f"\n ----------- Store the following metrics in {run_name}.json ----------")
print(f"\n {str_train_metrics}", flush=True)
if __name__ == '__main__':
args = sys.argv[1:]
logs_dir = args[0] # eg /lustre/fsw/joc/shanmugamr/megatron/logs/
run_name = args[1]
collect_train_test_metrics(logs_dir, run_name)
import os
import json
import pytest
import sys
import glob
from tensorboard.backend.event_processing import event_accumulator
LOGS_DIR = os.getenv('LOGS_DIR')
EXPECTED_METRICS_FILE = os.getenv('EXPECTED_METRICS_FILE')
import enum
class TypeOfTest(enum.Enum):
APPROX = 1
DETERMINISTIC = 2
def read_tb_logs_as_list(path, summary_name):
"""Reads a TensorBoard Events file from the input path, and returns the
summary specified as input as a list.
Arguments:
path: str, path to the dir where the events file is located.
summary_name: str, name of the summary to read from the TB logs.
Output:
summary_list: list, the values in the read summary list, formatted as a list.
"""
files = glob.glob(f"{path}/events*tfevents*")
files += glob.glob(f"{path}/results/events*tfevents*")
files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
if files:
event_file = files[0]
ea = event_accumulator.EventAccumulator(event_file)
ea.Reload()
summary = ea.Scalars(summary_name)
summary_list = [round(x.value, 5) for x in summary]
print(f'\nObtained the following list for {summary_name} ------------------')
print(summary_list)
return summary_list
raise FileNotFoundError(f"File not found matching: {path}/events*")
# If we require a variation of tests for any of the other pipelines we can just inherit this class.
class TestCIPipeline:
margin_loss, margin_time = 0.05, 0.1
expected = None
if os.path.exists(EXPECTED_METRICS_FILE):
with open(EXPECTED_METRICS_FILE) as f:
expected = json.load(f)
def _test_helper(self, loss_type, test_type):
if self.expected is None:
raise FileNotFoundError("Expected data is none")
expected = self.expected[loss_type]
expected_list = expected["values"]
print(expected_list)
actual_list = read_tb_logs_as_list(LOGS_DIR, loss_type)
assert actual_list is not None, f"No TensorBoard events file was found in the logs for {loss_type}."
for i, step in enumerate(range(expected["start_step"], expected["end_step"], expected["step_interval"])):
print(f"Checking step {step} against expected {i}")
if test_type == TypeOfTest.APPROX:
assert actual_list[step] == pytest.approx(expected=expected_list[i], rel=self.margin_loss), f"{self.job_name} : The loss at step {step} should be approximately {expected_list[i]} but it is {actual_list[step]}."
else:
assert actual_list[step] == expected_list[i], f"The value at step {step} should be {expected_list[i]} but it is {actual_list[step]}."
@pytest.mark.xfail
def test_lm_loss_deterministic(self):
# Expected training loss curve at different global steps.
self._test_helper("lm loss", TypeOfTest.DETERMINISTIC)
def test_lm_loss_approx(self):
# Expected training loss curve at different global steps.
self._test_helper("lm loss", TypeOfTest.APPROX)
def test_num_zeros_deterministic(self):
# Expected validation loss curve at different global steps.
self._test_helper("num-zeros", TypeOfTest.DETERMINISTIC)
def iteration_timing_node(self):
expected_iteration_timing_avg = self.expected["train_step_timing_avg"]
iteration_time = read_tb_logs_as_list(LOGS_DIR, "iteration-time")
idx = len(iteration_time)//3
iteration_time_avg = sum(iteration_time[idx:])/len(iteration_time[idx:])
assert expected_iteration_timing_avg == pytest.approx(expected=iteration_time_avg, rel=self.margin_time), f"The time per global step must be approximately {expected_iteration_timing_avg} but it is {iteration_time_avg}."
import os
import sys
import json
import shutil
import glob
from tensorboard.backend.event_processing import event_accumulator
LOGS_DIR = os.getenv('LOGS_DIR')
def read_tb_logs_as_list(path, summary_name, index):
files = glob.glob(f"{path}/events*tfevents*")
files += glob.glob(f"{path}/results/events*tfevents*")
files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
if files:
event_file = files[index]
ea = event_accumulator.EventAccumulator(event_file)
ea.Reload()
summary = ea.Scalars(summary_name)
summary_list = [round(x.value, 5) for x in summary]
print(summary_list)
return summary_list
raise FileNotFoundError(f"File not found matching: {path}/events*")
def collect_train_test_metrics(logs_dir, index):
train_loss_list = read_tb_logs_as_list(logs_dir, "lm loss", index)
train_loss_list = [round(elem,3) for elem in train_loss_list]
train_metrics = {
"lm loss": train_loss_list[0:len(train_loss_list):5],
}
str_train_metrics = str(train_metrics).replace("'", "\"")
print(f"\n ----------- The following are the metrics for ----------")
print(f"\n {str_train_metrics}", flush=True)
return train_metrics
class TestCIPipeline:
train_metrics_100 = collect_train_test_metrics(LOGS_DIR, 0)
train_metrics_50_to_100 = collect_train_test_metrics(LOGS_DIR, 1)
def _test_helper(self, loss_type):
expected = self.train_metrics_100[loss_type]
print('expected : ' + str(expected))
actual = self.train_metrics_50_to_100[loss_type]
print('actual : ' + str(actual))
# NOTE : Doing this way because in gpt3 model when I run from 0 - 100 directly, it produces 1 extra element
# i.e expected is [10.84266, 10.89696, 10.90542, 10.87498, 10.86265, 10.83608, 10.64368, 10.62319, 10.53908, 10.25005, 10.20907, 9.96542, 9.96802, 9.92436, 9.79086, 9.26718, 9.61784, 9.19018, 9.45986, 9.62168, 9.73772, 8.85732, 9.43185, 9.27912, 9.6832, 9.5127, 9.5419, 9.02549, 8.55077, 8.91355, 8.83375, 9.17722, 9.22436, 9.19436, 9.11323, 9.09711, 9.04421, 9.36795]
# actual is : [9.73772, 8.85732, 9.43185, 9.27912, 9.6832, 9.5127, 9.5419, 9.02549, 8.55077, 8.91355, 8.83375, 9.17722, 9.22435, 9.19435, 9.11322, 9.09711, 9.04422]
# That extra element in expected is causing some issues. So doing it this way. Need to figure out whats happening
start_idx_expected = expected.index(actual[0]) # First element of actual
# Here we will just be comparing values of actual and second half (50-100) of expected
for i in range(len(actual)):
assert actual[i] == expected[start_idx_expected + i], f"The value at step {i} should be {expected[start_idx_expected + i]} but it is {actual[i]}."
def test_lm_loss_deterministic(self):
self._test_helper("lm loss")
\ No newline at end of file
#! /bin/bash
JOBID=$1
echo "Job id : $JOBID"
if [[ $JOBID -eq "" ]]; then
exit 1
fi
sleep 10s
while true; do
export STATE=`sacct -j $JOBID --format State --parsable2 --noheader |& head -n 1`
case "${STATE}" in
PENDING|RUNNING|REQUEUED)
echo "Job is still in $STATE"
sleep 15s
;;
*)
sleep 30s
echo "Exiting with SLURM job status '${STATE}'"
exit 0
;;
esac
done
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50444, 10.49325, 10.4863, 10.48386, 10.49892, 10.46644, 10.41921, 10.30106, 10.16285, 9.97939]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [17438.0, 18815.0, 22912.0, 18568.0, 19900.0, 23810.0, 22918.0]}, "iteration_timing_avg": 0.35970588235294115}
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54369, 10.5383, 10.55953, 10.54011, 10.51908, 10.49118, 10.46612, 10.31901, 10.15649, 9.96702]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [21736.0, 20433.0, 27243.0, 23240.0, 22459.0, 20724.0, 23451.0]}, "iteration_timing_avg": 0.8657461764705884}
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.44729, 10.44093, 10.45375, 10.44445, 10.44305, 10.44595, 10.39163, 10.25898, 10.13498, 9.95692]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27334.0, 20551.0, 28114.0, 24328.0, 24070.0, 20653.0, 21346.0]}, "iteration_timing_avg": 0.6318655882352939}
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.4978, 10.49775, 10.48021, 10.50638, 10.49624, 10.47018, 10.34494, 10.25536, 10.10244, 9.91938]}, "num-zeros": {"start_step": 0, "end_step": 35, "step_interval": 5, "values": [26168.0, 19042.0, 28718.0, 22408.0, 26377.0, 34320.0, 21873.0]}, "iteration_timing_avg": 1.1249785294117647}
{"lm loss": {"start_step": 0, "end_step": 37, "step_interval": 5, "values": [10.84266, 10.89696, 10.90542, 10.87498, 10.86279, 10.83628, 10.64437, 10.62386]}, "num-zeros": {"start_step": 0, "end_step": 20, "step_interval": 5, "values": [2093.0, 2474.0, 2327.0, 2213.0]}, "iteration_timing_avg": 0.080846}
{"lm loss": {"start_step": 0, "end_step": 49, "step_interval": 5, "values": [10.7947, 10.85294, 10.87058, 10.83388, 10.83025, 10.78755, 10.56419, 10.57339, 10.48735, 10.19553]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2452.0, 2744.0, 2176.0, 2722.0, 2636.0, 2535.0, 2996.0]}, "iteration_timing_avg": 0.1158709090909091}
{"lm loss": {"start_step": 0, "end_step": 48, "step_interval": 5, "values": [10.85716, 10.88973, 10.879, 10.87014, 10.87978, 10.84463, 10.67266, 10.62932, 10.52767, 10.25362]}, "num-zeros": {"start_step": 0, "end_step": 31, "step_interval": 5, "values": [2450.0, 2396.0, 2523.0, 2242.0, 2225.0, 2478.0, 2536.0]}, "iteration_timing_avg": 0.11416968750000002}
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86276, 10.88058, 10.87527, 10.88402, 10.89173, 10.84724, 10.6886, 10.62864, 10.53925, 10.26646]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2199.0, 2306.0, 2412.0, 2032.0, 2077.0, 2475.0, 2347.0]}, "iteration_timing_avg": 0.15481029411764707}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment