Merge branch 'main' of https://github.com/NVIDIA/Megatron-LM

0024a5c6 · zhuwenwen · b004456b · 3db2063b · 0024a5c6 · 0024a5c6
Commit 0024a5c6 authored Apr 26, 2023 by zhuwenwen
20 changed files
--- a/tasks/vision/segmentation/transforms.py
+++ b/tasks/vision/segmentation/transforms.py
+# Copyright (c) 2020 The MMSegmenation Authors.
+#
+# This source code is licensed under the Apache license found in the
+# LICENSE file in the root directory of this source tree.
+
+import random
+import os
+import math
+import mmcv
+import torch
+import numpy as np
+import torchvision.transforms as T
+from torchvision import datasets
+from torch.utils.data import Dataset
+from megatron import print_rank_0
+from megatron import get_args
+from PIL import Image, ImageOps, ImageEnhance
+import torchvision.transforms as torch_tr
+
+def _is_pil_image(img):
+    return isinstance(img, Image.Image)
+
+class PhotoMetricDistortion(object):
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def convert(self, img, alpha=1, beta=0):
+        """Multiple with alpha and add beat with clip."""
+        img = img.astype(np.float32) * alpha + beta
+        img = np.clip(img, 0, 255)
+        return img.astype(np.uint8)
+
+    def brightness(self, img):
+        """Brightness distortion."""
+        if random.randint(0, 1):
+            return self.convert(
+                img,
+                beta=random.uniform(-self.brightness_delta,
+                                    self.brightness_delta))
+        return img
+
+    def contrast(self, img):
+        """Contrast distortion."""
+        if random.randint(0, 1):
+            return self.convert(
+                img,
+                alpha=random.uniform(self.contrast_lower, self.contrast_upper))
+        return img
+
+    def saturation(self, img):
+        """Saturation distortion."""
+        if random.randint(0, 1):
+            img = mmcv.bgr2hsv(img)
+            img[:, :, 1] = self.convert(
+                img[:, :, 1],
+                alpha=random.uniform(self.saturation_lower,
+                                     self.saturation_upper))
+            img = mmcv.hsv2bgr(img)
+        return img
+
+    def hue(self, img):
+        """Hue distortion."""
+        if random.randint(0, 1):
+            img = mmcv.bgr2hsv(img)
+            img[:, :,
+                0] = (img[:, :, 0].astype(int) +
+                      random.randint(-self.hue_delta, self.hue_delta)) % 180
+            img = mmcv.hsv2bgr(img)
+        return img
+
+    def __call__(self, img):
+        """Call function to perform photometric distortion on images.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Result dict with images distorted.
+        """
+        img = np.array(img)
+
+        # random brightness
+        img = self.brightness(img)
+
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        mode = random.randint(0, 1)
+        if mode == 1:
+            img = self.contrast(img)
+
+        # random saturation
+        img = self.saturation(img)
+
+        # random hue
+        img = self.hue(img)
+
+        # random contrast
+        if mode == 0:
+            img = self.contrast(img)
+
+        img = Image.fromarray(img.astype(np.uint8)).convert('RGB')
+        return img
+
+
+class RandomCrop(object):
+    """
+    Take a random crop from the image.
+
+    First the image or crop size may need to be adjusted if the incoming image
+    is too small...
+
+    If the image is smaller than the crop, then:
+         the image is padded up to the size of the crop
+         unless 'nopad', in which case the crop size is shrunk to fit the image
+
+    A random crop is taken such that the crop fits within the image.
+
+
+    if cfg.DATASET.TRANSLATION_AUG_FIX is set, we insure that there's always
+    translation randomness of at least that value around the image.
+
+    if image < crop_size:
+        # slide crop within image, random offset
+    else:
+        # slide image within crop
+    """
+    def __init__(self, crop_size):
+        args = get_args()
+        self.size = crop_size
+        self.cat_max_ratio = 0.75
+        self.ignore_index = args.ignore_index
+        self.pad_color = (0, 0, 0)
+
+    def get_crop_bbox(self, img):
+        """Randomly get a crop bounding box."""
+        img_w, img_h = img.size
+        target_h, target_w = self.size  #[H W]
+        margin_h = max(img_h - target_h, 0)
+        margin_w = max(img_w - target_w, 0)
+        offset_h = random.randint(0, margin_h)
+        offset_w = random.randint(0, margin_w)
+        crop_y1, crop_y2 = offset_h, offset_h + target_h
+        crop_x1, crop_x2 = offset_w, offset_w + target_w
+
+        return crop_y1, crop_y2, crop_x1, crop_x2
+
+    def crop(self, img, crop_bbox):
+        """Crop from ``img``"""
+        crop_y1, crop_y2, crop_x1, crop_x2 = crop_bbox
+        img = img.crop((crop_x1, crop_y1, crop_x2, crop_y2))
+        return img
+
+    @staticmethod
+    def crop_in_image(target_w, target_h, w, h, img, mask):
+        if w == target_w:
+            x1 = 0
+        else:
+            x1 = random.randint(0, w - target_w)
+        if h == target_h:
+            y1 = 0
+        else:
+            y1 = random.randint(0, h - target_h)
+
+        return [img.crop((x1, y1, x1 + target_w, y1 + target_h)),
+                mask.crop((x1, y1, x1 + target_w, y1 + target_h))]
+
+
+    def __call__(self, img, mask):
+        w, h = img.size
+        target_h, target_w = self.size   # ASSUME H, W
+
+        if w == target_w and h == target_h:
+            return img, mask
+
+        # Pad image if image < crop
+        if target_h > h:
+            pad_h = (target_h - h) // 2 + 1
+        else:
+            pad_h = 0
+        if target_w > w:
+            pad_w = (target_w - w) // 2 + 1
+        else:
+            pad_w = 0
+        border = (pad_w, pad_h, pad_w, pad_h)
+        if pad_h or pad_w:
+            img = ImageOps.expand(img, border=border, fill=(0, 0, 0))
+            mask = ImageOps.expand(mask, border=border, fill=self.ignore_index)
+            w, h = img.size
+
+        crop_bbox = self.get_crop_bbox(img)
+        if self.cat_max_ratio < 1.:
+            # Repeat 10 times
+            for _ in range(10):
+                seg_temp = self.crop(mask, crop_bbox)
+                labels, cnt = np.unique(seg_temp, return_counts=True)
+                cnt = cnt[labels != self.ignore_index]
+                if len(cnt) > 1 and np.max(cnt) / np.sum(
+                        cnt) < self.cat_max_ratio:
+                    break
+                crop_bbox = self.get_crop_bbox(img)
+
+        # crop the image
+        img = self.crop(img, crop_bbox)
+
+        # crop semantic seg
+        mask = self.crop(mask, crop_bbox)
+        assert(img.size[0] == self.size[1] and img.size[1] == self.size[0])
+          
+        return img, mask
+
+
+class RandomSizeAndCrop(object):
+    def __init__(self,
+                 crop_size,
+                 scale_min=0.5,
+                 scale_max=2.0):
+        self.crop = RandomCrop(crop_size)
+        self.scale_min = scale_min
+        self.scale_max = scale_max
+
+    def __call__(self, img, mask):
+
+        scale_amt = random.uniform(self.scale_min, self.scale_max)
+        w, h = [int(i * scale_amt) for i in img.size]
+
+        resized_img = img.resize((w, h), Image.BICUBIC)
+        resized_mask = mask.resize((w, h), Image.NEAREST)
+        img, mask = self.crop(resized_img, resized_mask)
+        return img, mask
+
+class RandomHorizontallyFlip(object):
+    def __call__(self, img, mask):
+        if random.random() < 0.5:
+            return img.transpose(Image.FLIP_LEFT_RIGHT), mask.transpose(
+                Image.FLIP_LEFT_RIGHT)
+        return img, mask
+
+
+def adjust_brightness(img, brightness_factor):
+    """Adjust brightness of an Image.
+
+    Args:
+        img (PIL Image): PIL Image to be adjusted.
+        brightness_factor (float):  How much to adjust the brightness. Can be
+            any non negative number. 0 gives a black image, 1 gives the
+            original image while 2 increases the brightness by a factor of 2.
+
+    Returns:
+        PIL Image: Brightness adjusted image.
+    """
+    if not _is_pil_image(img):
+        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
+
+    enhancer = ImageEnhance.Brightness(img)
+    img = enhancer.enhance(brightness_factor)
+    return img
+
+
+def adjust_contrast(img, contrast_factor):
+    """Adjust contrast of an Image.
+
+    Args:
+        img (PIL Image): PIL Image to be adjusted.
+        contrast_factor (float): How much to adjust the contrast. Can be any
+            non negative number. 0 gives a solid gray image, 1 gives the
+            original image while 2 increases the contrast by a factor of 2.
+
+    Returns:
+        PIL Image: Contrast adjusted image.
+    """
+    if not _is_pil_image(img):
+        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
+
+    enhancer = ImageEnhance.Contrast(img)
+    img = enhancer.enhance(contrast_factor)
+    return img
+
+
+def adjust_saturation(img, saturation_factor):
+    """Adjust color saturation of an image.
+
+    Args:
+        img (PIL Image): PIL Image to be adjusted.
+        saturation_factor (float):  How much to adjust the saturation. 0 will
+            give a black and white image, 1 will give the original image while
+            2 will enhance the saturation by a factor of 2.
+
+    Returns:
+        PIL Image: Saturation adjusted image.
+    """
+    if not _is_pil_image(img):
+        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
+
+    enhancer = ImageEnhance.Color(img)
+    img = enhancer.enhance(saturation_factor)
+    return img
+
+
+def adjust_hue(img, hue_factor):
+    """Adjust hue of an image.
+
+    The image hue is adjusted by converting the image to HSV and
+    cyclically shifting the intensities in the hue channel (H).
+    The image is then converted back to original image mode.
+
+    `hue_factor` is the amount of shift in H channel and must be in the
+    interval `[-0.5, 0.5]`.
+
+    See https://en.wikipedia.org/wiki/Hue for more details on Hue.
+
+    Args:
+        img (PIL Image): PIL Image to be adjusted.
+        hue_factor (float):  How much to shift the hue channel. Should be in
+            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
+            HSV space in positive and negative direction respectively.
+            0 means no shift. Therefore, both -0.5 and 0.5 will give an image
+            with complementary colors while 0 gives the original image.
+
+    Returns:
+        PIL Image: Hue adjusted image.
+    """
+    if not(-0.5 <= hue_factor <= 0.5):
+        raise ValueError('hue_factor is not in [-0.5, 0.5].'.format(hue_factor))
+
+    if not _is_pil_image(img):
+        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
+
+    input_mode = img.mode
+    if input_mode in {'L', '1', 'I', 'F'}:
+        return img
+
+    h, s, v = img.convert('HSV').split()
+
+    np_h = np.array(h, dtype=np.uint8)
+    # uint8 addition take cares of rotation across boundaries
+    with np.errstate(over='ignore'):
+        np_h += np.uint8(hue_factor * 255)
+    h = Image.fromarray(np_h, 'L')
+
+    img = Image.merge('HSV', (h, s, v)).convert(input_mode)
+    return img
+
+
+class ColorJitter(object):
+    """Randomly change the brightness, contrast and saturation of an image.
+
+    Args:
+        brightness (float): How much to jitter brightness. brightness_factor
+            is chosen uniformly from [max(0, 1 - brightness), 1 + brightness].
+        contrast (float): How much to jitter contrast. contrast_factor
+            is chosen uniformly from [max(0, 1 - contrast), 1 + contrast].
+        saturation (float): How much to jitter saturation. saturation_factor
+            is chosen uniformly from [max(0, 1 - saturation), 1 + saturation].
+        hue(float): How much to jitter hue. hue_factor is chosen uniformly from
+            [-hue, hue]. Should be >=0 and <= 0.5.
+    """
+    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
+        self.brightness = brightness
+        self.contrast = contrast
+        self.saturation = saturation
+        self.hue = hue
+
+    @staticmethod
+    def get_params(brightness, contrast, saturation, hue):
+        """Get a randomized transform to be applied on image.
+
+        Arguments are same as that of __init__.
+
+        Returns:
+            Transform which randomly adjusts brightness, contrast and
+            saturation in a random order.
+        """
+        transforms = []
+        if brightness > 0:
+            brightness_factor = np.random.uniform(max(0, 1 - brightness), 1 + brightness)
+            transforms.append(
+                torch_tr.Lambda(lambda img: adjust_brightness(img, brightness_factor)))
+
+        if contrast > 0:
+            contrast_factor = np.random.uniform(max(0, 1 - contrast), 1 + contrast)
+            transforms.append(
+                torch_tr.Lambda(lambda img: adjust_contrast(img, contrast_factor)))
+
+        if saturation > 0:
+            saturation_factor = np.random.uniform(max(0, 1 - saturation), 1 + saturation)
+            transforms.append(
+                torch_tr.Lambda(lambda img: adjust_saturation(img, saturation_factor)))
+
+        if hue > 0:
+            hue_factor = np.random.uniform(-hue, hue)
+            transforms.append(
+                torch_tr.Lambda(lambda img: adjust_hue(img, hue_factor)))
+
+        np.random.shuffle(transforms)
+        transform = torch_tr.Compose(transforms)
+
+        return transform
+
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image): Input image.
+
+        Returns:
+            PIL Image: Color jittered image.
+        """
+        transform = self.get_params(self.brightness, self.contrast,
+                                    self.saturation, self.hue)
+        return transform(img)
+
--- a/tasks/vision/segmentation/utils.py
+++ b/tasks/vision/segmentation/utils.py
+import math
+import torch
+import numpy as np
+from megatron import get_args
+
+def slidingcrops(img, mask):
+    # img: [b c h w]
+    # mask: [b h w]
+    args = get_args()
+    assert args.img_h == args.img_w
+    crop_size = args.img_h
+    stride = args.seg_stride
+    ignore_index = args.ignore_index
+    n, c, h, w = img.shape
+    assert h >= crop_size
+    assert w >= crop_size
+    long_size = max(h, w)
+
+    img_slices, mask_slices, slices_info = [], [], []
+    if long_size > crop_size:
+        assert stride <= crop_size
+        h_step_num = int(math.ceil((h - crop_size) / float(stride))) + 1
+        w_step_num = int(math.ceil((w - crop_size) / float(stride))) + 1
+        for yy in range(h_step_num):
+            for xx in range(w_step_num):
+                sy, sx = yy * stride, xx * stride
+                ey, ex = sy + crop_size, sx + crop_size
+                img_sub = img[:, :, sy: ey, sx: ex]
+                mask_sub = mask[:, sy: ey, sx: ex]
+
+                # padding
+                sub_h, sub_w = img_sub.shape[2:]
+                pad_h = max(crop_size - sub_h, 0)
+                pad_w = max(crop_size - sub_w, 0)
+                img_sub = torch.nn.functional.pad(img_sub, pad=(0, pad_w, 0, pad_h), value=ignore_index)
+                mask_sub = torch.nn.functional.pad(mask_sub, pad=(0, pad_w, 0, pad_h))
+
+                img_slices.append(img_sub)
+                mask_slices.append(mask_sub)
+                slices_info.append([sy, ey, sx, ex, sub_h, sub_w])
+
+        return torch.cat(img_slices), torch.cat(mask_slices), slices_info, (h, w)
+    else:
+        return img, mask, [[0, h, 0, w, h, w]], (h, w)
+
+
+def slidingjoins(preds, probs, labels, slices_info, img_size):
+    args = get_args()
+    num_slices = len(slices_info)
+
+    if num_slices == 1:
+        return preds, labels
+
+    h, w = img_size
+    split_size = args.micro_batch_size
+
+    preds_split = torch.split(preds, split_size)
+    probs_split = torch.split(probs, split_size)
+    labels_split = torch.split(labels, split_size)
+
+    assert(len(preds_split) == num_slices)
+
+    total_max_probs = torch.zeros((split_size, h, w), dtype=torch.float, device='cuda')
+    total_preds = torch.zeros((split_size, h, w), dtype=torch.int, device='cuda')
+    total_labels = torch.zeros((split_size, h, w), dtype=torch.int, device='cuda')
+
+    for i in range(num_slices):
+        sy, ey, sx, ex, sub_h, sub_w = slices_info[i]
+        assert sy + sub_h <= h
+        assert sx + sub_w <= w
+        curr_max_probs = total_max_probs[:, sy:sy + sub_h, sx:sx + sub_w]
+        curr_preds = total_preds[:, sy:sy + sub_h, sx:sx + sub_w]
+
+        local_max_probs = probs_split[i][:, :sub_h, : sub_w]
+        local_preds = preds_split[i][:, :sub_h, :sub_w]
+
+        result_max_probs = torch.maximum(curr_max_probs, local_max_probs)
+        result_preds = torch.where(curr_max_probs >= local_max_probs, curr_preds, local_preds)
+
+        total_max_probs[:, sy:sy + sub_h, sx:sx + sub_w] = result_max_probs
+        total_preds[:, sy:sy + sub_h, sx:sx + sub_w] = result_preds
+        total_labels[:, sy:sy + sub_h, sx:sx + sub_w] = labels_split[i][0, :sub_h, :sub_w]
+
+    return total_preds, total_labels
+
--- a/tasks/zeroshot_gpt/datasets.py
+++ b/tasks/zeroshot_gpt/datasets.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Zero-shot datasets."""
+
+import json
+import math
+
+import numpy as np
+import torch
+
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_tokenizer
+from .detokenizer import get_detokenizer
+
+
+def build_dataset(task):
+    """Helper function to select and build dataset."""
+
+    if task == 'LAMBADA':
+        return _build_lambada_dataset()
+    if task == 'WIKITEXT103':
+        return _build_wikitext103_dataset()
+
+    raise NotImplementedError('dataset for {} task is not '
+                              'implemented.'.format(task))
+
+
+class _LMDataset(torch.utils.data.Dataset):
+
+    def __init__(self, tokens, seq_len, pad_idx, num_original_tokens,
+                 num_tokenized_tokens, overalapping_eval=None):
+        self.tokens = tokens
+        self.seq_len = seq_len
+        self.pad_idx = pad_idx
+        self.overalapping_eval = overalapping_eval
+        if self.overalapping_eval is None:
+            self.overalapping_eval = self.seq_len
+        self.overalapping_eval = max(1, self.overalapping_eval)
+        self.num_original_tokens = num_original_tokens
+        self.num_tokenized_tokens = num_tokenized_tokens
+        self.total_targets = len(self.tokens) - 1
+        # remove first sequence tokens
+        targets = max(self.total_targets - self.overalapping_eval, 0)
+        self.total_sequences = max(
+            math.ceil(targets / self.overalapping_eval) + 1, 1)
+
+    def __len__(self):
+        return self.total_sequences
+
+    def __getitem__(self, idx):
+        start_idx = idx * self.overalapping_eval
+        end_idx = start_idx + self.seq_len
+        tokens = self.tokens[start_idx:end_idx + 1]
+        num_tokens = len(tokens)
+        pad_mask = [1] * num_tokens
+        if num_tokens < self.seq_len + 1:
+            num_pad = (self.seq_len + 1 - num_tokens)
+            pad_mask += [0] * (num_pad)
+            tokens += [self.pad_idx] * num_pad
+        pad_mask = np.array(pad_mask[1:])
+        if self.overalapping_eval != self.seq_len and idx != 0:
+            pad_mask[:-self.overalapping_eval] *= 0
+
+        return {'text': np.array(tokens), 'pad_mask': pad_mask}
+
+
+class _LambadaDataset(torch.utils.data.Dataset):
+
+    def __init__(self, path, pad_idx, tokenizer, seq_len, strict=False):
+        print_rank_0('> building lambada dataset from {} ...'.format(path))
+        self.seq_len = seq_len
+        self.pad_idx = pad_idx
+        self.tokenizer = tokenizer
+        self.strict = strict
+
+        self.tokens = []
+        self.labels = []
+        with open(path, 'r') as f:
+            for line in f.readlines():
+                text = json.loads(line)['text']
+                tokens, labels = self.get_tokens(text)
+                self.tokens.append(tokens)
+                self.labels.append(labels)
+
+    def get_tokens(self, text):
+        if not self.strict:
+            tokens = self.tokenizer.tokenize(text)
+            return tokens[:-1], [tokens[-1]]
+        last_token = text.split()[-1]
+        start_idx = text.rfind(last_token)
+        beginning_tokens = self.tokenizer.tokenize(text[:start_idx].strip())
+        last_token = self.tokenizer.tokenize(' ' + last_token)
+        return beginning_tokens, last_token
+
+    def __len__(self):
+        return len(self.tokens)
+
+    def __getitem__(self, idx):
+        tokens = self.tokens[idx]
+        num_tokens = len(tokens)
+        pad_mask = [0] * num_tokens
+        labels = self.labels[idx]
+        pad_mask += [1] * len(labels)
+        tokens = tokens + labels
+        num_tokens = len(tokens)
+        if num_tokens < self.seq_len + 1:
+            num_pad = (self.seq_len + 1 - num_tokens)
+            pad_mask += [0] * (num_pad)
+            tokens += [self.pad_idx] * num_pad
+        pad_mask = np.array(pad_mask[1:])
+
+        return {'text': np.array(tokens), 'pad_mask': pad_mask}
+
+
+def _build_lambada_dataset():
+    """Build lambada dataset."""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    assert len(args.valid_data) == 1
+    val_dataset = _LambadaDataset(args.valid_data[0], tokenizer.eod, tokenizer,
+                                  args.seq_length, args.strict_lambada)
+    print_rank_0(' > found {} samples.'.format(len(val_dataset)))
+
+    return val_dataset
+
+
+def _build_wikitext103_dataset():
+    """"""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    assert len(args.valid_data) == 1
+    with open(args.valid_data[0], "rb") as reader:
+        entire_data = reader.read().decode('utf-8')
+    num_original_tokens = len(entire_data.strip().split(" "))
+    entire_data = get_detokenizer(args.valid_data[0])(entire_data)
+    tokenized_data = tokenizer.tokenize(entire_data)
+    num_tokenized_tokens = len(tokenized_data)
+
+    val_dataset = _LMDataset(tokenized_data, args.seq_length, tokenizer.eod,
+                             num_original_tokens, num_tokenized_tokens,
+                             args.overlapping_eval)
+    print_rank_0(' > number of original tokens: {}, number of detokenized '
+                 'tokens: {}'.format(num_original_tokens, num_tokenized_tokens))
+
+    return val_dataset
--- a/tasks/zeroshot_gpt/detokenizer.py
+++ b/tasks/zeroshot_gpt/detokenizer.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Detokenization."""
+
+import re
+
+
+def ptb_detokenizer(string):
+    string = string.replace(" '", "'")
+    string = string.replace(" \n", "\n")
+    string = string.replace("\n ", "\n")
+    string = string.replace(" n't", "n't")
+    string = string.replace(" N ", "1 ")
+    string = string.replace("$ 1", "$1")
+    string = string.replace("# 1", "#1")
+    return string
+
+
+def wikitext_detokenizer(string):
+    # contractions
+    string = string.replace("s '", "s'")
+    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
+    # number separators
+    string = string.replace(" @-@ ", "-")
+    string = string.replace(" @,@ ", ",")
+    string = string.replace(" @.@ ", ".")
+    # punctuation
+    string = string.replace(" : ", ": ")
+    string = string.replace(" ; ", "; ")
+    string = string.replace(" . ", ". ")
+    string = string.replace(" ! ", "! ")
+    string = string.replace(" ? ", "? ")
+    string = string.replace(" , ", ", ")
+    # double brackets
+    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
+    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
+    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
+    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
+    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
+    # miscellaneous
+    string = string.replace("= = = =", "====")
+    string = string.replace("= = =", "===")
+    string = string.replace("= =", "==")
+    string = string.replace(" " + chr(176) + " ", chr(176))
+    string = string.replace(" \n", "\n")
+    string = string.replace("\n ", "\n")
+    string = string.replace(" N ", " 1 ")
+    string = string.replace(" 's", "'s")
+
+    return string
+
+
+def lambada_detokenizer(string):
+    return string
+
+
+_DETOKENIZERS = {
+    'ptb': ptb_detokenizer,
+    'wiki': wikitext_detokenizer,
+    'lambada': lambada_detokenizer,
+}
+
+
+def get_detokenizer(path):
+    for key in _DETOKENIZERS.keys():
+        if key in path:
+            return _DETOKENIZERS[key]
--- a/tasks/zeroshot_gpt/evaluate.py
+++ b/tasks/zeroshot_gpt/evaluate.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""GPT zero-shot evaluation."""
+
+import math
+
+import torch
+
+from megatron import get_args
+from megatron import print_rank_0, is_last_rank
+from megatron import get_tokenizer
+from megatron.core import parallel_state, tensor_parallel
+from megatron.checkpointing import load_checkpoint
+from megatron.model import GPTModel
+from megatron.training import get_model
+from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
+from megatron.p2p_communication import recv_forward, send_forward
+from tasks.finetune_utils import build_data_loader
+
+from .datasets import build_dataset
+
+# These are needed to unwrap the model, would be nice to put these in megatron.utils if possible?
+from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
+from megatron.model import DistributedDataParallel as LocalDDP
+from megatron.model import Float16Module
+
+def get_model_provider(eval_metric):
+    """Based on evaluation metric set the parallel-output flag and
+    return the model provider."""
+
+    def model_provider(pre_process=True, post_process=True):
+        """Build the model."""
+
+        if eval_metric == 'loss':
+            parallel_output = True
+        elif eval_metric == 'accuracy':
+            parallel_output = False
+        else:
+            raise NotImplementedError('output type for {} evaluation metric '
+                                      'is not supported.'.format(eval_metric))
+
+        print_rank_0('building GPT model ...')
+        model = GPTModel(num_tokentypes=0, parallel_output=parallel_output,
+                         pre_process=pre_process, post_process=post_process)
+
+        return model
+
+    return model_provider
+
+
+def process_batch(batch):
+    """Process batch and produce inputs for the model."""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    loss_mask = batch['pad_mask'].long().cuda().contiguous().byte()
+    tokens_ = batch['text'].long().cuda().contiguous()
+    labels = tokens_[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].contiguous()
+
+    # Get the masks and postition ids.
+    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        tokenizer.eod,
+        args.reset_position_ids,
+        args.reset_attention_mask,
+        args.eod_mask_loss)
+
+    return tokens, labels, attention_mask, position_ids, loss_mask
+
+
+def forward_step(batch, model, eval_metric):
+    """Forward step."""
+
+    # Get the batch.
+    tokens, labels, attention_mask, position_ids, loss_mask = process_batch(
+        batch)
+
+    # Tell the model what our actual batch size will be
+    args = get_args()
+    args.micro_batch_size = len(labels)
+
+    input_tensor = recv_forward()
+
+    # Forward pass through the model.
+    unwrapped_model = unwrap_model(
+        model, (torchDDP, LocalDDP, Float16Module))
+    unwrapped_model.set_input_tensor(input_tensor)
+    output = model(tokens, position_ids, attention_mask)
+
+    send_forward(output)
+
+    if parallel_state.is_pipeline_last_stage():
+        # For loss, return the unreduced loss.
+        if eval_metric == 'loss':
+            losses = tensor_parallel.vocab_parallel_cross_entropy(
+                output.contiguous().float(), labels.contiguous())
+            loss = torch.sum(
+                losses.view(-1) * loss_mask.contiguous().view(-1).float())
+            return loss
+
+        # For accuracy, return the number of correctly predicted samples.
+        if eval_metric == 'accuracy':
+            outputs = torch.argmax(output, -1)
+            correct = (outputs == labels).float()
+            correct[(1 - loss_mask).bool()] = 1
+            correct = correct.prod(-1)
+            return correct.sum()
+
+        raise NotImplementedError('forward method for evaluation metric {} '
+                                  'is not implemented.'.format(eval_metric))
+    return None
+
+
+def evaluate(data_loader, model, eval_metric):
+    """Evaluation."""
+    args = get_args()
+
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+
+    total_output = 0.0
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        for iteration, batch in enumerate(data_loader):
+            if iteration % args.log_interval == 0:
+                print_rank_0('> working on iteration: {}'.format(iteration))
+            # Forward evaluation.
+            output = forward_step(batch, model, eval_metric)
+
+            # Reduce across processes.
+            if parallel_state.is_pipeline_last_stage():
+                torch.distributed.all_reduce(output,
+                                             group=parallel_state.get_data_parallel_group())
+
+                total_output += output
+
+    return total_output
+
+
+def evaluate_and_print_results(task, data_loader, model, eval_metric):
+    """Evaluate and print results on screen."""
+
+    # Evaluate and get results.
+    output = evaluate(data_loader, model, eval_metric)
+
+    string = ' validation results on {} | '.format(task)
+    if is_last_rank():
+        if eval_metric == 'loss':
+            num_tokenized_tokens = data_loader.dataset.num_tokenized_tokens
+            num_original_tokens = data_loader.dataset.num_original_tokens
+            val_loss = output / (num_tokenized_tokens - 1)
+            ppl = math.exp(min(20, val_loss))
+            token_ratio = (num_tokenized_tokens - 1) / (num_original_tokens - 1)
+            adjusted_ppl = math.exp(min(20, val_loss * token_ratio))
+            string += 'avg loss: {:.4E} | '.format(val_loss)
+            string += 'ppl: {:.4E} | '.format(ppl)
+            string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl)
+            string += 'token ratio: {} |'.format(token_ratio)
+
+        elif eval_metric == 'accuracy':
+            num_examples = len(data_loader.dataset)
+            acc = output / num_examples
+            string += 'number correct: {:.4E} | '.format(output)
+            string += 'total examples: {:.4E} | '.format(num_examples)
+            string += 'avg accuracy: {:.4E}'.format(acc)
+
+        else:
+            raise NotImplementedError('evaluation method for {} metric is not '
+                                      'implemented yet.'.format(eval_metric))
+
+        length = len(string) + 1
+        print('-' * length)
+        print(string)
+        print('-' * length)
+
+
+def main():
+    """Main program."""
+    args = get_args()
+
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        print("Interleaved pipeline schedule is not yet supported for text generation.")
+        exit()
+
+    if args.task == 'LAMBADA':
+        eval_metric = 'accuracy'
+    elif args.task == 'WIKITEXT103':
+        eval_metric = 'loss'
+    else:
+        raise NotImplementedError('{} task is not implemented.'.format(
+            args.task))
+
+    # Set up model and load checkpoint.
+    model = get_model(get_model_provider(eval_metric), wrap_with_ddp=False)
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+
+    assert len(model) == 1, "Above condition should have caught this"
+    model = model[0]
+
+    # Data stuff.
+    dataset = build_dataset(args.task)
+    dataloader = build_data_loader(dataset, args.micro_batch_size,
+                                   args.num_workers, drop_last=False)
+
+    # Run evaluation.
+    evaluate_and_print_results(args.task, dataloader, model, eval_metric)
+
+    print_rank_0('done :-)')
--- a/tests/__init__.py
+++ b/tests/__init__.py
--- a/tests/functional_tests/__init__.py
+++ b/tests/functional_tests/__init__.py
--- a/tests/functional_tests/python_test_utils/__init__.py
+++ b/tests/functional_tests/python_test_utils/__init__.py
--- a/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
+++ b/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
+import os
+import sys
+import json
+import shutil
+import glob
+from tensorboard.backend.event_processing import event_accumulator
+
+
+def read_tb_logs_as_list(path, summary_name):
+    """Reads a TensorBoard Events file from the input path, and returns the
+    summary specified as input as a list.
+
+    Arguments:
+    path: str, path to the dir where the events file is located.
+    summary_name: str, name of the summary to read from the TB logs.
+    Output:
+    summary_list: list, the values in the read summary list, formatted as a list.
+    """
+    files = glob.glob(f"{path}/events*tfevents*")
+    files += glob.glob(f"{path}/results/events*tfevents*")
+    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
+    if files:
+        event_file = files[0]
+        ea = event_accumulator.EventAccumulator(event_file)
+        ea.Reload()
+        summary = ea.Scalars(summary_name)
+        summary_list = [round(x.value, 5) for x in summary]
+        print(f'\nObtained the following list for {summary_name} ------------------')
+        print(summary_list)
+        return summary_list
+    raise FileNotFoundError(f"File not found matching: {path}/events*")    
+
+def collect_train_test_metrics(logs_dir, run_name):
+    # TODO: Fetch current baseline
+
+    # train loss
+    train_loss_list = read_tb_logs_as_list(logs_dir, "lm loss")
+
+    # num zeros
+    num_zeros = read_tb_logs_as_list(logs_dir, "num-zeros")
+
+    iteration_time = read_tb_logs_as_list(logs_dir, "iteration-time")
+
+    # First few iterations might take a little longer. So we take the last 70 percent of the timings
+    idx = len(iteration_time)//3   
+    iteration_time_avg = sum(iteration_time[idx:])/len(iteration_time[idx:])
+
+    train_metrics = {
+        "lm loss": {
+            "start_step": 0,
+            "end_step": len(train_loss_list),
+            "step_interval": 5,
+            "values": train_loss_list[0:len(train_loss_list):5],
+        },
+        "num-zeros": {
+            "start_step": 0,
+            "end_step": len(num_zeros),
+            "step_interval": 5,
+            "values": num_zeros[0:len(num_zeros):5],
+        },
+        "iteration_timing_avg": iteration_time_avg,
+    }
+    str_train_metrics = str(train_metrics).replace("'", "\"")
+    print(f"\n ----------- Store the following metrics in {run_name}.json ----------")
+    print(f"\n {str_train_metrics}", flush=True)
+
+if __name__ == '__main__':
+    args = sys.argv[1:]
+    logs_dir = args[0] # eg /lustre/fsw/joc/shanmugamr/megatron/logs/
+    run_name = args[1]
+    collect_train_test_metrics(logs_dir, run_name)
+
+
--- a/tests/functional_tests/python_test_utils/test_ci_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_ci_pipeline.py
+import os
+import json
+import pytest
+import sys
+import glob
+from tensorboard.backend.event_processing import event_accumulator
+
+LOGS_DIR = os.getenv('LOGS_DIR')
+EXPECTED_METRICS_FILE = os.getenv('EXPECTED_METRICS_FILE')
+
+import enum
+
+class TypeOfTest(enum.Enum):
+    APPROX = 1
+    DETERMINISTIC = 2
+
+
+def read_tb_logs_as_list(path, summary_name):
+    """Reads a TensorBoard Events file from the input path, and returns the
+    summary specified as input as a list.
+
+    Arguments:
+    path: str, path to the dir where the events file is located.
+    summary_name: str, name of the summary to read from the TB logs.
+    Output:
+    summary_list: list, the values in the read summary list, formatted as a list.
+    """
+    files = glob.glob(f"{path}/events*tfevents*")
+    files += glob.glob(f"{path}/results/events*tfevents*")
+    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
+    if files:
+        event_file = files[0]
+        ea = event_accumulator.EventAccumulator(event_file)
+        ea.Reload()
+        summary = ea.Scalars(summary_name)
+        summary_list = [round(x.value, 5) for x in summary]
+        print(f'\nObtained the following list for {summary_name} ------------------')
+        print(summary_list)
+        return summary_list
+    raise FileNotFoundError(f"File not found matching: {path}/events*")
+
+
+# If we require a variation of tests for any of the other pipelines we can just inherit this class.
+class TestCIPipeline:
+
+    margin_loss, margin_time = 0.05, 0.1
+    expected = None
+    if os.path.exists(EXPECTED_METRICS_FILE):
+        with open(EXPECTED_METRICS_FILE) as f:
+            expected = json.load(f)
+
+    def _test_helper(self, loss_type, test_type):
+        if self.expected is None:
+            raise FileNotFoundError("Expected data is none")
+        expected = self.expected[loss_type]
+        expected_list = expected["values"]
+        print(expected_list)
+        actual_list = read_tb_logs_as_list(LOGS_DIR, loss_type)
+        assert actual_list is not None, f"No TensorBoard events file was found in the logs for {loss_type}."
+        for i, step in enumerate(range(expected["start_step"], expected["end_step"], expected["step_interval"])):
+            print(f"Checking step {step} against expected {i}")
+            if test_type == TypeOfTest.APPROX:
+                assert actual_list[step] == pytest.approx(expected=expected_list[i], rel=self.margin_loss), f"{self.job_name} : The loss at step {step} should be approximately {expected_list[i]} but it is {actual_list[step]}."
+            else:
+                assert actual_list[step] == expected_list[i], f"The value at step {step} should be {expected_list[i]} but it is {actual_list[step]}."
+
+    @pytest.mark.xfail
+    def test_lm_loss_deterministic(self):
+        # Expected training loss curve at different global steps.
+        self._test_helper("lm loss", TypeOfTest.DETERMINISTIC)
+
+    def test_lm_loss_approx(self):
+        # Expected training loss curve at different global steps.
+        self._test_helper("lm loss", TypeOfTest.APPROX)
+
+    def test_num_zeros_deterministic(self):
+        # Expected validation loss curve at different global steps.
+        self._test_helper("num-zeros", TypeOfTest.DETERMINISTIC)
+    
+    def iteration_timing_node(self):
+        expected_iteration_timing_avg = self.expected["train_step_timing_avg"]
+        iteration_time = read_tb_logs_as_list(LOGS_DIR, "iteration-time")
+        idx = len(iteration_time)//3   
+        iteration_time_avg = sum(iteration_time[idx:])/len(iteration_time[idx:])
+        assert expected_iteration_timing_avg == pytest.approx(expected=iteration_time_avg, rel=self.margin_time), f"The time per global step must be approximately {expected_iteration_timing_avg} but it is {iteration_time_avg}."
--- a/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
+import os
+import sys
+import json
+import shutil
+import glob
+from tensorboard.backend.event_processing import event_accumulator
+
+LOGS_DIR = os.getenv('LOGS_DIR')
+
+def read_tb_logs_as_list(path, summary_name, index):
+    files = glob.glob(f"{path}/events*tfevents*")
+    files += glob.glob(f"{path}/results/events*tfevents*")
+    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
+    if files:
+        event_file = files[index]
+        ea = event_accumulator.EventAccumulator(event_file)
+        ea.Reload()
+        summary = ea.Scalars(summary_name)
+        summary_list = [round(x.value, 5) for x in summary]
+        print(summary_list)
+        return summary_list
+    raise FileNotFoundError(f"File not found matching: {path}/events*")    
+
+def collect_train_test_metrics(logs_dir, index):
+    train_loss_list = read_tb_logs_as_list(logs_dir, "lm loss", index)
+    train_loss_list = [round(elem,3) for elem in train_loss_list]
+    train_metrics = {
+        "lm loss": train_loss_list[0:len(train_loss_list):5],
+    } 
+    str_train_metrics = str(train_metrics).replace("'", "\"")
+    print(f"\n ----------- The following are the metrics for ----------")
+    print(f"\n {str_train_metrics}", flush=True)
+    return train_metrics
+
+class TestCIPipeline:
+
+    train_metrics_100 = collect_train_test_metrics(LOGS_DIR, 0)
+    train_metrics_50_to_100 = collect_train_test_metrics(LOGS_DIR, 1)
+
+    def _test_helper(self, loss_type):
+        expected = self.train_metrics_100[loss_type]
+        print('expected : '  + str(expected))
+        actual = self.train_metrics_50_to_100[loss_type]
+        print('actual : '  + str(actual))
+        # NOTE : Doing this way because in gpt3 model when I run from 0 - 100 directly, it produces 1 extra element
+        # i.e expected is [10.84266, 10.89696, 10.90542, 10.87498, 10.86265, 10.83608, 10.64368, 10.62319, 10.53908, 10.25005, 10.20907, 9.96542, 9.96802, 9.92436, 9.79086, 9.26718, 9.61784, 9.19018, 9.45986, 9.62168, 9.73772, 8.85732, 9.43185, 9.27912, 9.6832, 9.5127, 9.5419, 9.02549, 8.55077, 8.91355, 8.83375, 9.17722, 9.22436, 9.19436, 9.11323, 9.09711, 9.04421, 9.36795]
+        # actual is : [9.73772, 8.85732, 9.43185, 9.27912, 9.6832, 9.5127, 9.5419, 9.02549, 8.55077, 8.91355, 8.83375, 9.17722, 9.22435, 9.19435, 9.11322, 9.09711, 9.04422]
+        # That extra element in expected is causing some issues. So doing it this way. Need to figure out whats happening
+        start_idx_expected = expected.index(actual[0]) # First element of actual
+        # Here we will just be comparing values of actual and second half (50-100) of expected
+        for i in range(len(actual)):
+            assert actual[i] == expected[start_idx_expected + i], f"The value at step {i} should be {expected[start_idx_expected + i]} but it is {actual[i]}."
+
+    def test_lm_loss_deterministic(self):
+        self._test_helper("lm loss")
\ No newline at end of file
--- a/tests/functional_tests/shell_test_utils/jobwait.sh
+++ b/tests/functional_tests/shell_test_utils/jobwait.sh
+#! /bin/bash
+
+JOBID=$1
+echo "Job id : $JOBID"
+
+if [[ $JOBID -eq "" ]]; then
+  exit 1
+fi
+
+sleep 10s
+
+while true; do
+    export STATE=`sacct -j $JOBID --format State --parsable2 --noheader |& head -n 1`
+    case "${STATE}" in
+        PENDING|RUNNING|REQUEUED)
+            echo "Job is still in $STATE"
+            sleep 15s
+            ;;
+        *)
+            sleep 30s
+            echo "Exiting with SLURM job status '${STATE}'"
+            exit 0
+            ;;
+    esac
+done
--- a/tests/functional_tests/test_results/bert/bert_tp1_pp2_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/bert/bert_tp1_pp2_1nodes_50steps.json
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50444, 10.49325, 10.4863, 10.48386, 10.49892, 10.46644, 10.41921, 10.30106, 10.16285, 9.97939]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [17438.0, 18815.0, 22912.0, 18568.0, 19900.0, 23810.0, 22918.0]}, "iteration_timing_avg": 0.35970588235294115}
--- a/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54369, 10.5383, 10.55953, 10.54011, 10.51908, 10.49118, 10.46612, 10.31901, 10.15649, 9.96702]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [21736.0, 20433.0, 27243.0, 23240.0, 22459.0, 20724.0, 23451.0]}, "iteration_timing_avg": 0.8657461764705884}
--- a/tests/functional_tests/test_results/bert/bert_tp2_pp2_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/bert/bert_tp2_pp2_1nodes_50steps.json
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.44729, 10.44093, 10.45375, 10.44445, 10.44305, 10.44595, 10.39163, 10.25898, 10.13498, 9.95692]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27334.0, 20551.0, 28114.0, 24328.0, 24070.0, 20653.0, 21346.0]}, "iteration_timing_avg": 0.6318655882352939}
--- a/tests/functional_tests/test_results/bert/bert_tp4_pp1_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/bert/bert_tp4_pp1_1nodes_50steps.json
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.4978, 10.49775, 10.48021, 10.50638, 10.49624, 10.47018, 10.34494, 10.25536, 10.10244, 9.91938]}, "num-zeros": {"start_step": 0, "end_step": 35, "step_interval": 5, "values": [26168.0, 19042.0, 28718.0, 22408.0, 26377.0, 34320.0, 21873.0]}, "iteration_timing_avg": 1.1249785294117647}
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps.json
+{"lm loss": {"start_step": 0, "end_step": 37, "step_interval": 5, "values": [10.84266, 10.89696, 10.90542, 10.87498, 10.86279, 10.83628, 10.64437, 10.62386]}, "num-zeros": {"start_step": 0, "end_step": 20, "step_interval": 5, "values": [2093.0, 2474.0, 2327.0, 2213.0]}, "iteration_timing_avg": 0.080846}
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps.json
+{"lm loss": {"start_step": 0, "end_step": 49, "step_interval": 5, "values": [10.7947, 10.85294, 10.87058, 10.83388, 10.83025, 10.78755, 10.56419, 10.57339, 10.48735, 10.19553]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2452.0, 2744.0, 2176.0, 2722.0, 2636.0, 2535.0, 2996.0]}, "iteration_timing_avg": 0.1158709090909091}
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json
+{"lm loss": {"start_step": 0, "end_step": 48, "step_interval": 5, "values": [10.85716, 10.88973, 10.879, 10.87014, 10.87978, 10.84463, 10.67266, 10.62932, 10.52767, 10.25362]}, "num-zeros": {"start_step": 0, "end_step": 31, "step_interval": 5, "values": [2450.0, 2396.0, 2523.0, 2242.0, 2225.0, 2478.0, 2536.0]}, "iteration_timing_avg": 0.11416968750000002}
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86276, 10.88058, 10.87527, 10.88402, 10.89173, 10.84724, 10.6886, 10.62864, 10.53925, 10.26646]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2199.0, 2306.0, 2412.0, 2032.0, 2077.0, 2475.0, 2347.0]}, "iteration_timing_avg": 0.15481029411764707}