Commit a0368ddf authored by Raul Puri's avatar Raul Puri
Browse files

eval+numeric update

parent abe36e2e
......@@ -34,10 +34,7 @@ from model import DistributedDataParallel as DDP
import mpu
from apex.optimizers import FusedAdam as Adam
from utils import Timers
from utils import save_checkpoint
from utils import save_checkpoint_model_parallel
from utils import load_checkpoint
from utils import load_checkpoint_model_parallel
from utils import report_memory
from utils import print_params_min_max_norm
from utils import print_rank_0
......@@ -84,7 +81,7 @@ def setup_model(args):
model = get_model(args)
if args.load is not None:
_ = load_checkpoint_model_parallel(
_ = load_checkpoint(
model, None, None, args)
return model
......
......@@ -60,6 +60,17 @@ def make_gpt2_dataloaders(args):
valid = make_data_loader_(args.val_data_path)
test = make_data_loader_(args.test_data_path)
args.do_train = False
args.do_valid = False
args.do_test = False
if train is not None:
args.do_train = True
if valid is not None:
args.do_valid = True
if test is not None:
args.do_test = True
# Tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=args.cache_dir)
eod_token = tokenizer.encoder['<|endoftext|>']
......@@ -126,7 +137,8 @@ class GPT2Dataset(Dataset):
def build_dataset_(self, shard_index):
# Garbage collect so we don't use a lot of memory.
# Leave the last one in case other threads have not catche up yet.
for i in range(shard_index - 1):
#for i in range(shard_index - 1):
for i in range(shard_index):
self.shards_data[i] = None
self.shards_sample_index[i] = None
# Read the shard.
......
......@@ -480,10 +480,9 @@ class BertParallelSelfAttention(torch.nn.Module):
value_layer = self._transpose_for_scores(mixed_value_layer)
# Raw attention scores. [b, np, s, s]
attention_scores = torch.matmul(query_layer,
key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(
self.hidden_size_per_attention_head)
norm_factor = math.sqrt(math.sqrt(self.hidden_size_per_attention_head))
attention_scores = torch.matmul(query_layer/norm_factor,
key_layer.transpose(-1, -2)/norm_factor)
# Apply the attention mask.
attention_scores += attention_mask
......
......@@ -221,7 +221,6 @@ def forward_step(data_iterator, model, args, timers):
losses = mpu.vocab_parallel_cross_entropy(
output.contiguous().float(), lm_labels.contiguous())
loss_mask = loss_mask.contiguous()
loss_mask = loss_mask.view(-1)
lm_loss = torch.sum(
losses.view(-1) * loss_mask.view(-1).float()) / loss_mask.sum()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment