Commit 27dab946 authored by huchen's avatar huchen
Browse files

Merge branch 'GNMT-v2' into 'main'

更新了GNMT v2

See merge request dcutoolkit/deeplearing/dlexamples_new!11
parents 20291e9d 07c30a15
# Copyright (c) 2017 Elad Hoffer
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
PAD_TOKEN = '<pad>'
UNK_TOKEN = '<unk>'
BOS_TOKEN = '<s>'
......@@ -6,27 +27,5 @@ EOS_TOKEN = '<\s>'
# special PAD, UNKNOWN, BEGIN-OF-STRING, END-OF-STRING tokens
PAD, UNK, BOS, EOS = [0, 1, 2, 3]
# path to the BPE vocabulary file, relative to the data directory, it should
# point to file generated by subword-nmt/get_vocab.py
VOCAB_FNAME = 'vocab.bpe.32000'
# paths to source and target training files, relative to the data directory, it
# should point to BPE-encoded files, generated by subword-nmt/apply_bpe.py
SRC_TRAIN_FNAME = 'train.tok.clean.bpe.32000.en'
TGT_TRAIN_FNAME = 'train.tok.clean.bpe.32000.de'
# paths to source and target validation files, relative to the data directory,
# it should point to BPE-encoded files, generated by subword-nmt/apply_bpe.py
SRC_VAL_FNAME = 'newstest_dev.tok.clean.bpe.32000.en'
TGT_VAL_FNAME = 'newstest_dev.tok.clean.bpe.32000.de'
# path to the test source file, relative to the data directory, it should point
# to BPE-encoded file, generated by subword-nmt/apply_bpe.py
SRC_TEST_FNAME = 'newstest2014.tok.bpe.32000.en'
# path to the test target file, relative to the data directory, it should point
# to plaintext file, tokenization is performed by the sacrebleu package
TGT_TEST_TARGET_FNAME = 'newstest2014.de'
# path to the moses detokenizer, relative to the data directory
DETOKENIZER = 'mosesdecoder/scripts/tokenizer/detokenizer.perl'
import time
import os
# Copyright (c) 2017 Elad Hoffer
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import logging
from operator import itemgetter
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
......@@ -31,17 +49,17 @@ def build_collate_fn(batch_first=False, parallel=True, sort=False):
:param seq: list of sequences
"""
lengths = [len(s) for s in seq]
lengths = torch.tensor([len(s) for s in seq], dtype=torch.int64)
batch_length = max(lengths)
shape = (batch_length, len(seq))
shape = (len(seq), batch_length)
seq_tensor = torch.full(shape, config.PAD, dtype=torch.int64)
for i, s in enumerate(seq):
end_seq = lengths[i]
seq_tensor[:end_seq, i].copy_(s[:end_seq])
seq_tensor[i, :end_seq].copy_(s[:end_seq])
if batch_first:
if not batch_first:
seq_tensor = seq_tensor.t()
return (seq_tensor, lengths)
......@@ -62,19 +80,19 @@ def build_collate_fn(batch_first=False, parallel=True, sort=False):
return tuple([collate_seq(s) for s in [src_seqs, tgt_seqs]])
def single_collate(seqs):
def single_collate(src_seqs):
"""
Builds batches from text dataset, optionally sorts batch by src
sequence length.
:param src_seqs: source sequences
"""
src_seqs, indices = zip(*seqs)
if sort:
idx, src_seqs = zip(*sorted(enumerate(src_seqs),
key=lambda item: len(item[1]),
reverse=True))
indices = [indices[i] for i in idx]
indices, src_seqs = zip(*sorted(enumerate(src_seqs),
key=lambda item: len(item[1]),
reverse=True))
else:
indices = range(len(src_seqs))
return collate_seq(src_seqs), tuple(indices)
......@@ -84,6 +102,102 @@ def build_collate_fn(batch_first=False, parallel=True, sort=False):
return single_collate
class SyntheticDataset(Dataset):
def __init__(self, vocab_size, seq_len, nsamples):
self.vocab_size = vocab_size
self.nsamples = nsamples
self.seq_len = seq_len
def __getitem__(self, idx):
rand = torch.randint(0, self.vocab_size, size=(self.seq_len,))
return rand
def unsort(self, array):
return array
def get_loader(self, batch_size=1, num_workers=0, batch_first=False,
pad=False, repeat=1):
collate_fn = build_collate_fn(batch_first, parallel=False,
sort=True)
sampler = StaticDistributedSampler(self, batch_size, pad, repeat)
return DataLoader(self,
batch_size=batch_size,
collate_fn=collate_fn,
sampler=sampler,
num_workers=num_workers,
pin_memory=True,
drop_last=False)
def __len__(self):
return self.nsamples
class RawTextDataset(Dataset):
def __init__(self, raw_data=None, raw_datafile=None, tokenizer=None,
sort=False, max_size=None):
self.tokenizer = tokenizer
self.sorted = False
if raw_datafile:
with open(raw_datafile, 'r') as f:
self.raw_data = f.readlines()
else:
self.raw_data = raw_data
if max_size:
self.raw_data = self.raw_data[:max_size]
self.lengths = [len(s.split()) for s in self.raw_data]
if sort:
self.sort_by_length()
def __getitem__(self, idx):
raw = self.raw_data[idx]
tokenized = self.tokenizer.tokenize(raw)
return tokenized
def unsort(self, array):
"""
"Unsorts" given array (restores original order of elements before
dataset was sorted by sequence length).
:param array: array to be "unsorted"
"""
if self.sorted:
inverse = sorted(enumerate(self.indices), key=itemgetter(1))
array = [array[i[0]] for i in inverse]
return array
def sort_by_length(self):
output = sorted(
enumerate(self.raw_data),
key=lambda x: len(x[1].split()),
)
self.indices, self.raw_data = zip(*output)
self.lengths = [self.lengths[idx] for idx in self.indices]
self.sorted = True
def __len__(self):
return len(self.raw_data)
def get_loader(self, batch_size=1, num_workers=0, batch_first=False,
pad=False, repeat=1):
collate_fn = build_collate_fn(batch_first, parallel=False,
sort=True)
sampler = StaticDistributedSampler(self, batch_size, pad, repeat)
return DataLoader(self,
batch_size=batch_size,
collate_fn=collate_fn,
sampler=sampler,
num_workers=num_workers,
pin_memory=True,
drop_last=False)
class TextDataset(Dataset):
def __init__(self, src_fname, tokenizer, min_len=None, max_len=None,
sort=False, max_size=None):
......@@ -181,7 +295,7 @@ class TextDataset(Dataset):
return len(self.src)
def __getitem__(self, idx):
return self.src[idx], self.indices[idx]
return self.src[idx]
def get_loader(self, batch_size=1, seeds=None, shuffle=False,
num_workers=0, batch_first=False, pad=False,
......@@ -212,13 +326,6 @@ class TextDataset(Dataset):
pin_memory=True,
drop_last=False)
#return DataLoader(self,
# batch_size=batch_size,
# collate_fn=collate_fn,
# sampler=sampler,
# num_workers=num_workers,
# pin_memory=False,
# drop_last=False)
class ParallelDataset(TextDataset):
def __init__(self, src_fname, tgt_fname, tokenizer,
......@@ -347,10 +454,12 @@ class LazyParallelDataset(TextDataset):
if None loads the entire dataset
"""
logging.info(f'Processing data from {fname}')
data = []
with open(fname) as dfile:
data = dfile.readlines()
if max_size:
data = data[:max_size]
for idx, line in enumerate(dfile):
if max_size and idx == max_size:
break
data.append(line)
return data
def filter_raw_data(self, min_len, max_len):
......@@ -391,100 +500,3 @@ class LazyParallelDataset(TextDataset):
def __len__(self):
return len(self.raw_src)
class PreprocessedDataset(TextDataset):
def __init__(self, min_len, max_len, vocab_size):
self.min_len = min_len
self.max_len = max_len
self.vocab_size = vocab_size
self.parallel = True
def get_data_dtype(self, vocab_size):
if vocab_size <= np.iinfo(np.int16).max:
dtype = np.int16
elif vocab_size <= np.iinfo(np.int32).max:
dtype = np.int32
elif vocab_size <= np.iinfo(np.int64).max:
dtype = np.int64
else:
raise ValueError('Vocabulary size is too large')
return dtype
def write_data(self, fname, src, tgt):
src, src_lengths = src
tgt, tgt_lengths = tgt
assert len(src) == len(tgt) == len(src_lengths) == len(tgt_lengths)
length = len(src)
dtype = self.get_data_dtype(self.vocab_size)
data = torch.cat((src, tgt), dim=1).numpy()
offset = 0
with open(fname, 'wb') as f:
offset += f.write((np.array(length, dtype=np.int64)))
offset += f.write((np.array(self.vocab_size, dtype=np.int64)))
offset += f.write((np.array(self.min_len, dtype=np.int64)))
offset += f.write((np.array(self.max_len, dtype=np.int64)))
offset += f.write((np.array(src_lengths, dtype=np.int64)))
offset += f.write((np.array(tgt_lengths, dtype=np.int64)))
offset += np.iinfo(np.int64).dtype.itemsize
f.write((np.array(offset, dtype=np.int64)))
f.write((np.array(data, dtype=dtype)))
def read_data(self, fname, vocab_size):
self.fname = fname
with open(fname, 'rb') as f:
logging.info(f'Reading preprocessed data file from {fname}')
length = int(np.fromfile(f, np.int64, 1))
file_vocab_size = int(np.fromfile(f, np.int64, 1))
file_min_len = int(np.fromfile(f, np.int64, 1))
file_max_len = int(np.fromfile(f, np.int64, 1))
src_lengths = np.fromfile(f, np.int64, length)
tgt_lengths = np.fromfile(f, np.int64, length)
self.offset = int(np.fromfile(f, np.int64, 1))
assert file_max_len == self.max_len
assert file_min_len == self.min_len
assert file_vocab_size == self.vocab_size
logging.info(
f'Preprocessed data: length: {length} '
f'min length: {self.min_len} '
f'max length: {self.max_len} '
)
self.length = length
self.src_lengths = torch.tensor(src_lengths)
self.tgt_lengths = torch.tensor(tgt_lengths)
self.lengths = self.src_lengths + self.tgt_lengths
self.dtype = self.get_data_dtype(vocab_size)
itemsize = np.iinfo(self.dtype).dtype.itemsize
self.item_stride = itemsize * self.max_len * 2
def prepare(self):
logging.info(f'Opening preprocessed data {self.fname} for reading')
self.file = open(self.fname, 'rb')
def finalize(self):
logging.info(f'Closing preprocessed data file')
self.file.close()
def __getitem__(self, idx):
offset = self.offset + self.item_stride * idx
self.file.seek(offset, os.SEEK_SET)
data = np.fromfile(self.file, self.dtype, self.max_len * 2)
data = data.astype(np.int64)
src_len = self.src_lengths[idx]
tgt_len = self.tgt_lengths[idx]
src = torch.tensor(data[0: src_len])
tgt = torch.tensor(data[self.max_len: self.max_len + tgt_len])
return src, tgt
def __len__(self):
return self.length
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import logging
import torch
......@@ -226,7 +246,7 @@ class BucketingSampler(DistributedSampler):
class StaticDistributedSampler(Sampler):
def __init__(self, dataset, batch_size, pad, world_size=None, rank=None):
def __init__(self, dataset, batch_size, pad, repeat=1, world_size=None, rank=None):
"""
Constructor for the StaticDistributedSampler.
......@@ -247,11 +267,12 @@ class StaticDistributedSampler(Sampler):
global_batch_size = batch_size * world_size
data_len = len(dataset)
num_samples = (data_len + global_batch_size - 1) \
repeated_data_len = int(len(dataset) * repeat)
num_samples = (repeated_data_len + global_batch_size - 1) \
// global_batch_size * global_batch_size
self.num_samples = num_samples
indices = list(range(data_len))
indices = list(range(repeated_data_len))
if pad:
# pad dataset to a multiple of global_batch_size samples, uses
# sample with idx 0 as pad
......@@ -267,6 +288,7 @@ class StaticDistributedSampler(Sampler):
indices = indices.view(-1)
# remove temporary pad
indices = indices[indices != -1]
indices = indices % data_len
indices = indices.tolist()
self.indices = indices
......
# Copyright (c) 2017 Elad Hoffer
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import logging
from collections import defaultdict
from functools import partial
import torch
import subword_nmt.apply_bpe
import sacremoses
import seq2seq.data.config as config
......@@ -9,37 +33,53 @@ class Tokenizer:
"""
Tokenizer class.
"""
def __init__(self, vocab_fname=None, pad=1, separator='@@'):
def __init__(self, vocab_fname=None, bpe_fname=None, lang=None, pad=1,
separator='@@'):
"""
Constructor for the Tokenizer class.
:param vocab_fname: path to the file with vocabulary
:param bpe_fname: path to the file with bpe codes
:param pad: pads vocabulary to a multiple of 'pad' tokens
:param separator: tokenization separator
"""
self.separator = separator
self.lang = lang
if bpe_fname:
with open(bpe_fname, 'r') as bpe_codes:
self.bpe = subword_nmt.apply_bpe.BPE(bpe_codes)
if vocab_fname:
self.separator = separator
self.build_vocabulary(vocab_fname, pad)
if lang:
self.init_moses(lang)
logging.info(f'Building vocabulary from {vocab_fname}')
vocab = [config.PAD_TOKEN, config.UNK_TOKEN,
config.BOS_TOKEN, config.EOS_TOKEN]
def init_moses(self, lang):
self.moses_tokenizer = sacremoses.MosesTokenizer(lang['src'])
self.moses_detokenizer = sacremoses.MosesDetokenizer(lang['tgt'])
with open(vocab_fname) as vfile:
for line in vfile:
vocab.append(line.strip())
def build_vocabulary(self, vocab_fname, pad):
logging.info(f'Building vocabulary from {vocab_fname}')
vocab = [config.PAD_TOKEN, config.UNK_TOKEN,
config.BOS_TOKEN, config.EOS_TOKEN]
with open(vocab_fname) as vfile:
for line in vfile:
vocab.append(line.strip())
self.pad_vocabulary(vocab, pad)
self.pad_vocabulary(vocab, pad)
self.vocab_size = len(vocab)
logging.info(f'Size of vocabulary: {self.vocab_size}')
self.vocab_size = len(vocab)
logging.info(f'Size of vocabulary: {self.vocab_size}')
self.tok2idx = defaultdict(partial(int, config.UNK))
for idx, token in enumerate(vocab):
self.tok2idx[token] = idx
self.tok2idx = defaultdict(partial(int, config.UNK))
for idx, token in enumerate(vocab):
self.tok2idx[token] = idx
self.idx2tok = {}
for key, value in self.tok2idx.items():
self.idx2tok[value] = key
self.idx2tok = {}
for key, value in self.tok2idx.items():
self.idx2tok[value] = key
def pad_vocabulary(self, vocab, pad):
"""
......@@ -58,8 +98,10 @@ class Tokenizer:
def get_state(self):
logging.info(f'Saving state of the tokenizer')
state = {
'lang': self.lang,
'separator': self.separator,
'vocab_size': self.vocab_size,
'bpe': self.bpe,
'tok2idx': self.tok2idx,
'idx2tok': self.idx2tok,
}
......@@ -67,11 +109,15 @@ class Tokenizer:
def set_state(self, state):
logging.info(f'Restoring state of the tokenizer')
self.lang = state['lang']
self.separator = state['separator']
self.vocab_size = state['vocab_size']
self.bpe = state['bpe']
self.tok2idx = state['tok2idx']
self.idx2tok = state['idx2tok']
self.init_moses(self.lang)
def segment(self, line):
"""
Tokenizes single sentence and adds special BOS and EOS tokens.
......@@ -85,7 +131,14 @@ class Tokenizer:
entry = [config.BOS] + entry + [config.EOS]
return entry
def detokenize(self, inputs, delim=' '):
def tokenize(self, line):
tokenized = self.moses_tokenizer.tokenize(line, return_str=True)
bpe = self.bpe.process_line(tokenized)
segmented = self.segment(bpe)
tensor = torch.tensor(segmented)
return tensor
def detokenize_bpe(self, inp, delim=' '):
"""
Detokenizes single sentence and removes token separator characters.
......@@ -94,7 +147,7 @@ class Tokenizer:
returns: string representing detokenized sentence
"""
detok = delim.join([self.idx2tok[idx] for idx in inputs])
detok = delim.join([self.idx2tok[idx] for idx in inp])
detok = detok.replace(self.separator + ' ', '')
detok = detok.replace(self.separator, '')
......@@ -103,3 +156,12 @@ class Tokenizer:
detok = detok.replace(config.PAD_TOKEN, '')
detok = detok.strip()
return detok
def detokenize_moses(self, inp):
output = self.moses_detokenizer.detokenize(inp.split())
return output
def detokenize(self, inp):
detok_bpe = self.detokenize_bpe(inp)
output = self.detokenize_moses(detok_bpe)
return output
This source diff could not be displayed because it is too large. You can view the blob instead.
import collections
import math
import os
import pathlib
import re
import pynvml
pynvml.nvmlInit()
def systemGetDriverVersion():
return pynvml.nvmlSystemGetDriverVersion()
def deviceGetCount():
return pynvml.nvmlDeviceGetCount()
class device:
# assume nvml returns list of 64 bit ints
_nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
def __init__(self, device_idx):
super().__init__()
self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
def getName(self):
return pynvml.nvmlDeviceGetName(self.handle)
def getCpuAffinity(self):
affinity_string = ''
for j in pynvml.nvmlDeviceGetCpuAffinity(
self.handle, device._nvml_affinity_elements
):
# assume nvml returns list of 64 bit ints
affinity_string = '{:064b}'.format(j) + affinity_string
affinity_list = [int(x) for x in affinity_string]
affinity_list.reverse() # so core 0 is in 0th element of list
ret = [i for i, e in enumerate(affinity_list) if e != 0]
return ret
def set_socket_affinity(gpu_id):
dev = device(gpu_id)
affinity = dev.getCpuAffinity()
os.sched_setaffinity(0, affinity)
def set_single_affinity(gpu_id):
dev = device(gpu_id)
affinity = dev.getCpuAffinity()
os.sched_setaffinity(0, affinity[:1])
def set_single_unique_affinity(gpu_id, nproc_per_node):
devices = [device(i) for i in range(nproc_per_node)]
socket_affinities = [dev.getCpuAffinity() for dev in devices]
siblings_list = get_thread_siblings_list()
siblings_dict = dict(siblings_list)
# remove siblings
for idx, socket_affinity in enumerate(socket_affinities):
socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
affinities = []
assigned = []
for socket_affinity in socket_affinities:
for core in socket_affinity:
if core not in assigned:
affinities.append([core])
assigned.append(core)
break
os.sched_setaffinity(0, affinities[gpu_id])
def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
device_ids = [device(i) for i in range(nproc_per_node)]
socket_affinities = [dev.getCpuAffinity() for dev in device_ids]
siblings_list = get_thread_siblings_list()
siblings_dict = dict(siblings_list)
# remove siblings
for idx, socket_affinity in enumerate(socket_affinities):
socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
socket_affinities_to_device_ids = collections.defaultdict(list)
for idx, socket_affinity in enumerate(socket_affinities):
socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx)
for socket_affinity, device_ids in socket_affinities_to_device_ids.items():
devices_per_group = len(device_ids)
cores_per_device = len(socket_affinity) // devices_per_group
for group_id, device_id in enumerate(device_ids):
if device_id == gpu_id:
if mode == 'interleaved':
affinity = list(socket_affinity[group_id::devices_per_group])
elif mode == 'continuous':
affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device])
else:
raise RuntimeError('Unknown set_socket_unique_affinity mode')
# reintroduce siblings
affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
os.sched_setaffinity(0, affinity)
def get_thread_siblings_list():
path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
thread_siblings_list = []
pattern = re.compile(r'(\d+)\D(\d+)')
for fname in pathlib.Path(path[0]).glob(path[1:]):
with open(fname) as f:
content = f.read().strip()
res = pattern.findall(content)
if res:
pair = tuple(map(int, res[0]))
thread_siblings_list.append(pair)
return thread_siblings_list
def set_affinity(gpu_id, nproc_per_node, mode='socket'):
if mode == 'socket':
set_socket_affinity(gpu_id)
elif mode == 'single':
set_single_affinity(gpu_id)
elif mode == 'single_unique':
set_single_unique_affinity(gpu_id, nproc_per_node)
elif mode == 'socket_unique_interleaved':
set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
elif mode == 'socket_unique_continuous':
set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
else:
raise RuntimeError('Unknown affinity mode')
affinity = os.sched_getaffinity(0)
return affinity
# Copyright (c) 2017 Elad Hoffer
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import torch
from seq2seq.data.config import BOS
......@@ -8,7 +29,7 @@ class SequenceGenerator:
"""
Generator for the autoregressive inference with beam search decoding.
"""
def __init__(self, model, beam_size=5, max_seq_len=100, cuda=False,
def __init__(self, model, beam_size=5, max_seq_len=100,
len_norm_factor=0.6, len_norm_const=5,
cov_penalty_factor=0.1):
"""
......@@ -21,14 +42,12 @@ class SequenceGenerator:
:param model: model which implements generate method
:param beam_size: decoder beam size
:param max_seq_len: maximum decoder sequence length
:param cuda: whether to use cuda
:param len_norm_factor: length normalization factor
:param len_norm_const: length normalization constant
:param cov_penalty_factor: coverage penalty factor
"""
self.model = model
self.cuda = cuda
self.beam_size = beam_size
self.max_seq_len = max_seq_len
self.len_norm_factor = len_norm_factor
......@@ -51,18 +70,17 @@ class SequenceGenerator:
lengths: (batch_size) - lengths of generated translations
counter: number of iterations of the decoding loop
"""
device = initial_input.device
max_seq_len = self.max_seq_len
translation = torch.zeros(batch_size, max_seq_len, dtype=torch.int64)
lengths = torch.ones(batch_size, dtype=torch.int64)
active = torch.arange(0, batch_size, dtype=torch.int64)
base_mask = torch.arange(0, batch_size, dtype=torch.int64)
if self.cuda:
translation = translation.cuda()
lengths = lengths.cuda()
active = active.cuda()
base_mask = base_mask.cuda()
translation = torch.zeros(batch_size, max_seq_len, dtype=torch.int64,
device=device)
lengths = torch.ones(batch_size, dtype=torch.int64,
device=device)
active = torch.arange(0, batch_size, dtype=torch.int64,
device=device)
base_mask = torch.arange(0, batch_size, dtype=torch.int64,
device=device)
translation[:, 0] = BOS
words, context = initial_input, initial_context
......@@ -118,6 +136,7 @@ class SequenceGenerator:
lengths: (batch_size) - lengths of generated translations
counter: number of iterations of the decoding loop
"""
device = initial_input.device
beam_size = self.beam_size
norm_const = self.len_norm_const
norm_factor = self.len_norm_factor
......@@ -125,25 +144,19 @@ class SequenceGenerator:
cov_penalty_factor = self.cov_penalty_factor
translation = torch.zeros(batch_size * beam_size, max_seq_len,
dtype=torch.int64)
lengths = torch.ones(batch_size * beam_size, dtype=torch.int64)
scores = torch.zeros(batch_size * beam_size, dtype=torch.float32)
active = torch.arange(0, batch_size * beam_size, dtype=torch.int64)
base_mask = torch.arange(0, batch_size * beam_size, dtype=torch.int64)
dtype=torch.int64, device=device)
lengths = torch.ones(batch_size * beam_size,
dtype=torch.int64, device=device)
scores = torch.zeros(batch_size * beam_size,
dtype=torch.float32, device=device)
active = torch.arange(0, batch_size * beam_size,
dtype=torch.int64, device=device)
base_mask = torch.arange(0, batch_size * beam_size,
dtype=torch.int64, device=device)
global_offset = torch.arange(0, batch_size * beam_size, beam_size,
dtype=torch.int64)
eos_beam_fill = torch.tensor([0] + (beam_size - 1) * [float('-inf')])
if self.cuda:
translation = translation.cuda()
lengths = lengths.cuda()
active = active.cuda()
base_mask = base_mask.cuda()
scores = scores.cuda()
global_offset = global_offset.cuda()
eos_beam_fill = eos_beam_fill.cuda()
device=device, dtype=torch.int64)
eos_beam_fill = torch.tensor([0] + (beam_size - 1) * [float('-inf')],
dtype=torch.float32, device=device)
translation[:, 0] = BOS
......@@ -182,9 +195,8 @@ class SequenceGenerator:
context[1] = context[1].contiguous().view(batch_size * beam_size)
# context[1]: (batch * beam)
accu_attn_scores = torch.zeros(batch_size * beam_size, seq)
if self.cuda:
accu_attn_scores = accu_attn_scores.cuda()
accu_attn_scores = torch.zeros(batch_size * beam_size, seq,
dtype=torch.float32, device=device)
counter = 0
for idx in range(1, self.max_seq_len):
......
from itertools import zip_longest
import sacrebleu
import torch
def read_reference(fname, indices):
with open(fname) as f:
refs = f.readlines()
refs = [refs[i] for i in indices]
return refs
def all_reduce(val):
if torch.distributed.is_available() and torch.distributed.is_initialized():
val = torch.tensor(val)
if hasattr(torch.distributed, "get_backend"):
_backend = torch.distributed.get_backend()
if hasattr(torch.distributed, "DistBackend"):
backend_enum_holder = torch.distributed.DistBackend
else:
backend_enum_holder = torch.distributed.Backend
else:
_backend = torch.distributed._backend
backend_enum_holder = torch.distributed.dist_backend
if _backend == backend_enum_holder.NCCL:
device = torch.device('cuda')
else:
device = torch.device('cpu')
val = val.to(device)
torch.distributed.all_reduce(val)
val = val.tolist()
return val
def corpus_bleu(sys_stream, ref_streams, smooth='exp', smooth_floor=0.0,
force=False, lowercase=False,
tokenize=sacrebleu.DEFAULT_TOKENIZER,
use_effective_order=False) -> sacrebleu.BLEU:
"""Produces BLEU scores along with its sufficient statistics from a source
against one or more references.
:param sys_stream: The system stream (a sequence of segments)
:param ref_streams: A list of one or more reference streams (each a
sequence of segments)
:param smooth: The smoothing method to use
:param smooth_floor: For 'floor' smoothing, the floor to use
:param force: Ignore data that looks already tokenized
:param lowercase: Lowercase the data
:param tokenize: The tokenizer to use
:return: a BLEU object containing everything you'd want
"""
# Add some robustness to the input arguments
if isinstance(sys_stream, str):
sys_stream = [sys_stream]
if isinstance(ref_streams, str):
ref_streams = [[ref_streams]]
sys_len = 0
ref_len = 0
correct = [0 for n in range(sacrebleu.NGRAM_ORDER)]
total = [0 for n in range(sacrebleu.NGRAM_ORDER)]
fhs = [sys_stream] + ref_streams
for lines in zip_longest(*fhs):
if None in lines:
raise EOFError("Source and reference streams have different "
"lengths!")
if lowercase:
lines = [x.lower() for x in lines]
output, *refs = [sacrebleu.TOKENIZERS[tokenize](x.rstrip()) for x in
lines]
ref_ngrams, closest_diff, closest_len = sacrebleu.ref_stats(output,
refs)
sys_len += len(output.split())
ref_len += closest_len
sys_ngrams = sacrebleu.extract_ngrams(output)
for ngram in sys_ngrams.keys():
n = len(ngram.split())
correct[n-1] += min(sys_ngrams[ngram], ref_ngrams.get(ngram, 0))
total[n-1] += sys_ngrams[ngram]
correct = all_reduce(correct)
total = all_reduce(total)
sys_len = all_reduce(sys_len)
ref_len = all_reduce(ref_len)
return sacrebleu.compute_bleu(correct, total, sys_len, ref_len, smooth,
smooth_floor, use_effective_order)
def compute_bleu(output, indices, ref_fname):
refs = read_reference(ref_fname, indices)
bleu = corpus_bleu(output, [refs], lowercase=True,
tokenize='intl')
return bleu
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import collections
import itertools
import numpy as np
from pytablewriter import MarkdownTableWriter
def interleave(*args):
return list(itertools.chain(*zip(*args)))
class AccuracyTable:
def __init__(self, unit):
self.data = collections.defaultdict(dict)
self.unit = unit
def add(self, key, data):
self.data[key].update(data)
def write(self, title, write_math):
writer = MarkdownTableWriter()
writer.table_name = f'{title}'
main_header = ['**Batch Size**', '**Beam Size**']
data_header = []
if 'fp32' in write_math:
data_header += [f'**Accuracy - FP32 ({self.unit})**']
if 'tf32' in write_math:
data_header += [f'**Accuracy - TF32 ({self.unit})**']
if 'fp16' in write_math:
data_header += [f'**Accuracy - FP16 ({self.unit})**']
writer.headers = main_header + data_header
writer.value_matrix = []
for k, v in self.data.items():
batch_size, beam_size = k
row = [batch_size, beam_size]
if 'fp32' in write_math:
row.append(v['fp32'])
if 'tf32' in write_math:
row.append(v['tf32'])
if 'fp16' in write_math:
row.append(v['fp16'])
writer.value_matrix.append(row)
writer.write_table()
class PerformanceTable:
def __init__(self, percentiles, unit, reverse_percentiles=False):
self.percentiles = percentiles
self.data = collections.defaultdict(dict)
self.unit = unit
self.reverse_percentiles = reverse_percentiles
def add(self, key, value):
math, value = next(iter(value.items()))
value = np.array(value)
if self.reverse_percentiles:
percentiles = [100 - p for p in self.percentiles]
else:
percentiles = self.percentiles
stats = []
for p in percentiles:
val = np.percentile(value, p)
stats.append(val * self.unit_convert[self.unit])
avg = value.mean() * self.unit_convert[self.unit]
self.data[key].update({math: (avg, stats)})
def write(self, title, math, relative=None, reverse_speedup=False):
writer = MarkdownTableWriter()
writer.table_name = f'{title} - {math.upper()}'
main_header = ['**Batch Size**', '**Beam Size**']
data_header = [f'**Avg ({self.unit})**']
data_header += [f'**{p}% ({self.unit})**' for p in self.percentiles]
if relative:
speedup_header = ['**Speedup**'] * len(data_header)
data_header = interleave(data_header, speedup_header)
writer.headers = main_header + data_header
writer.value_matrix = []
for k, v in self.data.items():
batch_size, beam_size = k
avg, res_percentiles = v[math]
main = [batch_size, beam_size]
data = [avg, *res_percentiles]
if relative:
rel = self.data[k][relative]
rel_avg, rel_res_percentiles = rel
rel = [rel_avg, *rel_res_percentiles]
speedup = [d / r for (r, d) in zip(rel, data)]
if reverse_speedup:
speedup = [1 / s for s in speedup]
data = interleave(data, speedup)
writer.value_matrix.append(main + data)
writer.write_table()
class LatencyTable(PerformanceTable):
def __init__(self, percentiles, unit='ms'):
super().__init__(percentiles, unit)
self.unit_convert = {'s': 1, 'ms': 1e3, 'us': 1e6}
class ThroughputTable(PerformanceTable):
def __init__(self, percentiles, unit='tok/s', reverse_percentiles=True):
super().__init__(percentiles, unit, reverse_percentiles)
self.unit_convert = {'tok/s': 1}
import contextlib
# Copyright (c) 2017 Elad Hoffer
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import logging
import os
import subprocess
import time
......@@ -8,24 +27,35 @@ import torch
import torch.distributed as dist
import seq2seq.data.config as config
import seq2seq.utils as utils
from seq2seq.inference.beam_search import SequenceGenerator
from seq2seq.utils import AverageMeter
from seq2seq.utils import barrier
from seq2seq.utils import get_rank
from seq2seq.utils import get_world_size
import seq2seq.inference.bleu
def gather_predictions(preds):
world_size = get_world_size()
world_size = utils.get_world_size()
if world_size > 1:
all_preds = preds.new(world_size * preds.size(0), preds.size(1))
all_preds_list = all_preds.chunk(world_size, dim=0)
dist.all_gather(all_preds_list, preds)
preds = all_preds
all_preds = [preds.new(preds.size(0), preds.size(1)) for i in range(world_size)]
dist.all_gather(all_preds, preds)
preds = torch.cat(all_preds)
return preds
def run_sacrebleu(test_path, reference_path):
"""
Executes sacrebleu and returns BLEU score.
:param test_path: path to the test file
:param reference_path: path to the reference file
"""
sacrebleu_params = '--score-only -lc --tokenize intl'
logging.info(f'Running sacrebleu (parameters: {sacrebleu_params})')
sacrebleu = subprocess.run([f'sacrebleu --input {test_path} \
{reference_path} {sacrebleu_params}'],
stdout=subprocess.PIPE, shell=True)
test_bleu = round(float(sacrebleu.stdout.strip()), 2)
return test_bleu
class Translator:
"""
Translator runs validation on test dataset, executes inference, optionally
......@@ -34,17 +64,15 @@ class Translator:
def __init__(self,
model,
tokenizer,
loader,
loader=None,
beam_size=5,
len_norm_factor=0.6,
len_norm_const=5.0,
cov_penalty_factor=0.1,
max_seq_len=50,
cuda=False,
print_freq=1,
dataset_dir=None,
save_path=None,
target_bleu=None):
reference=None,
):
self.model = model
self.tokenizer = tokenizer
......@@ -53,24 +81,22 @@ class Translator:
self.insert_src_start = [config.BOS]
self.insert_src_end = [config.EOS]
self.batch_first = model.batch_first
self.cuda = cuda
self.beam_size = beam_size
self.print_freq = print_freq
self.dataset_dir = dataset_dir
self.target_bleu = target_bleu
self.save_path = save_path
self.reference = reference
self.distributed = (utils.get_world_size() > 1)
self.generator = SequenceGenerator(
model=self.model,
beam_size=beam_size,
max_seq_len=max_seq_len,
cuda=cuda,
len_norm_factor=len_norm_factor,
len_norm_const=len_norm_const,
cov_penalty_factor=cov_penalty_factor)
def run(self, calc_bleu=True, epoch=None, iteration=None, summary=False,
reference_path=None):
def run(self, calc_bleu=True, epoch=None, iteration=None, eval_path=None,
summary=False, warmup=0, reference_path=None):
"""
Runs translation on test dataset.
......@@ -78,47 +104,49 @@ class Translator:
BLEU score
:param epoch: index of the current epoch
:param iteration: index of the current iteration
:param eval_path: path to the file for saving results
:param summary: if True prints summary
:param reference_path: path to the file with reference translation
"""
test_bleu = 0.
break_training = False
if reference_path is None:
reference_path = self.reference
device = next(self.model.parameters()).device
test_bleu = torch.tensor([0.], device=device)
rank = utils.get_rank()
logging.info(f'Running evaluation on test set')
self.model.eval()
output = self.evaluate(epoch, iteration, summary)
# detokenize (BPE)
detok_output = []
for idx, pred in output:
pred = pred.tolist()
detok = self.tokenizer.detokenize(pred)
detok_output.append((idx, detok + '\n'))
output, eval_stats = self.evaluate(self.loader, epoch, iteration,
warmup, summary)
output = output[:len(self.loader.dataset)]
output = self.loader.dataset.unsort(output)
if rank == 0 and eval_path:
with open(eval_path, 'w') as eval_file:
lines = [line + '\n' for line in output]
eval_file.writelines(lines)
if calc_bleu:
test_bleu[0] = run_sacrebleu(eval_path, reference_path)
if summary:
logging.info(f'BLEU on test dataset: {test_bleu[0]:.2f}')
utils.barrier()
logging.info(f'Finished evaluation on test set')
if calc_bleu:
if detok_output:
indices, output = zip(*detok_output)
else:
indices, output = [], []
output = self.run_detokenizer(output)
reference_path = os.path.join(self.dataset_dir,
config.TGT_TEST_TARGET_FNAME)
bleu = seq2seq.inference.bleu.compute_bleu(output, indices,
reference_path)
logging.info(bleu)
test_bleu = round(bleu.score, 2)
if summary:
logging.info(f'BLEU on test dataset: {test_bleu:.2f}')
if self.target_bleu and test_bleu >= self.target_bleu:
logging.info(f'Target accuracy reached')
break_training = True
if self.distributed:
dist.broadcast(test_bleu, 0)
logging.info(f'Finished evaluation on test set')
if calc_bleu:
eval_stats['bleu'] = test_bleu[0].item()
else:
eval_stats['bleu'] = None
return test_bleu, break_training
return output, eval_stats
def evaluate(self, epoch, iteration, summary):
def evaluate(self, loader, epoch=0, iteration=0, warmup=0, summary=False):
"""
Runs evaluation on test dataset.
......@@ -126,56 +154,58 @@ class Translator:
:param iteration: index of the current iteration
:param summary: if True prints summary
"""
batch_time = AverageMeter(False)
tot_tok_per_sec = AverageMeter(False)
iterations = AverageMeter(False)
enc_seq_len = AverageMeter(False)
dec_seq_len = AverageMeter(False)
device = next(self.model.parameters()).device
batch_time = utils.AverageMeter(warmup, keep=True)
tot_tok_per_sec = utils.AverageMeter(warmup, keep=True)
iterations = utils.AverageMeter()
enc_seq_len = utils.AverageMeter()
dec_seq_len = utils.AverageMeter()
stats = {}
batch_size = loader.batch_size
global_batch_size = batch_size * utils.get_world_size()
beam_size = self.beam_size
bos = [self.insert_target_start] * (batch_size * beam_size)
bos = torch.tensor(bos, dtype=torch.int64, device=device)
if self.batch_first:
bos = bos.view(-1, 1)
else:
bos = bos.view(1, -1)
if beam_size == 1:
generator = self.generator.greedy_search
else:
generator = self.generator.beam_search
output = []
for i, (src, indices) in enumerate(self.loader):
for i, (src, indices) in enumerate(loader):
translate_timer = time.time()
src, src_length = src
if self.batch_first:
batch_size = src.shape[0]
else:
batch_size = src.shape[1]
global_batch_size = batch_size * get_world_size()
beam_size = self.beam_size
bos = [self.insert_target_start] * (batch_size * beam_size)
bos = torch.LongTensor(bos)
if self.batch_first:
bos = bos.view(-1, 1)
else:
bos = bos.view(1, -1)
src_length = torch.LongTensor(src_length)
stats['total_enc_len'] = int(src_length.sum())
if self.cuda:
src = src.cuda()
bos = bos.cuda()
src = src.to(device)
src_length = src_length.to(device)
with torch.no_grad():
context = self.model.encode(src, src_length)
if self.cuda: src_length = src_length.cuda()
context = [context, src_length, None]
if beam_size == 1:
generator = self.generator.greedy_search
else:
generator = self.generator.beam_search
preds, lengths, counter = generator(batch_size, bos, context)
stats['total_dec_len'] = lengths.sum().item()
stats['iters'] = counter
for idx, pred in zip(indices, preds):
output.append((idx, pred))
indices = torch.tensor(indices).to(preds)
preds = preds.scatter(0, indices.unsqueeze(1).expand_as(preds), preds)
preds = gather_predictions(preds).cpu()
if self.tokenizer:
for pred in preds:
pred = pred.tolist()
detok = self.tokenizer.detokenize(pred)
output.append(detok)
elapsed = time.time() - translate_timer
batch_time.update(elapsed, batch_size)
......@@ -188,15 +218,15 @@ class Translator:
enc_seq_len.update(stats['total_enc_len'] / batch_size, batch_size)
dec_seq_len.update(stats['total_dec_len'] / batch_size, batch_size)
if i % self.print_freq == 0:
if i % self.print_freq == self.print_freq - 1:
log = []
log += f'TEST '
if epoch is not None:
log += f'[{epoch}]'
if iteration is not None:
log += f'[{iteration}]'
log += f'[{i}/{len(self.loader)}]\t'
log += f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
log += f'[{i}/{len(loader)}]\t'
log += f'Time {batch_time.val:.4f} ({batch_time.avg:.4f})\t'
log += f'Decoder iters {iterations.val:.1f} ({iterations.avg:.1f})\t'
log += f'Tok/s {tot_tok_per_sec.val:.0f} ({tot_tok_per_sec.avg:.0f})'
log = ''.join(log)
......@@ -208,11 +238,11 @@ class Translator:
batch_time.reduce('mean')
iterations.reduce('sum')
if summary and get_rank() == 0:
if summary and utils.get_rank() == 0:
time_per_sentence = (batch_time.avg / global_batch_size)
log = []
log += f'TEST SUMMARY:\n'
log += f'Lines translated: {len(self.loader.dataset)}\t'
log += f'Lines translated: {len(loader.dataset)}\t'
log += f'Avg total tokens/s: {tot_tok_per_sec.avg:.0f}\n'
log += f'Avg time per batch: {batch_time.avg:.3f} s\t'
log += f'Avg time per sentence: {1000*time_per_sentence:.3f} ms\n'
......@@ -222,21 +252,9 @@ class Translator:
log = ''.join(log)
logging.info(log)
return output
def run_detokenizer(self, data):
"""
Executes moses detokenizer.
:param data: list of sentences to detokenize
"""
eval_stats = {}
eval_stats['tokens_per_sec'] = tot_tok_per_sec.avg
eval_stats['runtimes'] = batch_time.vals
eval_stats['throughputs'] = tot_tok_per_sec.vals
data = ''.join(data)
detok_path = os.path.join(self.dataset_dir, config.DETOKENIZER)
cmd = f'perl {detok_path}'
logging.info('Running moses detokenizer')
z = subprocess.run(cmd, shell=True, input=data.encode(),
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL)
output = z.stdout.decode().splitlines()
return output
return output, eval_stats
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment