Unverified Commit d9d0f114 authored by Sam Shleifer's avatar Sam Shleifer Committed by GitHub
Browse files

[s2s] distributed eval allows num_return_sequences > 1 (#7254)

parent 0804d077
...@@ -235,7 +235,7 @@ export DATA_DIR=cnn_dm ...@@ -235,7 +235,7 @@ export DATA_DIR=cnn_dm
--fp16 \ --fp16 \
--bs 32 --bs 32
``` ```
### Multi-GPU Evalulation ### Multi-GPU Evaluation
here is a command to run xsum evaluation on 8 GPUS. It is more than linearly faster than run_eval.py in some cases here is a command to run xsum evaluation on 8 GPUS. It is more than linearly faster than run_eval.py in some cases
because it uses SortishSampler to minimize padding. You can also use it on 1 GPU. `data_dir` must have because it uses SortishSampler to minimize padding. You can also use it on 1 GPU. `data_dir` must have
`{type_path}.source` and `{type_path}.target`. Run `./run_distributed_eval.py --help` for all clargs. `{type_path}.source` and `{type_path}.target`. Run `./run_distributed_eval.py --help` for all clargs.
...@@ -250,7 +250,7 @@ python -m torch.distributed.launch --nproc_per_node=8 run_distributed_eval.py \ ...@@ -250,7 +250,7 @@ python -m torch.distributed.launch --nproc_per_node=8 run_distributed_eval.py \
Contributions that implement this command for other distributed hardware setups are welcome! Contributions that implement this command for other distributed hardware setups are welcome!
#### run_eval tips and tricks #### Single-GPU Eval: Tips and Tricks
When using `run_eval.py`, the following features can be useful: When using `run_eval.py`, the following features can be useful:
......
...@@ -17,6 +17,7 @@ from utils import ( ...@@ -17,6 +17,7 @@ from utils import (
Seq2SeqDataset, Seq2SeqDataset,
calculate_bleu, calculate_bleu,
calculate_rouge, calculate_rouge,
chunks,
lmap, lmap,
load_json, load_json,
parse_numeric_n_bool_cl_kwargs, parse_numeric_n_bool_cl_kwargs,
...@@ -40,6 +41,7 @@ def eval_data_dir( ...@@ -40,6 +41,7 @@ def eval_data_dir(
fp16=False, fp16=False,
task="summarization", task="summarization",
local_rank=None, local_rank=None,
num_return_sequences=1,
src_lang=None, src_lang=None,
tgt_lang=None, tgt_lang=None,
prefix="", prefix="",
...@@ -56,10 +58,15 @@ def eval_data_dir( ...@@ -56,10 +58,15 @@ def eval_data_dir(
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda() model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda()
if fp16: if fp16:
model = model.half() model = model.half()
# determine if we need to increase num_beams
use_task_specific_params(model, task) # update config with task specific params
num_beams = generate_kwargs.pop("num_beams", model.config.num_beams) # AttributeError risk?
if num_return_sequences > num_beams:
num_beams = num_return_sequences
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)
logger.info(f"Inferred tokenizer type: {tokenizer.__class__}") # if this is wrong, check config.model_type. logger.info(f"Inferred tokenizer type: {tokenizer.__class__}") # if this is wrong, check config.model_type.
use_task_specific_params(model, task) # update config with task specific params
if max_source_length is None: if max_source_length is None:
max_source_length = tokenizer.model_max_length max_source_length = tokenizer.model_max_length
if prefix is None: if prefix is None:
...@@ -84,10 +91,14 @@ def eval_data_dir( ...@@ -84,10 +91,14 @@ def eval_data_dir(
summaries = model.generate( summaries = model.generate(
input_ids=batch["input_ids"].to(model.device), input_ids=batch["input_ids"].to(model.device),
attention_mask=batch["attention_mask"].to(model.device), attention_mask=batch["attention_mask"].to(model.device),
num_return_sequences=num_return_sequences,
num_beams=num_beams,
**generate_kwargs, **generate_kwargs,
) )
preds = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) preds = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
ids = batch["ids"] ids = batch["ids"]
if num_return_sequences > 1:
preds = chunks(preds, num_return_sequences) # batch size chunks, each of size num_return_seq
for i, pred in enumerate(preds): for i, pred in enumerate(preds):
results.append(dict(pred=pred, id=ids[i].item())) results.append(dict(pred=pred, id=ids[i].item()))
save_json(results, save_path) save_json(results, save_path)
...@@ -110,7 +121,6 @@ def run_generate(): ...@@ -110,7 +121,6 @@ def run_generate():
parser.add_argument( parser.add_argument(
"--type_path", type=str, default="test", help="which subset to evaluate typically train/val/test" "--type_path", type=str, default="test", help="which subset to evaluate typically train/val/test"
) )
parser.add_argument("--reference_path", type=str, required=False, help="like cnn_dm/test.target")
parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics") parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics")
parser.add_argument("--bs", type=int, default=8, required=False, help="batch size") parser.add_argument("--bs", type=int, default=8, required=False, help="batch size")
parser.add_argument( parser.add_argument(
...@@ -120,6 +130,9 @@ def run_generate(): ...@@ -120,6 +130,9 @@ def run_generate():
parser.add_argument( parser.add_argument(
"--n_obs", type=int, default=None, required=False, help="How many observations. Defaults to all." "--n_obs", type=int, default=None, required=False, help="How many observations. Defaults to all."
) )
parser.add_argument(
"--num_return_sequences", type=int, default=1, required=False, help="How many sequences to return"
)
parser.add_argument( parser.add_argument(
"--sync_timeout", "--sync_timeout",
type=int, type=int,
...@@ -158,6 +171,7 @@ def run_generate(): ...@@ -158,6 +171,7 @@ def run_generate():
local_rank=args.local_rank, local_rank=args.local_rank,
n_obs=args.n_obs, n_obs=args.n_obs,
max_source_length=args.max_source_length, max_source_length=args.max_source_length,
num_return_sequences=args.num_return_sequences,
prefix=args.prefix, prefix=args.prefix,
src_lang=args.src_lang, src_lang=args.src_lang,
tgt_lang=args.tgt_lang, tgt_lang=args.tgt_lang,
...@@ -169,6 +183,11 @@ def run_generate(): ...@@ -169,6 +183,11 @@ def run_generate():
save_dir.mkdir(exist_ok=True) save_dir.mkdir(exist_ok=True)
partial_results = gather_results_from_each_node(num_replicas, json_save_dir, args.sync_timeout) partial_results = gather_results_from_each_node(num_replicas, json_save_dir, args.sync_timeout)
preds = combine_partial_results(partial_results) preds = combine_partial_results(partial_results)
if args.num_return_sequences > 1:
save_path = save_dir.joinpath("pseudolabel_results.json")
print(f"Saving aggregated results at {save_path}, intermediate in {json_save_dir}/")
save_json(preds, save_path)
return
tgt_file = Path(args.data_dir).joinpath(args.type_path + ".target") tgt_file = Path(args.data_dir).joinpath(args.type_path + ".target")
labels = [x.rstrip() for x in open(tgt_file).readlines()][: len(preds)] labels = [x.rstrip() for x in open(tgt_file).readlines()][: len(preds)]
......
...@@ -13,7 +13,7 @@ import torch ...@@ -13,7 +13,7 @@ import torch
from tqdm import tqdm from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from utils import calculate_bleu, calculate_rouge, parse_numeric_n_bool_cl_kwargs, use_task_specific_params from utils import calculate_bleu, calculate_rouge, chunks, parse_numeric_n_bool_cl_kwargs, use_task_specific_params
logger = getLogger(__name__) logger = getLogger(__name__)
...@@ -22,12 +22,6 @@ logger = getLogger(__name__) ...@@ -22,12 +22,6 @@ logger = getLogger(__name__)
DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i : i + n]
def generate_summaries_or_translations( def generate_summaries_or_translations(
examples: List[str], examples: List[str],
out_file: str, out_file: str,
......
...@@ -145,6 +145,7 @@ class TestSummarizationDistiller(unittest.TestCase): ...@@ -145,6 +145,7 @@ class TestSummarizationDistiller(unittest.TestCase):
assert not failures, f"The following models could not be loaded through AutoConfig: {failures}" assert not failures, f"The following models could not be loaded through AutoConfig: {failures}"
@require_multigpu @require_multigpu
@unittest.skip("Broken at the moment")
def test_multigpu(self): def test_multigpu(self):
updates = dict( updates = dict(
no_teacher=True, no_teacher=True,
......
...@@ -456,3 +456,9 @@ def write_txt_file(ordered_tgt, path): ...@@ -456,3 +456,9 @@ def write_txt_file(ordered_tgt, path):
for ln in ordered_tgt: for ln in ordered_tgt:
f.write(ln + "\n") f.write(ln + "\n")
f.flush() f.flush()
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i : i + n]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment