"git@developer.sourcefind.cn:kecinstone/2024-pra-vllm.git" did not exist on "e41f06702cb6d6787ac4474832264108a6e28780"
Unverified Commit 441fa968 authored by galagam's avatar galagam Committed by GitHub
Browse files

ONNX export test - minor fixes (#200)



* ONNX export - input names fix

* Fix discrepencies due to input names not defined correctly/not passed to export
* Refactor ORT input feed creation for simplicity
* Control whether to save test IO files via environment variable
Signed-off-by: default avatarGal Hubara Agam <ghubaraagam@nvidia.com>

* ONNX export test: minor refactor
Co-authored-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>
Signed-off-by: default avatargalagam <96368689+galagam@users.noreply.github.com>

---------
Signed-off-by: default avatarGal Hubara Agam <ghubaraagam@nvidia.com>
Signed-off-by: default avatargalagam <96368689+galagam@users.noreply.github.com>
Co-authored-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>
parent 73c9f421
...@@ -44,7 +44,8 @@ from transformer_engine.pytorch.fp8 import is_fp8_available ...@@ -44,7 +44,8 @@ from transformer_engine.pytorch.fp8 import is_fp8_available
# Global test configuration knobs. # Global test configuration knobs.
# Enable this to serialize test inputs and outputs to file (as a Polygraphy RunResults instance). # Enable this to serialize test inputs and outputs to file (as a Polygraphy RunResults instance).
SAVE_TEST_IO = False SAVE_TEST_IO = bool(int(os.getenv("NVTE_ONNX_EXPORT_SAVE_TEST_IO", "0")))
if SAVE_TEST_IO: if SAVE_TEST_IO:
from polygraphy.json import save_json from polygraphy.json import save_json
from polygraphy.comparator import RunResults from polygraphy.comparator import RunResults
...@@ -96,7 +97,12 @@ def do_export( ...@@ -96,7 +97,12 @@ def do_export(
model.cuda().eval() model.cuda().eval()
os.makedirs(NVTE_TEST_ARTIFACTS_DIR, exist_ok=True) os.makedirs(NVTE_TEST_ARTIFACTS_DIR, exist_ok=True)
fname = os.path.join(NVTE_TEST_ARTIFACTS_DIR, fname) fname = os.path.join(NVTE_TEST_ARTIFACTS_DIR, fname)
inps = inp if isinstance(inp, list) or isinstance(inp, tuple) else (inp,) inps = inp if isinstance(inp, list) or isinstance(inp, tuple) else (inp,)
assert len(inps) == len(input_names)
inds_to_del = [i for i in range(len(inps)) if inps[i] is None]
input_names = [input_names[i] for i in range(len(inps)) if i not in inds_to_del]
with te.onnx_export(True): with te.onnx_export(True):
torch.onnx.export( torch.onnx.export(
model, model,
...@@ -185,17 +191,11 @@ def validate_result( ...@@ -185,17 +191,11 @@ def validate_result(
s = ort.InferenceSession(fname, **kwargs) s = ort.InferenceSession(fname, **kwargs)
return s return s
def create_ort_input_dict(session, inps): def create_ort_input_dict(session, inputs):
inp_dict = {} inputs = inputs if isinstance(inputs, list) or isinstance(inputs, tuple) else (inputs,)
if isinstance(inps, tuple) or isinstance(inps, list): input_names = [x.name for x in session.get_inputs()]
nonetype_inputs = 0 inps = [to_numpy(x) for x in inputs if x is not None]
for idx, inp in enumerate(inps): inp_dict = dict(zip(input_names, inps))
if inp is None:
nonetype_inputs += 1
continue
inp_dict[session.get_inputs()[idx - nonetype_inputs].name] = to_numpy(inp)
else:
inp_dict[session.get_inputs()[0].name] = to_numpy(inps)
return inp_dict return inp_dict
def serialize_inputs_outputs(fname, inputs, inputs_names, te_outputs, output_names): def serialize_inputs_outputs(fname, inputs, inputs_names, te_outputs, output_names):
...@@ -515,16 +515,17 @@ def test_export_gemm( ...@@ -515,16 +515,17 @@ def test_export_gemm(
gelu_str = "_gelu" if use_gelu else "" gelu_str = "_gelu" if use_gelu else ""
high_prec_str = dtype2str(precision) high_prec_str = dtype2str(precision)
fname = f"te.gemm{fp8_str}{bias_str}{gelu_str}{high_prec_str}.onnx" fname = f"te.gemm{fp8_str}{bias_str}{gelu_str}{high_prec_str}.onnx"
input_names = ['input', 'weight']
if use_fp8: if use_fp8:
model = TestFP8_GEMM(precision, use_bias, use_gelu, scale_factors) model = TestFP8_GEMM(precision, use_bias, use_gelu, scale_factors)
do_export(model, (inp, weight), fname, use_fp8) do_export(model, (inp, weight), fname, use_fp8, input_names=input_names)
if precision == torch.bfloat16: if precision == torch.bfloat16:
return return
validate_result(fname, (inp, weight), model, rtol=1e-2, atol=2e-2, is_fp8=True) validate_result(fname, (inp, weight), model, rtol=1e-2, atol=2e-2, is_fp8=True, input_names=input_names)
else: else:
model = Test_GEMM(precision, use_bias, use_gelu) model = Test_GEMM(precision, use_bias, use_gelu)
do_export(model, (inp, weight), fname, use_fp8) do_export(model, (inp, weight), fname, use_fp8, input_names=input_names)
validate_result(fname, (inp, weight), model, rtol=1e-2, atol=2e-2) validate_result(fname, (inp, weight), model, rtol=1e-2, atol=2e-2, input_names=input_names)
@pytest.mark.parametrize("use_fp8", [False, True]) @pytest.mark.parametrize("use_fp8", [False, True])
...@@ -630,7 +631,7 @@ def test_export_softmax(softmax_def, precision): ...@@ -630,7 +631,7 @@ def test_export_softmax(softmax_def, precision):
in_features = 64 in_features = 64
hidden_size = 256 hidden_size = 256
mask = None mask = None
input_names = ["input"] input_names = ["input", "mask"]
inp_shape = [hidden_size, in_features, in_features, in_features] inp_shape = [hidden_size, in_features, in_features, in_features]
if softmax_def == softmax_defs.ScaledUpperTriangMaskedSoftmax: if softmax_def == softmax_defs.ScaledUpperTriangMaskedSoftmax:
inp_shape = [hidden_size, in_features, in_features] inp_shape = [hidden_size, in_features, in_features]
...@@ -640,7 +641,6 @@ def test_export_softmax(softmax_def, precision): ...@@ -640,7 +641,6 @@ def test_export_softmax(softmax_def, precision):
# Generate a random mask with 50% probability for 0 or 1. # Generate a random mask with 50% probability for 0 or 1.
probs = 0.5 * torch.ones(hidden_size, 1, in_features, in_features, device="cuda", dtype=precision) probs = 0.5 * torch.ones(hidden_size, 1, in_features, in_features, device="cuda", dtype=precision)
mask = torch.bernoulli(probs).to("cuda", dtype=torch.bool) mask = torch.bernoulli(probs).to("cuda", dtype=torch.bool)
input_names.append("mask")
kernel_str = "ScaledMaskedSoftmax" kernel_str = "ScaledMaskedSoftmax"
model = Test_Softmax(softmax_def, mask_inp=True) model = Test_Softmax(softmax_def, mask_inp=True)
elif softmax_def == softmax_defs.ScaledSoftmax: elif softmax_def == softmax_defs.ScaledSoftmax:
...@@ -866,13 +866,12 @@ def test_export_core_attention( ...@@ -866,13 +866,12 @@ def test_export_core_attention(
query_layer = torch.randn(qkv_size, dtype=precision, device="cuda") query_layer = torch.randn(qkv_size, dtype=precision, device="cuda")
key_layer = torch.randn(qkv_size, dtype=precision, device="cuda") key_layer = torch.randn(qkv_size, dtype=precision, device="cuda")
value_layer = torch.randn(qkv_size, dtype=precision, device="cuda") value_layer = torch.randn(qkv_size, dtype=precision, device="cuda")
input_names = ["query", "key", "value"] input_names = ["query", "key", "value", "attention_mask"]
attention_mask = None attention_mask = None
if use_mask: if use_mask:
# Generate a random mask with 50% probability for 0 or 1. # Generate a random mask with 50% probability for 0 or 1.
probs = 0.5 * torch.ones(qkv_size[1], qkv_size[2], qkv_size[0], qkv_size[0], device="cuda", dtype=precision) probs = 0.5 * torch.ones(qkv_size[1], qkv_size[2], qkv_size[0], qkv_size[0], device="cuda", dtype=precision)
attention_mask = torch.bernoulli(probs).to("cuda", dtype=torch.bool) attention_mask = torch.bernoulli(probs).to("cuda", dtype=torch.bool)
input_names.append("attention_mask")
inp = (query_layer, key_layer, value_layer, attention_mask) inp = (query_layer, key_layer, value_layer, attention_mask)
mask_str = get_attn_mask_str(use_mask, attn_mask_type) mask_str = get_attn_mask_str(use_mask, attn_mask_type)
...@@ -881,6 +880,7 @@ def test_export_core_attention( ...@@ -881,6 +880,7 @@ def test_export_core_attention(
if attn_mask_type is None: if attn_mask_type is None:
attn_mask_type = 'causal' attn_mask_type = 'causal'
input_names = ["query", "key", "value"]
inp = (query_layer, key_layer, value_layer) inp = (query_layer, key_layer, value_layer)
model = te.attention.DotProductAttention( model = te.attention.DotProductAttention(
num_attention_heads=num_attention_heads, num_attention_heads=num_attention_heads,
...@@ -958,6 +958,7 @@ def test_export_multihead_attention( ...@@ -958,6 +958,7 @@ def test_export_multihead_attention(
attention_mask = torch.bernoulli(probs).to("cuda", dtype=torch.bool) attention_mask = torch.bernoulli(probs).to("cuda", dtype=torch.bool)
encoder_output = None encoder_output = None
if attention_type == "cross": if attention_type == "cross":
encoder_output = torch.randn(sequence_length, batch_size, hidden_size, dtype=precision, device="cuda") encoder_output = torch.randn(sequence_length, batch_size, hidden_size, dtype=precision, device="cuda")
inp = (hidden_states, attention_mask, encoder_output) inp = (hidden_states, attention_mask, encoder_output)
...@@ -1021,13 +1022,12 @@ def test_export_transformer_layer( ...@@ -1021,13 +1022,12 @@ def test_export_transformer_layer(
num_attention_heads = 4 num_attention_heads = 4
input_tensor = torch.rand(sequence_length, batch_size, hidden_size, dtype=precision, device="cuda") input_tensor = torch.rand(sequence_length, batch_size, hidden_size, dtype=precision, device="cuda")
input_names = ["input"] input_names = ["input", "attention_mask"]
attention_mask = None attention_mask = None
if use_mask and attn_mask_type != "causal": if use_mask and attn_mask_type != "causal":
# Generate a random mask with 50% probability for 0 or 1. # Generate a random mask with 50% probability for 0 or 1.
probs = 0.5 * torch.ones(batch_size, 1, sequence_length, sequence_length, device="cuda", dtype=precision) probs = 0.5 * torch.ones(batch_size, 1, sequence_length, sequence_length, device="cuda", dtype=precision)
attention_mask = torch.bernoulli(probs).to("cuda", dtype=torch.bool) attention_mask = torch.bernoulli(probs).to("cuda", dtype=torch.bool)
input_names.append("attention_mask")
inp = (input_tensor, attention_mask) inp = (input_tensor, attention_mask)
fp8_str = "_fp8" if use_fp8 else "" fp8_str = "_fp8" if use_fp8 else ""
...@@ -1045,7 +1045,7 @@ def test_export_transformer_layer( ...@@ -1045,7 +1045,7 @@ def test_export_transformer_layer(
params_dtype=precision, params_dtype=precision,
fuse_qkv_params=fuse_qkv_params, fuse_qkv_params=fuse_qkv_params,
zero_centered_gamma=zero_centered_gamma).to(device='cuda') zero_centered_gamma=zero_centered_gamma).to(device='cuda')
do_export(model, inp, fname, use_fp8) do_export(model, inp, fname, use_fp8, input_names=input_names)
if not use_fp8: if not use_fp8:
validate_result(fname, inp, model, atol=1e-3, input_names=input_names) validate_result(fname, inp, model, atol=1e-3, input_names=input_names)
else: else:
...@@ -1167,10 +1167,11 @@ def test_export_gemm_layernorm( ...@@ -1167,10 +1167,11 @@ def test_export_gemm_layernorm(
high_prec_str = dtype2str(precision) high_prec_str = dtype2str(precision)
fp8_str = f"_fp8" if use_fp8 else "" fp8_str = f"_fp8" if use_fp8 else ""
fname = f"te.gemm_layernorm{fp8_str}{high_prec_str}.onnx" fname = f"te.gemm_layernorm{fp8_str}{high_prec_str}.onnx"
do_export(model, (inp, weight), fname, use_fp8=use_fp8) input_names = ['input', 'weight']
do_export(model, (inp, weight), fname, use_fp8=use_fp8, input_names=input_names)
if precision not in (torch.bfloat16, ): if precision not in (torch.bfloat16, ):
validate_result( validate_result(
fname, (inp, weight), model, atol=5e-2, is_fp8=use_fp8, allow_cnt_errors=2) fname, (inp, weight), model, atol=5e-2, is_fp8=use_fp8, allow_cnt_errors=2, input_names=input_names)
@pytest.mark.parametrize("enabled", [True, False]) @pytest.mark.parametrize("enabled", [True, False])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment