Unverified Commit 3ae5f9ed authored by Chris Austen's avatar Chris Austen Committed by GitHub
Browse files

Merge branch 'develop' into concat2

parents 6d5a34d2 785ff7d7
......@@ -21,5 +21,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#####################################################################################
numpy==1.21.6
accelerate
diffusers
optimum[onnxruntime]
transformers
\ No newline at end of file
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# The MIT License (MIT)\n",
"#\n",
"# Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.\n",
"#\n",
"# Permission is hereby granted, free of charge, to any person obtaining a copy\n",
"# of this software and associated documentation files (the 'Software'), to deal\n",
"# in the Software without restriction, including without limitation the rights\n",
"# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n",
"# copies of the Software, and to permit persons to whom the Software is\n",
"# furnished to do so, subject to the following conditions:\n",
"#\n",
"# The above copyright notice and this permission notice shall be included in\n",
"# all copies or substantial portions of the Software.\n",
"#\n",
"# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n",
"# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n",
"# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n",
"# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n",
"# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n",
"# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n",
"# THE SOFTWARE."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Stable Diffusion 2.1\n",
"\n",
"The following example will show how to run `Stable Diffusion 2.1` with `MIGraphX`."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Install the required dependencies."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Install dependencies\n",
"!pip install optimum[onnxruntime] transformers diffusers accelerate"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We will use optimum to generate the onnx files."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# export models\n",
"!optimum-cli export onnx --model stabilityai/stable-diffusion-2-1 models/sd21-onnx"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now it is time to load these models with python.\n",
"\n",
"First, we make sure that MIGraphX module is found in the python path."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"mgx_lib_path = \"/opt/rocm/lib/\" # or \"/code/AMDMIGraphX/build/lib/\"\n",
"if mgx_lib_path not in sys.path:\n",
" sys.path.append(mgx_lib_path)\n",
"import migraphx as mgx"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, a helper method to load and cache the models.\n",
"\n",
"This will use the `models/sd21-onnx` path. If you changed it, make sure to update here as well."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"# helper for model loading\n",
"def load_mgx_model(name, shapes):\n",
" file = f\"models/sd21-onnx/{name}/model\"\n",
" print(f\"Loading {name} model from {file}\")\n",
" if os.path.isfile(f\"{file}.mxr\"):\n",
" print(f\"Found mxr, loading it...\")\n",
" model = mgx.load(f\"{file}.mxr\", format=\"msgpack\")\n",
" elif os.path.isfile(f\"{file}.onnx\"):\n",
" print(f\"Parsing from onnx file...\")\n",
" model = mgx.parse_onnx(f\"{file}.onnx\", map_input_dims=shapes)\n",
" model.compile(mgx.get_target(\"gpu\"))\n",
" print(f\"Saving {name} model to mxr file...\")\n",
" mgx.save(model, f\"{file}.mxr\", format=\"msgpack\")\n",
" else:\n",
" print(f\"No {name} model found. Please verify the path is correct and re-try, or re-download model.\")\n",
" os.exit(1)\n",
" return model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"With that, we can load the models. This could take several minutes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"text_encoder = load_mgx_model(\"text_encoder\", {\"input_ids\": [1, 77]})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"unet = load_mgx_model(\n",
" \"unet\", {\n",
" \"sample\": [1, 4, 64, 64],\n",
" \"encoder_hidden_states\": [1, 77, 1024],\n",
" \"timestep\": [1],\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vae = load_mgx_model(\"vae_decoder\", {\"latent_sample\": [1, 4, 64, 64]})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Import the remaining packages."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from diffusers import EulerDiscreteScheduler\n",
"from transformers import CLIPTokenizer\n",
"import torch\n",
"import numpy as np\n",
"from tqdm.auto import tqdm\n",
"from PIL import Image"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Time to load the scheduler and tokenizer from the original source."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_id = \"stabilityai/stable-diffusion-2-1\"\n",
"scheduler = EulerDiscreteScheduler.from_pretrained(model_id,\n",
" subfolder=\"scheduler\")\n",
"tokenizer = CLIPTokenizer.from_pretrained(model_id, subfolder=\"tokenizer\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we will define all the steps one by one, to make the last step short and simple.\n",
"\n",
"The first step will be to tokenize the user prompt. It will make a `(1, 77)` shaped `input_ids`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def tokenize(input):\n",
" return tokenizer([input],\n",
" padding=\"max_length\",\n",
" max_length=tokenizer.model_max_length,\n",
" truncation=True,\n",
" return_tensors=\"np\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Optional\n",
"test_tk = tokenize(\"test tokenizer to see the tokens\")\n",
"test_tk.input_ids.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We run the tokenized prompt through the `Text Encoder` model. It expects the `(1, 77)` data as `int32`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Optional\n",
"text_encoder.get_parameter_shapes()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_embeddings(input):\n",
" return np.array(\n",
" text_encoder.run({\"input_ids\": input.input_ids.astype(np.int32)\n",
" })[0]).astype(np.float32)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Optional\n",
"test_emb = get_embeddings(tokenize(\"test tokenizer to see the tokens\"))\n",
"test_emb.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The other input of the model is latent representation (pure noise). It will be transformed into a 512x512 image later.\n",
"The last input will be the timestep."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def generate_latents(seed):\n",
" return torch.randn(\n",
" (1, 4, 64, 64),\n",
" generator=torch.manual_seed(seed),\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Optional\n",
"test_latents = generate_latents(42)\n",
"latents.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we add two helpers to access and convert from torch to numpy with the proper datatype."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_scaled_sample(latents, t):\n",
" return scheduler.scale_model_input(latents, t).numpy().astype(np.float32)\n",
"\n",
"\n",
"def get_timestep(t):\n",
" return np.atleast_1d(t.numpy().astype(np.int64)) # convert 0D -> 1D"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The UNet model will be run in a loop. It will predict the noise residual."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Optional\n",
"unet.get_parameter_shapes()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def denoise(sample, embeddings, timestep):\n",
" return np.array(\n",
" unet.run({\n",
" \"sample\": sample,\n",
" \"encoder_hidden_states\": embeddings,\n",
" \"timestep\": timestep\n",
" })[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Helpers to do the classifier-free guidance and computing the previous noisy sample."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def perform_guidance(noise_pred_uncond, noise_pred_text, scale):\n",
" return noise_pred_uncond + scale * (noise_pred_text - noise_pred_uncond)\n",
"\n",
"def compute_previous(noise_pred, t, latents):\n",
" # compute the previous noisy sample x_t -> x_t-1\n",
" return scheduler.step(noise_pred, t, latents).prev_sample\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Scale and decode the image latents with VAE."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def scale_denoised(latents):\n",
" return 1 / 0.18215 * latents\n",
"\n",
"\n",
"def decode(latents):\n",
" return np.array(\n",
" vae.run({\"latent_sample\": latents.numpy().astype(np.float32)})[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"And lastly, we need to convert it to an image to display or save."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def convert_to_rgb_image(image):\n",
" image = np.clip(image / 2 + 0.5, 0, 1)\n",
" image = np.transpose(image, (0, 2, 3, 1))\n",
" images = (image * 255).round().astype(\"uint8\")\n",
" return Image.fromarray(images[0])\n",
"\n",
"def save_image(pil_image, filename=\"output.png\"):\n",
" pil_image.save(filename, format=\"png\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Feel free to play around with these params."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"prompt = \"a photograph of an astronaut riding a horse\"\n",
"negative_prompt = \"\"\n",
"steps = 20\n",
"seed = 13\n",
"scale = 7.0"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"And now, to put everything together and run the whole pipeline:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"scheduler.set_timesteps(steps)\n",
"\n",
"text_input, uncond_input = tokenize(prompt), tokenize(negative_prompt)\n",
"text_embeddings, uncond_embeddings = get_embeddings(\n",
" text_input), get_embeddings(uncond_input)\n",
"latents = generate_latents(seed) * scheduler.init_noise_sigma\n",
"\n",
"for t in tqdm(scheduler.timesteps):\n",
" sample = get_scaled_sample(latents, t)\n",
" timestep = get_timestep(t)\n",
"\n",
" noise_pred_uncond = denoise(sample, uncond_embeddings, timestep)\n",
" noise_pred_text = denoise(sample, text_embeddings, timestep)\n",
"\n",
" noise_pred = perform_guidance(noise_pred_uncond, noise_pred_text, scale)\n",
" latents = compute_previous(torch.from_numpy(noise_pred), t, latents)\n",
"\n",
"latents = scale_denoised(latents)\n",
"result = decode(latents)\n",
"image = convert_to_rgb_image(result)\n",
"\n",
"# show the image\n",
"image"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you like the generated image, save it with the following:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"save_image(image, \"output.png\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "sd_venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
# The MIT License (MIT)
#
# Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the 'Software'), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
from argparse import ArgumentParser
from diffusers import EulerDiscreteScheduler
from transformers import CLIPTokenizer
from PIL import Image
import migraphx as mgx
import numpy as np
import os
import torch
import time
from functools import wraps
# measurement helper
def measure(fn):
@wraps(fn)
def measure_ms(*args, **kwargs):
start_time = time.perf_counter_ns()
result = fn(*args, **kwargs)
end_time = time.perf_counter_ns()
print(f"Elapsed time: {(end_time - start_time) * 1e-6:.4f} ms\n")
return result
return measure_ms
def get_args():
parser = ArgumentParser()
parser.add_argument(
"-s",
"--seed",
type=int,
default=42,
help="Random seed",
)
parser.add_argument(
"-t",
"--steps",
type=int,
default=20,
help="Number of steps",
)
parser.add_argument(
"-p",
"--prompt",
type=str,
required=True,
help="Prompt",
)
parser.add_argument(
"-n",
"--negative-prompt",
type=str,
default="",
help="Negative prompt",
)
parser.add_argument(
"--scale",
type=float,
default=7.0,
help="Guidance scale",
)
parser.add_argument(
"-o",
"--output",
type=str,
default=None,
help="Output name",
)
return parser.parse_args()
class StableDiffusionMGX():
def __init__(self):
model_id = "stabilityai/stable-diffusion-2-1"
print(f"Using {model_id}")
print("Creating EulerDiscreteScheduler scheduler")
self.scheduler = EulerDiscreteScheduler.from_pretrained(
model_id, subfolder="scheduler")
print("Creating CLIPTokenizer tokenizer...")
self.tokenizer = CLIPTokenizer.from_pretrained(model_id,
subfolder="tokenizer")
print("Load models...")
self.vae = StableDiffusionMGX.load_mgx_model(
"vae_decoder", {"latent_sample": [1, 4, 64, 64]})
self.text_encoder = StableDiffusionMGX.load_mgx_model(
"text_encoder", {"input_ids": [1, 77]})
self.unet = StableDiffusionMGX.load_mgx_model(
"unet", {
"sample": [1, 4, 64, 64],
"encoder_hidden_states": [1, 77, 1024],
"timestep": [1],
})
def run(self, prompt, negative_prompt, steps, seed, scale):
# need to set this for each run
self.scheduler.set_timesteps(steps)
print("Tokenizing prompt...")
text_input = self.tokenize(prompt)
print("Creating text embeddings for prompt...")
text_embeddings = self.get_embeddings(text_input)
print("Tokenizing negative prompt...")
uncond_input = self.tokenize(negative_prompt)
print("Creating text embeddings for negative prompt...")
uncond_embeddings = self.get_embeddings(uncond_input)
print(
f"Creating random input data ({1}x{4}x{64}x{64}) (latents) with seed={seed}..."
)
latents = torch.randn((1, 4, 64, 64),
generator=torch.manual_seed(seed))
print("Apply initial noise sigma\n")
latents = latents * self.scheduler.init_noise_sigma
print("Running denoising loop...")
for step, t in enumerate(self.scheduler.timesteps):
print(f"#{step}/{len(self.scheduler.timesteps)} step")
latents = self.denoise_step(text_embeddings, uncond_embeddings,
latents, t, scale)
print("Scale denoised result...")
latents = 1 / 0.18215 * latents
print("Decode denoised result...")
image = self.decode(latents)
return image
@staticmethod
@measure
def load_mgx_model(name, shapes):
file = f"models/sd21-onnx/{name}/model"
print(f"Loading {name} model from {file}")
if os.path.isfile(f"{file}.mxr"):
print("Found mxr, loading it...")
model = mgx.load(f"{file}.mxr", format="msgpack")
elif os.path.isfile(f"{file}.onnx"):
print("Parsing from onnx file...")
model = mgx.parse_onnx(f"{file}.onnx", map_input_dims=shapes)
model.compile(mgx.get_target("gpu"))
print(f"Saving {name} model to mxr file...")
mgx.save(model, f"{file}.mxr", format="msgpack")
else:
print(f"No {name} model found. Please download it and re-try.")
os.exit(1)
return model
@measure
def tokenize(self, input):
return self.tokenizer([input],
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
return_tensors="np")
@measure
def get_embeddings(self, input):
return np.array(
self.text_encoder.run(
{"input_ids":
input.input_ids.astype(np.int32)})[0]).astype(np.float32)
@staticmethod
def convert_to_rgb_image(image):
image = np.clip(image / 2 + 0.5, 0, 1)
image = np.transpose(image, (0, 2, 3, 1))
images = (image * 255).round().astype("uint8")
return Image.fromarray(images[0])
@staticmethod
def save_image(pil_image, filename="output.png"):
pil_image.save(filename)
@measure
def denoise_step(self, text_embeddings, uncond_embeddings, latents, t,
scale):
sample = self.scheduler.scale_model_input(latents,
t).numpy().astype(np.float32)
timestep = np.atleast_1d(t.numpy().astype(
np.int64)) # convert 0D -> 1D
noise_pred_uncond = np.array(
self.unet.run({
"sample": sample,
"encoder_hidden_states": uncond_embeddings,
"timestep": timestep
})[0])
noise_pred_text = np.array(
self.unet.run({
"sample": sample,
"encoder_hidden_states": text_embeddings,
"timestep": timestep
})[0])
# perform guidance
noise_pred = noise_pred_uncond + scale * (noise_pred_text -
noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
return self.scheduler.step(torch.from_numpy(noise_pred), t,
latents).prev_sample
@measure
def decode(self, latents):
return np.array(
self.vae.run({"latent_sample":
latents.numpy().astype(np.float32)})[0])
if __name__ == "__main__":
args = get_args()
sd = StableDiffusionMGX()
result = sd.run(args.prompt, args.negative_prompt, args.steps, args.seed,
args.scale)
print("Convert result to rgb image...")
image = StableDiffusionMGX.convert_to_rgb_image(result)
filename = args.output if args.output else f"output_s{args.seed}_t{args.steps}.png"
StableDiffusionMGX.save_image(image, args.output)
print(f"Image saved to {filename}")
......@@ -29,4 +29,4 @@ pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build
msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off
sqlite3@3.43.2 -DCMAKE_POSITION_INDEPENDENT_CODE=On
ROCmSoftwarePlatform/composable_kernel@70eefcf4f263aa5c25f3c9ff0db8f6f199ef0fb9 -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On
ROCmSoftwarePlatform/rocMLIR@13f6c2a69cfe80a575c6b241ec7353d1e953cb12 -DBUILD_FAT_LIBROCKCOMPILER=On
ROCmSoftwarePlatform/rocMLIR@9e66e8050209f03349a41b6b497f0da2b285a53b -DBUILD_FAT_LIBROCKCOMPILER=On
......@@ -28,9 +28,9 @@ include(ROCMInstallTargets)
include(ROCMPackageConfigHelpers)
include(RegisterOp)
include(CheckCXXLinkerFlag)
include(CheckCXXSourceCompiles)
add_library(migraphx
add_library(migraphx
adjust_allocation.cpp
analyze_streams.cpp
apply_alpha_beta.cpp
......@@ -105,6 +105,12 @@ add_library(migraphx
value.cpp
verify_args.cpp
)
if(WIN32)
# Due to compilation crashing, we need to use type-erased matchers on Windows.
target_compile_definitions(migraphx PUBLIC MIGRAPHX_USE_TYPE_ERASED_MATCHERS=1)
endif()
configure_file(version.h.in include/migraphx/version.h)
rocm_set_soversion(migraphx ${MIGRAPHX_SO_VERSION})
function(register_migraphx_ops)
......@@ -176,6 +182,7 @@ register_migraphx_ops(
mul
multibroadcast
multinomial
nearbyint
neg
nonmaxsuppression
nonzero
......@@ -206,7 +213,6 @@ register_migraphx_ops(
rnn_last_hs_output
rnn_var_sl_last_output
roialign
round
rsqrt
run_on_target
scalar
......@@ -216,6 +222,8 @@ register_migraphx_ops(
scatternd_add
scatternd_mul
scatternd_none
scatternd_max
scatternd_min
select_module
sigmoid
sign
......@@ -234,6 +242,7 @@ register_migraphx_ops(
transpose
unary_not
undefined
unique
unknown
unsqueeze
where
......@@ -248,17 +257,62 @@ rocm_install_targets(
${CMAKE_CURRENT_BINARY_DIR}/include
)
check_cxx_linker_flag(-lstdc++fs HAS_LIB_STD_FILESYSTEM)
if(HAS_LIB_STD_FILESYSTEM)
target_link_libraries(migraphx PRIVATE -lstdc++fs)
if(NOT WIN32)
check_cxx_linker_flag(-lstdc++fs HAS_LIB_STD_FILESYSTEM)
if(HAS_LIB_STD_FILESYSTEM)
target_link_libraries(migraphx PRIVATE -lstdc++fs)
endif()
target_link_libraries(migraphx PRIVATE -ldl)
endif()
target_link_libraries(migraphx PRIVATE -ldl)
target_include_directories(migraphx SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
target_link_libraries(migraphx PUBLIC Threads::Threads)
function(check_execution_par RESULT)
set(CMAKE_REQUIRED_LIBRARIES ${ARGN})
set(CMAKE_REQUIRED_FLAGS)
if(NOT MSVC)
set(CMAKE_REQUIRED_FLAGS "-std=c++17")
endif()
string(MD5 _flags_hash "${CMAKE_REQUIRED_FLAGS} ${CMAKE_REQUIRED_LIBRARIES}")
set(_source "
#include <execution>
int main() {
int* i = nullptr;
std::sort(std::execution::par, i, i);
}
")
check_cxx_source_compiles("${_source}" _has_execution_${_flags_hash})
set(${RESULT} ${_has_execution_${_flags_hash}} PARENT_SCOPE)
endfunction()
set(MIGRAPHX_HAS_EXECUTORS_DEFAULT Off)
find_package(TBB QUIET)
if(TBB_FOUND)
check_execution_par(TBB_HAS_EXECUTION_PAR TBB::tbb)
if(TBB_HAS_EXECUTION_PAR)
list(APPEND PACKAGE_DEPENDS PACKAGE TBB)
target_link_libraries(migraphx PUBLIC TBB::tbb)
set(MIGRAPHX_HAS_EXECUTORS_DEFAULT On)
message(STATUS "Using TBB for parallel execution")
endif()
else()
check_execution_par(HAS_EXECUTION_PAR)
if(HAS_EXECUTION_PAR)
set(MIGRAPHX_HAS_EXECUTORS_DEFAULT On)
endif()
endif()
option(MIGRAPHX_HAS_EXECUTORS "C++ supports parallel executors" ${MIGRAPHX_HAS_EXECUTORS_DEFAULT})
if(MIGRAPHX_HAS_EXECUTORS)
message("Parallel STL enabled")
target_compile_definitions(migraphx PUBLIC MIGRAPHX_HAS_EXECUTORS=1)
else()
message("Parallel STL disabled")
target_compile_definitions(migraphx PUBLIC MIGRAPHX_HAS_EXECUTORS=0)
endif()
find_package(nlohmann_json 3.8.0 REQUIRED)
target_link_libraries(migraphx PRIVATE nlohmann_json::nlohmann_json)
migraphx_generate_export_header(migraphx)
......@@ -276,8 +330,6 @@ target_link_libraries(migraphx INTERFACE $<BUILD_INTERFACE:msgpackc-cxx>)
add_library(migraphx_all_targets INTERFACE)
set(PACKAGE_DEPENDS)
add_subdirectory(api)
add_subdirectory(driver)
add_subdirectory(onnx)
......
......@@ -44,7 +44,8 @@
m(int32_type, int32_t) \
m(int64_type, int64_t) \
m(uint32_type, uint32_t) \
m(uint64_type, uint64_t)
m(uint64_type, uint64_t) \
m(fp8e4m3fnuz_type, migraphx::fp8::fp8e4m3fnuz)
// clang-format on
#ifdef __cplusplus
......
......@@ -105,6 +105,8 @@ inline std::ostream& operator<<(std::ostream& os, const color& c)
static const bool use_color = isatty(STDOUT_FILENO) != 0;
if(use_color)
return os << "\033[" << static_cast<std::size_t>(c) << "m";
#else
(void)c;
#endif
return os;
}
......
......@@ -59,6 +59,13 @@ namespace migraphx {
namespace driver {
inline namespace MIGRAPHX_INLINE_NS {
inline std::string get_version()
{
return "MIGraphX Version: " + std::to_string(MIGRAPHX_VERSION_MAJOR) + "." +
std::to_string(MIGRAPHX_VERSION_MINOR) + "." + std::to_string(MIGRAPHX_VERSION_PATCH) +
"." MIGRAPHX_VERSION_TWEAK;
}
struct loader
{
std::string model;
......@@ -597,16 +604,6 @@ struct verify : command<verify>
}
};
struct version : command<version>
{
void parse(const argument_parser&) {}
void run() const
{
std::cout << "MIGraphX Version: " << MIGRAPHX_VERSION_MAJOR << "." << MIGRAPHX_VERSION_MINOR
<< "." << MIGRAPHX_VERSION_PATCH << "." MIGRAPHX_VERSION_TWEAK << std::endl;
}
};
struct compile : command<compile>
{
compiler c;
......@@ -759,16 +756,14 @@ struct main_command
}
void parse(argument_parser& ap)
{
std::string version_str = "MIGraphX Version: " + std::to_string(MIGRAPHX_VERSION_MAJOR) +
"." + std::to_string(MIGRAPHX_VERSION_MINOR) + "." +
std::to_string(MIGRAPHX_VERSION_PATCH) +
"." MIGRAPHX_VERSION_TWEAK;
std::string version_str = get_version();
ap(wrong_commands, {}, ap.metavar("<command>"), ap.append());
ap(nullptr, {"-h", "--help"}, ap.help("Show help"), ap.show_help(get_command_help()));
ap(nullptr,
{"-v", "--version"},
ap.help("Show MIGraphX version"),
ap.show_help(version_str));
ap(nullptr, {"--ort-sha"}, ap.help("Show MIGraphX onnx runtime SHA"));
// Trim command off of exe name
ap.set_exe_name(ap.get_exe_name().substr(0, ap.get_exe_name().size() - 5));
......@@ -811,7 +806,6 @@ using namespace migraphx::driver; // NOLINT
int main(int argc, const char* argv[])
{
std::vector<std::string> args(argv + 1, argv + argc);
// no argument, print the help infomration by default
if(args.empty())
{
......@@ -821,15 +815,27 @@ int main(int argc, const char* argv[])
auto&& m = get_commands();
auto cmd = args.front();
if(cmd == "ort-sha")
if(cmd == "--ort-sha")
{
std::cout << MIGRAPHX_ORT_SHA1 << std::endl;
return 0;
}
if(cmd == "-v" or cmd == "--version")
{
std::cout << get_version() << std::endl;
return 0;
}
if(m.count(cmd) > 0)
{
m.at(cmd)(argv[0], {args.begin() + 1, args.end()});
std::string driver_invocation =
std::string(argv[0]) + " " + migraphx::to_string_range(args, " ");
std::cout << "Running [ " << get_version() << " ]: " << driver_invocation << std::endl;
m.at(cmd)(argv[0],
{args.begin() + 1, args.end()}); // run driver command found in commands map
std::cout << "[ " << get_version() << " ] Complete: " << driver_invocation << std::endl;
}
else
{
......
......@@ -119,6 +119,7 @@ void verify_program(const std::string& name,
auto target_outs = run_target(p, t, options, quantize, inputs);
std::size_t output_num = ref_outs.size();
bool passed = true;
for(std::size_t i = 0; i < output_num; ++i)
{
if(ref_outs[i].get_shape().type() != target_outs[i].get_shape().type() or
......@@ -130,9 +131,11 @@ void verify_program(const std::string& name,
}
else
{
verify_args(name, target_outs[i], verify::expected{ref_outs[i]}, tols);
passed &= verify_args(name, target_outs[i], verify::expected{ref_outs[i]}, tols);
}
}
if(passed)
std::cout << "MIGraphX verification passed successfully." << std::endl;
}
void verify_instructions(const program& prog,
......
......@@ -130,6 +130,30 @@ struct dynamic_loader_impl
tmp_dir temp;
};
fs::path dynamic_loader::path(void* address)
{
HMODULE module = nullptr;
if(GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
static_cast<LPCSTR>(address),
&module) == 0)
{
auto err = GetLastError();
MIGRAPHX_THROW("Unable to obtain module handle, error = " + std::to_string(err));
}
TCHAR buffer[MAX_PATH];
if(GetModuleFileName(module, buffer, sizeof(buffer)) == 0)
{
auto err = GetLastError();
MIGRAPHX_THROW("Unable to read module file path, error = " + std::to_string(err));
}
if(GetLastError() == ERROR_INSUFFICIENT_BUFFER)
{
MIGRAPHX_THROW("Buffer too small (" + std::to_string(MAX_PATH) + ") to hold the path");
}
return {buffer};
}
#endif
optional<dynamic_loader> dynamic_loader::try_load(const fs::path& p)
......
......@@ -219,9 +219,8 @@ struct find_pointwise_reshape_pointwise
auto reshape_input = [&](const auto& ins_to_insert) {
return [&](auto input) {
auto c = m.insert_instruction(ins_to_insert, make_op("contiguous"), input);
return m.insert_instruction(
ins_to_insert, make_op("reshape", {{"dims", cd.dims}}), c);
ins_to_insert, make_op("reshape", {{"dims", cd.dims}}), input);
};
};
auto x_inputs = x_ins->inputs();
......
/* ************************************************************************
* Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
* ies of the Software, and to permit persons to whom the Software is furnished
* to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
* PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
* CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ************************************************************************ */
#ifndef MIGRAPHX_GUARD_RTGLIB_BITCAST_HPP
#define MIGRAPHX_GUARD_RTGLIB_BITCAST_HPP
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#endif
#include <migraphx/config.hpp>
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define MIGRAPHX_CONST_FOLD(x) (__builtin_constant_p(x) ? (x) : (x))
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
template <typename To, typename From>
inline constexpr To bit_cast(From fr) noexcept
{
static_assert(sizeof(To) == sizeof(From));
#if defined(__GNUC__) and !defined(__clang__)
return MIGRAPHX_CONST_FOLD(*reinterpret_cast<To*>(&fr));
#else
return __builtin_bit_cast(To, fr);
#endif
}
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic pop
#endif
#endif // MIGRAPHX_GUARD_RTGLIB_BITCAST_HPP
......@@ -38,15 +38,12 @@ struct dynamic_loader_impl;
struct MIGRAPHX_EXPORT dynamic_loader
{
#ifndef _WIN32
template <class T>
static fs::path path(T* address)
{
return path(reinterpret_cast<void*>(address));
}
static fs::path path(void* address);
#endif
static optional<dynamic_loader> try_load(const fs::path& p);
dynamic_loader() = default;
......
/* ************************************************************************
* Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
* ies of the Software, and to permit persons to whom the Software is furnished
* to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
* PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
* CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ************************************************************************ */
#ifndef MIGRAPHX_GUARD_RTGLIB_FLOAT8_HPP
#define MIGRAPHX_GUARD_RTGLIB_FLOAT8_HPP
// We are clipping/saturation in down conversion by default. Unclipped version is not tested and
// shouldn't be used without having enough tests.
// logic is based on clipping table from here : https://onnx.ai/onnx/technical/float8.html#cast
// NOLINTNEXTLINE
#define MIGRAPHX_F8_DOWNCAST_CLIPPING 1
#include <cmath>
#include <cstdint>
#include <climits>
#include <cstring>
#include <iosfwd>
#include <limits>
#include <sstream>
#include <iostream>
#include <string>
#include <utility>
#include <migraphx/config.hpp>
#include <migraphx/float8_impl.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace fp8 {
enum class rounding_mode
{
standard, // standard rounding is doing RNE -- round to nearest even
stochastic
};
enum class f8_type
{
bf8 = 0, // s1e5m2
fp8 = 1 // s1e4m3
};
template <typename T, bool FNUZ = true>
class numeric_limits;
template <migraphx::fp8::f8_type T = migraphx::fp8::f8_type::fp8, bool FNUZ = true>
struct float8
{
uint8_t data = 0x00;
// default constructor
constexpr float8() = default;
// default copy constructor
constexpr float8(const float8& y) = default;
struct from_bits_t
{
};
static constexpr from_bits_t from_bits() { return from_bits_t(); }
explicit constexpr float8(uint8_t bits, from_bits_t) : data(bits) {}
explicit constexpr float8(
float v,
migraphx::fp8::rounding_mode rm = migraphx::fp8::rounding_mode::standard,
uint32_t rng = 0)
{
if constexpr(T == migraphx::fp8::f8_type::fp8)
{
#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
data = migraphx::fp8::impl::
cast_to_f8<3, 4, float, FNUZ /*negative_zero_nan*/, true /*clip*/>(
v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
#else // MIGRAPHX_F8_DOWNCAST_CLIPPING
data = migraphx::fp8::impl::
cast_to_f8<3, 4, float, FNUZ /*negative_zero_nan*/, false /*clip*/>(
v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
#endif // MIGRAPHX_F8_DOWNCAST_CLIPPING
}
else
{
#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
data = migraphx::fp8::impl::
cast_to_f8<2, 5, float, FNUZ /*negative_zero_nan*/, true /*clip*/>(
v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
#else // MIGRAPHX_F8_DOWNCAST_CLIPPING
data = migraphx::fp8::impl::
cast_to_f8<2, 5, float, FNUZ /*negative_zero_nan*/, false /*clip*/>(
v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
#endif // rocblas_F8_downcast_clipping}
}
}
inline constexpr operator float() const
{
if constexpr(T == migraphx::fp8::f8_type::fp8)
{
return migraphx::fp8::impl::cast_from_f8<3, 4, float, FNUZ /*negative_zero_nan*/>(data);
} // else
return migraphx::fp8::impl::cast_from_f8<2, 5, float, FNUZ /*negative_zero_nan*/>(data);
}
inline constexpr bool is_zero() const
{
if constexpr(FNUZ)
{
return data == 0x00;
}
else
{
return (data == 0x00) or (data == 0x80);
}
}
inline constexpr bool is_nan() const
{
if constexpr(FNUZ)
{
return data == 0x80;
}
else
{
if(T == migraphx::fp8::f8_type::bf8)
{
return (data == 0x7D) or (data == 0x7E) or (data == 0x7F) or (data == 0xFD) or
(data == 0xFE) or (data == 0xFF);
}
else
{
return (data == 0x7F) or (data == 0xFF);
}
}
}
inline constexpr bool is_inf() const
{
if constexpr(FNUZ)
{
return data == 0x80;
}
else
{
if(T == migraphx::fp8::f8_type::bf8)
{
return (data == 0x7C) or (data == 0xFC);
}
else
{
// no infinities in e4m3fn, represent them as NaNs
return (data == 0x7F) or (data == 0xFF);
}
}
}
// NOLINTNEXTLINE
#define MIGRAPHX_FP8_UNARY_OP(unary_op, binary_op) \
constexpr float8& operator unary_op(const float8& rhs) \
{ \
const auto tmp = static_cast<float>(*this) binary_op static_cast<float>(rhs); \
*this = static_cast<float8>(tmp); \
return *this; \
} \
constexpr float8& operator unary_op(const float& rhs) \
{ \
const auto tmp = static_cast<float>(*this) binary_op static_cast<float>(rhs); \
*this = static_cast<float8>(tmp); \
return *this; \
}
MIGRAPHX_FP8_UNARY_OP(*=, *)
MIGRAPHX_FP8_UNARY_OP(-=, -)
MIGRAPHX_FP8_UNARY_OP(+=, +)
MIGRAPHX_FP8_UNARY_OP(/=, /)
inline constexpr float8& operator=(const float8& rhs) = default;
inline constexpr float8& operator=(float8&& rhs) noexcept = default;
inline constexpr float8& operator=(float rhs)
{
*this = static_cast<float8>(rhs);
return *this;
}
inline constexpr bool operator==(const float8& rhs) const
{
if(rhs.is_nan() or rhs.is_inf() or this->is_nan() or this->is_inf())
return false;
else if((rhs.is_zero() and this->is_zero()) or (this->data == rhs.data))
return true;
return false;
}
inline constexpr bool operator<(const float8& rhs) const
{
const auto we = static_cast<float>(*this);
const auto them = static_cast<float>(rhs);
return we < them;
}
inline constexpr bool operator>(const float8& rhs) const
{
const auto we = static_cast<float>(*this);
const auto them = static_cast<float>(rhs);
return we > them;
}
};
// https://onnx.ai/onnx/technical/float8.html
using fp8e4m3fn = float8<migraphx::fp8::f8_type::fp8, false>;
using fp8e5m2 = float8<migraphx::fp8::f8_type::bf8, false>;
using fp8e4m3fnuz = float8<migraphx::fp8::f8_type::fp8, true>;
using fp8e5m2fnuz = float8<migraphx::fp8::f8_type::bf8, true>;
/*
// NOLINTNEXTLINE
#define MIGRAPHX_FP8_BINARY_OP(binary_op, T, U) \
inline constexpr U operator binary_op(const T& lhs, const T& rhs) \
{ \
return U(static_cast<float>(lhs) binary_op static_cast<float>(rhs)); \
}
// TODO: these should return floats for binary ops
// NOLINTNEXTLINE
#define MIGRAPHX_FP8_BINARY_OP_GEN_FOR(T) \
MIGRAPHX_FP8_BINARY_OP(*, T, T) \
MIGRAPHX_FP8_BINARY_OP(-, T, T) \
MIGRAPHX_FP8_BINARY_OP(/, T, T) \
MIGRAPHX_FP8_BINARY_OP(+, T, T) \
MIGRAPHX_FP8_BINARY_OP(==, T, bool) \
MIGRAPHX_FP8_BINARY_OP(>=, T, bool) \
MIGRAPHX_FP8_BINARY_OP(<=, T, bool) \
MIGRAPHX_FP8_BINARY_OP(>, T, bool) \
MIGRAPHX_FP8_BINARY_OP(<, T, bool) \
MIGRAPHX_FP8_BINARY_OP(!=, T, bool)
MIGRAPHX_FP8_BINARY_OP_GEN_FOR(fp8e5m2)
MIGRAPHX_FP8_BINARY_OP_GEN_FOR(fp8e4m3fn)
MIGRAPHX_FP8_BINARY_OP_GEN_FOR(fp8e5m2fnuz)
MIGRAPHX_FP8_BINARY_OP_GEN_FOR(fp8e4m3fnuz)
*/
// Special operator overloading
inline std::ostream& operator<<(std::ostream& os, const fp8e4m3fnuz& rhs)
{
return os << static_cast<float>(rhs);
}
inline fp8e4m3fnuz fabs(fp8e4m3fnuz v)
{
v.data = v.data & 0x7F; // NOLINT
return v;
}
// Special operator overloading
inline std::ostream& operator<<(std::ostream& os, const fp8e4m3fn& rhs)
{
return os << static_cast<float>(rhs);
}
inline fp8e4m3fn fabs(fp8e4m3fn v)
{
v.data = v.data & 0x7F; // NOLINT
return v;
}
// Special operator overloading
inline std::ostream& operator<<(std::ostream& os, const fp8e5m2fnuz& rhs)
{
return os << static_cast<float>(rhs);
}
inline fp8e5m2fnuz fabs(fp8e5m2fnuz v)
{
v.data = v.data & 0x7F; // NOLINT
return v;
}
// Special operator overloading
inline std::ostream& operator<<(std::ostream& os, const fp8e5m2& rhs)
{
return os << static_cast<float>(rhs);
}
inline fp8e5m2 fabs(fp8e5m2 v)
{
v.data = v.data & 0x7F; // NOLINT
return v;
}
template <>
class numeric_limits<fp8e4m3fnuz>
{
public:
static constexpr bool has_infinity = false;
static constexpr fp8e4m3fnuz epsilon() { return fp8e4m3fnuz(0x28, fp8e4m3fnuz::from_bits()); }
// NOLINTNEXTLINE
static constexpr fp8e4m3fnuz quiet_NaN() { return fp8e4m3fnuz(0x80, fp8e4m3fnuz::from_bits()); }
static constexpr fp8e4m3fnuz max() { return fp8e4m3fnuz(0x7F, fp8e4m3fnuz::from_bits()); }
// this is min value that is not DeNorm. DeNorm min is 0x01
static constexpr fp8e4m3fnuz min() { return fp8e4m3fnuz(0x08, fp8e4m3fnuz::from_bits()); }
static constexpr fp8e4m3fnuz lowest() { return fp8e4m3fnuz(0xFF, fp8e4m3fnuz::from_bits()); }
};
template <>
class numeric_limits<fp8e4m3fn>
{
public:
static constexpr bool has_infinity = false;
static constexpr fp8e4m3fn epsilon() { return fp8e4m3fn(0x20, fp8e4m3fn::from_bits()); }
// NOLINTNEXTLINE
static constexpr fp8e4m3fn quiet_NaN() { return fp8e4m3fn(0x7F, fp8e4m3fn::from_bits()); }
static constexpr fp8e4m3fn max() { return fp8e4m3fn(0x7E, fp8e4m3fn::from_bits()); }
// this is min value that is not DeNorm. DeNorm min is 0x01
static constexpr fp8e4m3fn min() { return fp8e4m3fn(0x08, fp8e4m3fn::from_bits()); }
static constexpr fp8e4m3fn lowest() { return fp8e4m3fn(0xFE, fp8e4m3fn::from_bits()); }
};
template <>
class numeric_limits<fp8e5m2fnuz>
{
public:
static constexpr bool has_infinity = false;
static constexpr fp8e5m2fnuz epsilon() { return fp8e5m2fnuz(0x34, fp8e5m2fnuz::from_bits()); }
static constexpr fp8e5m2fnuz quiet_NaN() // NOLINT
{
return fp8e5m2fnuz(0x80, fp8e5m2fnuz::from_bits());
}
static constexpr fp8e5m2fnuz max() { return fp8e5m2fnuz(0x7F, fp8e5m2fnuz::from_bits()); }
// this is min value that is not DeNorm. DeNorm min is 0x01. I am not sure if we want to make
// this distinction. For the floating points we would end up using lowest most of the times.
static constexpr fp8e5m2fnuz min() { return fp8e5m2fnuz(0x4, fp8e5m2fnuz::from_bits()); }
static constexpr fp8e5m2fnuz lowest() { return fp8e5m2fnuz(0xFF, fp8e5m2fnuz::from_bits()); }
};
template <>
class numeric_limits<fp8e5m2>
{
public:
static constexpr bool has_infinity = true;
static constexpr fp8e5m2 epsilon() { return fp8e5m2(0x34, fp8e5m2::from_bits()); }
// 7D, 7E, 7F are positive NaNs and FD, FE, FF are negative NaNs
static constexpr fp8e5m2 quiet_NaN() { return fp8e5m2(0xFF, fp8e5m2::from_bits()); } // NOLINT
static constexpr fp8e5m2 max() { return fp8e5m2(0x7B, fp8e5m2::from_bits()); }
// this is min value that is not DeNorm. DeNorm min is 0x01. I am not sure if we want to make
// this distinction. For the floating points we would end up using lowest most of the times.
static constexpr fp8e5m2 min() { return fp8e5m2(0x4, fp8e5m2::from_bits()); }
static constexpr fp8e5m2 lowest() { return fp8e5m2(0xFB, fp8e5m2::from_bits()); }
// 7C and FC both are infinity
static constexpr fp8e5m2 infinity() { return fp8e5m2(0x7C, fp8e5m2::from_bits()); }
};
} // namespace fp8
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
// =================================================================================================
// define numeric limits for the new data type
// NOLINTBEGIN
namespace std {
#define MIGRAPHX_FP8_STD_OVERLOADS(T) \
inline bool isfinite(T x) { return not x.is_inf() and not x.is_nan(); } \
inline bool isnan(T x) { return x.is_nan(); } \
template <> \
class numeric_limits<T> : public migraphx::fp8::numeric_limits<T> \
{ \
}; \
template <class U> \
struct common_type<T, U> : std::common_type<float, U> \
{ \
}; \
template <class U> \
struct common_type<U, T> : std::common_type<float, U> \
{ \
}; \
template <> \
struct common_type<T, T> \
{ \
using type = T; \
};
MIGRAPHX_FP8_STD_OVERLOADS(migraphx::fp8::fp8e4m3fn)
MIGRAPHX_FP8_STD_OVERLOADS(migraphx::fp8::fp8e5m2)
MIGRAPHX_FP8_STD_OVERLOADS(migraphx::fp8::fp8e4m3fnuz)
MIGRAPHX_FP8_STD_OVERLOADS(migraphx::fp8::fp8e5m2fnuz)
} // namespace std
// NOLINTEND
// =================================================================================================
#endif // MIGRAPHX_GUARD_RTGLIB_FLOAT8_HPP
/* ************************************************************************
* Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
* ies of the Software, and to permit persons to whom the Software is furnished
* to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
* PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
* CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ************************************************************************ */
#ifndef MIGRAPHX_GUARD_RTGLIB_FLOAT8_IMPL_HPP
#define MIGRAPHX_GUARD_RTGLIB_FLOAT8_IMPL_HPP
#include <algorithm>
#include <cstdint>
#include <type_traits>
#include <migraphx/config.hpp>
#include <migraphx/bit_cast.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace fp8 {
namespace impl {
// NOLINTBEGIN
template <uint32_t Wm, uint32_t We, typename T, bool NegativeZeroNan, bool Clip>
constexpr uint8_t cast_to_f8(T f_x, bool stoch = false, uint32_t rng = 0)
{
constexpr bool is_float = std::is_same<T, float>::value;
// half is not supported for now
constexpr bool is_half = false;
static_assert(Wm + We == 7, "Wm+We==7");
static_assert(is_float or is_half, "Only float can be cast to f8");
const uint32_t mfmt = (sizeof(T) == 4) ? 23 : 10;
typename std::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type x;
if constexpr(sizeof(T) == 4)
x = migraphx::bit_cast<uint32_t>(f_x);
else
x = migraphx::bit_cast<uint16_t>(f_x);
uint32_t head = 0;
uint32_t mantissa = 0;
int exponent = 0;
uint32_t bias = 0;
uint32_t sign = 0;
if constexpr(sizeof(T) == 4)
{
head = x & 0xFF800000;
mantissa = x & 0x7FFFFF;
exponent = (head >> 23) & 0xFF;
sign = head >> 31;
bias = 127;
}
else
{
head = x & 0xFC00;
mantissa = x & 0x3FF;
exponent = (head >> 10) & 0x1F;
sign = head >> 15;
bias = 15;
}
uint32_t signed_inf = (sign << 7) + (((1 << We) - 1) << Wm);
uint32_t signed_all_ones = (sign << 7) + ((((1 << We) - 1) << Wm) + ((1 << Wm) - 1));
// Calcualte maximum singed value FLT_MAX, FLT_MIN
uint32_t signed_max = signed_all_ones;
if(not NegativeZeroNan)
signed_max = (Wm == 2) ? (signed_max - 4) : (signed_max - 1);
// Deal with inf and NaNs
if(NegativeZeroNan) // For the FNUZ cases, it is simple just return NaNs
{
if((sizeof(T) == 4 and ((x & 0x7F800000) == 0x7F800000)) or
(sizeof(T) == 2 and ((x & 0x7C00) == 0x7C00)))
return 0x80;
}
else
{
// calculate most common NaN mantissa for FP8, which is all Ones in binary
uint32_t nan_mantissa = 1;
for(auto i = 1; i < Wm; ++i)
{
nan_mantissa |= (nan_mantissa << 1);
}
if((sizeof(T) == 4 and ((x & 0x7F800000) == 0x7F800000)) or
(sizeof(T) == 2 and ((x & 0x7C00) == 0x7C00)))
{
// infinity
if(mantissa == 0)
{
if(sign == 0)
return (Wm == 2) ? 0x7B : 0x7E;
else
return (Wm == 2) ? 0xFB : 0xFE;
}
else // NaNs
return signed_inf + nan_mantissa;
}
}
// handle positive zero
if(x == 0)
return 0;
// handle negative zero
else if((sizeof(T) == 4 and x == 0x80000000) or (sizeof(T) == 2 and x == 0x8000))
{
return NegativeZeroNan ? 0 : 0x80; // For FNUZ types neg zero is just positive zero
}
/* First need to check if it is normal or denorm as there is a difference of implict 1
Then need to adjust the exponent to align with the F8 exponent, in the meanwhile, shift
The mantissa. Then for stochastic rounding, add rng to mantissa and truncate. And for
RNE, no need to add rng. Then probably need to check whether there is carry and adjust
exponent and mantissa again*/
// For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent bits
const int f8_bias = (1 << (We - 1u)) - 1 + (NegativeZeroNan ? 1 : 0);
const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal
/* act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
f8_exponent is the converted f8 exponent with bias encoding
exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
the difference needs to be adjusted and mantissa shifted*/
int act_exponent = 0;
int f8_exponent = 0;
int exponent_diff = 0;
if(exponent == 0 and mantissa != 0)
{ // fp32/fp16 is in denormal.
/* fp32 denormal is below 2^-127 so it is usually not a concern here, we mostly concern fp16
here. In this case, f8 is usually in denormal. But there could be exceptions. fp16 denormal
has exponent bias 15 while bf8 with FNUZ has exponent bias 16. It means that there are some
numbers in fp16 denormal but they are bf8 (FNUZ) normals - smallest bf8 (FNUZ) normal is
2^-15. fp16 numbers where exponent==0 (actual exponent -14) and highest bit of mantissa is 1
are bf8 (FNUZ) normal. In this case, the fp16 mantissa should be shift left by 1 */
act_exponent = 1 - bias;
exponent_diff = f8_denormal_act_exponent -
act_exponent; // actual exponent is exponent-bias+1 as it is denormal
}
else
{ // fp32/fp16 is normal with implicit 1
act_exponent = exponent - bias;
if(act_exponent <= f8_denormal_act_exponent)
{
/* This is the case where fp32/fp16 is normal but it is in f8 denormal range.
For example fp8 FNUZ mode, denormal exponent is -7, but if the fp32/fp16
actual exponent is -7, it is actually larger due to the implict 1,
Therefore it needs to be adjust to -6 and mantissa shift right by 1.
So for fp32/fp16, exponent -8 is the cut point to convert to fp8 FNUZ */
exponent_diff = f8_denormal_act_exponent - act_exponent;
}
else
{ // both fp32/fp16 and f8 are in normal range
exponent_diff =
0; // exponent_diff=0 does not mean there is no difference for this case,
// act_exponent could be larger. Just that it does not need shift mantissa
}
mantissa += (1u << mfmt); // Add the implicit 1 into mantissa
}
// need to know whether the number is right in the middle of two adjacent fp8 numbers. use max
// value of 31 to avoid undefined behaviour
bool midpoint = (mantissa & ((1u << std::min(31u, mfmt - Wm + exponent_diff)) - 1)) ==
(1u << std::min(31u, mfmt - Wm + exponent_diff - 1));
/* This part is a bit tricky. The judgment of whether it is a tie needs to be done before we
shift right as shift right could rip off some residual part and make something not midpoint look
like midpoint. For example, the fp16 number 0x1002 (0 00100 0000000010), it is larger than
midpoint, but after shift right by 4 bits, it would look like midpoint.
*/
if(exponent_diff > 0)
mantissa >>= std::min(31u, uint32_t(exponent_diff));
else if(exponent_diff == -1)
mantissa <<= -exponent_diff;
bool implicit_one = mantissa & (1 << mfmt);
// if there is no implict 1, it means the f8 is denormal and need to adjust to denorm exponent
f8_exponent =
(act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1);
// Now we have the exponent and mantissa adjusted
uint32_t drop_mask = (1u << (mfmt - Wm)) - 1;
bool odd =
mantissa & (1u << (mfmt - Wm)); // if the least significant bit that is not truncated is 1
/*
This part is doing rounding by adding mantissa part that is going to get dropped.
e.g. if the dropped part for less than 0.5 than it would round down.
if the dropped part is more than 0.5 then it would round up by rolling carry to LSB of retained
mantissa.
For the mid point when bit pattern is like this for Odd: `xy1:10000000` for Odd and
`xy0:10000000` for the Even. where `:` is delimiter for dropped v/s retained part.
For the odd case :
this will add xy1:10000000 + 000:10000000 which would roll over carry to LSB of retained
part making it RNE.
For the even case : this will add xy0:10000000 + 000:01111111 which would
round down and keep number Even
*/
mantissa += (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) & drop_mask;
// Now we deal with overflow
if(f8_exponent == 0 and ((1 << mfmt) & mantissa))
{
f8_exponent = 1; // denormal overflow to become normal, promote exponent
}
else if((1 << (mfmt + 1)) & mantissa)
{
mantissa >>= 1;
f8_exponent++;
}
mantissa >>= (mfmt - Wm);
// above range: quantize to maximum possible float of the same sign
// for e5m2 case, max_exp is 14, since exp = 15 is reserved for Infs and Nans
const int max_exp = (1 << We) - ((NegativeZeroNan or Wm == 3) ? 1 : 2);
if(f8_exponent > max_exp)
{
if(Clip)
return signed_max;
else
{
// https://onnx.ai/onnx/technical/float8.html#cast
if(NegativeZeroNan)
return 0x80;
else
return (Wm == 2) ? signed_inf : signed_all_ones;
}
}
if(f8_exponent == 0 and mantissa == 0)
return NegativeZeroNan ? 0 : (sign << 7);
mantissa &= (1 << Wm) - 1;
return (sign << 7) | (f8_exponent << Wm) | mantissa;
}
// NOLINTEND
template <uint32_t Wm, uint32_t We, typename T, bool NegativeZeroNan>
constexpr T cast_from_f8(uint8_t x)
{
// half is not supported for now
constexpr bool is_half = false;
constexpr bool is_float = std::is_same<T, float>::value;
static_assert(is_float or is_half, "Only float are supported");
constexpr int weo = is_half ? 5 : 8;
constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7);
// NOLINTNEXTLINE
T f_inf, f_neg_inf, f_nan, f_neg0;
if constexpr(is_float)
{
const uint32_t if_inf = 0x7F800000;
const uint32_t if_neg_inf = 0xFF800000;
const uint32_t if_nan = 0x7F800001;
const uint32_t if_neg0 = 0x80000000;
f_inf = migraphx::bit_cast<float>(if_inf);
f_neg_inf = migraphx::bit_cast<float>(if_neg_inf);
f_nan = migraphx::bit_cast<float>(if_nan);
f_neg0 = migraphx::bit_cast<float>(if_neg0);
}
if(x == 0)
return 0;
uint32_t sign = x >> 7; // NOLINT
uint32_t mantissa = x & ((1 << Wm) - 1); // NOLINT
int exponent = (x & 0x7F) >> Wm; // NOLINT
if(NegativeZeroNan)
{
if(x == 0x80)
return f_nan;
}
else
{
if(x == 0x80)
return f_neg0;
if(exponent == ((1 << We) - 1) and Wm == 2) // NOLINT
return (mantissa == 0) ? (sign ? f_neg_inf : f_inf) : f_nan;
else if(Wm == 3 and (x == 0x7F or x == 0xFF))
return f_nan;
}
typename std::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type retval;
const int exp_low_cutoff =
(1 << (weo - 1)) - (1 << (We - 1)) + 1 - (NegativeZeroNan ? 1 : 0); // NOLINT
// subnormal input
if(exponent == 0)
{
// guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
int sh = 1 + __builtin_clz(mantissa) - (32 - Wm);
mantissa <<= sh; // NOLINT
exponent += 1 - sh;
mantissa &= ((1 << Wm) - 1); // NOLINT
}
exponent += exp_low_cutoff - 1;
mantissa <<= wmo - Wm; // NOLINT
// subnormal output (occurs when T=half, We=5, negative_zero_nan=true)
if(exponent <= 0)
{
mantissa |= 1 << wmo; // NOLINT
mantissa >>= 1 - exponent; // NOLINT
exponent = 0;
}
if(sizeof(T) == 2)
retval = (sign << 15) | (exponent << 10) | mantissa; // NOLINT
else
retval = (sign << 31) | (exponent << 23) | mantissa; // NOLINT
return migraphx::bit_cast<T>(retval);
}
} // namespace impl
} // namespace fp8
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif // MIGRAPHX_GUARD_RTGLIB_FLOAT8_IMPL
......@@ -27,6 +27,7 @@
#include <half/half.hpp>
#include <migraphx/config.hpp>
#include <migraphx/float8.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......@@ -67,6 +68,18 @@ struct common_type<T, migraphx::half> : std::common_type<float, T> // NOLINT
{
};
template <>
struct common_type<migraphx::fp8::fp8e4m3fnuz, migraphx::half>
{
using type = float;
};
template <>
struct common_type<migraphx::half, migraphx::fp8::fp8e4m3fnuz>
{
using type = float;
};
template <>
struct common_type<migraphx::half, migraphx::half>
{
......
......@@ -591,6 +591,19 @@ MIGRAPHX_PRED_MATCHER(same_input_shapes, instruction_ref ins)
ins->inputs().begin(), ins->inputs().end(), [&](auto x) { return x->get_shape() == s; });
}
MIGRAPHX_PRED_MATCHER(has_same_value, instruction_ref ins)
{
if(ins->name() != "@literal")
return false;
bool all_same = false;
ins->get_literal().visit([&](auto s) {
all_same = std::all_of(s.begin() + 1, s.end(), [&](const auto& scale) {
return float_equal(scale, s.front());
});
});
return all_same;
}
MIGRAPHX_BASIC_MATCHER(output, const matcher_context&, instruction_ref ins)
{
if(ins->outputs().size() == 1)
......@@ -844,6 +857,12 @@ auto skip_broadcasts_converts(Ms... ms)
return skip(name("broadcast", "multibroadcast", "contiguous", "convert"))(ms...);
}
template <class... Ms>
auto skip_broadcasts_transposes_contiguous(Ms... ms)
{
return skip(name("broadcast", "multibroadcast", "contiguous", "transpose"))(ms...);
}
template <class T>
inline auto has_value(T x, float tolerance = 1e-6)
{
......
......@@ -42,6 +42,7 @@
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
MIGRAPHX_EXPORT
const operation& get_operation(instruction_ref ins);
struct module_impl;
......
......@@ -29,6 +29,7 @@
#include <migraphx/argument.hpp>
#include <migraphx/value.hpp>
#include <migraphx/dyn_output.hpp>
#include <migraphx/par.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......@@ -95,11 +96,11 @@ struct binary : op_name<Derived>
{
argument result{dyn_out.computed_shape};
visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
std::transform(input1.begin(),
input1.end(),
input2.begin(),
output.begin(),
static_cast<const Derived&>(*this).apply());
par_transform(input1.begin(),
input1.end(),
input2.begin(),
output.begin(),
static_cast<const Derived&>(*this).apply());
});
return result;
}
......
......@@ -35,7 +35,7 @@ struct isinf : unary<isinf>
{
auto apply() const
{
return [&](auto x) { return std::isinf(x); };
return [&](auto x) { return std::isinf(static_cast<double>(x)); };
}
std::string name() const { return "isinf"; }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment