import argparse import contextlib import time import cupy import numpy # This sample computes call and put prices for European options with # Black-Scholes equation. It was based on a sample of the financial package # in CUDA toolkit. For details, please see the corresponding whitepaper. # # The following code shows that CuPy enables us to write algorithms for GPUs # without significantly modifying the existing NumPy's code. # It also briefly describes how to create your own kernel with CuPy. # If you want to speed up the existing code, please define the kernel # with cupy.ElementwiseKernel. # Naive implementation of the pricing of options with NumPy and CuPy. def black_scholes(xp, s, x, t, r, v): sqrt_t = xp.sqrt(t) d1 = (xp.log(s / x) + (r + v * v / 2) * t) / (v * sqrt_t) d2 = d1 - v * sqrt_t def get_cumulative_normal_distribution(x): A1 = 0.31938153 A2 = -0.356563782 A3 = 1.781477937 A4 = -1.821255978 A5 = 1.330274429 RSQRT2PI = 0.39894228040143267793994605993438 W = 0.2316419 k = 1 / (1 + W * xp.abs(x)) cnd = RSQRT2PI * xp.exp(-x * x / 2) * \ (k * (A1 + k * (A2 + k * (A3 + k * (A4 + k * A5))))) cnd = xp.where(x > 0, 1 - cnd, cnd) return cnd cnd_d1 = get_cumulative_normal_distribution(d1) cnd_d2 = get_cumulative_normal_distribution(d2) exp_rt = xp.exp(- r * t) call = s * cnd_d1 - x * exp_rt * cnd_d2 put = x * exp_rt * (1 - cnd_d2) - s * (1 - cnd_d1) return call, put # An example of calling the kernel via cupy.ElementwiseKernel. # When executing __call__ method of the instance, it automatically compiles # the code depending on the types of the given arrays, and calls the kernel. # Other functions used inside the kernel can be defined by 'preamble' option. black_scholes_kernel = cupy.ElementwiseKernel( 'T s, T x, T t, T r, T v', # Inputs 'T call, T put', # Outputs ''' const T sqrt_t = sqrt(t); const T d1 = (log(s / x) + (r + v * v / 2) * t) / (v * sqrt_t); const T d2 = d1 - v * sqrt_t; const T cnd_d1 = get_cumulative_normal_distribution(d1); const T cnd_d2 = get_cumulative_normal_distribution(d2); const T exp_rt = exp(- r * t); call = s * cnd_d1 - x * exp_rt * cnd_d2; put = x * exp_rt * (1 - cnd_d2) - s * (1 - cnd_d1); ''', 'black_scholes_kernel', preamble=''' __device__ inline T get_cumulative_normal_distribution(T x) { const T A1 = 0.31938153; const T A2 = -0.356563782; const T A3 = 1.781477937; const T A4 = -1.821255978; const T A5 = 1.330274429; const T RSQRT2PI = 0.39894228040143267793994605993438; const T W = 0.2316419; const T k = 1 / (1 + W * abs(x)); T cnd = RSQRT2PI * exp(- x * x / 2) * (k * (A1 + k * (A2 + k * (A3 + k * (A4 + k * A5))))); if (x > 0) { cnd = 1 - cnd; } return cnd; } ''', ) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--gpu-id', '-g', default=0, type=int, help='GPU ID') parser.add_argument('--n-options', '-n', default=10000000, type=int) args = parser.parse_args() cupy.cuda.Device(args.gpu_id).use() def rand_range(m, M): samples = cupy.random.rand(args.n_options) return (m + (M - m) * samples).astype(numpy.float64) print('initializing...') stock_price_gpu = rand_range(5, 30) option_strike_gpu = rand_range(1, 100) option_years_gpu = rand_range(0.25, 10) stock_price_cpu = stock_price_gpu.get() option_strike_cpu = option_strike_gpu.get() option_years_cpu = option_years_gpu.get() @contextlib.contextmanager def timer(message): cupy.cuda.Stream.null.synchronize() start = time.time() yield cupy.cuda.Stream.null.synchronize() end = time.time() print('%s:\t%f sec' % (message, end - start)) print('start computation') risk_free = 0.02 volatility = 0.3 with timer(' CPU (NumPy, Naive implementation)'): call_cpu, put_cpu = black_scholes( numpy, stock_price_cpu, option_strike_cpu, option_years_cpu, risk_free, volatility) with timer(' GPU (CuPy, Naive implementation)'): call_gpu1, put_gpu1 = black_scholes( cupy, stock_price_gpu, option_strike_gpu, option_years_gpu, risk_free, volatility) with timer(' GPU (CuPy, Elementwise kernel)'): call_gpu2, put_gpu2 = black_scholes_kernel( stock_price_gpu, option_strike_gpu, option_years_gpu, risk_free, volatility) # Check whether all elements in gpu arrays are equal to those of cpus cupy.testing.assert_allclose(call_cpu, call_gpu1) cupy.testing.assert_allclose(call_cpu, call_gpu2) cupy.testing.assert_allclose(put_cpu, put_gpu1) cupy.testing.assert_allclose(put_cpu, put_gpu2)