Commit 1fb0017a authored by dugupeiwen's avatar dugupeiwen
Browse files

init 0.58

parents
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Numba documentation build configuration file, created by
# sphinx-quickstart on Tue Dec 30 11:55:40 2014.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import sys
import os
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
try:
# Numba is installed
import numba
except ImportError:
# Numba is run from its source checkout
sys.path.insert(0, os.path.abspath('../..'))
import numba
on_rtd = os.environ.get('READTHEDOCS') == 'True'
if on_rtd:
# The following is needed to fix RTD issue with numpydoc
# https://github.com/readthedocs/sphinx_rtd_theme/issues/766
from conda.cli.python_api import run_command as conda_cmd
conda_cmd("install", "-c", "conda-forge", "sphinx_rtd_theme>=0.5.1", "-y")
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.intersphinx',
'sphinx.ext.todo',
#'sphinx.ext.mathjax',
'sphinx.ext.autodoc',
#'sphinx.ext.graphviz',
'numpydoc',
]
# Adding the github files extension
sys.path.append(os.path.abspath(os.path.join(".", "_ext")))
extensions.append('ghfiles')
todo_include_todos = True
# Add any paths that contain templates here, relative to this directory.
templates_path = ['../_templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'Numba'
copyright = u'2012-2020, Anaconda, Inc. and others'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
version = '.'.join(numba.__version__.split('.')[:2])
# The full version, including alpha/beta/rc tags.
release = numba.__version__
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = []
# The reST default role (used for this markup: `text`) to use for all
# documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
#keep_warnings = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = 'sphinx_rtd_theme'
# All sphinx_rtd_theme options. Default values commented out; uncomment to
# change.
html_theme_options = {
'canonical_url': 'https://numba.readthedocs.io/en/stable/',
# 'logo_only': False,
# 'display_version': True,
# 'prev_next_buttons_location': 'bottom',
'style_external_links': True,
# 'vcs_pageview_mode': '',
'style_nav_header_background': '#00A3E0',
# Toc options
'collapse_navigation': False,
# 'sticky_navigation': True,
# 'navigation_depth': 4,
# 'includehidden': True,
# 'titles_only': False
}
# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = None
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
html_logo = "../_static/numba-white-icon-rgb.svg"
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
html_favicon = '../_static/numba-blue-icon-rgb.svg'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['../_static']
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
#html_extra_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# If false, no module index is generated.
#html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# If true, links to the reST sources are added to the pages.
#html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = 'Numbadoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
('index', 'numba.tex', u'Numba Documentation',
u'Anaconda', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# If true, show page references after internal links.
#latex_show_pagerefs = False
# If true, show URL addresses after external links.
#latex_show_urls = False
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'numba', 'Numba Documentation',
['Anaconda'], 1)
]
# If true, show URL addresses after external links.
#man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
('index', 'Numba', 'Numba Documentation',
'Anaconda', 'Numba', 'One line description of project.',
'Miscellaneous'),
]
# Documents to append as an appendix to all manuals.
#texinfo_appendices = []
# If false, no module index is generated.
#texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
#texinfo_no_detailmenu = False
# Configuration for intersphinx: refer to the Python standard library
# and the Numpy documentation.
intersphinx_mapping = {
'python': ('https://docs.python.org/3', None),
'numpy': ('https://numpy.org/doc/stable/', None),
'llvmlite': ('https://llvmlite.readthedocs.io/en/latest/', None),
}
# numpydoc options
# To silence "WARNING: toctree contains reference to nonexisting document"
numpydoc_show_class_members = False
# -- Custom autogeneration ------------------------------------------------
def _autogenerate():
from numba.scripts.generate_lower_listing import gen_lower_listing
from numba.misc.help.inspector import write_listings
basedir = os.path.dirname(__file__)
gen_lower_listing(os.path.join(basedir,
'developer/autogen_lower_listing.rst'))
# Run inspector on supported packages
for package in ['builtins', 'math', 'cmath', 'numpy']:
write_listings(
package_name=package,
filename=os.path.join(
basedir, 'developer', 'autogen_{}_listing'.format(package),
),
output_format='rst',
)
_autogenerate()
def setup(app):
app.add_css_file('rtd-overrides.css')
CUDA Host API
=============
Device Management
-----------------
Device detection and enquiry
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The following functions are available for querying the available hardware:
.. autofunction:: numba.cuda.is_available
.. autofunction:: numba.cuda.detect
Context management
~~~~~~~~~~~~~~~~~~
CUDA Python functions execute within a CUDA context. Each CUDA device in a
system has an associated CUDA context, and Numba presently allows only one context
per thread. For further details on CUDA Contexts, refer to the `CUDA Driver API
Documentation on Context Management
<http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html>`_ and the
`CUDA C Programming Guide Context Documentation
<http://docs.nvidia.com/cuda/cuda-c-programming-guide/#context>`_. CUDA Contexts
are instances of the :class:`~numba.cuda.cudadrv.driver.Context` class:
.. autoclass:: numba.cuda.cudadrv.driver.Context
:members: reset, get_memory_info, push, pop
The following functions can be used to get or select the context:
.. autofunction:: numba.cuda.current_context
.. autofunction:: numba.cuda.require_context
The following functions affect the current context:
.. autofunction:: numba.cuda.synchronize
.. autofunction:: numba.cuda.close
Device management
~~~~~~~~~~~~~~~~~
Numba maintains a list of supported CUDA-capable devices:
.. attribute:: numba.cuda.gpus
An indexable list of supported CUDA devices. This list is indexed by integer
device ID.
Alternatively, the current device can be obtained:
.. function:: numba.cuda.gpus.current
Return the currently-selected device.
Getting a device through :attr:`numba.cuda.gpus` always provides an instance of
:class:`numba.cuda.cudadrv.devices._DeviceContextManager`, which acts as a
context manager for the selected device:
.. autoclass:: numba.cuda.cudadrv.devices._DeviceContextManager
One may also select a context and device or get the current device using the
following three functions:
.. autofunction:: numba.cuda.select_device
.. autofunction:: numba.cuda.get_current_device
.. autofunction:: numba.cuda.list_devices
The :class:`numba.cuda.cudadrv.driver.Device` class can be used to enquire about
the functionality of the selected device:
.. class:: numba.cuda.cudadrv.driver.Device
The device associated with a particular context.
.. attribute:: compute_capability
A tuple, *(major, minor)* indicating the supported compute capability.
.. attribute:: id
The integer ID of the device.
.. attribute:: name
The name of the device (e.g. "GeForce GTX 970").
.. attribute:: uuid
The UUID of the device (e.g. "GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643").
.. method:: reset
Delete the context for the device. This will destroy all memory
allocations, events, and streams created within the context.
.. attribute:: supports_float16
Return ``True`` if the device supports float16 operations, ``False``
otherwise.
Compilation
-----------
Numba provides an entry point for compiling a Python function to PTX without
invoking any of the driver API. This can be useful for:
- Generating PTX that is to be inlined into other PTX code (e.g. from outside
the Numba / Python ecosystem).
- Generating code when there is no device present.
- Generating code prior to a fork without initializing CUDA.
.. note:: It is the user's responsibility to manage any ABI issues arising from
the use of compilation to PTX.
.. autofunction:: numba.cuda.compile_ptx
The environment variable ``NUMBA_CUDA_DEFAULT_PTX_CC`` can be set to control
the default compute capability targeted by ``compile_ptx`` - see
:ref:`numba-envvars-gpu-support`. If PTX for the compute capability of the
current device is required, the ``compile_ptx_for_current_device`` function can
be used:
.. autofunction:: numba.cuda.compile_ptx_for_current_device
Measurement
-----------
.. _cuda-profiling:
Profiling
~~~~~~~~~
The NVidia Visual Profiler can be used directly on executing CUDA Python code -
it is not a requirement to insert calls to these functions into user code.
However, these functions can be used to allow profiling to be performed
selectively on specific portions of the code. For further information on
profiling, see the `NVidia Profiler User's Guide
<https://docs.nvidia.com/cuda/profiler-users-guide/>`_.
.. autofunction:: numba.cuda.profile_start
.. autofunction:: numba.cuda.profile_stop
.. autofunction:: numba.cuda.profiling
.. _events:
Events
~~~~~~
Events can be used to monitor the progress of execution and to record the
timestamps of specific points being reached. Event creation returns immediately,
and the created event can be queried to determine if it has been reached. For
further information, see the `CUDA C Programming Guide Events section
<http://docs.nvidia.com/cuda/cuda-c-programming-guide/#events>`_.
The following functions are used for creating and measuring the time between
events:
.. autofunction:: numba.cuda.event
.. autofunction:: numba.cuda.event_elapsed_time
Events are instances of the :class:`numba.cuda.cudadrv.driver.Event` class:
.. autoclass:: numba.cuda.cudadrv.driver.Event
:members: query, record, synchronize, wait
.. _streams:
Stream Management
-----------------
Streams allow concurrency of execution on a single device within a given
context. Queued work items in the same stream execute sequentially, but work
items in different streams may execute concurrently. Most operations involving a
CUDA device can be performed asynchronously using streams, including data
transfers and kernel execution. For further details on streams, see the `CUDA C
Programming Guide Streams section
<http://docs.nvidia.com/cuda/cuda-c-programming-guide/#streams>`_.
Numba defaults to using the legacy default stream as the default stream. The
per-thread default stream can be made the default stream by setting the
environment variable ``NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM`` to ``1`` (see the
:ref:`CUDA Environment Variables section <numba-envvars-gpu-support>`).
Regardless of this setting, the objects representing the legacy and per-thread
default streams can be constructed using the functions below.
Streams are instances of :class:`numba.cuda.cudadrv.driver.Stream`:
.. autoclass:: numba.cuda.cudadrv.driver.Stream
:members: synchronize, auto_synchronize, add_callback, async_done
To create a new stream:
.. autofunction:: numba.cuda.stream
To get the default stream:
.. autofunction:: numba.cuda.default_stream
To get the default stream with an explicit choice of whether it is the legacy
or per-thread default stream:
.. autofunction:: numba.cuda.legacy_default_stream
.. autofunction:: numba.cuda.per_thread_default_stream
To construct a Numba ``Stream`` object using a stream allocated elsewhere, the
``external_stream`` function is provided. Note that the lifetime of external
streams must be managed by the user - Numba will not deallocate an external
stream, and the stream must remain valid whilst the Numba ``Stream`` object is
in use.
.. autofunction:: numba.cuda.external_stream
Runtime
-------
Numba generally uses the Driver API, but it provides a simple wrapper to the
Runtime API so that the version of the runtime in use can be queried. This is
accessed through ``cuda.runtime``, which is an instance of the
:class:`numba.cuda.cudadrv.runtime.Runtime` class:
.. autoclass:: numba.cuda.cudadrv.runtime.Runtime
:members: get_version, is_supported_version, supported_versions
Whether the current runtime is officially supported and tested with the current
version of Numba can also be queried:
.. autofunction:: numba.cuda.is_supported_version
CUDA Python Reference
=====================
.. toctree::
host.rst
kernel.rst
types.rst
memory.rst
libdevice.rst
This diff is collapsed.
Libdevice functions
===================
All wrapped libdevice functions are listed in this section. All functions in
libdevice are wrapped, with the exception of ``__nv_nan`` and ``__nv_nanf``.
These functions return a representation of a quiet NaN, but the argument they
take (a pointer to an object specifying the representation) is undocumented, and
follows an unusual form compared to the rest of libdevice - it is not an output
like every other pointer argument. If a NaN is required, one can be obtained in
CUDA Python by other means, e.g. ``math.nan``.
Wrapped functions
-----------------
.. automodule:: numba.cuda.libdevice
:members:
Memory Management
=================
.. autofunction:: numba.cuda.to_device
.. autofunction:: numba.cuda.device_array
.. autofunction:: numba.cuda.device_array_like
.. autofunction:: numba.cuda.pinned_array
.. autofunction:: numba.cuda.pinned_array_like
.. autofunction:: numba.cuda.mapped_array
.. autofunction:: numba.cuda.mapped_array_like
.. autofunction:: numba.cuda.managed_array
.. autofunction:: numba.cuda.pinned
.. autofunction:: numba.cuda.mapped
Device Objects
--------------
.. autoclass:: numba.cuda.cudadrv.devicearray.DeviceNDArray
:members: copy_to_device, copy_to_host, is_c_contiguous, is_f_contiguous,
ravel, reshape, split
.. autoclass:: numba.cuda.cudadrv.devicearray.DeviceRecord
:members: copy_to_device, copy_to_host
.. autoclass:: numba.cuda.cudadrv.devicearray.MappedNDArray
:members: copy_to_device, copy_to_host, split
CUDA-Specific Types
====================
.. note::
This page is about types specific to CUDA targets. Many other types are also
available in the CUDA target - see :ref:`cuda-built-in-types`.
Vector Types
~~~~~~~~~~~~
`CUDA Vector Types <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#built-in-vector-types>`_
are usable in kernels. There are two important distinctions from vector types in CUDA C/C++:
First, the recommended names for vector types in Numba CUDA is formatted as ``<base_type>x<N>``,
where ``base_type`` is the base type of the vector, and ``N`` is the number of elements in the vector.
Examples include ``int64x3``, ``uint16x4``, ``float32x4``, etc. For new Numba CUDA kernels,
this is the recommended way to instantiate vector types.
For convenience, users adapting existing kernels from CUDA C/C++ to Python may use
aliases consistent with the C/C++ namings. For example, ``float3`` aliases ``float32x3``,
``long3`` aliases ``int32x3`` or ``int64x3`` (depending on the platform), etc.
Second, unlike CUDA C/C++ where factory functions are used, vector types are constructed directly
with their constructor. For example, to construct a ``float32x3``:
.. code-block:: python3
from numba.cuda import float32x3
# In kernel
f3 = float32x3(0.0, -1.0, 1.0)
Additionally, vector types can be constructed from a combination of vector and
primitive types, as long as the total number of components matches the result
vector type. For example, all of the following constructions are valid:
.. code-block:: python3
zero = uint32(0)
u2 = uint32x2(1, 2)
# Construct a 3-component vector with primitive type and a 2-component vector
u3 = uint32x3(zero, u2)
# Construct a 4-component vector with 2 2-component vectors
u4 = uint32x4(u2, u2)
The 1st, 2nd, 3rd and 4th component of the vector type can be accessed through fields
``x``, ``y``, ``z``, and ``w`` respectively. The components are immutable after
construction in the present version of Numba; it is expected that support for
mutating vector components will be added in a future release.
.. code-block:: python3
v1 = float32x2(1.0, 1.0)
v2 = float32x2(1.0, -1.0)
dotprod = v1.x * v2.x + v1.y * v2.y
CUDA Bindings
=============
Numba supports two bindings to the CUDA Driver APIs: its own internal bindings
based on ctypes, and the official `NVIDIA CUDA Python bindings
<https://nvidia.github.io/cuda-python/>`_. Functionality is equivalent between
the two bindings.
The internal bindings are used by default. If the NVIDIA bindings are installed,
then they can be used by setting the environment variable
``NUMBA_CUDA_USE_NVIDIA_BINDING`` to ``1`` prior to the import of Numba. Once
Numba has been imported, the selected binding cannot be changed.
Per-Thread Default Streams
--------------------------
Responsibility for handling Per-Thread Default Streams (PTDS) is delegated to
the NVIDIA bindings when they are in use. To use PTDS with the NVIDIA bindings,
set the environment variable ``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM`` to
``1`` instead of Numba's environmnent variable
:envvar:`NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM`.
.. seealso::
The `Default Stream section
<https://nvidia.github.io/cuda-python/release/11.6.0-notes.html#default-stream>`_
in the NVIDIA Bindings documentation.
Roadmap
-------
In Numba 0.56, the NVIDIA Bindings will be used by default, if they are
installed.
In future versions of Numba:
- The internal bindings will be deprecated.
- The internal bindings will be removed.
At present, no specific release is planned for the deprecation or removal of
the internal bindings.
On-disk Kernel Caching
======================
When the ``cache`` keyword argument of the :func:`@cuda.jit <numba.cuda.jit>`
decorator is ``True``, a file-based cache is enabled. This shortens compilation
times when the function was already compiled in a previous invocation.
The cache is maintained in the ``__pycache__`` subdirectory of the directory
containing the source file; if the current user is not allowed to write to it,
the cache implementation falls back to a platform-specific user-wide cache
directory (such as ``$HOME/.cache/numba`` on Unix platforms).
Compute capability considerations
---------------------------------
Separate cache files are maintained for each compute capability. When a cached
kernel is loaded, the compute capability of the device the kernel is first
launched on in the current run is used to determine which version to load.
Therefore, on systems that have multiple GPUs with differing compute
capabilities, the cached versions of kernels are only used for one compute
capability, and recompilation will occur for other compute capabilities.
For example: if a system has two GPUs, one of compute capability 7.5 and one of
8.0, then:
* If a cached kernel is first launched on the CC 7.5 device, then the cached
version for CC 7.5 is used. If it is subsequently launched on the CC 8.0
device, a recompilation will occur.
* If in a subsequent run the cached kernel is first launched on the CC 8.0
device, then the cached version for CC 8.0 is used. A subsequent launch on
the CC 7.5 device will require a recompilation.
This limitation is not expected to present issues in most practical scenarios,
as multi-GPU production systems tend to have identical GPUs within each node.
==================
Cooperative Groups
==================
Supported features
------------------
Numba's Cooperative Groups support presently provides grid groups and grid
synchronization, along with cooperative kernel launches.
Cooperative groups are supported on Linux, and Windows for devices in `TCC
mode
<https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#tesla-compute-cluster-mode-for-windows>`_.
Using Grid Groups
-----------------
To get the current grid group, use the :meth:`cg.this_grid()
<numba.cuda.cg.this_grid>` function:
.. code-block:: python
g = cuda.cg.this_grid()
Synchronizing the grid is done with the :meth:`sync()
<numba.cuda.cg.GridGroup.sync>` method of the grid group:
.. code-block:: python
g.sync()
Cooperative Launches
--------------------
Unlike the CUDA C/C++ API, a cooperative launch is invoked using the same syntax
as a normal kernel launch - Numba automatically determines whether a cooperative
launch is required based on whether a grid group is synchronized in the kernel.
The grid size limit for a cooperative launch is more restrictive than for a
normal launch - the grid must be no larger than the maximum number of active
blocks on the device on which it is launched. To get maximum grid size for a
cooperative launch of a kernel with a given block size and dynamic shared
memory requirement, use the ``max_cooperative_grid_blocks()`` method of kernel
overloads:
.. automethod:: numba.cuda.dispatcher._Kernel.max_cooperative_grid_blocks
This can be used to ensure that the kernel is launched with no more than the
maximum number of blocks. Exceeding the maximum number of blocks for the
cooperative launch will result in a ``CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE``
error.
Applications and Example
------------------------
Grid group synchronization can be used to implement a global barrier across all
threads in the grid - applications of this include a global reduction to a
single value, or looping over rows of a large matrix sequentially using the
entire grid to operate on column elements in parallel.
In the following example, rows are written sequentially by the grid. Each thread
in the grid reads a value from the previous row written by it's *opposite*
thread. A grid sync is needed to ensure that threads in the grid don't run ahead
of threads in other blocks, or fail to see updates from their opposite thread.
First we'll define our kernel:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cg.py
:language: python
:caption: from ``test_grid_sync`` of ``numba/cuda/tests/doc_example/test_cg.py``
:start-after: magictoken.ex_grid_sync_kernel.begin
:end-before: magictoken.ex_grid_sync_kernel.end
:dedent: 8
:linenos:
Then create some empty input data and determine the grid and block sizes:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cg.py
:language: python
:caption: from ``test_grid_sync`` of ``numba/cuda/tests/doc_example/test_cg.py``
:start-after: magictoken.ex_grid_sync_data.begin
:end-before: magictoken.ex_grid_sync_data.end
:dedent: 8
:linenos:
Finally we launch the kernel and print the result:
.. literalinclude:: ../../../numba/cuda/tests/doc_examples/test_cg.py
:language: python
:caption: from ``test_grid_sync`` of ``numba/cuda/tests/doc_example/test_cg.py``
:start-after: magictoken.ex_grid_sync_launch.begin
:end-before: magictoken.ex_grid_sync_launch.end
:dedent: 8
:linenos:
The maximum grid size for ``sequential_rows`` can be enquired using:
.. code-block:: python
overload = sequential_rows.overloads[(int32[:,::1],)
max_blocks = overload.max_cooperative_grid_blocks(blockdim)
print(max_blocks)
# 1152 (e.g. on Quadro RTX 8000 with Numba 0.52.1 and CUDA 11.0)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Writing Device Functions
========================
CUDA device functions can only be invoked from within the device (by a kernel
or another device function). To define a device function::
from numba import cuda
@cuda.jit(device=True)
def a_device_function(a, b):
return a + b
Unlike a kernel function, a device function can return a value like normal
functions.
Device management
=================
For multi-GPU machines, users may want to select which GPU to use.
By default the CUDA driver selects the fastest GPU as the device 0,
which is the default device used by Numba.
The features introduced on this page are generally not of interest
unless working with systems hosting/offering more than one CUDA-capable GPU.
Device Selection
----------------
If at all required, device selection must be done before any CUDA feature is
used.
::
from numba import cuda
cuda.select_device(0)
The device can be closed by:
::
cuda.close()
Users can then create a new context with another device.
::
cuda.select_device(1) # assuming we have 2 GPUs
.. function:: numba.cuda.select_device(device_id)
:noindex:
Create a new CUDA context for the selected *device_id*. *device_id*
should be the number of the device (starting from 0; the device order
is determined by the CUDA libraries). The context is associated with
the current thread. Numba currently allows only one context per thread.
If successful, this function returns a device instance.
.. XXX document device instances?
.. function:: numba.cuda.close
:noindex:
Explicitly close all contexts in the current thread.
.. note::
Compiled functions are associated with the CUDA context.
This makes it not very useful to close and create new devices, though it
is certainly useful for choosing which device to use when the machine
has multiple GPUs.
The Device List
===============
The Device List is a list of all the GPUs in the system, and can be indexed to
obtain a context manager that ensures execution on the selected GPU.
.. attribute:: numba.cuda.gpus
:noindex:
.. attribute:: numba.cuda.cudadrv.devices.gpus
:py:data:`numba.cuda.gpus` is an instance of the ``_DeviceList`` class, from
which the current GPU context can also be retrieved:
.. autoclass:: numba.cuda.cudadrv.devices._DeviceList
:members: current
:noindex:
Device UUIDs
============
The UUID of a device (equal to that returned by ``nvidia-smi -L``) is available
in the :attr:`uuid <numba.cuda.cudadrv.driver.Device.uuid>` attribute of a CUDA
device object.
For example, to obtain the UUID of the current device:
.. code-block:: python
dev = cuda.current_context().device
# prints e.g. "GPU-e6489c45-5b68-3b03-bab7-0e7c8e809643"
print(dev.uuid)
This diff is collapsed.
This diff is collapsed.
.. _cudafaq:
=================================================
CUDA Frequently Asked Questions
=================================================
nvprof reports "No kernels were profiled"
-----------------------------------------
When using the ``nvprof`` tool to profile Numba jitted code for the CUDA
target, the output contains ``No kernels were profiled`` but there are clearly
running kernels present, what is going on?
This is quite likely due to the profiling data not being flushed on program
exit, see the `NVIDIA CUDA documentation
<http://docs.nvidia.com/cuda/profiler-users-guide/#flush-profile-data>`_ for
details. To fix this simply add a call to ``numba.cuda.profile_stop()`` prior
to the exit point in your program (or wherever you want to stop profiling).
For more on CUDA profiling support in Numba, see :ref:`cuda-profiling`.
.. _cuda-fast-math:
CUDA Fast Math
==============
As noted in :ref:`fast-math`, for certain classes of applications that utilize
floating point, strict IEEE-754 conformance is not required. For this subset of
applications, performance speedups may be possible.
The CUDA target implements :ref:`fast-math` behavior with two differences.
* First, the ``fastmath`` argument to the :func:`@jit decorator
<numba.cuda.jit>` is limited to the values ``True`` and ``False``.
When ``True``, the following optimizations are enabled:
- Flushing of denormals to zero.
- Use of a fast approximation to the square root function.
- Use of a fast approximation to the division operation.
- Contraction of multiply and add operations into single fused multiply-add
operations.
See the `documentation for nvvmCompileProgram <https://docs.nvidia.com/cuda/libnvvm-api/group__compilation.html#group__compilation_1g76ac1e23f5d0e2240e78be0e63450346>`_ for more details of these optimizations.
* Secondly, calls to a subset of math module functions on ``float32`` operands
will be implemented using fast approximate implementations from the libdevice
library.
- :func:`math.cos`: Implemented using `__nv_fast_cosf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_cosf.html>`_.
- :func:`math.sin`: Implemented using `__nv_fast_sinf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_sinf.html>`_.
- :func:`math.tan`: Implemented using `__nv_fast_tanf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_tanf.html>`_.
- :func:`math.exp`: Implemented using `__nv_fast_expf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_expf.html>`_.
- :func:`math.log2`: Implemented using `__nv_fast_log2f <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_log2f.html>`_.
- :func:`math.log10`: Implemented using `__nv_fast_log10f <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_log10f.html>`_.
- :func:`math.log`: Implemented using `__nv_fast_logf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_logf.html>`_.
- :func:`math.pow`: Implemented using `__nv_fast_powf <https://docs.nvidia.com/cuda/libdevice-users-guide/__nv_fast_powf.html>`_.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment