Unverified Commit de8c6105 authored by Alex Ford's avatar Alex Ford Committed by GitHub
Browse files

Optimize array-from-ctypes in basic.py (#3927)

Approximately %80 of runtime when loading "low column count, high row
count" DataFrames into Datasets is consumed in `np.fromiter`, called
as part of the `Dataset.get_field` method.

This is particularly pernicious hotspot, as unlike other ctypes-based
methods this is a hot loop over a python iterator loop and causes
significant GIL-contention in multi-threaded applications.

Replace `np.fromiter` with a direct call to `np.ctypeslib.as_array`,
which allows a single-shot `copy` of the underlying array.

This reduces the load time of a ~35 million row categorical dataframe
with 1 column from ~5 seconds to ~1 second, and allows multi-threaded
execution.
parent 75b9b0d3
...@@ -152,7 +152,7 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'): ...@@ -152,7 +152,7 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'):
def cfloat32_array_to_numpy(cptr, length): def cfloat32_array_to_numpy(cptr, length):
"""Convert a ctypes float pointer array to a numpy array.""" """Convert a ctypes float pointer array to a numpy array."""
if isinstance(cptr, ctypes.POINTER(ctypes.c_float)): if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
return np.fromiter(cptr, dtype=np.float32, count=length) return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
else: else:
raise RuntimeError('Expected float pointer') raise RuntimeError('Expected float pointer')
...@@ -160,7 +160,7 @@ def cfloat32_array_to_numpy(cptr, length): ...@@ -160,7 +160,7 @@ def cfloat32_array_to_numpy(cptr, length):
def cfloat64_array_to_numpy(cptr, length): def cfloat64_array_to_numpy(cptr, length):
"""Convert a ctypes double pointer array to a numpy array.""" """Convert a ctypes double pointer array to a numpy array."""
if isinstance(cptr, ctypes.POINTER(ctypes.c_double)): if isinstance(cptr, ctypes.POINTER(ctypes.c_double)):
return np.fromiter(cptr, dtype=np.float64, count=length) return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
else: else:
raise RuntimeError('Expected double pointer') raise RuntimeError('Expected double pointer')
...@@ -168,7 +168,7 @@ def cfloat64_array_to_numpy(cptr, length): ...@@ -168,7 +168,7 @@ def cfloat64_array_to_numpy(cptr, length):
def cint32_array_to_numpy(cptr, length): def cint32_array_to_numpy(cptr, length):
"""Convert a ctypes int pointer array to a numpy array.""" """Convert a ctypes int pointer array to a numpy array."""
if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)): if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)):
return np.fromiter(cptr, dtype=np.int32, count=length) return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
else: else:
raise RuntimeError('Expected int32 pointer') raise RuntimeError('Expected int32 pointer')
...@@ -176,7 +176,7 @@ def cint32_array_to_numpy(cptr, length): ...@@ -176,7 +176,7 @@ def cint32_array_to_numpy(cptr, length):
def cint64_array_to_numpy(cptr, length): def cint64_array_to_numpy(cptr, length):
"""Convert a ctypes int pointer array to a numpy array.""" """Convert a ctypes int pointer array to a numpy array."""
if isinstance(cptr, ctypes.POINTER(ctypes.c_int64)): if isinstance(cptr, ctypes.POINTER(ctypes.c_int64)):
return np.fromiter(cptr, dtype=np.int64, count=length) return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
else: else:
raise RuntimeError('Expected int64 pointer') raise RuntimeError('Expected int64 pointer')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment