CUDA support#
Contexts#
-
class CudaDeviceManager#
Public Functions
-
Result<std::shared_ptr<CudaDevice>> GetDevice(int device_number)#
Get a CudaDevice instance for a particular device.
- Parameters:
device_number – [in] the CUDA device number
-
Result<std::shared_ptr<CudaContext>> GetContext(int device_number)#
Get the CUDA driver context for a particular device.
- Parameters:
device_number – [in] the CUDA device number
- Returns:
cached context
Get the shared CUDA driver context for a particular device.
- Parameters:
device_number – [in] the CUDA device number
handle – [in] CUDA context handle created by another library
- Returns:
shared context
-
Result<std::shared_ptr<CudaHostBuffer>> AllocateHost(int device_number, int64_t nbytes)#
Allocate host memory with fast access to given GPU device.
- Parameters:
device_number – [in] the CUDA device number
nbytes – [in] number of bytes
- Returns:
Host buffer or Status
-
Result<std::shared_ptr<CudaDevice>> GetDevice(int device_number)#
-
class CudaContext : public std::enable_shared_from_this<CudaContext>#
Object-oriented interface to the low-level CUDA driver API.
Public Functions
-
Result<std::unique_ptr<CudaBuffer>> Allocate(int64_t nbytes)#
Allocate CUDA memory on GPU device for this context.
- Parameters:
nbytes – [in] number of bytes
- Returns:
the allocated buffer
-
Status Free(void *device_ptr, int64_t nbytes)#
Release CUDA memory on GPU device for this context.
- Parameters:
device_ptr – [in] the buffer address
nbytes – [in] number of bytes
- Returns:
-
Result<std::shared_ptr<CudaBuffer>> View(uint8_t *data, int64_t nbytes)#
Create a view of CUDA memory on GPU device of this context.
Note
The caller is responsible for allocating and freeing the memory as well as ensuring that the memory belongs to the CUDA context that this CudaContext instance holds.
- Parameters:
data – [in] the starting device address
nbytes – [in] number of bytes
- Returns:
the view buffer
-
Result<std::shared_ptr<CudaBuffer>> OpenIpcBuffer(const CudaIpcMemHandle &ipc_handle)#
Open existing CUDA IPC memory handle.
- Parameters:
ipc_handle – [in] opaque pointer to CUipcMemHandle (driver API)
- Returns:
a CudaBuffer referencing the IPC segment
-
Status CloseIpcBuffer(CudaBuffer *buffer)#
Close memory mapped with IPC buffer.
- Parameters:
buffer – [in] a CudaBuffer referencing
- Returns:
-
void *handle() const#
Expose CUDA context handle to other libraries.
-
std::shared_ptr<CudaMemoryManager> memory_manager() const#
Return the default memory manager tied to this context’s device.
-
std::shared_ptr<CudaDevice> device() const#
Return the device instance associated with this context.
-
int device_number() const#
Return the logical device number.
-
Result<uintptr_t> GetDeviceAddress(uint8_t *addr)#
Return the device address that is reachable from kernels running in the context.
The device address is defined as a memory address accessible by device. While it is often a device memory address, it can be also a host memory address, for instance, when the memory is allocated as host memory (using cudaMallocHost or cudaHostAlloc) or as managed memory (using cudaMallocManaged) or the host memory is page-locked (using cudaHostRegister).
- Parameters:
addr – [in] device or host memory address
- Returns:
the device address
-
Result<std::unique_ptr<CudaBuffer>> Allocate(int64_t nbytes)#
Devices#
-
class CudaDevice : public arrow::Device#
Device implementation for CUDA.
Each CudaDevice instance is tied to a particular CUDA device (identified by its logical device number).
Public Functions
-
virtual const char *type_name() const override#
A shorthand for this device’s type.
The returned value is different for each device class, but is the same for all instances of a given class. It can be used as a replacement for RTTI.
-
virtual std::string ToString() const override#
A human-readable description of the device.
The returned value should be detailed enough to distinguish between different instances, where necessary.
-
virtual bool Equals(const Device&) const override#
Whether this instance points to the same device as another one.
-
virtual std::shared_ptr<MemoryManager> default_memory_manager() override#
Return a MemoryManager instance tied to this device.
The returned instance uses default parameters for this device type’s MemoryManager implementation. Some devices also allow constructing MemoryManager instances with non-default parameters.
-
inline virtual DeviceAllocationType device_type() const override#
Return the DeviceAllocationType of this device.
-
inline virtual int64_t device_id() const override#
A device ID to identify this device if there are multiple of this type.
If there is no “device_id” equivalent (such as for the main CPU device on non-numa systems) returns -1.
-
int device_number() const#
Return the device logical number.
-
std::string device_name() const#
Return the GPU model name.
-
int64_t total_memory() const#
Return total memory on this device.
-
int handle() const#
Return a raw CUDA device handle.
The returned value can be used to expose this device to other libraries. It should be interpreted as
CUdevice
.
-
Result<std::shared_ptr<CudaContext>> GetContext()#
Get a CUDA driver context for this device.
The returned context is associated with the primary CUDA context for the device. This is the recommended way of getting a context for a device, as it allows interoperating transparently with any library using the primary CUDA context API.
Get a CUDA driver context for this device, using an existing handle.
The handle is not owned: it will not be released when the CudaContext is destroyed. This function should only be used if you need interoperation with a library that uses a non-primary context.
- Parameters:
handle – [in] CUDA context handle created by another library
-
Result<std::shared_ptr<CudaHostBuffer>> AllocateHostBuffer(int64_t size)#
Allocate a host-residing, GPU-accessible buffer.
The buffer is allocated using this device’s primary context.
- Parameters:
size – [in] The buffer size in bytes
-
virtual Result<std::shared_ptr<Device::Stream>> MakeStream(unsigned int flags) override#
Create a CUstream wrapper in the current context.
-
virtual Result<std::shared_ptr<Device::Stream>> WrapStream(void *device_stream, Stream::release_fn_t release_fn) override#
Wrap a pointer to an existing stream.
- Parameters:
device_stream – passed in stream (should be a CUstream*)
release_fn – destructor to free the stream.
nullptr
may be passed to indicate there is no destruction/freeing necessary.
Public Static Functions
-
static Result<std::shared_ptr<CudaDevice>> Make(int device_number)#
Return a CudaDevice instance for a particular device.
- Parameters:
device_number – [in] the CUDA device number
-
class Stream : public arrow::Device::Stream#
EXPERIMENTAL: Wrapper for CUstreams.
Does not own the CUstream object which must be separately constructed and freed using cuStreamCreate and cuStreamDestroy (or equivalent). Default construction will use the cuda default stream, and does not allow construction from literal 0 or nullptr.
-
virtual const char *type_name() const override#
-
class CudaMemoryManager : public arrow::MemoryManager#
MemoryManager implementation for CUDA.
Public Functions
Create a RandomAccessFile to read a particular buffer.
The given buffer must be tied to this MemoryManager.
See also the Buffer::GetReader shorthand.
Create a OutputStream to write to a particular buffer.
The given buffer must be mutable and tied to this MemoryManager. The returned stream object writes into the buffer’s underlying memory (but it won’t resize it).
See also the Buffer::GetWriter shorthand.
-
virtual Result<std::unique_ptr<Buffer>> AllocateBuffer(int64_t size) override#
Allocate a (mutable) Buffer.
The buffer will be allocated in the device’s memory.
-
std::shared_ptr<CudaDevice> cuda_device() const#
The CudaDevice instance tied to this MemoryManager.
This is a useful shorthand returning a concrete-typed pointer, avoiding having to cast the
device()
result.
-
virtual Result<std::shared_ptr<Device::SyncEvent>> MakeDeviceSyncEvent() override#
Creates a wrapped CUevent.
Will call cuEventCreate and it will call cuEventDestroy internally when the event is destructed.
-
virtual Result<std::shared_ptr<Device::SyncEvent>> WrapDeviceSyncEvent(void *sync_event, Device::SyncEvent::release_fn_t release_sync_event) override#
Wraps an existing event into a sync event.
- Parameters:
sync_event – the event to wrap, must be a CUevent*
release_sync_event – a function to call during destruction,
nullptr
or a no-op function can be passed to indicate ownership is maintained externally
Buffers#
-
class CudaBuffer : public arrow::Buffer#
An Arrow buffer located on a GPU device.
Be careful using this in any Arrow code which may not be GPU-aware
Public Functions
-
Status CopyToHost(const int64_t position, const int64_t nbytes, void *out) const#
Copy memory from GPU device to CPU host.
- Parameters:
position – [in] start position inside buffer to copy bytes from
nbytes – [in] number of bytes to copy
out – [out] start address of the host memory area to copy to
- Returns:
-
Status CopyFromHost(const int64_t position, const void *data, int64_t nbytes)#
Copy memory to device at position.
- Parameters:
position – [in] start position to copy bytes to
data – [in] the host data to copy
nbytes – [in] number of bytes to copy
- Returns:
-
Status CopyFromDevice(const int64_t position, const void *data, int64_t nbytes)#
Copy memory from device to device at position.
Note
It is assumed that both source and destination device memories have been allocated within the same context.
- Parameters:
position – [in] start position inside buffer to copy bytes to
data – [in] start address of the device memory area to copy from
nbytes – [in] number of bytes to copy
- Returns:
Copy memory from another device to device at position.
- Parameters:
src_ctx – [in] context of the source device memory
position – [in] start position inside buffer to copy bytes to
data – [in] start address of the another device memory area to copy from
nbytes – [in] number of bytes to copy
- Returns:
-
virtual Result<std::shared_ptr<CudaIpcMemHandle>> ExportForIpc()#
Expose this device buffer as IPC memory which can be used in other processes.
Note
After calling this function, this device memory will not be freed when the CudaBuffer is destructed
- Returns:
Handle or Status
Public Static Functions
Convert back generic buffer into CudaBuffer.
Note
This function returns an error if the buffer isn’t backed by GPU memory
- Parameters:
buffer – [in] buffer to convert
- Returns:
CudaBuffer or Status
-
Status CopyToHost(const int64_t position, const int64_t nbytes, void *out) const#
-
class CudaHostBuffer : public arrow::MutableBuffer#
Device-accessible CPU memory created using cudaHostAlloc.
Public Functions
Return a device address the GPU can read this memory from.
Memory Input / Output#
-
class CudaBufferReader : public arrow::io::internal::RandomAccessFileConcurrencyWrapper<CudaBufferReader>#
File interface for zero-copy read from CUDA buffers.
CAUTION: reading to a Buffer returns a Buffer pointing to device memory. It will generally not be compatible with Arrow code expecting a buffer pointing to CPU memory. Reading to a raw pointer, though, copies device memory into the host memory pointed to.
-
class CudaBufferWriter : public arrow::io::WritableFile#
File interface for writing to CUDA buffers, with optional buffering.
Public Functions
-
virtual bool closed() const override#
Return whether the stream is closed.
-
virtual Status Write(const void *data, int64_t nbytes) override#
Write the given data to the stream.
This method always processes the bytes in full. Depending on the semantics of the stream, the data may be written out immediately, held in a buffer, or written asynchronously. In the case where the stream buffers the data, it will be copied. To avoid potentially large copies, use the Write variant that takes an owned Buffer.
-
Status SetBufferSize(const int64_t buffer_size)#
Set CPU buffer size to limit calls to cudaMemcpy.
By default writes are unbuffered
- Parameters:
buffer_size – [in] the size of CPU buffer to allocate
- Returns:
-
int64_t buffer_size() const#
Returns size of host (CPU) buffer, 0 for unbuffered.
-
int64_t num_bytes_buffered() const#
Returns number of bytes buffered on host.
-
virtual bool closed() const override#
IPC#
-
class CudaIpcMemHandle#
Public Functions
-
Result<std::shared_ptr<Buffer>> Serialize(MemoryPool *pool = default_memory_pool()) const#
Write CudaIpcMemHandle to a Buffer.
- Parameters:
pool – [in] a MemoryPool to allocate memory from
- Returns:
Public Static Functions
-
static Result<std::shared_ptr<CudaIpcMemHandle>> FromBuffer(const void *opaque_handle)#
Create CudaIpcMemHandle from opaque buffer (e.g.
from another process)
- Parameters:
opaque_handle – [in] a CUipcMemHandle as a const void*
- Returns:
Handle or Status
-
Result<std::shared_ptr<Buffer>> Serialize(MemoryPool *pool = default_memory_pool()) const#
-
Result<std::shared_ptr<CudaBuffer>> SerializeRecordBatch(const RecordBatch &batch, CudaContext *ctx)#
Write record batch message to GPU device memory.
- Parameters:
batch – [in] record batch to write
ctx – [in] CudaContext to allocate device memory from
- Returns:
CudaBuffer or Status
ReadRecordBatch specialized to handle metadata on CUDA device.
- Parameters:
schema – [in] the Schema for the record batch
dictionary_memo – [in] DictionaryMemo which has any dictionaries. Can be nullptr if you are sure there are no dictionary-encoded fields
buffer – [in] a CudaBuffer containing the complete IPC message
pool – [in] a MemoryPool to use for allocating space for the metadata
- Returns: