add xlib.avecl

This commit is contained in:
iperov 2021-09-30 18:21:30 +04:00
commit 0058474da7
56 changed files with 5569 additions and 0 deletions

View file

@ -0,0 +1,109 @@
from typing import Iterable, Union
import numpy as np
from . import OpenCL as CL
class Buffer:
__slots__ = ['_device','_cl_mem','_size','_on_initialize']
def __init__(self, device : 'Device', size : int, on_initialize = None):
"""
represents physical buffer associated with physical device
device Device
size int
"""
Buffer._object_count += 1
self._device = device
self._size = size
self._cl_mem = None
self._on_initialize = on_initialize
def __del__(self):
#print('Buffer.__del__')
Buffer._object_count -= 1
self.free_cl_mem()
def get_device(self) -> 'Device': return self._device
def get_size(self) -> int: return self._size
def has_cl_mem(self) -> bool: return self._cl_mem is not None
def get_cl_mem(self) -> CL.cl_mem:
if self._cl_mem is None:
self._cl_mem = self._device._cl_mem_pool_alloc(self._size)
if self._on_initialize is not None:
self._on_initialize()
return self._cl_mem
def free_cl_mem(self):
if self._cl_mem is not None:
self._device._cl_mem_pool_free(self._cl_mem)
self._cl_mem = None
def set(self, value : Union['Buffer', np.ndarray]):
"""
Parameters
value Buffer copy data from other Buffer.
np.ndarray copies values from ndarray
to Buffer's memory
"""
if isinstance(value, Buffer):
if self != value:
if self._size != value._size:
raise Exception(f'Unable to copy from Buffer with {value._size} size to buffer with {self._size} size.')
if self._device == value._device:
CL.clEnqueueCopyBuffer(self._device._get_ctx_q(), value.get_cl_mem(), self.get_cl_mem(), 0, 0, self._size, 0, None, None)
else:
# Transfer between devices will cause low performance
raise NotImplementedError()
else:
if not isinstance(value, np.ndarray):
raise ValueError (f'Invalid type {value.__class__}. Must be np.ndarray.')
if value.nbytes != self._size:
raise ValueError(f'Value size {value.nbytes} does not match Buffer size {self._size}.')
if not value.flags.contiguous:
value = value.reshape(-1)
if not value.flags.contiguous:
raise ValueError ("Unable to write from non-contiguous np array.")
ev = CL.cl_event()
clr = CL.clEnqueueWriteBuffer(self._device._get_ctx_q(), self.get_cl_mem(), False, 0, value.nbytes, value.ctypes.data_as(CL.c_void_p), 0, None, ev)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clEnqueueWriteBuffer error: {clr}')
CL.clWaitForEvents(1, ( CL.cl_event * 1 )(ev) )
CL.clReleaseEvent(ev)
def np(self, shape : Iterable, dtype : np.dtype):
"""
Returns data of buffer as np.ndarray with specified shape and dtype
"""
out_np_value = np.empty (shape, dtype)
if out_np_value.nbytes != self._size:
raise ValueError(f'Unable to represent Buffer with size {self._size} as shape {shape} with dtype {dtype}')
clr = CL.clEnqueueReadBuffer(self._device._get_ctx_q(), self.get_cl_mem(), True, 0, self._size, out_np_value.ctypes.data, 0, None, None)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clEnqueueReadBuffer error: {clr}')
return out_np_value
def __str__(self):
return f'Buffer [{self._size} bytes][{f"{self._cl_mem.value}" if self._cl_mem is not None else "unallocated"}] on {str(self._device)}'
def __repr__(self):
return self.__str__()
_object_count = 0

View file

@ -0,0 +1,522 @@
from typing import List, Union
import numpy as np
from . import OpenCL as CL
from .Buffer import Buffer
from .DeviceInfo import DeviceInfo
from .Kernel import Kernel
_np_dtype_to_cl = { np.uint8: CL.cl_uchar,
np.int8: CL.cl_char,
np.uint16: CL.cl_ushort,
np.int16: CL.cl_short,
np.uint32: CL.cl_uint,
np.int32: CL.cl_int,
np.uint64: CL.cl_ulong,
np.int64: CL.cl_long,
np.float16: CL.cl_half,
np.float32: CL.cl_float,
np.float64: CL.cl_double }
_opencl_device_ids = None
_default_device = None
_devices = {}
class Device:
"""
Represents physical TensorCL device
"""
def __init__(self, device_info : DeviceInfo, **kwargs):
if kwargs.get('_check', None) is None:
raise Exception('You should not to create Device from constructor. Use get_device()')
self._cached_data = {} # cached data (per device) by key
self._pooled_buffers = {} # Pool of cached device buffers.
self._compiled_kernels = {} # compiled kernels by key
self._ctx_q = None # CL command queue
self._ctx = None # CL context
self._total_memory_allocated = 0
self._total_buffers_allocated = 0
self._total_memory_pooled = 0
self._total_buffers_pooled = 0
self._device_info = device_info
self._device_id = _get_opencl_device_ids()[device_info.get_index()]
def __del__(self):
self.cleanup()
def __eq__(self, other):
if self is not None and other is not None and isinstance(self, Device) and isinstance(other, Device):
return self._device_id.value == other._device_id.value
return False
def __hash__(self):
return self._device_id.value
def _get_ctx(self) -> CL.cl_context:
# Create OpenCL context on demand
if self._ctx is None:
clr = CL.CLRESULT()
ctx = CL.clCreateContext( None, 1, (CL.cl_device_id * 1)( self._device_id ), None, None, clr)
if clr != CL.CLERROR.SUCCESS:
raise Exception('Unable to create OpenCL context.')
self._ctx = ctx
return self._ctx
def _get_ctx_q(self) -> CL.cl_command_queue:
# Create CommandQueue on demand
if self._ctx_q is None:
clr = CL.CLRESULT()
ctx_q = CL.clCreateCommandQueue(self._get_ctx(), self._device_id, CL.cl_command_queue_properties(0), clr)
if clr != CL.CLERROR.SUCCESS:
raise Exception('Unable to create OpenCL CommandQueue.')
self._ctx_q = ctx_q
return self._ctx_q
def get_description(self) -> str:
return f"{self._device_info.get_name()} [{(self._device_info.get_total_memory() / 1024**3) :.3}Gb]"
def __str__(self):
return self.get_description()
def __repr__(self):
return f'{self.__class__.__name__} object: ' + self.__str__()
def set_cached_data(self, key, value):
"""
All cached data will be freed with cleanup()
"""
self._cached_data[key] = value
def get_cached_data(self, key):
return self._cached_data.get(key, None)
def get_total_allocated_memory(self):
return self._total_memory_allocated
def get_max_malloc_size(self) -> int:
size = CL.cl_ulong()
clr = CL.clGetDeviceInfo(self._device_id, CL.CL_DEVICE_MAX_MEM_ALLOC_SIZE, CL.sizeof(size), CL.byref(size), None)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clGetDeviceInfo error: {clr}')
return size.value
def _compile_kernel(self, key, kernel_text) -> CL.cl_kernel:
"""
compile or get cached kernel
"""
compiled_krn, prog = self._compiled_kernels.get(key, (None, None) )
if compiled_krn is None:
clr = CL.CLRESULT()
prog = CL.clCreateProgramWithSource(self._get_ctx(), 1, CL.c_char_p(kernel_text.encode()), None, clr )
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clCreateProgramWithSource error {clr}, with kernel_text:\n{kernel_text}')
clr = CL.clBuildProgram(prog, 1, (CL.cl_device_id*1)(self._device_id), CL.c_char_p('-cl-std=CL1.2 -cl-single-precision-constant'.encode()), None, None )
if clr != CL.CLERROR.SUCCESS:
build_log_size = CL.c_size_t()
clr = CL.clGetProgramBuildInfo(prog, self._device_id, CL.CL_PROGRAM_BUILD_LOG, 0, None, CL.byref(build_log_size) )
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clGetProgramBuildInfo,error: {clr}')
build_log = CL.create_string_buffer(build_log_size.value)
clr = CL.clGetProgramBuildInfo(prog, self._device_id, CL.CL_PROGRAM_BUILD_LOG, build_log_size.value, build_log, None )
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clGetProgramBuildInfo error: {clr}')
build_log = str(build_log.value, 'utf-8')
raise Exception(f'clBuildProgram error:\n\n{build_log}')
num_kernels = CL.cl_uint()
clr = CL.clCreateKernelsInProgram(prog, 0, None, CL.byref(num_kernels))
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clCreateKernelsInProgram error: {clr}')
if num_kernels.value != 1:
raise Exception(f'Kernel must contain only one __kernel:\n\n{kernel_text}')
kernels = (CL.cl_kernel * num_kernels.value)()
clr = CL.clCreateKernelsInProgram(prog, num_kernels.value, kernels, None)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clCreateKernelsInProgram error: {clr}')
compiled_krn = kernels[0]
self._compiled_kernels[key] = (compiled_krn, prog)
return compiled_krn
def _cl_mem_alloc(self, size) -> CL.cl_mem:
clr = CL.CLRESULT()
mem = CL.clCreateBuffer(self._get_ctx(), CL.CL_MEM_READ_WRITE, size, None, clr)
if clr == CL.CLERROR.SUCCESS:
# Fill one byte to check memory availability
ev = CL.cl_event()
clr = CL.clEnqueueFillBuffer (self._get_ctx_q(), mem, (CL.c_char * 1)(), 1, 0, 1, 0, None, ev )
if clr == CL.CLERROR.SUCCESS:
CL.clReleaseEvent(ev)
self._total_memory_allocated += size
self._total_buffers_allocated += 1
return mem
return None
def _cl_mem_free(self, mem : CL.cl_mem):
size = CL.c_size_t()
clr = CL.clGetMemObjectInfo(mem, CL.CL_MEM_SIZE, CL.sizeof(size), CL.byref(size), None )
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clGetMemObjectInfo error: {clr}')
size = size.value
self._total_memory_allocated -= size
self._total_buffers_allocated -= 1
clr = CL.clReleaseMemObject(mem)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clReleaseMemObject error: {clr}')
def _cl_mem_pool_alloc(self, size):
"""
allocate or get cl_mem from pool
"""
pool = self._pooled_buffers
# First try to get pooled buffer
ar = pool.get(size, None)
if ar is not None and len(ar) != 0:
mem = ar.pop(-1)
self._total_memory_pooled -= size
self._total_buffers_pooled -= 1
else:
# No pooled buffer, try to allocate new
while True:
mem = self._cl_mem_alloc(size)
if mem is None:
# MemoryError. Finding largest pooled buffer to release
buf_to_release = None
for size_key in sorted(list(pool.keys()), reverse=True):
ar = pool[size_key]
if len(ar) != 0:
buf_to_release = ar.pop(-1)
break
if buf_to_release is not None:
# Release pooled buffer and try to allocate again
self._cl_mem_free(buf_to_release)
continue
raise Exception(f'Unable to allocate {size // 1024**2}Mb on {str(self)}')
break
return mem
def _cl_mem_pool_free(self, mem : CL.cl_mem):
"""
Put cl_mem to pool for reuse in future.
"""
size = CL.c_size_t()
clr = CL.clGetMemObjectInfo(mem, CL.CL_MEM_SIZE, CL.sizeof(size), CL.byref(size), None )
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clGetMemObjectInfo error: {clr}')
size = size.value
pool = self._pooled_buffers
ar = pool.get(size, None)
if ar is None:
ar = pool[size] = []
ar.append(mem)
self._total_memory_pooled += size
self._total_buffers_pooled += 1
def print_stat(self):
s = f'''
Total memory allocated: {self._total_memory_allocated}
Total buffers allocated: {self._total_buffers_allocated}
Total memory pooled: {self._total_memory_pooled}
Total buffers pooled: {self._total_buffers_pooled}
N of compiled kernels: {len(self._compiled_kernels)}
N of cacheddata: {len(self._cached_data)}
'''
print(s)
def run_kernel(self, kernel : Kernel, *args, global_shape=None, local_shape=None, global_shape_offsets=None, wait=False):
"""
Run kernel on Device
Arguments
*args arguments will be passed to OpenCL kernel
allowed types:
Buffer
np single value
global_shape(None) tuple of ints, up to 3 dims
amount of parallel kernel executions.
in OpenCL kernel,
id can be obtained via get_global_id(dim)
local_shape(None) tuple of ints, up to 3 dims
specifies local groups of every dim of global_shape.
in OpenCL kernel,
id can be obtained via get_local_id(dim)
global_shape_offsets(None) tuple of ints
offsets for global_shape
wait(False) wait execution to complete
"""
ckernel = self._compile_kernel(kernel, kernel.get_kernel_text())
if global_shape is None:
global_shape = kernel.get_global_shape()
if global_shape is None:
raise ValueError('global_shape must be defined.')
work_dim = len(global_shape)
global_shape_ar = (CL.c_size_t*work_dim)()
for i,v in enumerate(global_shape):
global_shape_ar[i] = v
local_shape_ar = None
if local_shape is None:
local_shape = kernel.get_local_shape()
if local_shape is not None:
if len(local_shape) != work_dim:
raise ValueError('len of local_shape must match global_shape')
local_shape_ar = (CL.c_size_t*work_dim)()
for i,v in enumerate(local_shape):
local_shape_ar[i] = v
global_shape_offsets_ar = None
if global_shape_offsets is not None:
if len(global_shape_offsets) != work_dim:
raise ValueError('len of global_shape_offsets must match global_shape')
global_shape_offsets_ar = (CL.c_size_t*work_dim)()
for i,v in enumerate(local_shape):
global_shape_offsets_ar[i] = v
for i, arg in enumerate(args):
if isinstance(arg, Buffer):
arg = arg.get_cl_mem()
else:
cl_type = _np_dtype_to_cl.get(arg.__class__, None)
if cl_type is None:
raise ValueError(f'Cannot convert type {arg.__class__} to OpenCL type.')
arg = cl_type(arg)
clr = CL.clSetKernelArg(ckernel, i, CL.sizeof(arg), CL.byref(arg))
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clSetKernelArg error: {clr}')
ev = CL.cl_event() if wait else None
clr = CL.clEnqueueNDRangeKernel(self._get_ctx_q(), ckernel, work_dim, global_shape_offsets_ar, global_shape_ar, local_shape_ar, 0, None, ev)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clEnqueueNDRangeKernel error: {clr}')
if wait:
CL.clWaitForEvents(1, (CL.cl_event*1)(ev) )
CL.clReleaseEvent(ev)
def wait(self):
"""
Wait to finish all queued operations on this Device
"""
clr = CL.clFinish(self._get_ctx_q())
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clFinish error: {clr}')
def cleanup(self):
"""
Frees all resources from this Device.
"""
self._cached_data = {}
pool = self._pooled_buffers
for size_key in pool.keys():
for mem in pool[size_key]:
self._cl_mem_free(mem)
self._pooled_buffers = {}
self._total_memory_pooled = 0
self._total_buffers_pooled = 0
if self._total_memory_allocated != 0:
raise Exception('Unable to cleanup CLDevice, while not all Buffers are deallocated.')
for kernel, prog in self._compiled_kernels.values():
clr = CL.clReleaseKernel(kernel)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clReleaseKernel error: {clr}')
clr = CL.clReleaseProgram(prog)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clReleaseProgram error: {clr}')
self._compiled_kernels = {}
if self._ctx_q is not None:
clr = CL.clReleaseCommandQueue(self._ctx_q)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clReleaseCommandQueue error: {clr}')
self._ctx_q = None
if self._ctx is not None:
clr = CL.clReleaseContext(self._ctx)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clReleaseContext error: {clr}')
self._ctx = None
def _get_opencl_device_ids() -> List[CL.cl_device_id]:
global _opencl_device_ids
if _opencl_device_ids is None:
_opencl_device_ids = []
device_types = CL.CL_DEVICE_TYPE_CPU | CL.CL_DEVICE_TYPE_ACCELERATOR | CL.CL_DEVICE_TYPE_GPU
while True:
num_platforms = CL.cl_uint()
if CL.clGetPlatformIDs(0, None, num_platforms) != CL.CLERROR.SUCCESS or \
num_platforms.value == 0:
break
platforms = (CL.cl_platform_id * num_platforms.value) ()
if CL.clGetPlatformIDs(num_platforms.value, platforms, None) != CL.CLERROR.SUCCESS:
break
for i_platform in range(num_platforms.value):
platform = platforms[i_platform]
num_devices = CL.cl_uint(0)
if CL.clGetDeviceIDs(platform, device_types, 0, None, num_devices) != CL.CLERROR.SUCCESS or \
num_devices.value == 0:
continue
device_ids = (CL.cl_device_id * num_devices.value)()
if CL.clGetDeviceIDs(platform, device_types, num_devices.value, device_ids, None) != CL.CLERROR.SUCCESS:
continue
for i in range(num_devices.value):
device_id = device_ids[i]
# Check OpenCL version.
if device_id is not None:
device_version_size = CL.c_size_t()
if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_VERSION, 0, None, device_version_size) == CL.CLERROR.SUCCESS:
device_version = CL.create_string_buffer(device_version_size.value)
if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_VERSION, device_version_size.value, device_version, None) == CL.CLERROR.SUCCESS:
device_version = str(device_version.value, 'ascii')
major, minor = device_version.split(' ')[1].split('.')
opencl_version = int(major)*10+int(minor)
if opencl_version >= 12:
_opencl_device_ids.append(device_id)
break
return _opencl_device_ids
def get_available_devices_info() -> List[DeviceInfo]:
"""
returns a list of available picklable DeviceInfo's
"""
devices = []
for device_index, device_id in enumerate(_get_opencl_device_ids()):
device_name = 'undefined'
device_total_memory = 0
name_size = CL.c_size_t()
if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_NAME, 0, None, name_size) == CL.CLERROR.SUCCESS:
name_value = CL.create_string_buffer(name_size.value)
if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_NAME, name_size.value, name_value, None) == CL.CLERROR.SUCCESS:
device_name = str(name_value.value, 'ascii')
global_mem_size = CL.cl_ulong()
if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_GLOBAL_MEM_SIZE, CL.sizeof(global_mem_size), CL.byref(global_mem_size), None) == CL.CLERROR.SUCCESS:
device_total_memory = global_mem_size.value
vendor_id = CL.cl_uint()
CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_VENDOR_ID, CL.sizeof(vendor_id), CL.byref(vendor_id), None)
vendor_id = vendor_id.value
max_compute_units = CL.cl_uint()
CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_MAX_COMPUTE_UNITS, CL.sizeof(max_compute_units), CL.byref(max_compute_units), None)
max_compute_units = max_compute_units.value
performance_level = max_compute_units
if vendor_id == 0x8086: # Intel device
performance_level -= 1000
devices.append( DeviceInfo(index=device_index, name=device_name, total_memory=device_total_memory, performance_level=performance_level ) )
return devices
def get_default_device() -> Union[Device, None]:
global _default_device
if _default_device is None:
_default_device = get_device(0)
return _default_device
def set_default_device(device : Device):
if not isinstance(device, Device):
raise ValueError('device must be an instance of Device')
global _default_device
_default_device = device
def get_device(arg : Union[None, int, Device, DeviceInfo]) -> Union[Device, None]:
"""
get physical TensorCL device.
arg None - get best device
int - by index
DeviceInfo - by device info
Device - returns the same
"""
global _devices
if arg is None:
return get_best_device()
elif isinstance(arg, int):
devices_info = get_available_devices_info()
if arg < len(devices_info):
arg = devices_info[arg]
else:
return None
elif isinstance(arg, Device):
return arg
elif not isinstance(arg, DeviceInfo):
raise ValueError(f'Unknown type of arg {arg.__class__}')
device = _devices.get(arg, None)
if device is None:
device = _devices[arg] = Device(arg, _check=1)
return device
def get_best_device() -> Union[Device, None]:
"""
returns best device from available.
"""
perf_level = -999999
result = None
for device_info in get_available_devices_info():
dev_perf_level = device_info.get_performance_level()
if perf_level < dev_perf_level:
perf_level = dev_perf_level
result = device_info
if result is not None:
result = get_device(result)
return result
def cleanup_devices():
global _devices
for device in list(_devices.values()):
device.cleanup()
_devices = {}

View file

@ -0,0 +1,44 @@
class DeviceInfo:
"""
Represents picklable OpenCL device info
"""
def __init__(self, index : int = None, name : str = None, total_memory : int = None, performance_level : int = None):
self._index = index
self._name = name
self._total_memory = total_memory
self._performance_level = performance_level
def __getstate__(self):
return self.__dict__.copy()
def __setstate__(self, d):
self.__init__()
self.__dict__.update(d)
def get_index(self) -> int:
return self._index
def get_name(self) -> str:
return self._name
def get_total_memory(self) -> int:
return self._total_memory
def get_performance_level(self) -> int:
return self._performance_level
def __eq__(self, other):
if self is not None and other is not None and isinstance(self, DeviceInfo) and isinstance(other, DeviceInfo):
return self._index == other._index
return False
def __hash__(self):
return self._index
def __str__(self):
return f"[{self._index}] {self._name} [{(self._total_memory / 1024**3) :.3}Gb]"
def __repr__(self):
return f'{self.__class__.__name__} object: ' + self.__str__()

View file

@ -0,0 +1,26 @@
class Kernel:
"""
TensorCL kernel.
It does not allocate any resources, thus can be used as static variable within class.
arguments
kernel_text OpenCL text of kernel. Must contain only one __kernel
global_shape default global_shape for .run()
local_shape default local_shape for .run()
"""
def __init__(self, kernel_text, global_shape=None, local_shape=None):
self._kernel_text = kernel_text
self._global_shape = global_shape
self._local_shape = local_shape
def get_kernel_text(self) -> str: return self._kernel_text
def get_global_shape(self): return self._global_shape
def get_local_shape(self): return self._local_shape
def __str__(self): return f'Kernel: \n{self._kernel_text}'
def __repr__(self): return self.__str__()

View file

@ -0,0 +1,278 @@
"""
Minimal OpenCL 1.2 low level ctypes API.
"""
import ctypes
from ctypes import POINTER, create_string_buffer, sizeof, c_char_p, c_char, c_size_t, c_void_p, byref
from ctypes.util import find_library
from enum import IntEnum
dlls_by_name = {}
def dll_import(dll_name):
dll = dlls_by_name.get(dll_name, None)
if dll is None:
try:
dll = ctypes.cdll.LoadLibrary(find_library(dll_name))
except:
pass
if dll is None:
raise RuntimeError(f'Unable to load {dll_name} library.')
dlls_by_name[dll_name] = dll
def decorator(func):
dll_func = getattr(dll, func.__name__)
anno = list(func.__annotations__.values())
dll_func.argtypes = anno[:-1]
dll_func.restype = anno[-1]
def wrapper(*args):
return dll_func(*args)
return wrapper
return decorator
class cl_char(ctypes.c_int8): pass
class cl_uchar(ctypes.c_uint8): pass
class cl_short(ctypes.c_int16): pass
class cl_ushort(ctypes.c_uint16): pass
class cl_int(ctypes.c_int32): pass
class cl_uint(ctypes.c_uint32): pass
class cl_long(ctypes.c_int64): pass
class cl_ulong(ctypes.c_uint64): pass
class cl_half(ctypes.c_uint16): pass
class cl_float(ctypes.c_float): pass
class cl_double(ctypes.c_double): pass
class cl_bool(cl_uint): pass
class cl_bitfield(cl_ulong):
def __or__(self, other):
assert isinstance(other, self.__class__)
return self.__class__(self.value | other.value)
def __and__(self, other):
assert isinstance(other, self.__class__)
return self.__class__(self.value & other.value)
def __xor__(self, other):
assert isinstance(other, self.__class__)
return self.__class__(self.value ^ other.value)
def __not__(self):
return self.__class__(~self.value)
def __contains__(self, other):
assert isinstance(other, self.__class__)
return (self.value & other.value) == other.value
def __hash__(self):
return self.value.__hash__()
def __eq__(self, other):
if not isinstance(other, self.__class__):
return False
else:
return self.value == other.value
def __ne__(self, other):
return not(self == other)
def __repr__(self):
return f'cl_bitfield: {self.value}'
class CLERROR(IntEnum):
SUCCESS = 0
DEVICE_NOT_FOUND = -1
DEVICE_NOT_AVAILABLE = -2
COMPILER_NOT_AVAILABLE = -3
MEM_OBJECT_ALLOCATION_FAILURE = -4
OUT_OF_RESOURCES = -5
OUT_OF_HOST_MEMORY = -6
PROFILING_INFO_NOT_AVAILABLE = -7
MEM_COPY_OVERLAP = -8
IMAGE_FORMAT_MISMATCH = -9
IMAGE_FORMAT_NOT_SUPPORTED = -10
BUILD_PROGRAM_FAILURE = -11
MAP_FAILURE = -12
MISALIGNED_SUB_BUFFER_OFFSET = -13
EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST = -14
INVALID_VALUE = -30
INVALID_DEVICE_TYPE = -31
INVALID_PLATFORM = -32
INVALID_DEVICE = -33
INVALID_CONTEXT = -34
INVALID_QUEUE_PROPERTIES = -35
INVALID_COMMAND_QUEUE = -36
INVALID_HOST_PTR = -37
INVALID_MEM_OBJECT = -38
INVALID_IMAGE_FORMAT_DESCRIPTOR = -39
INVALID_IMAGE_SIZE = -40
INVALID_SAMPLER = -41
INVALID_BINARY = -42
INVALID_BUILD_OPTIONS = -43
INVALID_PROGRAM = -44
INVALID_PROGRAM_EXECUTABLE = -45
INVALID_KERNEL_NAME = -46
INVALID_KERNEL_DEFINITION = -47
INVALID_KERNEL = -48
INVALID_ARG_INDEX = -49
INVALID_ARG_VALUE = -50
INVALID_ARG_SIZE = -51
INVALID_KERNEL_ARGS = -52
INVALID_WORK_DIMENSION = -53
INVALID_WORK_GROUP_SIZE = -54
INVALID_WORK_ITEM_SIZE = -55
INVALID_GLOBAL_OFFSET = -56
INVALID_EVENT_WAIT_LIST = -57
INVALID_EVENT = -58
INVALID_OPERATION = -59
INVALID_GL_OBJECT = -60
INVALID_BUFFER_SIZE = -61
INVALID_MIP_LEVEL = -62
INVALID_GLOBAL_WORK_SIZE = -63
INVALID_PROPERTY = -64
INVALID_GL_SHAREGROUP_REFERENCE_KHR = -1000
PLATFORM_NOT_FOUND_KHR = -1001
class CLRESULT(cl_int):
def __eq__(self, other):
if isinstance(other, int):
return self.value == other
elif isinstance(other, self.__class__):
return self.value == other.value
else:
return False
def __ne__(self, other):
return not(self == other)
def __hash__(self):
return self.value.__hash__()
def __str__(self):
try:
return f'CLRESULT ({str(CLERROR(self.value))})'
except:
return f'CLRESULT ({self.value})'
def __repr__(self):
return self.__str__()
class cl_platform_id(c_void_p): ...
class cl_platform_info(cl_uint): ...
class cl_device_id(c_void_p): ...
class cl_device_type(cl_bitfield): ...
class cl_device_info(cl_uint): ...
class cl_context(c_void_p): ...
class cl_context_properties(c_void_p): ...
class cl_command_queue(c_void_p): ...
class cl_command_queue_properties(cl_bitfield): ...
class cl_event(c_void_p): ...
class cl_mem(c_void_p): ...
class cl_mem_info(cl_uint): ...
class cl_mem_flags(cl_bitfield): ...
class cl_program(c_void_p): ...
class cl_program_build_info(cl_uint): ...
class cl_kernel(c_void_p): ...
# https://github.com/KhronosGroup/OpenCL-Headers/blob/master/CL/cl.h
CL_PLATFORM_PROFILE = cl_platform_info(0x0900)
CL_PLATFORM_VERSION = cl_platform_info(0x0901)
CL_PLATFORM_NAME = cl_platform_info(0x0902)
CL_PLATFORM_VENDOR = cl_platform_info(0x0903)
CL_PLATFORM_EXTENSIONS = cl_platform_info(0x0904)
CL_DEVICE_TYPE_DEFAULT = cl_device_type( (1 << 0) )
CL_DEVICE_TYPE_CPU = cl_device_type( (1 << 1) )
CL_DEVICE_TYPE_GPU = cl_device_type( (1 << 2) )
CL_DEVICE_TYPE_ACCELERATOR = cl_device_type( (1 << 3) )
CL_DEVICE_TYPE_ALL = cl_device_type( 0xFFFFFFFF )
CL_DEVICE_TYPE = cl_device_info (0x1000)
CL_DEVICE_VENDOR_ID = cl_device_info (0x1001)
CL_DEVICE_MAX_COMPUTE_UNITS = cl_device_info (0x1002)
CL_DEVICE_GLOBAL_MEM_SIZE = cl_device_info (0x101F)
CL_DEVICE_NAME = cl_device_info (0x102B)
CL_DEVICE_VERSION = cl_device_info (0x102F)
CL_DEVICE_MAX_MEM_ALLOC_SIZE = cl_device_info (0x1010)
CL_DEVICE_MAX_WORK_GROUP_SIZE = cl_device_info (0x1004)
CL_DRIVER_VERSION = cl_device_info (0x102D)
CL_DEVICE_EXTENSIONS = cl_device_info (0x1030)
# cl_mem_flags
CL_MEM_READ_WRITE = cl_mem_flags( (1 << 0) )
CL_MEM_WRITE_ONLY = cl_mem_flags( (1 << 1) )
CL_MEM_READ_ONLY = cl_mem_flags( (1 << 2) )
CL_MEM_USE_HOST_PTR = cl_mem_flags( (1 << 3) )
CL_MEM_ALLOC_HOST_PTR = cl_mem_flags( (1 << 4) )
CL_MEM_COPY_HOST_PTR = cl_mem_flags( (1 << 5) )
# cl_mem_info
CL_MEM_SIZE = cl_mem_info(0x1102)
# cl_program_build_info
CL_PROGRAM_BUILD_STATUS = cl_program_build_info(0x1181)
CL_PROGRAM_BUILD_OPTIONS = cl_program_build_info(0x1182)
CL_PROGRAM_BUILD_LOG = cl_program_build_info(0x1183)
@dll_import('OpenCL')
def clGetPlatformIDs (num_entries : cl_uint, platforms : POINTER(cl_platform_id), num_platforms : POINTER(cl_uint) ) -> CLRESULT: ...
@dll_import('OpenCL')
def clGetPlatformInfo (platform : cl_platform_id, param_name : cl_platform_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t)) -> CLRESULT: ...
@dll_import('OpenCL')
def clGetDeviceIDs (platform : cl_platform_id, device_type : cl_device_type, num_entries : cl_uint, devices : POINTER(cl_device_id), num_devices : POINTER(cl_uint)) -> CLRESULT: ...
@dll_import('OpenCL')
def clGetDeviceInfo(device : cl_device_id, param_name : cl_device_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t)) -> CLRESULT: ...
@dll_import('OpenCL')
def clCreateContext(properties : cl_context_properties, num_devices : cl_uint, devices : POINTER(cl_device_id), pfn_notify : c_void_p, user_data : c_void_p, errcode_ret : POINTER(CLRESULT) ) -> cl_context: ...
@dll_import('OpenCL')
def clReleaseContext(context : cl_context) -> CLRESULT: ...
@dll_import('OpenCL')
def clCreateCommandQueue(context : cl_context, device : cl_device_id, properties : cl_command_queue_properties, errcode_ret : POINTER(CLRESULT) ) -> cl_command_queue: ...
@dll_import('OpenCL')
def clReleaseCommandQueue(command_queue : cl_command_queue) -> CLRESULT: ...
@dll_import('OpenCL')
def clFinish(command_queue : cl_command_queue) -> CLRESULT: ...
@dll_import('OpenCL')
def clWaitForEvents(num_events : cl_uint, event_list : POINTER(cl_event) ) -> CLRESULT: ...
@dll_import('OpenCL')
def clReleaseEvent(event : cl_event) -> CLRESULT: ...
@dll_import('OpenCL')
def clCreateBuffer(context : cl_context, flags : cl_mem_flags, size : c_size_t, host_ptr : c_void_p, errcode_ret : POINTER(CLRESULT) ) -> cl_mem: ...
@dll_import('OpenCL')
def clGetMemObjectInfo(memobj : cl_mem, param_name : cl_mem_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t) ) -> CLRESULT: ...
@dll_import('OpenCL')
def clReleaseMemObject(memobj : cl_mem) -> CLRESULT: ...
@dll_import('OpenCL')
def clEnqueueReadBuffer (command_queue : cl_command_queue, buffer : cl_mem, blocking_read : cl_bool, offset : c_size_t, cb : c_size_t, ptr : c_void_p, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ...
@dll_import('OpenCL')
def clEnqueueWriteBuffer (command_queue : cl_command_queue, buffer : cl_mem, blocking_write : cl_bool, offset : c_size_t, size : c_size_t, ptr : c_void_p, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ...
@dll_import('OpenCL')
def clEnqueueCopyBuffer (command_queue : cl_command_queue, src_buffer : cl_mem, dst_buffer : cl_mem, src_offset : c_size_t, dst_offset : c_size_t, cb : c_size_t, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : cl_event) -> CLRESULT: ...
@dll_import('OpenCL')
def clEnqueueFillBuffer (command_queue : cl_command_queue, buffer : cl_mem, pattern : c_void_p, pattern_size : c_size_t, offset : c_size_t, size : c_size_t, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ...
@dll_import('OpenCL')
def clCreateProgramWithSource (context : cl_context, count : cl_uint, strings : POINTER(c_char_p), lengths : POINTER(c_size_t), errcode_ret : POINTER(CLRESULT) ) -> cl_program: ...
@dll_import('OpenCL')
def clReleaseProgram (program : cl_program) -> CLRESULT: ...
@dll_import('OpenCL')
def clBuildProgram (program : cl_program, num_devices : cl_uint, device_list : POINTER(cl_device_id), options : c_char_p, pfn_notify : c_void_p, user_data : c_void_p) -> CLRESULT: ...
@dll_import('OpenCL')
def clGetProgramBuildInfo (program : cl_program, device : cl_device_id, param_name : cl_program_build_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t) ) -> CLRESULT: ...
@dll_import('OpenCL')
def clCreateKernelsInProgram (program : cl_program, num_kernels : cl_uint, kernels : POINTER(cl_kernel), num_kernels_ret : POINTER(cl_uint) ) -> CLRESULT: ...
@dll_import('OpenCL')
def clReleaseKernel (program : cl_kernel) -> CLRESULT: ...
@dll_import('OpenCL')
def clSetKernelArg (kernel : cl_kernel, arg_index : cl_uint, arg_size : c_size_t, arg_value : c_void_p) -> CLRESULT: ...
@dll_import('OpenCL')
def clEnqueueNDRangeKernel (command_queue : cl_command_queue, kernel : cl_kernel, work_dim : cl_uint, global_work_offset : POINTER(c_size_t), global_work_size : POINTER(c_size_t), local_work_size : POINTER(c_size_t), num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ...

View file

@ -0,0 +1,37 @@
"""
Minimal OpenCL 1.2 low level ctypes API.
"""
from .OpenCL import (CL_DEVICE_EXTENSIONS, CL_DEVICE_GLOBAL_MEM_SIZE,
CL_DEVICE_MAX_COMPUTE_UNITS, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
CL_DEVICE_MAX_WORK_GROUP_SIZE, CL_DEVICE_NAME,
CL_DEVICE_TYPE, CL_DEVICE_TYPE_ACCELERATOR,
CL_DEVICE_TYPE_ALL, CL_DEVICE_TYPE_CPU,
CL_DEVICE_TYPE_DEFAULT, CL_DEVICE_TYPE_GPU,
CL_DEVICE_VENDOR_ID, CL_DEVICE_VERSION, CL_DRIVER_VERSION,
CL_MEM_ALLOC_HOST_PTR, CL_MEM_COPY_HOST_PTR,
CL_MEM_READ_ONLY, CL_MEM_READ_WRITE, CL_MEM_SIZE,
CL_MEM_USE_HOST_PTR, CL_MEM_WRITE_ONLY,
CL_PLATFORM_EXTENSIONS, CL_PLATFORM_NAME,
CL_PLATFORM_PROFILE, CL_PLATFORM_VENDOR,
CL_PLATFORM_VERSION, CL_PROGRAM_BUILD_LOG,
CL_PROGRAM_BUILD_OPTIONS, CL_PROGRAM_BUILD_STATUS,
CLERROR, CLRESULT, byref, c_char, c_char_p, c_size_t,
c_void_p, cl_bitfield, cl_bool, cl_char, cl_command_queue,
cl_command_queue_properties, cl_context,
cl_context_properties, cl_device_id, cl_device_info,
cl_device_type, cl_double, cl_event, cl_float, cl_half,
cl_int, cl_kernel, cl_long, cl_mem, cl_mem_info,
cl_platform_id, cl_platform_info, cl_program,
cl_program_build_info, cl_short, cl_uchar, cl_uint,
cl_ulong, cl_ushort, clBuildProgram, clCreateBuffer,
clCreateCommandQueue, clCreateContext,
clCreateKernelsInProgram, clCreateProgramWithSource,
clEnqueueCopyBuffer, clEnqueueFillBuffer,
clEnqueueNDRangeKernel, clEnqueueReadBuffer,
clEnqueueWriteBuffer, clFinish, clGetDeviceIDs,
clGetDeviceInfo, clGetMemObjectInfo, clGetPlatformIDs,
clGetPlatformInfo, clGetProgramBuildInfo,
clReleaseCommandQueue, clReleaseContext, clReleaseEvent,
clReleaseKernel, clReleaseMemObject, clReleaseProgram,
clSetKernelArg, clWaitForEvents, create_string_buffer,
ctypes, sizeof)

View file

@ -0,0 +1,6 @@
from .Buffer import Buffer
from .Device import (Device, cleanup_devices, get_available_devices_info,
get_best_device, get_default_device, get_device,
set_default_device)
from .DeviceInfo import DeviceInfo
from .Kernel import Kernel