add xlib.avecl

This commit is contained in:
iperov 2021-09-30 18:21:30 +04:00
commit 0058474da7
56 changed files with 5569 additions and 0 deletions

31
xlib/avecl/__init__.py Normal file
View file

@ -0,0 +1,31 @@
"""
AveCL ! Make OpenCL great again.
Lightweight ndarray library using OpenCL 1.2 written in pure python.
Applicable for high-performance general purpose n-dim array computations for every device that supports OpenCL 1.2.
Works in python 3.5+. Dependencies: numpy.
This lib uses relative import, thus you can place it in any subfolder.
made by @iperov from scratch.
"""
from ._internal.AAxes import AAxes
from ._internal.AShape import AShape
from ._internal.backend import (Device, DeviceInfo, Kernel,
get_available_devices_info, get_best_device,
get_default_device, get_device,
set_default_device)
from ._internal.HArgs import HArgs
from ._internal.HKernel import HKernel
from ._internal.HTensor import HTensor
from ._internal.HType import HType
from ._internal.initializer import (InitCoords2DArange, Initializer,
InitRandomUniform)
from ._internal.NCore import NCore
from ._internal.NTest import NTest
from ._internal.op import *
from ._internal.SCacheton import SCacheton
from ._internal.Tensor import Tensor
from ._internal.TensorImpl import *

View file

@ -0,0 +1,169 @@
from collections import Iterable
class AAxes(Iterable):
__slots__ = ['axes','ndim','_inversed']
def __init__(self, axes, shape_ndim=None):
"""
Constructs AAxes from user argument
arguments
axes AAxes
Int
Iterable of ints
None
shape_ndim(None) provide shape_ndim if axes contain negative values
can raise an errors during the construction
AAxes supports:
A+B : concat A_axes with B_axes
A-B : removes B_axes from A_axes
"""
if isinstance(axes, AAxes):
self.axes = axes.axes
self.ndim = axes.ndim
self._inversed = axes._inversed
elif axes is None:
self.axes = None
self.ndim = None
self._inversed = None
else:
if not isinstance(axes, Iterable):
axes = (axes,)
if isinstance(axes, Iterable):
valid_axes = []
for x in axes:
if x is None:
raise ValueError(f'Incorrent value {x} in axes {axes}')
x = int(x)
if x < 0:
if shape_ndim is None:
raise ValueError(f'Incorrent value {x} in axes {axes}, or provide shape_ndim')
x = shape_ndim + x
if x in valid_axes:
raise ValueError(f'Axes must contain unique values.')
valid_axes.append(x)
self.axes = tuple(valid_axes)
self.ndim = len(self.axes)
self._inversed = None
def is_none_axes(self):
"""
returns True if AAxes is constructed with (None) argument, i.e. all-axes
"""
return self.axes is None
def sorted(self) -> 'AAxes':
"""
returns sorted AAxes
"""
return AAxes(sorted(self.axes))
def swapped_axes(self, axis_a, axis_b) -> 'AAxes':
x = list(self.axes)
if axis_a < 0:
axis_a = len(x) + axis_a
if axis_b < 0:
axis_b = len(x) + axis_b
x[axis_b], x[axis_a] = x[axis_a], x[axis_b]
return AAxes( tuple(x) )
def inversed(self) -> 'AAxes':
"""
Returns inversed axes order
Example:
for (0,2,3,1) returns (0,3,1,2)
"""
if self.is_none_axes():
raise Exception(f'none-axes does not support inversed(). Handle none-axes by calling .is_none_axes()')
if self._inversed is None:
x = { axis:i for i,axis in enumerate(self.axes) }
t = []
for i in range(self.ndim):
axis = x.get(i, None)
if axis is None:
raise Exception(f'axes {self.axes} are inconsistent to do inverse order.')
t.append(axis)
self._inversed = AAxes(t)
return self._inversed
def __hash__(self): return self.axes.__hash__()
def __eq__(self, other):
if isinstance(other, AAxes):
return self.axes == other.axes
elif isinstance(other, Iterable):
return self.axes == tuple(other)
return False
def __iter__(self):
if self.is_none_axes():
raise Exception(f'none-axes does not support iteration. Handle none-axes by calling .is_none_axes()')
return self.axes.__iter__()
def __len__(self): return self.ndim
def __getitem__(self,key):
if self.is_none_axes():
raise Exception(f'none-axes does not support indexing. Handle none-axes by calling .is_none_axes()')
elif isinstance(key, slice):
return AAxes(self.axes[key])
return self.axes[key]
def __radd__(self, o):
if isinstance(o, Iterable):
return AAxes( tuple(o) + self.axes)
else:
raise ValueError(f'unable to use type {o.__class__} in AAxes append')
def __add__(self, o):
if isinstance(o, Iterable):
return AAxes( self.axes + tuple(o) )
else:
raise ValueError(f'unable to use type {o.__class__} in AAxes append')
def __rsub__(self, o):
if isinstance(o, Iterable):
new_axes = []
for axis in o:
if axis not in self.axes:
new_axes.append(axis)
return AAxes(new_axes)
else:
raise ValueError(f'unable to use type {o.__class__} in AAxes substraction')
def __sub__(self, o):
if isinstance(o, Iterable):
new_axes = []
o_axes = tuple(o)
for axis in self.axes:
if axis not in o_axes:
new_axes.append(axis)
return AAxes(new_axes)
else:
raise ValueError(f'unable to use type {o.__class__} in AAxes substraction')
def __str__(self):
if self.is_none_axes():
return '(None)'
return str(self.axes)
def __repr__(self): return 'AAxes' + self.__str__()
__all__ = ['AAxes']

View file

@ -0,0 +1,103 @@
from collections import Iterable
from .AAxes import AAxes
class AShape(Iterable):
__slots__ = ['shape','size','ndim']
def __init__(self, shape):
"""
Constructs valid shape from user argument
arguments
shape AShape
Iterable
can raise ValueError during the construction
"""
if isinstance(shape, AShape):
self.shape = shape.shape
self.size = shape.size
self.ndim = shape.ndim
else:
if isinstance(shape, (int,float) ):
shape = (int(shape),)
if isinstance(shape, Iterable):
size = 1
valid_shape = []
for x in shape:
if x is None:
raise ValueError(f'Incorrent value {x} in shape {shape}')
x = int(x)
if x < 1:
raise ValueError(f'Incorrent value {x} in shape {shape}')
valid_shape.append(x)
size *= x # Faster than np.prod()
self.shape = tuple(valid_shape)
self.ndim = len(self.shape)
if self.ndim == 0:
# Force (1,) shape for scalar shape
self.ndim = 1
self.shape = (1,)
self.size = size
else:
raise ValueError('Invalid type to create AShape')
def axes_arange(self) -> AAxes:
"""
Returns tuple of axes arange.
Example (0,1,2) for ndim 3
"""
return AAxes(range(self.ndim))
def transpose_by_axes(self, axes) -> 'AShape':
"""
Same as AShape[axes]
Returns AShape transposed by axes.
axes AAxes
Iterable(list,tuple,set,generator)
"""
return AShape(self.shape[axis] for axis in AAxes(axes) )
def __hash__(self): return self.shape.__hash__()
def __eq__(self, other):
if isinstance(other, AShape):
return self.shape == other.shape
elif isinstance(other, Iterable):
return self.shape == tuple(other)
return False
def __iter__(self): return self.shape.__iter__()
def __len__(self): return len(self.shape)
def __getitem__(self,key):
if isinstance(key, Iterable):
if isinstance(key, AAxes):
if key.is_none_axes():
return self
return self.transpose_by_axes(key)
elif isinstance(key, slice):
return AShape(self.shape[key])
return self.shape[key]
def __radd__(self, o):
if isinstance(o, Iterable):
return AShape( tuple(o) + self.shape)
else:
raise ValueError(f'unable to use type {o.__class__} in AShape append')
def __add__(self, o):
if isinstance(o, Iterable):
return AShape( self.shape + tuple(o) )
else:
raise ValueError(f'unable to use type {o.__class__} in AShape append')
def __str__(self): return str(self.shape)
def __repr__(self): return 'AShape' + self.__str__()
__all__ = ['AShape']

View file

@ -0,0 +1,106 @@
from typing import List
import numpy as np
from .backend import Device
from .HTensor import HTensor
from .HType import HType
from .Tensor import Tensor
from .AShape import AShape
class HArgs:
"""
Helper functions for list of arguments
"""
@staticmethod
def decompose(args):
"""
decompose list of args of Tensor and supported numeric values
returns ( shape_list, # if scalar value -> shape is None
dtype_list, #
kernel_args_list #
)
"""
shape_list = []
dtype_list = []
kernel_args_list = []
for arg in args:
if isinstance(arg, Tensor):
shape_list.append(arg.shape)
dtype_list.append(arg.dtype)
kernel_args_list.append(arg.get_buffer())
else:
if isinstance(arg, int):
dtype, arg = np.int32, np.int32(arg)
elif isinstance(arg, float):
dtype, arg = np.float32, np.float32(arg)
elif HType.is_obj_of_np_scalar_type(arg):
dtype = arg.__class__
else:
raise ValueError(f'Unsupported type of arg: {arg.__class__} Use Tensor or number type.')
shape_list.append(None)
dtype_list.append(dtype)
kernel_args_list.append(arg)
return tuple(shape_list), tuple(dtype_list), tuple(kernel_args_list)
@staticmethod
def get_shapes(args : List[Tensor]) -> List[AShape]:
"""
"""
return tuple(t.shape for t in args)
@staticmethod
def check_zero_get_length(args) -> int:
"""
raises an error if len(args) == 0, otherwise returns len
"""
args_len = len(args)
if len(args) == 0:
raise ValueError('args must be specified')
return args_len
@staticmethod
def check_get_same_device(args : List[Tensor]) -> Device:
"""
check all device of tensors are the same and return the device
"""
result = HTensor.all_same_device(args)
if not result:
raise ValueError('all Tensors must have the same device')
return args[0].get_device()
@staticmethod
def check_all_tensors(args : List[Tensor]):
"""
"""
if not all (isinstance(tensor, Tensor) for tensor in args):
raise ValueError('All values must have type of Tensor')
@staticmethod
def check_get_same_shape(args : List[Tensor]) -> AShape:
"""
check all shapes of tensors are the same and return the shape
"""
shape = args[0].shape
if not all (t.shape == shape for t in args):
raise ValueError('All tensors must have the same shape')
return shape
@staticmethod
def filter_tensor(args, raise_on_empty : bool):
"""
get only tensors from the list
"""
tensor_args = [arg for arg in args if isinstance(arg, Tensor) ]
if raise_on_empty and len(tensor_args) == 0:
raise ValueError('At least one arg must be a Tensor')
return tensor_args
__all__ = ['HArgs']

View file

@ -0,0 +1,496 @@
import numpy as np
class HKernel:
"""
Helper functions for Kernels
"""
_np_dtype_to_cl = { np.bool_ : 'bool',
np.int8 : 'char',
np.uint8 : 'uchar',
np.int16 : 'short',
np.uint16 : 'ushort',
np.int32 : 'int',
np.uint32 : 'uint',
np.int64 : 'long',
np.uint64 : 'ulong',
np.float16 : 'half',
np.float32 : 'float',
np.float64 : 'double'
}
@staticmethod
def np_dtype_to_cl(dtype : np.dtype):
"""
returns string opencl type from numpy dtype
example np.float32 -> 'float'
np.uint8 -> 'unsigned char'
"""
return HKernel._np_dtype_to_cl[np.dtype(dtype).type]
@staticmethod
def define_scalar_func_arg(name, dtype : np.dtype):
"""
"""
return f'{HKernel._np_dtype_to_cl[np.dtype(dtype).type]} {name}'
@staticmethod
def define_tensor_type(name, dtype : np.dtype):
"""
Returns a definitions for operations with tensor
example for 'O', np.float16:
#define O_PTR_NAME p_O
#define O_PTR_TYPE half
#define O_PTR_TYPE2 half2
#define O_PTR_TYPE3 half3
#define O_PTR_TYPE4 half4
#define O_PTR_TYPE8 half8
#define O_PTR_TYPE16 half16
#define O_TYPE float
#define O_TYPE2 float2
#define O_TYPE3 float3
#define O_TYPE4 float4
#define O_TYPE8 float8
#define O_TYPE16 float16
#define O_GLOBAL_LOAD(offset) vload_half (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_LOAD2(offset) vload_half2 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_LOAD3(offset) vload_half3 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_LOAD4(offset) vload_half4 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_LOAD8(offset) vload_half8 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_STORE(offset,value) vstore_half ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_STORE2(offset,value) vstore_half2 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_STORE3(offset,value) vstore_half3 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_STORE4(offset,value) vstore_half4 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_STORE8(offset,value) vstore_half8 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
#define O_TO_FLOATX(x) ((float)x)
"""
name_upper = name.upper()
dtype = np.dtype(dtype).type
out = [f'#define {name.upper()}_PTR_NAME p_{name.upper()}']
if dtype == np.float16:
out += [f'#define {name_upper}_PTR_TYPE half']
out += [f'#define {name_upper}_PTR_TYPE2 half2']
out += [f'#define {name_upper}_PTR_TYPE3 half3']
out += [f'#define {name_upper}_PTR_TYPE4 half4']
out += [f'#define {name_upper}_PTR_TYPE8 half8']
out += [f'#define {name_upper}_PTR_TYPE16 half16']
out += [f'#define {name_upper}_TYPE {HKernel.np_dtype_to_cl(np.float32)}']
out += [f'#define {name_upper}_TYPE2 {HKernel.np_dtype_to_cl(np.float32)}2']
out += [f'#define {name_upper}_TYPE3 {HKernel.np_dtype_to_cl(np.float32)}3']
out += [f'#define {name_upper}_TYPE4 {HKernel.np_dtype_to_cl(np.float32)}4']
out += [f'#define {name_upper}_TYPE8 {HKernel.np_dtype_to_cl(np.float32)}8']
out += [f'#define {name_upper}_TYPE16 {HKernel.np_dtype_to_cl(np.float32)}16']
out += [f'#define {name_upper}_GLOBAL_LOAD(offset) vload_half (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
out += [f'#define {name_upper}_GLOBAL_LOAD2(offset) vload_half2 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
out += [f'#define {name_upper}_GLOBAL_LOAD3(offset) vload_half3 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
out += [f'#define {name_upper}_GLOBAL_LOAD4(offset) vload_half4 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
out += [f'#define {name_upper}_GLOBAL_LOAD8(offset) vload_half8 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
out += [f'#define {name_upper}_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
out += [f'#define {name_upper}_GLOBAL_STORE(offset,value) vstore_half ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
out += [f'#define {name_upper}_GLOBAL_STORE2(offset,value) vstore_half2 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
out += [f'#define {name_upper}_GLOBAL_STORE3(offset,value) vstore_half3 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
out += [f'#define {name_upper}_GLOBAL_STORE4(offset,value) vstore_half4 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
out += [f'#define {name_upper}_GLOBAL_STORE8(offset,value) vstore_half8 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
out += [f'#define {name_upper}_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
else:
out += [f'#define {name_upper}_PTR_TYPE {HKernel.np_dtype_to_cl(dtype)}']
out += [f'#define {name_upper}_PTR_TYPE2 {HKernel.np_dtype_to_cl(dtype)}2']
out += [f'#define {name_upper}_PTR_TYPE3 {HKernel.np_dtype_to_cl(dtype)}3']
out += [f'#define {name_upper}_PTR_TYPE4 {HKernel.np_dtype_to_cl(dtype)}4']
out += [f'#define {name_upper}_PTR_TYPE8 {HKernel.np_dtype_to_cl(dtype)}8']
out += [f'#define {name_upper}_PTR_TYPE16 {HKernel.np_dtype_to_cl(dtype)}16']
out += [f'#define {name_upper}_TYPE {HKernel.np_dtype_to_cl(dtype)}']
out += [f'#define {name_upper}_TYPE2 {HKernel.np_dtype_to_cl(dtype)}2']
out += [f'#define {name_upper}_TYPE3 {HKernel.np_dtype_to_cl(dtype)}3']
out += [f'#define {name_upper}_TYPE4 {HKernel.np_dtype_to_cl(dtype)}4']
out += [f'#define {name_upper}_TYPE8 {HKernel.np_dtype_to_cl(dtype)}8']
out += [f'#define {name_upper}_TYPE16 {HKernel.np_dtype_to_cl(dtype)}16']
out += [f'#define {name_upper}_GLOBAL_LOAD(offset) {name_upper}_PTR_NAME[(offset)]']
out += [f'#define {name_upper}_GLOBAL_LOAD2(offset) {name_upper}_PTR_NAME[(offset)]']
out += [f'#define {name_upper}_GLOBAL_LOAD3(offset) {name_upper}_PTR_NAME[(offset)]']
out += [f'#define {name_upper}_GLOBAL_LOAD4(offset) {name_upper}_PTR_NAME[(offset)]']
out += [f'#define {name_upper}_GLOBAL_LOAD8(offset) {name_upper}_PTR_NAME[(offset)]']
out += [f'#define {name_upper}_GLOBAL_LOAD16(offset) {name_upper}_PTR_NAME[(offset)]']
out += [f'#define {name_upper}_GLOBAL_STORE(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
out += [f'#define {name_upper}_GLOBAL_STORE2(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
out += [f'#define {name_upper}_GLOBAL_STORE3(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
out += [f'#define {name_upper}_GLOBAL_STORE4(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
out += [f'#define {name_upper}_GLOBAL_STORE8(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
out += [f'#define {name_upper}_GLOBAL_STORE16(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
if dtype in [np.float32, np.float64]:
out += [f'#define {name_upper}_TO_FLOATX(x) x']
elif dtype in [np.bool_, np.int8, np.uint8, np.int16, np.uint16, np.int32,np.uint32, np.float16]:
out += [f'#define {name_upper}_TO_FLOATX(x) ((float)x)']
elif dtype in [np.int64,np.uint64]:
out += [f'#define {name_upper}_TO_FLOATX(x) ((double)x)']
return '\n'.join(out)
@staticmethod
def define_tensor_shape(name, shape, axes_symbols=None):
"""
Returns a definitions for operations with tensor
example for 'O', (7,3),
#define O0 7
#define O1 3
#define Om1 3
#define Om2 7
#define O_IDX(o0,o1) ( (size_t)(o0) )*3 +( o1 )
#define O_IDX_MOD(o0,o1) ( (size_t)(o0) % 7 )*3 +( (o1) % 3 )
"""
shape = tuple(shape)
ndim = len(shape)
name_upper = name.upper()
name_lower = name.lower()
if axes_symbols is None:
axes_symbols = "".join([str(i) for i in range(ndim)])
axes_symbols = axes_symbols.upper()
out = []
for i in range(ndim):
out += [f'#define {name_upper}{axes_symbols[i]} {shape[i]}']
for i in range(1,ndim+1):
out += [f'#define {name_upper}m{i} {shape[-i]}']
line = f'#define {name_upper}_IDX({HKernel.axes_seq_enum(name, ndim)}) '
for i in range(ndim):
if i == 0:
line += f'( (size_t)({name_lower}{i}) )'
else:
line += f'( {name_lower}{i} )'
for j in range(i+1,ndim):
line += f'*{shape[j]} '
if i != ndim-1:
line += '+'
out += [line]
line = f'#define {name_upper}_IDX_MOD({HKernel.axes_seq_enum(name, ndim)}) '
for i in range(ndim):
if i == 0:
line += f'( (size_t)({name_lower}{i}) % {shape[i]} )'
else:
line += f'( ({name_lower}{i}) % {shape[i]} )'
for j in range(i+1,ndim):
line += f'*{shape[j]} '
if i != ndim-1:
line += '+'
out += [line,'']
return '\n'.join(out)
@staticmethod
def define_tensor(name, shape, dtype : np.dtype, axes_symbols=None):
"""
Returns a definitions for operations with tensor
arguments
name text
shape Iterable
dtype np.dtype
axes_symbols(None) string of symbols.
None -> numeric symbols will be used
example for 'O', (2,4), np.float16
#define O_PTR_NAME p_O
#define O_PTR_TYPE half
#define O_PTR_TYPE2 half2
#define O_PTR_TYPE3 half3
#define O_PTR_TYPE4 half4
#define O_PTR_TYPE8 half8
#define O_PTR_TYPE16 half16
#define O_TYPE float
#define O_TYPE2 float2
#define O_TYPE3 float3
#define O_TYPE4 float4
#define O_TYPE8 float8
#define O_TYPE16 float16
#define O_GLOBAL_LOAD(offset) vload_half (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_LOAD2(offset) vload_half2 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_LOAD3(offset) vload_half3 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_LOAD4(offset) vload_half4 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_LOAD8(offset) vload_half8 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_STORE(offset,value) vstore_half ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_STORE2(offset,value) vstore_half2 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_STORE3(offset,value) vstore_half3 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_STORE4(offset,value) vstore_half4 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_STORE8(offset,value) vstore_half8 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
#define O_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
#define O_TO_FLOATX(x) ((float)x)
#define O0 2
#define O1 4
#define Om1 4
#define Om2 2
#define O_IDX(o0,o1) ( (size_t)(o0) )*4 +( o1 )
#define O_IDX_MOD(o0,o1) ( (size_t)(o0) % 2 )*4 +( (o1) % 4 )
"""
return'\n'.join([ HKernel.define_tensor_type(name, dtype),
HKernel.define_tensor_shape(name, shape, axes_symbols)
])
@staticmethod
def define_axes_sizes(axis_letter, axes_sizes):
"""
Returns definitions of axes sizes
example for 'O', (4,512,512)
#define O0 4
#define O1 512
#define O2 512
"""
out = ""
axes_sizes = tuple(axes_sizes)
ndim = len(axes_sizes)
for i in range(ndim):
out += f'#define {axis_letter.upper()}{i} {axes_sizes[i]}\n'
return out
@staticmethod
def decompose_idx_to_axes_idxs(var_name, tensor_name, ndim):
"""
decompose a size_t variable to axes indexes.
Keeps original variable untouched.
Example for 'gid','O',2
size_t gid_original = gid;
size_t o1 = gid % O1; gid /= O1;
#define om1 o1
size_t o0 = gid % O0;
#define om2 o0
gid = gid_original;
"""
name_lower = tensor_name.lower()
name_upper = tensor_name.upper()
out = [f'size_t {var_name}_original = {var_name};']
for i in range(ndim-1,-1,-1):
line = f'size_t {name_lower}{i} = {var_name} % {name_upper}{i};'
if i > 0:
line += f' {var_name} /= {name_upper}{i};'
out += [line]
out += [f'#define {name_lower}m{ndim-i} {name_lower}{i}']
out += [f'{var_name} = {var_name}_original;']
return '\n'.join(out)
@staticmethod
def axes_order_enum(tensor_name, axes_order):
"""
returns axis enumeration with given order
Example
('I', (1,2,0)) returns 'i1,i2,i0'
('I', 'HW') return 'ih,iw'
"""
if isinstance(axes_order, str):
axes_order = axes_order.lower()
else:
axes_order = tuple(axes_order)
name_lower = tensor_name.lower()
return ','.join( [ f'{name_lower}{axes_order[axis]}' for axis in range(len(axes_order)) ])
@staticmethod
def axes_seq_enum(tensor_name, ndim, new_axis=None, zero_axes=None, suffix=None):
"""
returns axis sequental enumeration with given ndim
Example
('I', 4) returns 'i0,i1,i2,i3'
('I', 4, new_axis=('name',1) ) returns 'i0,name,i1,i2,i3'
('I', 3, zero_axes=(1,) ) returns 'i0,0,i2'
('I', 2, suffix='ih,iw' ) returns 'i0,i1,ih,iw'
"""
name_lower = tensor_name.lower()
if zero_axes is not None:
axes = [ '0' if axis in zero_axes else f'{name_lower}{axis}' for axis in range(ndim) ]
else:
axes = [ f'{name_lower}{axis}' for axis in range(ndim) ]
if suffix is None:
suffix = []
else:
suffix = [suffix]
if new_axis is not None:
name, axis = new_axis
return','.join(axes[:axis] + [name] + axes[axis:] + suffix)
else:
return ','.join(axes+ suffix)
@staticmethod
def include_constants_pi():
"""
defines PI constants
PI_F
PI_2_F
PI_4_F
"""
return f"""
#define PI_F 3.14159274101257f
#define PI_2_F 1.57079637050629f
#define PI_4_F 0.78539818525314f
"""
@staticmethod
def include_hash():
"""
returns hash functions:
uint hash_uint_uint(uint v)
ulong hash_ulong_from_ulong(ulong x)
float hash_float_from_uint(uint x)
double hash_double_from_ulong(ulong x)
"""
return f"""
#define UIF (1.0 / (float)(0xffffffffU))
//from Chris Wellons https://nullprogram.com/blog/2018/07/31/ https://www.shadertoy.com/view/WttXWX
uint hash_uint_from_uint(uint x)
{{
x ^= x >> 17;
x *= 0xed5ad4bbU;
x ^= x >> 11;
x *= 0xac4c1b51U;
x ^= x >> 15;
x *= 0x31848babU;
x ^= x >> 14;
return x;
}}
ulong hash_ulong_from_ulong(ulong x)
{{
x ^= x >> 32;
x *= 0xd6e8feb86659fd93U;
x ^= x >> 32;
x *= 0xd6e8feb86659fd93U;
x ^= x >> 32;
return x;
}}
float hash_float_from_uint(uint x)
{{
return hash_uint_from_uint(x) / (float)(0xffffffffU);
}}
double hash_double_from_ulong(ulong x)
{{
return (double)hash_ulong_from_ulong(x) / (double)(0xffffffffffffffffU);
}}
/*****************************
UNUSED CODE
//---------- PCG hashes from https://www.shadertoy.com/view/XlGcRh
uint hash_uint_uint(uint v)
{{
uint state = v * 747796405u + 2891336453u;
uint word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u;
return (word >> 22u) ^ word;
}}
uint2 hash_uint2_uint2 (uint2 v)
{{
v = v * 1664525u + 1013904223u;
v.x += v.y * 1664525u;
v.y += v.x * 1664525u;
v ^= v>>16u;
v.x += v.y * 1664525u;
v.y += v.x * 1664525u;
v ^= v>>16u;
return v;
}}
uint3 hash_uint3_uint3(uint3 v)
{{
v = v * 1664525u + 1013904223u;
v.x += v.y*v.z;
v.y += v.z*v.x;
v.z += v.x*v.y;
v ^= v >> 16u;
v.x += v.y*v.z;
v.y += v.z*v.x;
v.z += v.x*v.y;
return v;
}}
float hash_float_uint(uint v)
{{
return (float)( hash_uint_uint(v) ) * UIF;
}}
float2 hash_float2_uint (uint v)
{{
uint2 q = hash_uint2_uint2( (uint2)(v, 1) );
return (float2)(q.x, q.y) * UIF;
}}
float3 hash_float3_uint (uint v)
{{
uint3 q = hash_uint3_uint3( (uint3)(v, 1, 1) );
return (float3)(q.x, q.y, q.z) * UIF;
}}
//---------- Classic hashes used in shaders
float hash_float_float(float p)
{{
float x = sin(p*12.9898)*43758.5453;
return x - floor(x);
}}
float hash_float_float2(float2 p)
{{
float x = sin( dot(p, (float2)(12.9898, 78.233)) )*43758.5453;
return x - floor(x);
}}
****************************/
"""
__all__ = ['HKernel']

View file

@ -0,0 +1,22 @@
from typing import List, Union
from .Tensor import *
class HTensor:
"""
Helper functions for Tensor
"""
@staticmethod
def all_same_device( tensor_or_list : Union[Tensor, List[Tensor] ] ) -> bool:
"""
check if all tensors in a list use the same device
"""
if not isinstance(tensor_or_list, (list,tuple)):
tensor_or_list = (tensor_or_list,)
device = tensor_or_list[0].get_device()
return all( device == tensor.get_device() for tensor in tensor_or_list )
__all__ = ['HTensor']

View file

@ -0,0 +1,82 @@
import operator
from typing import Iterable, List
import numpy as np
scalar_types = [int, float, np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64,
np.float16, np.float32, np.float64, np.bool_]
np_scalar_types = [np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64,
np.float16, np.float32, np.float64, np.bool_]
_np_dtype_to_cl = {
np.bool_ : 'bool',
np.uint8 : 'uchar',
np.int8 : 'char',
np.uint16 : 'ushort',
np.int16 : 'short',
np.uint32 : 'uint',
np.int32 : 'int',
np.uint64 : 'ulong',
np.int64 : 'long',
np.float16 : 'half',
np.float32 : 'float',
np.float64 : 'double',
}
_np_dtype_weight = {
np.bool_ : 1,
np.uint8 : 2,
np.int8 : 3,
np.uint16 : 4,
np.int16 : 5,
np.uint32 : 6,
np.int32 : 7,
np.uint64 : 8,
np.int64 : 9,
np.float16 : 10,
np.float32 : 11,
np.float64 : 12,
}
class HType:
"""
Helper functions for types.
"""
def is_scalar_type(value):
return value.__class__ in scalar_types
def get_np_scalar_types() -> List:
return np_scalar_types
def is_obj_of_np_scalar_type(obj) -> bool:
return obj.__class__ in np_scalar_types
def np_dtype_to_cl(dtype : np.dtype) -> str:
return _np_dtype_to_cl[ np.dtype(dtype).type ]
def get_most_weighted_dtype( dtype_list ):
dtype_list = [ np.dtype(dtype) for dtype in dtype_list ]
dtype_list = [ (_np_dtype_weight[dtype.type], dtype) for dtype in dtype_list ]
dtype_list = sorted(dtype_list, key=operator.itemgetter(0), reverse=True)
return dtype_list[0][1]
def hashable_slices(slices):
"""
Convert list of slice to hashable arg.
"""
if not isinstance(slices, Iterable):
slices = (slices,)
normalized_slices = []
for x in slices:
if isinstance(x, slice):
normalized_slices.append( (x.start, x.stop, x.step) )
elif x is Ellipsis or x is None:
normalized_slices.append(x)
else:
normalized_slices.append(int(x))
return tuple(normalized_slices)
__all__ = ['HType']

View file

@ -0,0 +1,24 @@
from .SCacheton import SCacheton
from .Tensor import Tensor
from .backend import *
class NCore:
"""
core functions
"""
@staticmethod
def cleanup():
"""
try to cleanup all resources consumed by TensorCL.
can raise Exception
"""
SCacheton.cleanup()
if Tensor._object_count != 0:
raise Exception(f'Unable to cleanup while {Tensor._object_count} Tensor objects exist.')
cleanup_devices()
__all__ = ['NCore']

View file

@ -0,0 +1,754 @@
import traceback
import numpy as np
from .HType import HType
from .NCore import NCore
from .backend import get_device, get_default_device, set_default_device
from .Tensor import Tensor
from . import op
from .initializer import InitRandomUniform, InitCoords2DArange
from .info import Conv2DInfo
class NTest():
def test_all():
NCore.cleanup()
prev_device = get_default_device()
device = get_device(0)
print(f'Using {device.get_description()}')
set_default_device(device)
test_funcs = [
InitRandomUniform_test,
InitCoords2DArange_test,
cast_test,
transpose_test,
pad_test,
concat_test,
tile_test,
stack_test,
slice_test,
slice_set_test,
reduce_test,
matmul_test,
any_wise_op_test,
depthwise_conv2d_test,
remap_np_affine_test,
remap_test,
warp_affine_test,
gaussian_blur_test,
binary_erode_circle_test,
binary_dilate_circle_test,
binary_morph_test,
]
for test_func in test_funcs:
print(f'{test_func.__name__}()')
test_func()
device.cleanup()
device.print_stat()
NCore.cleanup()
set_default_device(prev_device)
print('Done.')
def _all_close(x,y, atol=1, btol=1):
return np.allclose( np.ndarray.flatten(x[None,...]), np.ndarray.flatten(y[None,...]), atol, btol )
def cast_test():
for in_dtype in HType.get_np_scalar_types():
for out_dtype in HType.get_np_scalar_types():
shape = tuple(np.random.randint(1, 8, size=( np.random.randint(1,5))) )
print(f'cast: {shape} in_dtype:{str(np.dtype(in_dtype).name)} out_dtype:{str(np.dtype(out_dtype).name)} ... ', end='')
val_n = np.random.uniform( -64, 64, size=shape ).astype(in_dtype)
cast_n = val_n.astype(out_dtype)
val_t = Tensor.from_value(val_n)
cast_t = op.cast(val_t, out_dtype)
if not _all_close(cast_t.np(), cast_n):
raise Exception(f'data is not equal')
print('pass')
def binary_morph_test():
for shape_len in range(2,4):
for dtype in HType.get_np_scalar_types():
shape = np.random.randint( 1, 64, size=(shape_len,) )
erode_dilate = np.random.randint( -16, 16 )
blur = np.random.rand()*16 - 8
input_n = np.random.randint( 2, size=shape ).astype(dtype)
input_t = Tensor.from_value(input_n)
print(f'binary_morph: {shape} erode_dilate:{erode_dilate} blur:{blur} {np.dtype(dtype).name} ... ', end='')
op.binary_morph(input_t, erode_dilate=erode_dilate, blur=blur, fade_to_border=True)
print('pass')
def binary_erode_circle_test():
for shape_len in range(2,4):
for dtype in HType.get_np_scalar_types():
shape = np.random.randint( 1, 64, size=(shape_len,) )
radius = np.random.randint( 1, 16 )
iterations = np.random.randint( 1, 4 )
input_n = np.random.randint( 2, size=shape ).astype(dtype)
input_t = Tensor.from_value(input_n)
print(f'binary_erode_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='')
op.binary_erode_circle(input_t, radius=radius, iterations=iterations)
print('pass')
def binary_dilate_circle_test():
for shape_len in range(2,4):
for dtype in HType.get_np_scalar_types():
shape = np.random.randint( 1, 64, size=(shape_len,) )
radius = np.random.randint( 1, 16 )
iterations = np.random.randint( 1, 4 )
input_n = np.random.randint( 2, size=shape ).astype(dtype)
input_t = Tensor.from_value(input_n)
print(f'binary_dilate_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='')
op.binary_dilate_circle(input_t, radius=radius, iterations=iterations)
print('pass')
def gaussian_blur_test():
for shape_len in range(2,5):
for dtype in [np.float16, np.float32, np.float64]:
shape = np.random.randint( 1, 64, size=(shape_len,) )
sigma = np.random.rand() * 10
print(f'gaussian_blur: {shape} sigma:{sigma} {np.dtype(dtype).name} ... ', end='')
val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
val_t = Tensor.from_value(val_n)
op.gaussian_blur(val_t, sigma)
print('pass')
def pad_test():
for iteration in range(1):
for shape_len in range(5,1,-1):
for mode in ['constant']:
for dtype in HType.get_np_scalar_types():
while True:
shape = np.random.randint( 1, 8, size=(shape_len,) )
paddings = tuple( (np.random.randint(8), np.random.randint(8)) for i in range(len(shape)) )
print(f'pad: {shape} {paddings} {mode} {np.dtype(dtype).name} ... ', end='')
val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
pad_n = np.pad(val_n, paddings, mode=mode)
val_t = Tensor.from_value(val_n)
pad_t = op.pad(val_t, paddings, mode=mode)
print(f'{pad_n.shape} == {pad_t.shape} ... ', end='')
if pad_n.shape != pad_t.shape:
raise Exception(f'shape is not equal')
if not _all_close(pad_t.np(), pad_n):
raise Exception(f'data is not equal')
print('pass')
break
def slice_set_test():
for iteration in [0,1]:
for shape_len in range(5,1,-1):
for dtype in HType.get_np_scalar_types():
while True:
shape = np.random.randint( 1, 8, size=(shape_len,) )
if iteration == 0:
slices = [ slice(None,None,None), ] * shape_len
axis = np.random.randint(shape_len)
shape[axis] = 1
slices[axis] = 0
else:
slices = []
for i in range (shape_len):
axis_size = shape[i]
b = np.random.randint(axis_size)
e = np.random.randint(axis_size)
if b == e:
slices.append(b)
else:
if b < e:
s = 1
if b == 0:
b = None
if e == axis_size-1:
e = None
else:
s = -1
if b == axis_size-1:
b = None
if e == 0:
e = None
slices.append ( slice(b,e,s) )
if np.random.randint(2) == 0:
axis = np.random.randint(shape_len)
slices[axis] = Ellipsis
shape = tuple(shape)
slices = tuple(slices)
print(f'slice_set: {shape} {np.dtype(dtype).name} {slices} ... ', end='')
val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
val_t = Tensor.from_value(val_n)
sliced_n = val_n[slices]
v = [0] if sliced_n.ndim > 0 else 0
val_n[slices] = v
val_t[slices] = v
if not _all_close(val_t.np(), val_n):
raise Exception(f'data is not equal')
print('pass')
break
def depthwise_conv2d_test():
def _numpy_depthwise_conv2d(input_n, kernel_n, STRIDE=1, DILATION=1, padding='same', dtype=np.float32):
N, IC, IH, IW = input_n.shape
KI, KH, KW = kernel_n.shape
ci = Conv2DInfo(IH, IW, KH, KW, STRIDE, DILATION, padding)
PADL, PADT = ci.PADL, ci.PADT
OC, OH, OW = IC, ci.OH, ci.OW
O_IK_idxs = { idx:[ [ [], [] ], [ [], [] ] ] for idx in range(OH*OW) }
K_IO_idxs = { idx:[ [ [], [] ], [ [], [] ] ] for idx in range(KH*KW) }
I_KO_idxs = { idx:[ [ [], [] ], [ [], [] ] ] for idx in range(IH*IW) }
for ow in range(OW):
for oh in range(OH):
O_idx = oh*OW + ow
for kh in range(KH):
for kw in range(KW):
iw = -PADL + kw*DILATION + ow*STRIDE
ih = -PADT + kh*DILATION + oh*STRIDE
if (iw >= 0) & (ih >= 0) & (iw < IW) & (ih < IH):
O_IK_idxs[O_idx][0][0].append (ih)
O_IK_idxs[O_idx][0][1].append (iw)
O_IK_idxs[O_idx][1][0].append (kh)
O_IK_idxs[O_idx][1][1].append (kw)
K_idx = kh*KW + kw
K_IO_idxs[K_idx][0][0].append (ih)
K_IO_idxs[K_idx][0][1].append (iw)
K_IO_idxs[K_idx][1][0].append (oh)
K_IO_idxs[K_idx][1][1].append (ow)
I_idx = ih*IW + iw
I_KO_idxs[I_idx][0][0].append (kh)
I_KO_idxs[I_idx][0][1].append (kw)
I_KO_idxs[I_idx][1][0].append (oh)
I_KO_idxs[I_idx][1][1].append (ow)
output_shape = (N, OC, OH, OW)
output = np.empty( output_shape, dtype)
for n in range(N):
for oc in range(OC):
for oh in range(OH):
for ow in range(OW):
O_idx = oh*OW + ow
I_idxs = O_IK_idxs[O_idx][0]
K_idxs = O_IK_idxs[O_idx][1]
v = ( input_n[ n,oc][..., I_idxs[0], I_idxs[1]] *
kernel_n [oc][..., K_idxs[0], K_idxs[1]] ).sum()
output[n,oc,oh,ow] = v
return output
for padding in ['same','valid',2]:
for dilation in [1,2]:
for stride in [1,2]:
for ks in [1,3]:
for n in [1,4]:
for ic in [1,4]:
for ih,iw in zip(*[[4,16]]*2):
if padding == 'valid' and iw < ks:
continue
for dtype in [np.int16, np.float16, np.float32]:
input_shape = (n, ic, ih, iw)
kernel_shape = (ic, ks, ks)
print(f'depthwise_conv2d: {input_shape},{kernel_shape},{padding},{stride},{dilation},{np.dtype(dtype).name} ... ', end='')
input_n = np.random.randint( 64, size=input_shape ).astype(dtype)
kernel_n = np.ones(shape=kernel_shape ).astype(dtype)
input_t = Tensor.from_value(input_n)
kernel_t = Tensor.from_value(kernel_n)
conved_t = op.depthwise_conv2D(input_t, kernel_t, stride=stride, dilation=dilation, padding=padding)
conved_n = _numpy_depthwise_conv2d(input_n, kernel_n, STRIDE=stride, DILATION=dilation, padding=padding, dtype=dtype)
if conved_n.shape != conved_t.shape:
raise Exception(f'shape is not equal')
if not all ( np.ndarray.flatten( conved_t.np() == conved_n) ):
raise Exception(f'data is not equal')
print('pass')
def warp_affine_test():
for dtype in HType.get_np_scalar_types():
if dtype == np.bool_:
continue
H = np.random.randint(8, 64)
W = np.random.randint(8, 64)
print(f'warp_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )
affine_t = Tensor.from_value ( [[1,0,0],
[0,1,0]], dtype)
result_t = op.warp_affine(input_t, affine_t)
if not _all_close(input_t.np(), result_t.np() ):
raise Exception(f'data is not equal')
print('pass')
def remap_np_affine_test():
for dtype in HType.get_np_scalar_types():
if dtype == np.bool_:
continue
H = np.random.randint(8, 64)
W = np.random.randint(8, 64)
print(f'remap_np_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )
affine_n = np.array ( [[1,0,0],
[0,1,0]], dtype)
result_t = op.remap_np_affine(input_t, affine_n)
if not _all_close(input_t.np(), result_t.np() ):
raise Exception(f'data is not equal')
print('pass')
def remap_test():
for dtype in HType.get_np_scalar_types():
if dtype == np.bool_:
continue
H = np.random.randint(8, 64)
W = np.random.randint(8, 64)
print(f'remap: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )
coords_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) )
result_t = op.remap(input_t, coords_t)
if not _all_close(input_t.np(), result_t.np() ):
raise Exception(f'data is not equal')
print('pass')
def tile_test():
for _ in range(3):
for shape_len in range(3, 5):
for dtype in HType.get_np_scalar_types():
shape = tuple(np.random.randint( 8, size=(shape_len,) )+1)
tiles = tuple(np.random.randint( 4, size=(shape_len,) )+1)
print(f'tile: {shape} {tiles} {np.dtype(dtype).name} ... ', end='')
val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
tiled_n = np.tile(val_n, tiles)
val_t = Tensor.from_value(val_n)
tiled_t = op.tile(val_t, tiles)
print(f'{tiled_n.shape} == {tiled_t.shape} ... ', end='')
if tiled_n.shape != tiled_t.shape:
raise Exception(f'shape is not equal')
if not _all_close(tiled_t.np(), tiled_n):
raise Exception(f'data is not equal')
print('pass')
def stack_test():
for _ in range(3):
for shape_len in range(1, 4):
for dtype in HType.get_np_scalar_types():
shape = tuple(np.random.randint( 8, size=(shape_len,) )+1)
axis = np.random.randint(shape_len+1)
stack_count = np.random.randint(4)+1
print(f'stack: {shape}*{stack_count} axis:{axis} {np.dtype(dtype).name} ... ', end='')
vals_n = [ np.random.randint( 2**8, size=shape ).astype(dtype) for i in range(stack_count) ]
stack_n = np.stack(vals_n, axis)
vals_t = [ Tensor.from_value(vals_n[i]) for i in range(stack_count) ]
stack_t = op.stack(vals_t, axis)
print(f'{stack_n.shape} == {stack_t.shape} ... ', end='')
if stack_n.shape != stack_t.shape:
raise Exception('shape is not equal')
if not _all_close(stack_t.np(), stack_n):
raise Exception(f'data is not equal')
print('pass')
def reduce_test():
for op_type in ['sum', 'mean', 'min', 'max']:
for dtype in HType.get_np_scalar_types():
if dtype != np.bool_:
for shape_len in range(2, 5):
shape = np.random.randint( 8, size=(shape_len,) )+1
reduction_axes = np.array([*range(shape_len)])
np.random.shuffle(reduction_axes)
# Cut random amount of reduction_axes
reduction_axes = tuple(reduction_axes [:np.random.randint(shape_len+1)])
if len(reduction_axes) == 0:
reduction_axes = None
keepdims = np.random.randint(2) == 0
print(f'reduce {op_type}: {shape} {np.dtype(dtype).name} axes={reduction_axes} keepdims={keepdims} ... ', end='')
if dtype in [np.float16, np.float32, np.float64]:
value_n = np.random.uniform(size=shape).astype(dtype)
else:
value_n = np.random.randint( max(1, int(np.iinfo(dtype).max / np.prod(shape)) ), size=shape, dtype=dtype )
value_t = Tensor.from_value(value_n)
if op_type == 'sum':
reducted_n = value_n.sum(reduction_axes, keepdims=keepdims)
reducted_t = value_t.sum(reduction_axes, keepdims=keepdims)
elif op_type == 'mean':
reducted_n = value_n.mean(reduction_axes, keepdims=keepdims)
reducted_t = value_t.mean(reduction_axes, keepdims=keepdims)
elif op_type == 'max':
reducted_n = value_n.max(reduction_axes, keepdims=keepdims)
reducted_t = value_t.max(reduction_axes, keepdims=keepdims)
elif op_type == 'min':
reducted_n = value_n.min(reduction_axes, keepdims=keepdims)
reducted_t = value_t.min(reduction_axes, keepdims=keepdims)
print(f'{reducted_n.shape} == {reducted_t.shape} ... ')
if not _all_close(reducted_t.np(), reducted_n):
raise Exception(f'data is not equal')
print('pass')
def InitRandomUniform_test():
for dtype in HType.get_np_scalar_types():
for shape_len in range(1, 5):
shape = np.random.randint( 8, size=(shape_len,) )+1
print(f'InitRandomUniform: {shape} {np.dtype(dtype).name} ... ', end='')
Tensor(shape, dtype, initializer=InitRandomUniform()).np()
print('pass')
def InitCoords2DArange_test():
for dtype in HType.get_np_scalar_types():
for shape_len in range(2, 5):
shape = np.random.randint( 1, 60, size=(shape_len,) ).tolist()
shape = shape + ([2] if np.random.randint(2) == 0 else [3])
h_start = np.random.randint(80)
h_stop = h_start + np.random.randint(80)
w_start = np.random.randint(80)
w_stop = w_start + np.random.randint(80)
print(f'InitCoords2DArange: {shape} {np.dtype(dtype).name} ... ', end='')
Tensor(shape, dtype, initializer=InitCoords2DArange(h_start,h_stop,w_start,w_stop )).np()
print('pass')
def concat_test():
for shape_len in range(2, 5):
for dtype in HType.get_np_scalar_types():
shape = (np.random.randint( 8, size=(shape_len,) )+1).tolist()
axis = np.random.randint(shape_len)
count = np.random.randint(4)+1
shapes = tuple( tuple( dim if i != axis else np.random.randint(8)+1
for i,dim in enumerate(shape) )
for shape in ([shape] * count) )
print(f'concat: {shapes} axis={axis} {np.dtype(dtype).name} ... ', end='')
V_n = [ np.random.randint( 2**8, size=shape ).astype(dtype) for shape in shapes ]
O_n = np.concatenate(V_n, axis)
print(f'{O_n.shape} == ', end='')
V_t = [ Tensor.from_value(V_n[i]) for i in range(count) ]
O_t = op.concat(V_t, axis)
print(f'{O_t.shape} ... ', end='')
if O_n.shape != O_t.shape:
raise Exception('shape is not equal')
if not all ( np.ndarray.flatten( O_t.np() == O_n ) ):
raise Exception(f'data is not equal')
print('pass')
def matmul_test():
for _ in range(100):
for dtype in [np.float32]:
BATCH = np.random.randint(8)+1
M = np.random.randint(8)+1
N = np.random.randint(32768)+1
K = np.random.randint(32768)+1
while K*N > ( 8000000 // BATCH ):
K = max(1, K // 2)
N = max(1, N // 2)
if np.random.randint(2) == 0:
size = [2,4,8,16][np.random.randint(4)]
M = max(1, M // size) * size
N = max(1, N // size) * size
K = max(1, K // size) * size
if BATCH == 1:
A_shape = (M, K)
B_shape = (K, N)
else:
A_shape = (BATCH, M, K)
B_shape = (BATCH, K, N)
print(f'matmul: {A_shape} {B_shape} {np.dtype(dtype).name} ... ', end='')
A_n = np.random.randint( 2**4, size=A_shape ).astype(dtype)
B_n = np.random.randint( 2**4, size=B_shape ).astype(dtype)
O_n = np.matmul(A_n, B_n)
print(f'{O_n.shape} == ', end='')
A_t = Tensor.from_value(A_n)
B_t = Tensor.from_value(B_n)
O_t = op.matmul(A_t, B_t)
print(f'{O_t.shape} ... ', end='')
if O_n.shape != O_t.shape:
raise Exception('shape is not equal')
if not _all_close (O_t.np(), O_n):
raise Exception(f'data is not equal')
print('pass')
def slice_test():
for iteration in [0,1]:
for shape_len in range(5,1,-1):
for dtype in HType.get_np_scalar_types():
while True:
shape = np.random.randint( 1, 8, size=(shape_len,) )
if iteration == 0:
slices = [ slice(None,None,None), ] * shape_len
axis = np.random.randint(shape_len)
shape[axis] = 1
slices[axis] = 0
else:
slices = []
for i in range (shape_len):
axis_size = shape[i]
b = np.random.randint(axis_size)
e = np.random.randint(axis_size)
if b == e:
slices.append(b)
else:
if b < e:
s = 1
if b == 0:
b = None
if e == axis_size-1:
e = None
else:
s = -1
if b == axis_size-1:
b = None
if e == 0:
e = None
slices.append ( slice(b,e,s) )
if np.random.randint(2) == 0:
axis = np.random.randint(shape_len)
slices[axis] = Ellipsis
shape = tuple(shape)
slices = tuple(slices)
print(f'slice: {shape} {np.dtype(dtype).name} {slices} ... ', end='')
val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
sliced_n = val_n[slices]
print(f'{sliced_n.shape} ... ', end='')
sliced_t = Tensor.from_value(val_n)[slices]
print(f'{sliced_t.shape} ... ', end='')
if 0 in sliced_n.shape:
# some cases like 0:1:-1 will produce zero shape and invalid array on numpy
# but nn.slice has no such behaviour, thus we have to generate new slice again
print('pass (bad case)')
continue
if np.prod(sliced_n.shape) != sliced_t.shape.size:
raise Exception(f'shape is not equal')
if not _all_close(sliced_t.np(), sliced_n):
raise Exception(f'data is not equal')
print('pass')
break
def transpose_test():
for dtype in HType.get_np_scalar_types():
for shape_len in range(2, 5):
shape = np.random.randint( 8, size=(shape_len,) )+1
axes_order = np.array([*range(shape_len)])
np.random.shuffle(axes_order)
print(f'transpose: {shape} {axes_order} ... ', end='')
val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
transposed_n = np.transpose(val_n, axes_order)
print(f'{transposed_n.shape} ... ', end='')
val_t = Tensor.from_value(val_n)
transposed_t = op.transpose (val_t, axes_order )
print(f'{transposed_t.shape} ... ', end='')
if transposed_n.shape != transposed_t.shape:
raise Exception('shape is not equal')
if not all ( np.ndarray.flatten( transposed_t.np() == transposed_n ) ):
raise Exception(f'data is not equal {shape} {axes_order}')
print('pass')
def any_wise_op_test():
for op_type in ['square', '+', '-', '*', '/', 'min', 'max']:
for dtype in HType.get_np_scalar_types():
if dtype != np.bool_:
shape_gen = range(1, 5)
for shape_len in shape_gen:
a_shape = tuple(np.random.randint( 8, size=(shape_len,) )+1)
if np.random.randint(2) == 0:
b_shape = tuple(a_shape[np.random.randint(len(a_shape)):])
b_shape = (1,) if len(b_shape) == 0 else b_shape
else:
b_shape = list(a_shape)
b_shape[ np.random.randint(len(b_shape)) ] = 1
b_shape = tuple(b_shape)
shapes = [a_shape, b_shape]
if np.random.randint(2) == 0:
shapes = shapes[::-1]
a_shape, b_shape = shapes
print(f'any_wise: {a_shape} {str(op_type)} {b_shape}:{str(np.dtype(dtype).name)} ...', end='')
a_n = np.random.randint( 1, 2**8, size=a_shape ).astype(dtype)
b_n = np.random.randint( 1, 2**8, size=b_shape ).astype(dtype)
a_t = Tensor.from_value(a_n)
b_t = Tensor.from_value(b_n)
if op_type == '+':
r_t = a_t + b_t
elif op_type == '-':
r_t = a_t - b_t
elif op_type == '*':
r_t = a_t * b_t
elif op_type == '/':
r_t = a_t / b_t
elif op_type == 'min':
r_t = op.min_(a_t, b_t)
elif op_type == 'max':
r_t = op.max_(a_t, b_t)
elif op_type == 'square':
r_t = op.square(a_t)
if op_type in ['+','-','*','/']:
r_n = eval(f'a_n {op_type} b_n')
r_n = r_n.astype(dtype)
elif op_type == 'min':
r_n = np.minimum(a_n, b_n)
elif op_type == 'max':
r_n = np.maximum(a_n, b_n)
elif op_type == 'square':
r_n = np.square(a_n)
if r_n.shape != r_t.shape:
raise Exception(f'shapes are not equal')
if not _all_close(r_t.np(), r_n):
raise Exception(f'data is not equal')
print('pass')

View file

@ -0,0 +1,55 @@
class SCacheton:
"""
Static class for caching classes and vars by hashable arguments
"""
cachetons = {}
cachevars = {}
@staticmethod
def get(cls, *args, **kwargs):
"""
Get class cached by args/kwargs
If it does not exist, creates new with *args,**kwargs
All cached data will be freed with cleanup()
You must not to store Tensor in SCacheton, use per-device cache vars
"""
cls_multitons = SCacheton.cachetons.get(cls, None)
if cls_multitons is None:
cls_multitons = SCacheton.cachetons[cls] = {}
key = (args, tuple(kwargs.items()) )
data = cls_multitons.get(key, None)
if data is None:
data = cls_multitons[key] = cls(*args, **kwargs)
return data
@staticmethod
def set_var(var_name, value):
"""
Set data cached by var_name
All cached data will be freed with cleanup()
You must not to store Tensor in SCacheton, use per-device cache vars
"""
SCacheton.cachevars[var_name] = value
@staticmethod
def get_var(var_name):
"""
Get data cached by var_name
All cached data will be freed with cleanup()
You must not to store Tensor in SCacheton, use per-device cache vars
"""
return SCacheton.cachevars.get(var_name, None)
@staticmethod
def cleanup():
"""
Free all cached objects
"""
SCacheton.cachetons = {}
SCacheton.cachevars = {}

View file

@ -0,0 +1,124 @@
import weakref
from typing import Iterable, Union
import numpy as np
from .AShape import *
from .backend import *
from .HType import *
class Tensor:
### CONSTRUCTORS
def __init__(self, shape : Union[AShape, Iterable],
dtype : np.dtype,
device : Union[None, int, Device, DeviceInfo] = None,
initializer : 'Initializer' = None,
):
Tensor._object_count += 1
self._shape = shape = AShape(shape)
self._dtype = dtype = np.dtype(dtype)
self._device = device = get_device(device)
if device is None:
raise Exception('No device.')
self._seq_id = Tensor._seq_id = Tensor._seq_id + 1
self._buffer = Buffer(device, size=shape.size*dtype.itemsize,
on_initialize= lambda initializer=initializer, wself=weakref.ref(self): initializer.initialize_tensor( wself() ) if initializer is not None else None )
@staticmethod
def from_value(value, dtype=None, device:Union[None,int,Device,DeviceInfo]=None) -> 'Tensor':
if isinstance(value, Tensor):
device = value.get_device()
elif not isinstance(value, np.ndarray):
if HType.is_scalar_type(value):
if dtype is None:
raise ValueError('dtype must be specified for single value')
value = np.array([value], dtype=dtype)
elif isinstance(value, Iterable):
if dtype is None:
raise ValueError('dtype must be specified for Iterable of values')
value = np.array(value, dtype=dtype)
else:
raise ValueError(f'Unsupported value type {value.__class__}')
return Tensor(shape=value.shape, dtype=value.dtype, device=device).set(value)
### PROPERTIES
@property
def shape(self): return self._shape
@property
def ndim(self): return len(self._shape)
@property
def dtype(self) -> np.dtype: return self._dtype
@property
def name(self) -> str: return f'#{self.get_seq_id()}'
### GETTERS/SETTERS
def get_buffer(self) -> Buffer: return self._buffer
def get_device(self) -> Device: return self._device
def get_seq_id(self) -> int: return self._seq_id
def set(self, value : Union['Tensor',np.ndarray,int,float]) -> 'Tensor':
if isinstance(value, Tensor):
if self.shape.size != value.shape.size:
raise ValueError('Unable to set the data from other tensor: shape.size is not the same.')
self.get_buffer().set(value.get_buffer())
else:
if isinstance(value, np.ndarray):
value = value.astype(self.dtype)
else:
if HType.is_scalar_type(value):
value = np.array([value], dtype=self.dtype)
elif isinstance(value, Iterable):
value = np.array(value, dtype=self.dtype)
else:
raise ValueError(f'Unsupported value type {value.__class__}')
self.get_buffer().set(value)
return self
def np(self):
"""Returns numpy value of a Tensor"""
return self.get_buffer().np(self.shape, self.dtype)
### OPERATORS
def __add__(self, value) -> 'Tensor': ...
def __radd__(self, value) -> 'Tensor': ...
def __sub__(self, value) -> 'Tensor': ...
def __rsub__(self, value) -> 'Tensor': ...
def __mul__(self, value) -> 'Tensor': ...
def __rmul__(self, value) -> 'Tensor': ...
def __truediv__(self, value) -> 'Tensor': ...
def __rtruediv__(self, value) -> 'Tensor': ...
def __neg__(self) -> 'Tensor': ...
def __getitem__(self, slices) -> 'Tensor': ...
def __setitem__(self, slices, value): ...
def as_shape(self, shape) -> 'Tensor': ...
def copy(self) -> 'Tensor': ...
def max(self, axes=None, keepdims=False) -> 'Tensor': ...
def mean(self, axes=None, keepdims=False) -> 'Tensor': ...
def min(self, axes=None, keepdims=False) -> 'Tensor': ...
def reshape(self, new_shape) -> 'Tensor': ...
def sum(self, axes=None, keepdims=False) -> 'Tensor': ...
def transpose(self, axes_order, op_text=None, dtype=None) -> 'Tensor': ...
@property
def T(self): return self.transpose( tuple(range(self.shape.ndim-1,-1,-1)) )
### INTERNAL METHODS
def __del__(self):
Tensor._object_count -= 1
_object_count = 0
_seq_id = 0
__all__ = ['Tensor']

View file

@ -0,0 +1,92 @@
from .op import *
from .AShape import *
from .backend import *
from .Tensor import Tensor
def Tensor__str__(self : Tensor): return f"T {self.name} {self.shape} {self.dtype.name}"
Tensor.__str__ = Tensor__str__
def Tensor__repr__(self : Tensor):
s = self.__str__() + '\n'
s += str(self.np()) + '\n'
s += self.__str__()
return s
Tensor.__repr__ = Tensor__repr__
def Tensor__add__(self : Tensor, value) -> Tensor:
return add(self, value)
Tensor.__add__ = Tensor__add__
def Tensor__radd__(self : Tensor, value) -> Tensor:
return add(value, self)
Tensor.__radd__ = Tensor__radd__
def Tensor__sub__(self : Tensor, value) -> Tensor:
return sub(self, value)
Tensor.__sub__ = Tensor__sub__
def Tensor__rsub__(self : Tensor, value) -> Tensor:
return sub(value, self)
Tensor.__rsub__ = Tensor__rsub__
def Tensor__mul__(self : Tensor, value) -> Tensor:
if self == value:
return square(self)
return mul(self, value)
Tensor.__mul__ = Tensor__mul__
def Tensor__rmul__(self : Tensor, value) -> Tensor:
if self == value:
return square(self)
return mul(value, self)
Tensor.__rmul__ = Tensor__rmul__
def Tensor__truediv__(self : Tensor, value) -> Tensor:
return div(self, value)
Tensor.__truediv__ = Tensor__truediv__
def Tensor__rtruediv__(self : Tensor, value) -> Tensor:
return div(value, self)
Tensor.__rtruediv__ = Tensor__rtruediv__
def Tensor___neg__(self : Tensor):
raise NotImplementedError()
Tensor.___neg__ = Tensor___neg__
Tensor.__getitem__ = slice_
Tensor.__setitem__ = slice_set
def Tensor_as_shape(self : Tensor, shape) -> Tensor:
return TensorRef(self, shape)
Tensor.as_shape = Tensor_as_shape
def Tensor_copy(self : Tensor) -> Tensor:
return Tensor.from_value(self)
Tensor.copy = Tensor_copy
Tensor.max = reduce_max
Tensor.mean = reduce_mean
Tensor.min = reduce_min
Tensor.reshape = reshape
Tensor.sum = reduce_sum
Tensor.transpose = transpose
class TensorRef(Tensor):
"""
TensorRef used to interpret existing Tensor with different shape.
use Tensor.as_ref() method
"""
def __init__(self, t : Tensor, shape):
shape = AShape(shape)
if t.shape.size != shape.size:
raise ValueError(f'Cannot interpet shape {t.shape} as ref shape {shape}')
super().__init__(shape, t.dtype, device=t.get_device())
self._t = t
# Forward methods to original tensor
def get_seq_id(self) -> int: return self._t.get_seq_id()
def get_buffer(self) -> Buffer: return self._t.get_buffer()
def get_device(self) -> Device: return self._t.get_device()
__all__ = []

View file

@ -0,0 +1,109 @@
from typing import Iterable, Union
import numpy as np
from . import OpenCL as CL
class Buffer:
__slots__ = ['_device','_cl_mem','_size','_on_initialize']
def __init__(self, device : 'Device', size : int, on_initialize = None):
"""
represents physical buffer associated with physical device
device Device
size int
"""
Buffer._object_count += 1
self._device = device
self._size = size
self._cl_mem = None
self._on_initialize = on_initialize
def __del__(self):
#print('Buffer.__del__')
Buffer._object_count -= 1
self.free_cl_mem()
def get_device(self) -> 'Device': return self._device
def get_size(self) -> int: return self._size
def has_cl_mem(self) -> bool: return self._cl_mem is not None
def get_cl_mem(self) -> CL.cl_mem:
if self._cl_mem is None:
self._cl_mem = self._device._cl_mem_pool_alloc(self._size)
if self._on_initialize is not None:
self._on_initialize()
return self._cl_mem
def free_cl_mem(self):
if self._cl_mem is not None:
self._device._cl_mem_pool_free(self._cl_mem)
self._cl_mem = None
def set(self, value : Union['Buffer', np.ndarray]):
"""
Parameters
value Buffer copy data from other Buffer.
np.ndarray copies values from ndarray
to Buffer's memory
"""
if isinstance(value, Buffer):
if self != value:
if self._size != value._size:
raise Exception(f'Unable to copy from Buffer with {value._size} size to buffer with {self._size} size.')
if self._device == value._device:
CL.clEnqueueCopyBuffer(self._device._get_ctx_q(), value.get_cl_mem(), self.get_cl_mem(), 0, 0, self._size, 0, None, None)
else:
# Transfer between devices will cause low performance
raise NotImplementedError()
else:
if not isinstance(value, np.ndarray):
raise ValueError (f'Invalid type {value.__class__}. Must be np.ndarray.')
if value.nbytes != self._size:
raise ValueError(f'Value size {value.nbytes} does not match Buffer size {self._size}.')
if not value.flags.contiguous:
value = value.reshape(-1)
if not value.flags.contiguous:
raise ValueError ("Unable to write from non-contiguous np array.")
ev = CL.cl_event()
clr = CL.clEnqueueWriteBuffer(self._device._get_ctx_q(), self.get_cl_mem(), False, 0, value.nbytes, value.ctypes.data_as(CL.c_void_p), 0, None, ev)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clEnqueueWriteBuffer error: {clr}')
CL.clWaitForEvents(1, ( CL.cl_event * 1 )(ev) )
CL.clReleaseEvent(ev)
def np(self, shape : Iterable, dtype : np.dtype):
"""
Returns data of buffer as np.ndarray with specified shape and dtype
"""
out_np_value = np.empty (shape, dtype)
if out_np_value.nbytes != self._size:
raise ValueError(f'Unable to represent Buffer with size {self._size} as shape {shape} with dtype {dtype}')
clr = CL.clEnqueueReadBuffer(self._device._get_ctx_q(), self.get_cl_mem(), True, 0, self._size, out_np_value.ctypes.data, 0, None, None)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clEnqueueReadBuffer error: {clr}')
return out_np_value
def __str__(self):
return f'Buffer [{self._size} bytes][{f"{self._cl_mem.value}" if self._cl_mem is not None else "unallocated"}] on {str(self._device)}'
def __repr__(self):
return self.__str__()
_object_count = 0

View file

@ -0,0 +1,522 @@
from typing import List, Union
import numpy as np
from . import OpenCL as CL
from .Buffer import Buffer
from .DeviceInfo import DeviceInfo
from .Kernel import Kernel
_np_dtype_to_cl = { np.uint8: CL.cl_uchar,
np.int8: CL.cl_char,
np.uint16: CL.cl_ushort,
np.int16: CL.cl_short,
np.uint32: CL.cl_uint,
np.int32: CL.cl_int,
np.uint64: CL.cl_ulong,
np.int64: CL.cl_long,
np.float16: CL.cl_half,
np.float32: CL.cl_float,
np.float64: CL.cl_double }
_opencl_device_ids = None
_default_device = None
_devices = {}
class Device:
"""
Represents physical TensorCL device
"""
def __init__(self, device_info : DeviceInfo, **kwargs):
if kwargs.get('_check', None) is None:
raise Exception('You should not to create Device from constructor. Use get_device()')
self._cached_data = {} # cached data (per device) by key
self._pooled_buffers = {} # Pool of cached device buffers.
self._compiled_kernels = {} # compiled kernels by key
self._ctx_q = None # CL command queue
self._ctx = None # CL context
self._total_memory_allocated = 0
self._total_buffers_allocated = 0
self._total_memory_pooled = 0
self._total_buffers_pooled = 0
self._device_info = device_info
self._device_id = _get_opencl_device_ids()[device_info.get_index()]
def __del__(self):
self.cleanup()
def __eq__(self, other):
if self is not None and other is not None and isinstance(self, Device) and isinstance(other, Device):
return self._device_id.value == other._device_id.value
return False
def __hash__(self):
return self._device_id.value
def _get_ctx(self) -> CL.cl_context:
# Create OpenCL context on demand
if self._ctx is None:
clr = CL.CLRESULT()
ctx = CL.clCreateContext( None, 1, (CL.cl_device_id * 1)( self._device_id ), None, None, clr)
if clr != CL.CLERROR.SUCCESS:
raise Exception('Unable to create OpenCL context.')
self._ctx = ctx
return self._ctx
def _get_ctx_q(self) -> CL.cl_command_queue:
# Create CommandQueue on demand
if self._ctx_q is None:
clr = CL.CLRESULT()
ctx_q = CL.clCreateCommandQueue(self._get_ctx(), self._device_id, CL.cl_command_queue_properties(0), clr)
if clr != CL.CLERROR.SUCCESS:
raise Exception('Unable to create OpenCL CommandQueue.')
self._ctx_q = ctx_q
return self._ctx_q
def get_description(self) -> str:
return f"{self._device_info.get_name()} [{(self._device_info.get_total_memory() / 1024**3) :.3}Gb]"
def __str__(self):
return self.get_description()
def __repr__(self):
return f'{self.__class__.__name__} object: ' + self.__str__()
def set_cached_data(self, key, value):
"""
All cached data will be freed with cleanup()
"""
self._cached_data[key] = value
def get_cached_data(self, key):
return self._cached_data.get(key, None)
def get_total_allocated_memory(self):
return self._total_memory_allocated
def get_max_malloc_size(self) -> int:
size = CL.cl_ulong()
clr = CL.clGetDeviceInfo(self._device_id, CL.CL_DEVICE_MAX_MEM_ALLOC_SIZE, CL.sizeof(size), CL.byref(size), None)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clGetDeviceInfo error: {clr}')
return size.value
def _compile_kernel(self, key, kernel_text) -> CL.cl_kernel:
"""
compile or get cached kernel
"""
compiled_krn, prog = self._compiled_kernels.get(key, (None, None) )
if compiled_krn is None:
clr = CL.CLRESULT()
prog = CL.clCreateProgramWithSource(self._get_ctx(), 1, CL.c_char_p(kernel_text.encode()), None, clr )
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clCreateProgramWithSource error {clr}, with kernel_text:\n{kernel_text}')
clr = CL.clBuildProgram(prog, 1, (CL.cl_device_id*1)(self._device_id), CL.c_char_p('-cl-std=CL1.2 -cl-single-precision-constant'.encode()), None, None )
if clr != CL.CLERROR.SUCCESS:
build_log_size = CL.c_size_t()
clr = CL.clGetProgramBuildInfo(prog, self._device_id, CL.CL_PROGRAM_BUILD_LOG, 0, None, CL.byref(build_log_size) )
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clGetProgramBuildInfo,error: {clr}')
build_log = CL.create_string_buffer(build_log_size.value)
clr = CL.clGetProgramBuildInfo(prog, self._device_id, CL.CL_PROGRAM_BUILD_LOG, build_log_size.value, build_log, None )
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clGetProgramBuildInfo error: {clr}')
build_log = str(build_log.value, 'utf-8')
raise Exception(f'clBuildProgram error:\n\n{build_log}')
num_kernels = CL.cl_uint()
clr = CL.clCreateKernelsInProgram(prog, 0, None, CL.byref(num_kernels))
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clCreateKernelsInProgram error: {clr}')
if num_kernels.value != 1:
raise Exception(f'Kernel must contain only one __kernel:\n\n{kernel_text}')
kernels = (CL.cl_kernel * num_kernels.value)()
clr = CL.clCreateKernelsInProgram(prog, num_kernels.value, kernels, None)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clCreateKernelsInProgram error: {clr}')
compiled_krn = kernels[0]
self._compiled_kernels[key] = (compiled_krn, prog)
return compiled_krn
def _cl_mem_alloc(self, size) -> CL.cl_mem:
clr = CL.CLRESULT()
mem = CL.clCreateBuffer(self._get_ctx(), CL.CL_MEM_READ_WRITE, size, None, clr)
if clr == CL.CLERROR.SUCCESS:
# Fill one byte to check memory availability
ev = CL.cl_event()
clr = CL.clEnqueueFillBuffer (self._get_ctx_q(), mem, (CL.c_char * 1)(), 1, 0, 1, 0, None, ev )
if clr == CL.CLERROR.SUCCESS:
CL.clReleaseEvent(ev)
self._total_memory_allocated += size
self._total_buffers_allocated += 1
return mem
return None
def _cl_mem_free(self, mem : CL.cl_mem):
size = CL.c_size_t()
clr = CL.clGetMemObjectInfo(mem, CL.CL_MEM_SIZE, CL.sizeof(size), CL.byref(size), None )
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clGetMemObjectInfo error: {clr}')
size = size.value
self._total_memory_allocated -= size
self._total_buffers_allocated -= 1
clr = CL.clReleaseMemObject(mem)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clReleaseMemObject error: {clr}')
def _cl_mem_pool_alloc(self, size):
"""
allocate or get cl_mem from pool
"""
pool = self._pooled_buffers
# First try to get pooled buffer
ar = pool.get(size, None)
if ar is not None and len(ar) != 0:
mem = ar.pop(-1)
self._total_memory_pooled -= size
self._total_buffers_pooled -= 1
else:
# No pooled buffer, try to allocate new
while True:
mem = self._cl_mem_alloc(size)
if mem is None:
# MemoryError. Finding largest pooled buffer to release
buf_to_release = None
for size_key in sorted(list(pool.keys()), reverse=True):
ar = pool[size_key]
if len(ar) != 0:
buf_to_release = ar.pop(-1)
break
if buf_to_release is not None:
# Release pooled buffer and try to allocate again
self._cl_mem_free(buf_to_release)
continue
raise Exception(f'Unable to allocate {size // 1024**2}Mb on {str(self)}')
break
return mem
def _cl_mem_pool_free(self, mem : CL.cl_mem):
"""
Put cl_mem to pool for reuse in future.
"""
size = CL.c_size_t()
clr = CL.clGetMemObjectInfo(mem, CL.CL_MEM_SIZE, CL.sizeof(size), CL.byref(size), None )
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clGetMemObjectInfo error: {clr}')
size = size.value
pool = self._pooled_buffers
ar = pool.get(size, None)
if ar is None:
ar = pool[size] = []
ar.append(mem)
self._total_memory_pooled += size
self._total_buffers_pooled += 1
def print_stat(self):
s = f'''
Total memory allocated: {self._total_memory_allocated}
Total buffers allocated: {self._total_buffers_allocated}
Total memory pooled: {self._total_memory_pooled}
Total buffers pooled: {self._total_buffers_pooled}
N of compiled kernels: {len(self._compiled_kernels)}
N of cacheddata: {len(self._cached_data)}
'''
print(s)
def run_kernel(self, kernel : Kernel, *args, global_shape=None, local_shape=None, global_shape_offsets=None, wait=False):
"""
Run kernel on Device
Arguments
*args arguments will be passed to OpenCL kernel
allowed types:
Buffer
np single value
global_shape(None) tuple of ints, up to 3 dims
amount of parallel kernel executions.
in OpenCL kernel,
id can be obtained via get_global_id(dim)
local_shape(None) tuple of ints, up to 3 dims
specifies local groups of every dim of global_shape.
in OpenCL kernel,
id can be obtained via get_local_id(dim)
global_shape_offsets(None) tuple of ints
offsets for global_shape
wait(False) wait execution to complete
"""
ckernel = self._compile_kernel(kernel, kernel.get_kernel_text())
if global_shape is None:
global_shape = kernel.get_global_shape()
if global_shape is None:
raise ValueError('global_shape must be defined.')
work_dim = len(global_shape)
global_shape_ar = (CL.c_size_t*work_dim)()
for i,v in enumerate(global_shape):
global_shape_ar[i] = v
local_shape_ar = None
if local_shape is None:
local_shape = kernel.get_local_shape()
if local_shape is not None:
if len(local_shape) != work_dim:
raise ValueError('len of local_shape must match global_shape')
local_shape_ar = (CL.c_size_t*work_dim)()
for i,v in enumerate(local_shape):
local_shape_ar[i] = v
global_shape_offsets_ar = None
if global_shape_offsets is not None:
if len(global_shape_offsets) != work_dim:
raise ValueError('len of global_shape_offsets must match global_shape')
global_shape_offsets_ar = (CL.c_size_t*work_dim)()
for i,v in enumerate(local_shape):
global_shape_offsets_ar[i] = v
for i, arg in enumerate(args):
if isinstance(arg, Buffer):
arg = arg.get_cl_mem()
else:
cl_type = _np_dtype_to_cl.get(arg.__class__, None)
if cl_type is None:
raise ValueError(f'Cannot convert type {arg.__class__} to OpenCL type.')
arg = cl_type(arg)
clr = CL.clSetKernelArg(ckernel, i, CL.sizeof(arg), CL.byref(arg))
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clSetKernelArg error: {clr}')
ev = CL.cl_event() if wait else None
clr = CL.clEnqueueNDRangeKernel(self._get_ctx_q(), ckernel, work_dim, global_shape_offsets_ar, global_shape_ar, local_shape_ar, 0, None, ev)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clEnqueueNDRangeKernel error: {clr}')
if wait:
CL.clWaitForEvents(1, (CL.cl_event*1)(ev) )
CL.clReleaseEvent(ev)
def wait(self):
"""
Wait to finish all queued operations on this Device
"""
clr = CL.clFinish(self._get_ctx_q())
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clFinish error: {clr}')
def cleanup(self):
"""
Frees all resources from this Device.
"""
self._cached_data = {}
pool = self._pooled_buffers
for size_key in pool.keys():
for mem in pool[size_key]:
self._cl_mem_free(mem)
self._pooled_buffers = {}
self._total_memory_pooled = 0
self._total_buffers_pooled = 0
if self._total_memory_allocated != 0:
raise Exception('Unable to cleanup CLDevice, while not all Buffers are deallocated.')
for kernel, prog in self._compiled_kernels.values():
clr = CL.clReleaseKernel(kernel)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clReleaseKernel error: {clr}')
clr = CL.clReleaseProgram(prog)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clReleaseProgram error: {clr}')
self._compiled_kernels = {}
if self._ctx_q is not None:
clr = CL.clReleaseCommandQueue(self._ctx_q)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clReleaseCommandQueue error: {clr}')
self._ctx_q = None
if self._ctx is not None:
clr = CL.clReleaseContext(self._ctx)
if clr != CL.CLERROR.SUCCESS:
raise Exception(f'clReleaseContext error: {clr}')
self._ctx = None
def _get_opencl_device_ids() -> List[CL.cl_device_id]:
global _opencl_device_ids
if _opencl_device_ids is None:
_opencl_device_ids = []
device_types = CL.CL_DEVICE_TYPE_CPU | CL.CL_DEVICE_TYPE_ACCELERATOR | CL.CL_DEVICE_TYPE_GPU
while True:
num_platforms = CL.cl_uint()
if CL.clGetPlatformIDs(0, None, num_platforms) != CL.CLERROR.SUCCESS or \
num_platforms.value == 0:
break
platforms = (CL.cl_platform_id * num_platforms.value) ()
if CL.clGetPlatformIDs(num_platforms.value, platforms, None) != CL.CLERROR.SUCCESS:
break
for i_platform in range(num_platforms.value):
platform = platforms[i_platform]
num_devices = CL.cl_uint(0)
if CL.clGetDeviceIDs(platform, device_types, 0, None, num_devices) != CL.CLERROR.SUCCESS or \
num_devices.value == 0:
continue
device_ids = (CL.cl_device_id * num_devices.value)()
if CL.clGetDeviceIDs(platform, device_types, num_devices.value, device_ids, None) != CL.CLERROR.SUCCESS:
continue
for i in range(num_devices.value):
device_id = device_ids[i]
# Check OpenCL version.
if device_id is not None:
device_version_size = CL.c_size_t()
if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_VERSION, 0, None, device_version_size) == CL.CLERROR.SUCCESS:
device_version = CL.create_string_buffer(device_version_size.value)
if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_VERSION, device_version_size.value, device_version, None) == CL.CLERROR.SUCCESS:
device_version = str(device_version.value, 'ascii')
major, minor = device_version.split(' ')[1].split('.')
opencl_version = int(major)*10+int(minor)
if opencl_version >= 12:
_opencl_device_ids.append(device_id)
break
return _opencl_device_ids
def get_available_devices_info() -> List[DeviceInfo]:
"""
returns a list of available picklable DeviceInfo's
"""
devices = []
for device_index, device_id in enumerate(_get_opencl_device_ids()):
device_name = 'undefined'
device_total_memory = 0
name_size = CL.c_size_t()
if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_NAME, 0, None, name_size) == CL.CLERROR.SUCCESS:
name_value = CL.create_string_buffer(name_size.value)
if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_NAME, name_size.value, name_value, None) == CL.CLERROR.SUCCESS:
device_name = str(name_value.value, 'ascii')
global_mem_size = CL.cl_ulong()
if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_GLOBAL_MEM_SIZE, CL.sizeof(global_mem_size), CL.byref(global_mem_size), None) == CL.CLERROR.SUCCESS:
device_total_memory = global_mem_size.value
vendor_id = CL.cl_uint()
CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_VENDOR_ID, CL.sizeof(vendor_id), CL.byref(vendor_id), None)
vendor_id = vendor_id.value
max_compute_units = CL.cl_uint()
CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_MAX_COMPUTE_UNITS, CL.sizeof(max_compute_units), CL.byref(max_compute_units), None)
max_compute_units = max_compute_units.value
performance_level = max_compute_units
if vendor_id == 0x8086: # Intel device
performance_level -= 1000
devices.append( DeviceInfo(index=device_index, name=device_name, total_memory=device_total_memory, performance_level=performance_level ) )
return devices
def get_default_device() -> Union[Device, None]:
global _default_device
if _default_device is None:
_default_device = get_device(0)
return _default_device
def set_default_device(device : Device):
if not isinstance(device, Device):
raise ValueError('device must be an instance of Device')
global _default_device
_default_device = device
def get_device(arg : Union[None, int, Device, DeviceInfo]) -> Union[Device, None]:
"""
get physical TensorCL device.
arg None - get best device
int - by index
DeviceInfo - by device info
Device - returns the same
"""
global _devices
if arg is None:
return get_best_device()
elif isinstance(arg, int):
devices_info = get_available_devices_info()
if arg < len(devices_info):
arg = devices_info[arg]
else:
return None
elif isinstance(arg, Device):
return arg
elif not isinstance(arg, DeviceInfo):
raise ValueError(f'Unknown type of arg {arg.__class__}')
device = _devices.get(arg, None)
if device is None:
device = _devices[arg] = Device(arg, _check=1)
return device
def get_best_device() -> Union[Device, None]:
"""
returns best device from available.
"""
perf_level = -999999
result = None
for device_info in get_available_devices_info():
dev_perf_level = device_info.get_performance_level()
if perf_level < dev_perf_level:
perf_level = dev_perf_level
result = device_info
if result is not None:
result = get_device(result)
return result
def cleanup_devices():
global _devices
for device in list(_devices.values()):
device.cleanup()
_devices = {}

View file

@ -0,0 +1,44 @@
class DeviceInfo:
"""
Represents picklable OpenCL device info
"""
def __init__(self, index : int = None, name : str = None, total_memory : int = None, performance_level : int = None):
self._index = index
self._name = name
self._total_memory = total_memory
self._performance_level = performance_level
def __getstate__(self):
return self.__dict__.copy()
def __setstate__(self, d):
self.__init__()
self.__dict__.update(d)
def get_index(self) -> int:
return self._index
def get_name(self) -> str:
return self._name
def get_total_memory(self) -> int:
return self._total_memory
def get_performance_level(self) -> int:
return self._performance_level
def __eq__(self, other):
if self is not None and other is not None and isinstance(self, DeviceInfo) and isinstance(other, DeviceInfo):
return self._index == other._index
return False
def __hash__(self):
return self._index
def __str__(self):
return f"[{self._index}] {self._name} [{(self._total_memory / 1024**3) :.3}Gb]"
def __repr__(self):
return f'{self.__class__.__name__} object: ' + self.__str__()

View file

@ -0,0 +1,26 @@
class Kernel:
"""
TensorCL kernel.
It does not allocate any resources, thus can be used as static variable within class.
arguments
kernel_text OpenCL text of kernel. Must contain only one __kernel
global_shape default global_shape for .run()
local_shape default local_shape for .run()
"""
def __init__(self, kernel_text, global_shape=None, local_shape=None):
self._kernel_text = kernel_text
self._global_shape = global_shape
self._local_shape = local_shape
def get_kernel_text(self) -> str: return self._kernel_text
def get_global_shape(self): return self._global_shape
def get_local_shape(self): return self._local_shape
def __str__(self): return f'Kernel: \n{self._kernel_text}'
def __repr__(self): return self.__str__()

View file

@ -0,0 +1,278 @@
"""
Minimal OpenCL 1.2 low level ctypes API.
"""
import ctypes
from ctypes import POINTER, create_string_buffer, sizeof, c_char_p, c_char, c_size_t, c_void_p, byref
from ctypes.util import find_library
from enum import IntEnum
dlls_by_name = {}
def dll_import(dll_name):
dll = dlls_by_name.get(dll_name, None)
if dll is None:
try:
dll = ctypes.cdll.LoadLibrary(find_library(dll_name))
except:
pass
if dll is None:
raise RuntimeError(f'Unable to load {dll_name} library.')
dlls_by_name[dll_name] = dll
def decorator(func):
dll_func = getattr(dll, func.__name__)
anno = list(func.__annotations__.values())
dll_func.argtypes = anno[:-1]
dll_func.restype = anno[-1]
def wrapper(*args):
return dll_func(*args)
return wrapper
return decorator
class cl_char(ctypes.c_int8): pass
class cl_uchar(ctypes.c_uint8): pass
class cl_short(ctypes.c_int16): pass
class cl_ushort(ctypes.c_uint16): pass
class cl_int(ctypes.c_int32): pass
class cl_uint(ctypes.c_uint32): pass
class cl_long(ctypes.c_int64): pass
class cl_ulong(ctypes.c_uint64): pass
class cl_half(ctypes.c_uint16): pass
class cl_float(ctypes.c_float): pass
class cl_double(ctypes.c_double): pass
class cl_bool(cl_uint): pass
class cl_bitfield(cl_ulong):
def __or__(self, other):
assert isinstance(other, self.__class__)
return self.__class__(self.value | other.value)
def __and__(self, other):
assert isinstance(other, self.__class__)
return self.__class__(self.value & other.value)
def __xor__(self, other):
assert isinstance(other, self.__class__)
return self.__class__(self.value ^ other.value)
def __not__(self):
return self.__class__(~self.value)
def __contains__(self, other):
assert isinstance(other, self.__class__)
return (self.value & other.value) == other.value
def __hash__(self):
return self.value.__hash__()
def __eq__(self, other):
if not isinstance(other, self.__class__):
return False
else:
return self.value == other.value
def __ne__(self, other):
return not(self == other)
def __repr__(self):
return f'cl_bitfield: {self.value}'
class CLERROR(IntEnum):
SUCCESS = 0
DEVICE_NOT_FOUND = -1
DEVICE_NOT_AVAILABLE = -2
COMPILER_NOT_AVAILABLE = -3
MEM_OBJECT_ALLOCATION_FAILURE = -4
OUT_OF_RESOURCES = -5
OUT_OF_HOST_MEMORY = -6
PROFILING_INFO_NOT_AVAILABLE = -7
MEM_COPY_OVERLAP = -8
IMAGE_FORMAT_MISMATCH = -9
IMAGE_FORMAT_NOT_SUPPORTED = -10
BUILD_PROGRAM_FAILURE = -11
MAP_FAILURE = -12
MISALIGNED_SUB_BUFFER_OFFSET = -13
EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST = -14
INVALID_VALUE = -30
INVALID_DEVICE_TYPE = -31
INVALID_PLATFORM = -32
INVALID_DEVICE = -33
INVALID_CONTEXT = -34
INVALID_QUEUE_PROPERTIES = -35
INVALID_COMMAND_QUEUE = -36
INVALID_HOST_PTR = -37
INVALID_MEM_OBJECT = -38
INVALID_IMAGE_FORMAT_DESCRIPTOR = -39
INVALID_IMAGE_SIZE = -40
INVALID_SAMPLER = -41
INVALID_BINARY = -42
INVALID_BUILD_OPTIONS = -43
INVALID_PROGRAM = -44
INVALID_PROGRAM_EXECUTABLE = -45
INVALID_KERNEL_NAME = -46
INVALID_KERNEL_DEFINITION = -47
INVALID_KERNEL = -48
INVALID_ARG_INDEX = -49
INVALID_ARG_VALUE = -50
INVALID_ARG_SIZE = -51
INVALID_KERNEL_ARGS = -52
INVALID_WORK_DIMENSION = -53
INVALID_WORK_GROUP_SIZE = -54
INVALID_WORK_ITEM_SIZE = -55
INVALID_GLOBAL_OFFSET = -56
INVALID_EVENT_WAIT_LIST = -57
INVALID_EVENT = -58
INVALID_OPERATION = -59
INVALID_GL_OBJECT = -60
INVALID_BUFFER_SIZE = -61
INVALID_MIP_LEVEL = -62
INVALID_GLOBAL_WORK_SIZE = -63
INVALID_PROPERTY = -64
INVALID_GL_SHAREGROUP_REFERENCE_KHR = -1000
PLATFORM_NOT_FOUND_KHR = -1001
class CLRESULT(cl_int):
def __eq__(self, other):
if isinstance(other, int):
return self.value == other
elif isinstance(other, self.__class__):
return self.value == other.value
else:
return False
def __ne__(self, other):
return not(self == other)
def __hash__(self):
return self.value.__hash__()
def __str__(self):
try:
return f'CLRESULT ({str(CLERROR(self.value))})'
except:
return f'CLRESULT ({self.value})'
def __repr__(self):
return self.__str__()
class cl_platform_id(c_void_p): ...
class cl_platform_info(cl_uint): ...
class cl_device_id(c_void_p): ...
class cl_device_type(cl_bitfield): ...
class cl_device_info(cl_uint): ...
class cl_context(c_void_p): ...
class cl_context_properties(c_void_p): ...
class cl_command_queue(c_void_p): ...
class cl_command_queue_properties(cl_bitfield): ...
class cl_event(c_void_p): ...
class cl_mem(c_void_p): ...
class cl_mem_info(cl_uint): ...
class cl_mem_flags(cl_bitfield): ...
class cl_program(c_void_p): ...
class cl_program_build_info(cl_uint): ...
class cl_kernel(c_void_p): ...
# https://github.com/KhronosGroup/OpenCL-Headers/blob/master/CL/cl.h
CL_PLATFORM_PROFILE = cl_platform_info(0x0900)
CL_PLATFORM_VERSION = cl_platform_info(0x0901)
CL_PLATFORM_NAME = cl_platform_info(0x0902)
CL_PLATFORM_VENDOR = cl_platform_info(0x0903)
CL_PLATFORM_EXTENSIONS = cl_platform_info(0x0904)
CL_DEVICE_TYPE_DEFAULT = cl_device_type( (1 << 0) )
CL_DEVICE_TYPE_CPU = cl_device_type( (1 << 1) )
CL_DEVICE_TYPE_GPU = cl_device_type( (1 << 2) )
CL_DEVICE_TYPE_ACCELERATOR = cl_device_type( (1 << 3) )
CL_DEVICE_TYPE_ALL = cl_device_type( 0xFFFFFFFF )
CL_DEVICE_TYPE = cl_device_info (0x1000)
CL_DEVICE_VENDOR_ID = cl_device_info (0x1001)
CL_DEVICE_MAX_COMPUTE_UNITS = cl_device_info (0x1002)
CL_DEVICE_GLOBAL_MEM_SIZE = cl_device_info (0x101F)
CL_DEVICE_NAME = cl_device_info (0x102B)
CL_DEVICE_VERSION = cl_device_info (0x102F)
CL_DEVICE_MAX_MEM_ALLOC_SIZE = cl_device_info (0x1010)
CL_DEVICE_MAX_WORK_GROUP_SIZE = cl_device_info (0x1004)
CL_DRIVER_VERSION = cl_device_info (0x102D)
CL_DEVICE_EXTENSIONS = cl_device_info (0x1030)
# cl_mem_flags
CL_MEM_READ_WRITE = cl_mem_flags( (1 << 0) )
CL_MEM_WRITE_ONLY = cl_mem_flags( (1 << 1) )
CL_MEM_READ_ONLY = cl_mem_flags( (1 << 2) )
CL_MEM_USE_HOST_PTR = cl_mem_flags( (1 << 3) )
CL_MEM_ALLOC_HOST_PTR = cl_mem_flags( (1 << 4) )
CL_MEM_COPY_HOST_PTR = cl_mem_flags( (1 << 5) )
# cl_mem_info
CL_MEM_SIZE = cl_mem_info(0x1102)
# cl_program_build_info
CL_PROGRAM_BUILD_STATUS = cl_program_build_info(0x1181)
CL_PROGRAM_BUILD_OPTIONS = cl_program_build_info(0x1182)
CL_PROGRAM_BUILD_LOG = cl_program_build_info(0x1183)
@dll_import('OpenCL')
def clGetPlatformIDs (num_entries : cl_uint, platforms : POINTER(cl_platform_id), num_platforms : POINTER(cl_uint) ) -> CLRESULT: ...
@dll_import('OpenCL')
def clGetPlatformInfo (platform : cl_platform_id, param_name : cl_platform_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t)) -> CLRESULT: ...
@dll_import('OpenCL')
def clGetDeviceIDs (platform : cl_platform_id, device_type : cl_device_type, num_entries : cl_uint, devices : POINTER(cl_device_id), num_devices : POINTER(cl_uint)) -> CLRESULT: ...
@dll_import('OpenCL')
def clGetDeviceInfo(device : cl_device_id, param_name : cl_device_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t)) -> CLRESULT: ...
@dll_import('OpenCL')
def clCreateContext(properties : cl_context_properties, num_devices : cl_uint, devices : POINTER(cl_device_id), pfn_notify : c_void_p, user_data : c_void_p, errcode_ret : POINTER(CLRESULT) ) -> cl_context: ...
@dll_import('OpenCL')
def clReleaseContext(context : cl_context) -> CLRESULT: ...
@dll_import('OpenCL')
def clCreateCommandQueue(context : cl_context, device : cl_device_id, properties : cl_command_queue_properties, errcode_ret : POINTER(CLRESULT) ) -> cl_command_queue: ...
@dll_import('OpenCL')
def clReleaseCommandQueue(command_queue : cl_command_queue) -> CLRESULT: ...
@dll_import('OpenCL')
def clFinish(command_queue : cl_command_queue) -> CLRESULT: ...
@dll_import('OpenCL')
def clWaitForEvents(num_events : cl_uint, event_list : POINTER(cl_event) ) -> CLRESULT: ...
@dll_import('OpenCL')
def clReleaseEvent(event : cl_event) -> CLRESULT: ...
@dll_import('OpenCL')
def clCreateBuffer(context : cl_context, flags : cl_mem_flags, size : c_size_t, host_ptr : c_void_p, errcode_ret : POINTER(CLRESULT) ) -> cl_mem: ...
@dll_import('OpenCL')
def clGetMemObjectInfo(memobj : cl_mem, param_name : cl_mem_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t) ) -> CLRESULT: ...
@dll_import('OpenCL')
def clReleaseMemObject(memobj : cl_mem) -> CLRESULT: ...
@dll_import('OpenCL')
def clEnqueueReadBuffer (command_queue : cl_command_queue, buffer : cl_mem, blocking_read : cl_bool, offset : c_size_t, cb : c_size_t, ptr : c_void_p, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ...
@dll_import('OpenCL')
def clEnqueueWriteBuffer (command_queue : cl_command_queue, buffer : cl_mem, blocking_write : cl_bool, offset : c_size_t, size : c_size_t, ptr : c_void_p, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ...
@dll_import('OpenCL')
def clEnqueueCopyBuffer (command_queue : cl_command_queue, src_buffer : cl_mem, dst_buffer : cl_mem, src_offset : c_size_t, dst_offset : c_size_t, cb : c_size_t, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : cl_event) -> CLRESULT: ...
@dll_import('OpenCL')
def clEnqueueFillBuffer (command_queue : cl_command_queue, buffer : cl_mem, pattern : c_void_p, pattern_size : c_size_t, offset : c_size_t, size : c_size_t, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ...
@dll_import('OpenCL')
def clCreateProgramWithSource (context : cl_context, count : cl_uint, strings : POINTER(c_char_p), lengths : POINTER(c_size_t), errcode_ret : POINTER(CLRESULT) ) -> cl_program: ...
@dll_import('OpenCL')
def clReleaseProgram (program : cl_program) -> CLRESULT: ...
@dll_import('OpenCL')
def clBuildProgram (program : cl_program, num_devices : cl_uint, device_list : POINTER(cl_device_id), options : c_char_p, pfn_notify : c_void_p, user_data : c_void_p) -> CLRESULT: ...
@dll_import('OpenCL')
def clGetProgramBuildInfo (program : cl_program, device : cl_device_id, param_name : cl_program_build_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t) ) -> CLRESULT: ...
@dll_import('OpenCL')
def clCreateKernelsInProgram (program : cl_program, num_kernels : cl_uint, kernels : POINTER(cl_kernel), num_kernels_ret : POINTER(cl_uint) ) -> CLRESULT: ...
@dll_import('OpenCL')
def clReleaseKernel (program : cl_kernel) -> CLRESULT: ...
@dll_import('OpenCL')
def clSetKernelArg (kernel : cl_kernel, arg_index : cl_uint, arg_size : c_size_t, arg_value : c_void_p) -> CLRESULT: ...
@dll_import('OpenCL')
def clEnqueueNDRangeKernel (command_queue : cl_command_queue, kernel : cl_kernel, work_dim : cl_uint, global_work_offset : POINTER(c_size_t), global_work_size : POINTER(c_size_t), local_work_size : POINTER(c_size_t), num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ...

View file

@ -0,0 +1,37 @@
"""
Minimal OpenCL 1.2 low level ctypes API.
"""
from .OpenCL import (CL_DEVICE_EXTENSIONS, CL_DEVICE_GLOBAL_MEM_SIZE,
CL_DEVICE_MAX_COMPUTE_UNITS, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
CL_DEVICE_MAX_WORK_GROUP_SIZE, CL_DEVICE_NAME,
CL_DEVICE_TYPE, CL_DEVICE_TYPE_ACCELERATOR,
CL_DEVICE_TYPE_ALL, CL_DEVICE_TYPE_CPU,
CL_DEVICE_TYPE_DEFAULT, CL_DEVICE_TYPE_GPU,
CL_DEVICE_VENDOR_ID, CL_DEVICE_VERSION, CL_DRIVER_VERSION,
CL_MEM_ALLOC_HOST_PTR, CL_MEM_COPY_HOST_PTR,
CL_MEM_READ_ONLY, CL_MEM_READ_WRITE, CL_MEM_SIZE,
CL_MEM_USE_HOST_PTR, CL_MEM_WRITE_ONLY,
CL_PLATFORM_EXTENSIONS, CL_PLATFORM_NAME,
CL_PLATFORM_PROFILE, CL_PLATFORM_VENDOR,
CL_PLATFORM_VERSION, CL_PROGRAM_BUILD_LOG,
CL_PROGRAM_BUILD_OPTIONS, CL_PROGRAM_BUILD_STATUS,
CLERROR, CLRESULT, byref, c_char, c_char_p, c_size_t,
c_void_p, cl_bitfield, cl_bool, cl_char, cl_command_queue,
cl_command_queue_properties, cl_context,
cl_context_properties, cl_device_id, cl_device_info,
cl_device_type, cl_double, cl_event, cl_float, cl_half,
cl_int, cl_kernel, cl_long, cl_mem, cl_mem_info,
cl_platform_id, cl_platform_info, cl_program,
cl_program_build_info, cl_short, cl_uchar, cl_uint,
cl_ulong, cl_ushort, clBuildProgram, clCreateBuffer,
clCreateCommandQueue, clCreateContext,
clCreateKernelsInProgram, clCreateProgramWithSource,
clEnqueueCopyBuffer, clEnqueueFillBuffer,
clEnqueueNDRangeKernel, clEnqueueReadBuffer,
clEnqueueWriteBuffer, clFinish, clGetDeviceIDs,
clGetDeviceInfo, clGetMemObjectInfo, clGetPlatformIDs,
clGetPlatformInfo, clGetProgramBuildInfo,
clReleaseCommandQueue, clReleaseContext, clReleaseEvent,
clReleaseKernel, clReleaseMemObject, clReleaseProgram,
clSetKernelArg, clWaitForEvents, create_string_buffer,
ctypes, sizeof)

View file

@ -0,0 +1,6 @@
from .Buffer import Buffer
from .Device import (Device, cleanup_devices, get_available_devices_info,
get_best_device, get_default_device, get_device,
set_default_device)
from .DeviceInfo import DeviceInfo
from .Kernel import Kernel

View file

@ -0,0 +1,74 @@
from typing import List
import numpy as np
from ..AShape import AShape
from ..AAxes import AAxes
class BroadcastInfo:
"""
Broadcast info for multi shapes.
arguments
shapes List[AShape]
errors during the construction:
ValueError
Example:
shapes = ( 3,),
(3, 1,)
#Example | comment
o_shape #(3, 3) broadcasted together o_shape
br_shapes #(1, 3) shape[0] broadcasted to o_shape ndim
#(3, 1) shape[1] broadcasted to o_shape ndim
shapes_tiles #(3, 1) tiles to make o_shape from br_shapes[0]
#(1, 3) tiles to make o_shape from br_shapes[1]
shapes_reduction_axes #(0,) reduction_axes to make shapes[0] from o_shape
#(1,) reduction_axes to make shapes[1] from o_shape
"""
__slots__ = ['o_shape', 'br_shapes', 'shapes_tiles', 'shapes_reduction_axes']
def __init__(self, shapes : List[AShape]):
highest_rank = sorted([shape.ndim for shape in shapes])[-1]
# Broadcasted all shapes to highest ndim with (1,)-s in from left side
br_shapes = [ (1,)*(highest_rank-shape.ndim) + shape for shape in shapes ]
# Determine o_shape from all shapes
o_shape = AShape( np.max( [ [ br_shape.shape[axis] for br_shape in br_shapes] for axis in range(highest_rank) ], axis=1 ) )
shapes_tiles = []
shapes_reduction_axes = []
for br_shape in br_shapes:
shape_tile = []
shape_reduction_axes = []
for axis, (x,y) in enumerate(zip(br_shape.shape, o_shape.shape)):
if x != y:
if x == 1 and y != 1:
shape_tile.append(y)
shape_reduction_axes.append(axis)
elif x != 1 and y == 1:
shape_tile.append(1)
else:
raise ValueError(f'operands could not be broadcast together with shapes {br_shape} {o_shape}')
else:
shape_tile.append(1)
shapes_tiles.append(shape_tile)
shapes_reduction_axes.append( AAxes(shape_reduction_axes) )
self.o_shape : AShape = AShape(o_shape)
self.br_shapes : List[AShape] = br_shapes
self.shapes_tiles : List[List] = shapes_tiles
self.shapes_reduction_axes : List [AAxes] = shapes_reduction_axes

View file

@ -0,0 +1,62 @@
from ..AShape import AShape
class ConcatInfo:
__slots__ = ['o_shape', 'axis', 'axis_sizes', 'axis_offsets']
def __init__(self, shapes, axis):
"""
Concat info
arguments
shapes Iterable of shapes
errors during the construction:
ValueError
result
.o_shape AShape
.axis Int fixed axis argument
.axis_sizes List[Int] axis sizes for every shape in shapes
.axis_offsets List[Int] axis offset in o_shape
for every shape in shapes
"""
shapes = tuple(shapes)
if len(shapes) == 0:
raise ValueError('shapes is empty')
shape = shapes[0]
if axis < 0:
axis = shape.ndim + axis
if axis < 0 or axis >= shape.ndim:
raise ValueError(f'Wrong axis {axis}')
fixed_shapes = [ tuple(a for i,a in enumerate(shape) if i != axis) for shape in shapes ]
req_shape = fixed_shapes[0]
if not all(shape == req_shape for shape in fixed_shapes[1:]):
raise ValueError(f'All shapes must match shape {tuple(a if i != axis else "*" for i,a in enumerate(shape))}')
axis_sizes = [ shape[axis] for shape in shapes ]
axis_offset = 0
axis_offsets = []
for axis_size in axis_sizes:
axis_offsets.append(axis_offset)
axis_offset += axis_size
self.o_shape = AShape( tuple(shape)[0:axis] + (sum(axis_sizes),) + tuple(shape)[axis+1:] )
self.axis = axis
self.axis_sizes = axis_sizes
self.axis_offsets = tuple(axis_offsets)

View file

@ -0,0 +1,78 @@
from collections import Iterable
import math
class Conv2DInfo:
"""
Conv2DInfo
arguments
H, W int axes sizes
KH, KW int kernel sizes
stride int
dilation int
padding 'valid' no padding
'same' output size will be the same
or divided by stride
int padding value for all sides
Iterable of 4 ints
paddings for left,top,right,bottom sides
errors during the construction:
ValueError
result:
.PADL .PADR paddings for W axis
.PADT .PADB paddings for H axis
.OH .OW result axes
.OH_T .OW_T result transposed axes.
it is None if padding != 'valid','same'
"""
__slots__ = ['PADL', 'PADR', 'PADT', 'PADB', 'OH', 'OW', 'OH_T', 'OW_T']
def __init__(self, H, W, KH, KW, stride, dilation, padding):
# Effective kernel sizes with dilation
EKH = (KH-1)*dilation + 1
EKW = (KW-1)*dilation + 1
# Determine pad size of sides
OH_T = OW_T = None
if padding == 'valid':
PADL = PADT = PADR = PADB = 0
OH_T = H * stride + max(EKH - stride, 0)
OW_T = W * stride + max(EKW - stride, 0)
elif padding == 'same':
PADL = int(math.floor((EKW - 1)/2))
PADT = int(math.floor((EKH - 1)/2))
PADR = int(math.ceil((EKW - 1)/2))
PADB = int(math.ceil((EKH - 1)/2))
OH_T = H * stride
OW_T = W * stride
elif isinstance(padding, int):
PADL = PADT = PADR = PADB = padding
elif isinstance(padding, Iterable):
padding = tuple(int(x) for x in padding)
if len(padding) != 4:
raise ValueError("Invalid paddings list length.")
PADL, PADT, PADR, PADB = padding
else:
raise ValueError("Invalid padding value.")
self.PADL = PADL
self.PADT = PADT
self.PADR = PADR
self.PADB = PADB
self.OH = max(1, int((H + PADT + PADB - EKH) / stride + 1) )
self.OW = max(1, int((W + PADL + PADR - EKW) / stride + 1) )
self.OH_T = OH_T
self.OW_T = OW_T

View file

@ -0,0 +1,56 @@
from typing import List
import numpy as np
from ..AShape import AShape
class PadInfo:
"""
Pad info.
arguments
shape AShape
axes_paddings list of (l_pad, r_pad)
if [0] == ... (Ellipsis), then left-side paddings will be filled with (0,0) for remain axes
if [-1] == ... , same for ride-side
errors during the construction:
ValueError
result:
.o_shape AShape
"""
__slots__ = ['o_shape','axes_paddings']
def __init__(self, shape, axes_paddings : List):
if Ellipsis in axes_paddings:
if sum(1 if x == Ellipsis else 0 for x in axes_paddings) > 1:
raise ValueError('only 1 ...(ellipsis) allowed in axes_paddings')
if axes_paddings[0] == Ellipsis:
axes_paddings = ((0,0),)*(shape.ndim-(len(axes_paddings)-1))+ axes_paddings[1:]
elif axes_paddings[-1] == Ellipsis:
axes_paddings = axes_paddings[:-1] + ((0,0),)*(shape.ndim-(len(axes_paddings)-1))
else:
raise ValueError('...(ellipsis) must be at the begin or the end of axes_paddings')
if len(axes_paddings) != shape.ndim:
raise ValueError(f'axes_paddings should match shape.ndim {shape.ndim}')
self.axes_paddings = axes_paddings
o_shape = []
for axis, (axis_size, (l_pad, r_pad)) in enumerate(zip(shape, axes_paddings)):
new_axis_size = l_pad + axis_size + r_pad
o_shape.append(new_axis_size)
self.o_shape = AShape(o_shape)

View file

@ -0,0 +1,50 @@
from ..AShape import AShape
class ReductionInfo:
"""
Reduction info
arguments
shape AShape
axes AAxes
keepdims bool
can raise ValueError, TypeError during the construction
"""
__slots__ = [
'reduction_axes', # sorted reduction AAxes
'o_axes', # remain AAxes after reduction
'o_shape', # result AShape of reduction
'o_shape_kd', # result AShape of reduction with keepdims
]
def __init__(self, shape, axes, keepdims):
shape_axes = shape.axes_arange()
if axes.is_none_axes():
axes = shape_axes
# Check correctness of axes
for axis in axes:
if axis not in shape_axes:
raise ValueError(f'Wrong axis {axis} not in {shape_axes}')
self.reduction_axes = reduction_axes = axes.sorted()
# Output axes. Remove axes from shape_axes
self.o_axes = o_axes = shape_axes - axes
if o_axes.is_none_axes():
o_shape = AShape( (1,) )
else:
o_shape = shape[o_axes]
self.o_shape = o_shape
self.o_shape_kd = AShape( 1 if axis in reduction_axes else shape[axis] for axis in range(shape.ndim))
if keepdims:
self.o_shape = self.o_shape_kd

View file

@ -0,0 +1,44 @@
from ..AShape import AShape
class ReshapeInfo:
"""
Reshape info.
can raise ValueError,TypeError during the construction
arguments
shape AShape
target_shape Iterable of ints
can be any len and contains only one '-1'
Example
shape (2, 512, 8, 8, 64)
target_shape (2, 512, -1)
o_shape (2, 512, 4096)
"""
__slots__ = ['o_shape']
def __init__(self, shape, target_shape):
o_shape = []
remain_size = shape.size
unk_axis = None
for t_size in target_shape:
t_size = int(t_size)
if t_size != -1:
mod = remain_size % t_size
if mod != 0:
raise ValueError(f'Cannot reshape {shape} to {target_shape}.')
remain_size /= t_size
else:
if unk_axis is not None:
raise ValueError('Can specify only one unknown dimension.')
unk_axis = len(o_shape)
o_shape.append( t_size )
if unk_axis is not None:
o_shape[unk_axis] = int(remain_size)
self.o_shape = AShape(o_shape)

View file

@ -0,0 +1,148 @@
import math
import numpy as np
from ..AShape import AShape
class SliceInfo:
__slots__ = ['o_shape', 'o_shape_kd', 'just_reshaped','axes_bes','axes_abs_bes']
def __init__(self, shape : AShape, slices):
"""
Slice info.
can raise ValueError,TypeError during the construction
arguments
slices
Example
o_shape result shape after slice
axes_bes list of (begin,step,end) per axis
if s==0, then single axis element is fetched from position b
s can be negative.
"""
# Validate slices argument for given shape.
new_slices = []
before_ellipsis = None
for s in slices:
if s is Ellipsis:
before_ellipsis = new_slices
new_slices = []
continue
elif s is not None and not isinstance(s, (int,tuple) ):
raise ValueError(f'unknown slice argument {s} of type {s.__class__}')
new_slices.append(s)
if before_ellipsis is not None:
# Process Ellipsis separator
new_slices_n_axes = sum([ 1 for x in new_slices if x != None])
before_ellipsis_n_axes = sum([ 1 for x in before_ellipsis if x != None])
# Expand slices by filling intermediate (None,None,None) for each remaining axis
new_slices = before_ellipsis + \
[(None,None,None)]*max(0, shape.ndim-before_ellipsis_n_axes-new_slices_n_axes) + \
new_slices
new_slices_n_axes = sum([ 1 for x in new_slices if x != None])
if new_slices_n_axes > shape.ndim:
raise ValueError('slices arguments more than shape axes')
elif new_slices_n_axes < shape.ndim:
# Fill remaining axes
new_slices += [(None,None,None)]*( shape.ndim - new_slices_n_axes )
slices = tuple(new_slices)
# Compute shapes
output_is_reshaped = True # Flag determines that output_tensor
# can be just reshaped without any computation
o_shape = [] # output tensor shape
o_shape_kd = [] # output shape used in kernel, must match input shape
axes_bes = []
axes_abs_bes = []
i_axis = 0
# Process slices arguments
for v in slices:
if v is None:
# None is new axis
# We can add unlimited number of (1,) axes at any place of shape
o_shape.append(1)
continue
i_axis_size = shape[i_axis]
i_axis += 1
if isinstance(v, int):
if v < 0:
v += i_axis_size
if v < 0 or v >= i_axis_size:
raise ValueError(f'index {v} is out of bounds for axis {i_axis} with size {i_axis_size}')
b,e,s = v,v,0
else:
b,e,s = v
if s == 0:
raise ValueError(f'slice step cannot be zero')
# Fix begin, end, step values
if s is None:
s = 1
if b is None:
b = 0 if s >= 0 else i_axis_size-1
if e is None:
e = i_axis_size if s >= 0 else -1
elif e < 0:
e += i_axis_size
if b < 0:
b += i_axis_size
if s >= 0:
b = np.clip(b, 0, i_axis_size)
e = np.clip(e, 0, i_axis_size)
if b > e:
raise ValueError('for positive step, begin cannot be > end.')
abs_b, abs_e, abs_s = b,e,s
else:
b = np.clip(b, 0, i_axis_size-1)
e = np.clip(e, -1, i_axis_size)
if b <= e:
raise ValueError('for negative step, begin cannot be <= end.')
abs_s = -s
abs_e = b + 1
abs_b = b - (math.ceil( (b-e) / abs_s ) -1) * abs_s
# for every o_shape_kd axis
# we have exact begin,step values to fetch value from input
axes_bes.append( (b,e,s))
axes_abs_bes.append( (abs_b, abs_e, abs_s))
if i_axis_size != 1 and not (b == 0 and e == i_axis_size and s == 1):
# Such params of axis slice will change input, thus output cannot be as just reshaped input
output_is_reshaped = False
# Compute output_axis_size based on begin,end,step
o_axis_size = max(0, math.ceil ( (e-b) / (s if s != 0 else 1) ) )
if o_axis_size >= 1:
# >= 1 : select range of indexes, axis will remain
o_shape.append(o_axis_size)
# ^ othwerwise axis will be supressed
# o_shape with keepdims, must match ndim of input shape
o_shape_kd.append( max(1,o_axis_size) )
self.just_reshaped = output_is_reshaped
self.o_shape = AShape(o_shape)
self.o_shape_kd = AShape(o_shape_kd)
self.axes_bes = axes_bes
self.axes_abs_bes = axes_abs_bes

View file

@ -0,0 +1,37 @@
from ..AShape import AShape
class StackInfo:
__slots__ = ['o_shape', 'axis']
def __init__(self, shape, axis, stack_count):
"""
Stack info
arguments
shape AShape
axis Int
stack_count Int
errors during the construction:
ValueError
result:
.o_shape AShape
.axis Int positive axis argument
"""
if axis < 0:
axis = shape.ndim + 1 + axis
if axis < 0 or axis > shape.ndim:
raise ValueError(f'Wrong axis {axis}')
if stack_count <= 0:
raise ValueError(f'Invalid stack_count {stack_count}')
self.o_shape = AShape( tuple(shape)[0:axis] + (stack_count,) + tuple(shape)[axis:] )
self.axis = axis

View file

@ -0,0 +1,52 @@
import numpy as np
from ..AShape import AShape, AShape
class TileInfo:
"""
Tile info.
arguments
shape AShape
tiles Iterable of ints
errors during the construction:
ValueError
result:
.o_shape AShape
.axes_slices list of slice() to fetch original shape
from o_shape for each tile
"""
__slots__ = ['o_shape', 'axes_slices']
def __init__(self, shape, tiles):
if len(tiles) != shape.ndim:
raise ValueError(f'tiles should match shape.ndim {shape.ndim}')
self.o_shape = AShape(dim*tiles[i] for i,dim in enumerate(shape))
c = [0]*shape.ndim
axes_offsets = []
for n in range(np.prod(tiles)):
axes_offsets.append( c.copy() )
for i in range(shape.ndim-1,-1,-1):
c[i] += 1
if c[i] < tiles[i]:
break
c[i] = 0
axes_slices = []
for axes_offset in axes_offsets:
sl = []
for axis,tile in enumerate(axes_offset):
axis_size = shape[axis]
sl.append( slice(axis_size*tile, axis_size*(tile+1)) )
axes_slices.append(tuple(sl))
self.axes_slices = tuple(axes_slices)

View file

@ -0,0 +1,37 @@
from ..AShape import AShape
from ..AAxes import AAxes
class TransposeInfo:
"""
TransposeInfo
arguments
shape AShape
axes_order AAxes
errors during the construction:
ValueError
result
.o_shape AShape
.no_changes bool transpose changes nothing
"""
__slots__ = ['no_changes', 'o_shape']
def __init__(self, shape : AShape, axes_order : AAxes):
if shape.ndim != axes_order.ndim:
raise ValueError('axes must match the shape')
# Axes order changes nothing?
self.o_shape = shape[axes_order]
self.no_changes = axes_order == shape.axes_arange()

View file

@ -0,0 +1,10 @@
from .BroadcastInfo import BroadcastInfo
from .ConcatInfo import ConcatInfo
from .PadInfo import PadInfo
from .ReductionInfo import ReductionInfo
from .ReshapeInfo import ReshapeInfo
from .SliceInfo import SliceInfo
from .StackInfo import StackInfo
from .TileInfo import TileInfo
from .TransposeInfo import TransposeInfo
from .Conv2DInfo import Conv2DInfo

View file

@ -0,0 +1,89 @@
import numpy as np
from ..backend import Kernel
from ..HKernel import HKernel
from ..SCacheton import SCacheton
from ..Tensor import Tensor
from .Initializer import Initializer
class InitCoords2DArange(Initializer):
def __init__(self, h_start, h_stop, w_start, w_stop):
"""
Initialize (...,H,W,D) tensor with coords arange
D == 2(x,y) or 3 (x,y,1)
arguments
h_start float height start value (inclusive)
h_stop float height stop value (inclusive)
w_start float width start value (inclusive)
w_stop float width stop value (inclusive)
"""
super().__init__()
self._h_start = h_start
self._h_stop = h_stop
self._w_start = w_start
self._w_stop = w_stop
def initialize_tensor(self, tensor : Tensor):
shape = tensor.shape
dtype = tensor.dtype
if shape.ndim < 3:
raise ValueError(f'tensor.shape.ndim must be >= 3 (...,H,W,D)')
OH,OW,OD = shape[-3:]
if OD not in [2,3]:
raise ValueError(f'last dim D {OD} must == 2 or 3')
if OH > 1:
h_step = (self._h_stop-self._h_start) / (OH-1)
else:
h_step = 0
if OW > 1:
w_step = (self._w_stop-self._w_start) / (OW-1)
else:
w_step = 0
key = (InitCoords2DArange, dtype)
kernel = SCacheton.get_var(key)
if kernel is None:
kernel = Kernel(kernel_text=f"""
{HKernel.define_tensor_type('O', tensor.dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME , float h_start, float h_step
, float w_start, float w_step,
uint O0, uint O1, uint O2)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', 3)}
O_TYPE v;
if (o2 == 0)
v = w_start+o1*w_step;
else
if (o2 == 1)
v = h_start+o0*h_step;
else
v = 1;
O_GLOBAL_STORE(gid, v);
}}
""")
SCacheton.set_var(key, kernel)
tensor.get_device().run_kernel( kernel, tensor.get_buffer(),
np.float32(self._h_start), np.float32(h_step),
np.float32(self._w_start), np.float32(w_step),
np.uint32(OH), np.uint32(OW), np.uint32(OD),
global_shape=(shape.size,) )
def __str__(self): return f'CoordsArange'

View file

@ -0,0 +1,62 @@
import numpy as np
from ..backend import Kernel
from ..HKernel import HKernel
from ..SCacheton import SCacheton
from ..Tensor import Tensor
from .Initializer import Initializer
class InitRandomUniform(Initializer):
def __init__(self, low=0, high=1):
"""
arguments
low(0) low value
high(1) high value (exclusive)
"""
super().__init__()
self._low = low
self._high = high
def initialize_tensor(self, tensor : Tensor):
key = (InitRandomUniform, self._low, self._high, tensor.dtype)
kernel = SCacheton.get_var(key)
if kernel is None:
hl = self._high-self._low
l = self._low
if tensor.dtype in [np.bool_,np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32]:
gen_expression = f'hash_uint_from_uint(gid+seed32) % {int(hl)} + {int(l)}'
elif tensor.dtype in [np.int64]:
gen_expression = f'hash_ulong_from_ulong(gid+seed64) % {int(hl)} + {int(l)}'
elif tensor.dtype in [np.uint64]:
gen_expression = f'hash_ulong_from_ulong(gid+seed64) % {int(hl)} + {int(l)}'
elif tensor.dtype in [np.float16, np.float32]:
gen_expression = f'hash_float_from_uint(gid+seed32)*{hl} + {l}'
elif tensor.dtype in [np.float64]:
gen_expression = f'hash_double_from_ulong(gid+seed64)*{hl} + {l}'
kernel = Kernel(kernel_text=f"""
{HKernel.include_hash()}
{HKernel.define_tensor('O', (tensor.shape.size,), tensor.dtype )}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, uint seed32, ulong seed64)
{{
size_t gid = get_global_id(0);
O_GLOBAL_STORE(gid, {gen_expression} );
}}
""")
SCacheton.set_var(key, kernel)
tensor.get_device().run_kernel( kernel, tensor.get_buffer(),
np.uint32(np.random.randint(np.iinfo(np.uint32).max, dtype=np.uint32)),
np.uint64(np.random.randint(np.iinfo(np.uint64).max, dtype=np.uint64)),
global_shape=(tensor.shape.size,) )
def __str__(self): return f'InitRandomUniform low={self._low}, high={self._high}'

View file

@ -0,0 +1,17 @@
from ..Tensor import Tensor
class Initializer:
"""
Base class for tensor inititalizers
"""
def initialize_tensor(self, tensor : Tensor):
"""
Implement initialization of tensor
You can compute the data using python and then call buffer.set(numpy_value)
or call kernel.run( tensor.get_device(), tensor.get_buffer, ...) to initialize the data using OpenCL
"""
raise NotImplementedError()
def __str__(self): return 'Initializer'
def __repr__(self): return self.__str__()

View file

@ -0,0 +1,3 @@
from .InitCoords2DArange import InitCoords2DArange
from .Initializer import Initializer
from .InitRandomUniform import InitRandomUniform

View file

@ -0,0 +1,21 @@
from .any_wise import add, any_wise, div, max_, min_, mul, sqrt, square, sub
from .binary_dilate_circle import binary_dilate_circle
from .binary_erode_circle import binary_erode_circle
from .binary_morph import binary_morph
from .cast import cast
from .concat import concat
from .depthwise_conv2D import depthwise_conv2D
from .gaussian_blur import gaussian_blur
from .matmul import matmul, matmulc
from .pad import pad
from .reduce import (moments, reduce_max, reduce_mean, reduce_min, reduce_std,
reduce_sum, reduce_variance)
from .remap import remap
from .remap_np_affine import remap_np_affine
from .reshape import reshape
from .slice_ import slice_
from .slice_set import slice_set
from .stack import stack
from .tile import tile
from .transpose import transpose
from .warp_affine import warp_affine

View file

@ -0,0 +1,111 @@
import numpy as np
from ..AShape import AShape
from ..backend import Kernel
from ..HArgs import HArgs
from ..HKernel import HKernel
from ..HType import HType
from ..info import BroadcastInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def any_wise(op_text : str,
*args,
dtype : np.dtype = None,
output_t:Tensor=None) -> Tensor:
"""
operator for N-wise ops with N inputs
arguments
op_text example: O=(2*I0*I1)+I2
*args List[ Tensor | number ]
dtype
output_t compute result to this Tensor.
Tensor may be with different shape, but should match total size.
"""
HArgs.check_zero_get_length(args)
tensor_args = HArgs.filter_tensor(args, raise_on_empty=True)
device = HArgs.check_get_same_device(tensor_args)
shape_list, dtype_list, krn_args = HArgs.decompose(args)
op = SCacheton.get(_AnyWiseOp, shape_list, dtype_list, dtype, op_text)
if output_t is None:
output_t = Tensor ( op.o_shape, op.o_dtype, device=device )
elif output_t.shape.size != op.o_shape.size:
raise ValueError(f'output_t must have size {op.o_shape.size}')
device.run_kernel(op.forward_krn, output_t.get_buffer(), *krn_args)
return output_t
class _AnyWiseOp:
def __init__(self, shape_list, dtype_list, o_dtype, op_text : str):
if len(shape_list) != len(dtype_list):
raise ValueError('len(shape_list) != len(dtype_list)')
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
if len(shape_list) == 1:
# element-wise.
i_shape, i_dtype = shape_list[0], dtype_list[0]
self.o_shape = o_shape = i_shape
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', o_shape, o_dtype)}
{HKernel.define_tensor('IN', i_shape, i_dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const IN_PTR_TYPE* IN_PTR_NAME)
{{
size_t gid = get_global_id(0);
O_TYPE O = O_GLOBAL_LOAD(gid);
IN_TYPE I0 = IN_GLOBAL_LOAD(gid);
{op_text};
O_GLOBAL_STORE(gid, O);
}}
""")
else:
# Multi arg.
self.info = info = BroadcastInfo( [ shape if shape is not None else AShape((1,)) for shape in shape_list ])
self.o_shape = o_shape = info.o_shape
defs, arg_defs, impls = [], [], []
for i, (t_shape, t_dtype) in enumerate(zip(shape_list, dtype_list)):
t_name = f'I{i}'
if t_shape is not None:
defs.append( HKernel.define_tensor(t_name, info.br_shapes[i], t_dtype) )
arg_defs.append( f", __global const {t_name}_PTR_TYPE* {t_name}_PTR_NAME" )
impls.append( f"{t_name}_TYPE {t_name} = {t_name}_GLOBAL_LOAD({t_name}_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)}));")
else:
arg_defs.append( f", {HKernel.define_scalar_func_arg(t_name, t_dtype)}" )
defs, arg_defs, impls = '\n'.join(defs), '\n'.join(arg_defs), '\n'.join(impls)
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
{defs}
{HKernel.define_tensor('O', o_shape, o_dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME{arg_defs})
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)}
{impls}
O_TYPE O;
{op_text};
O_GLOBAL_STORE(gid, O);
}}
""")
def add(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0+I1', a_t, b_t)
def sub(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0-I1', a_t, b_t)
def mul(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0*I1', a_t, b_t)
def div(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0/I1', a_t, b_t)
def min_(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=fmin( I0_TO_FLOATX(I0), I1_TO_FLOATX(I1) )', a_t, b_t)
def max_(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=fmax( I0_TO_FLOATX(I0), I1_TO_FLOATX(I1) )', a_t, b_t)
def square(a_t : Tensor) -> Tensor: return any_wise('O=I0*I0', a_t)
def sqrt(a_t : Tensor) -> Tensor: return any_wise('O=sqrt(I0_TO_FLOATX(I0))', a_t)

View file

@ -0,0 +1,91 @@
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..info import Conv2DInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def binary_dilate_circle (input_t : Tensor, radius : int = 1, iterations : int = 1, dtype=None):
"""
Binary dilate operator using circle kernel with radius.
input_t Tensor (...,H,W)
per-element of H,W, set 1 if any neighbor elements inside circle with radius != 0.
otherwise set 0.
"""
op = SCacheton.get(_BinaryDilateOp, input_t.shape, input_t.dtype, int(radius), dtype)
device = input_t.get_device()
if radius <= 0 or iterations <= 0:
return input_t.copy()
else:
for i in range(iterations):
if i == 0:
buf_in = input_t
else:
buf_in, buf_out = buf_out, buf_in
if i <= 1:
buf_out = Tensor( op.o_shape, op.o_dtype, device=device )
device.run_kernel(op.forward_krn, buf_out.get_buffer(), buf_in.get_buffer() )
return buf_out
class _BinaryDilateOp():
def __init__(self, i_shape : AShape, i_dtype, radius, o_dtype):
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
if i_shape.ndim < 2:
raise ValueError(f'i_shape.ndim must be >= 2')
KS = radius*2+1
IH,IW = i_shape[-2:]
ci = Conv2DInfo(IH, IW, KS, KS, stride=1, dilation=1, padding='same')
self.o_shape = o_shape = i_shape
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', o_shape, o_dtype)}
{HKernel.define_tensor('I', i_shape, i_dtype)}
#define PADL {ci.PADL}
#define PADT {ci.PADT}
#define RADIUS {radius}
#define KS {KS}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
{'#pragma unroll' if KS <= 16 else ''}
for (int kh=0; kh<KS; ++kh)
{'#pragma unroll' if KS <= 16 else ''}
for (int kw=0; kw<KS; ++kw)
{{
if ( hypot( (float)(kh-RADIUS), (float)(kw-RADIUS) ) <= RADIUS)
{{
int im2 = -PADT + kh + om2;
int im1 = -PADL + kw + om1;
I_TYPE i_val = (im1 >= 0 & im1 < Im1 & im2 >= 0 & im2 < Im2) ?
I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))
: 0;
if (i_val != (I_TYPE)0)
{{
O_GLOBAL_STORE(gid, (O_TYPE) 1);
return;
}}
}}
}}
O_GLOBAL_STORE(gid, (O_TYPE) 0 );
}}
""")

View file

@ -0,0 +1,91 @@
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..info import Conv2DInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def binary_erode_circle (input_t : Tensor, radius : int = 1, iterations : int = 1, dtype=None):
"""
Binary erode operator using circle kernel with radius.
input_t Tensor (...,H,W)
per-element of H,W, set 1 if all neighbor elements inside circle with radius != 0.
otherwise set 0.
"""
op = SCacheton.get(_BinaryErodeOp, input_t.shape, input_t.dtype, int(radius), dtype)
device = input_t.get_device()
if radius <= 0 or iterations <= 0:
return input_t.copy()
else:
for i in range(iterations):
if i == 0:
buf_in = input_t
else:
buf_in, buf_out = buf_out, buf_in
if i <= 1:
buf_out = Tensor( op.o_shape, op.o_dtype, device=device )
device.run_kernel(op.forward_krn, buf_out.get_buffer(), buf_in.get_buffer() )
return buf_out
class _BinaryErodeOp():
def __init__(self, i_shape : AShape, i_dtype, radius, o_dtype):
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
if i_shape.ndim < 2:
raise ValueError(f'i_shape.ndim must be >= 2')
KS = radius*2+1
IH,IW = i_shape[-2:]
ci = Conv2DInfo(IH, IW, KS, KS, stride=1, dilation=1, padding='same')
self.o_shape = o_shape = i_shape
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', o_shape, o_dtype)}
{HKernel.define_tensor('I', i_shape, i_dtype)}
#define PADL {ci.PADL}
#define PADT {ci.PADT}
#define RADIUS {radius}
#define KS {KS}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
{'#pragma unroll' if KS <= 16 else ''}
for (int kh=0; kh<KS; ++kh)
{'#pragma unroll' if KS <= 16 else ''}
for (int kw=0; kw<KS; ++kw)
{{
if ( hypot( (float)(kh-RADIUS), (float)(kw-RADIUS) ) <= RADIUS)
{{
int im2 = -PADT + kh + om2;
int im1 = -PADL + kw + om1;
I_TYPE i_val = (im1 >= 0 & im1 < Im1 & im2 >= 0 & im2 < Im2) ?
I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))
: 0;
if (i_val == (I_TYPE)0)
{{
O_GLOBAL_STORE(gid, (O_TYPE) 0 );
return;
}}
}}
}}
O_GLOBAL_STORE(gid, (O_TYPE) 1 );
}}
""")

View file

@ -0,0 +1,43 @@
from ..Tensor import Tensor
from .binary_dilate_circle import binary_dilate_circle
from .binary_erode_circle import binary_erode_circle
from .gaussian_blur import gaussian_blur
from .pad import pad
def binary_morph(input_t : Tensor, erode_dilate : int, blur : float, fade_to_border : bool = False, dtype=None) -> Tensor:
"""
Apply optional binary erode/dilate and optional blur.
input_t (...,H,W) tensor. Non zero values will be treated as 1.
erode_dilate int >= 0 amount of pixels to dilate
blur float >= 0 amount of pixels to blur
fade_to_border(False) clip the image in order
to fade smoothly to the border with specified blur amount
"""
x = input_t
H,W = input_t.shape[-2:]
x = pad(x, (...,(H,H),(W,W)), mode='constant', constant_value=0)
if erode_dilate > 0:
x = binary_erode_circle(x, radius=1, iterations=max(1,erode_dilate//2))
elif erode_dilate < 0:
x = binary_dilate_circle(x, radius=1, iterations=max(1,-erode_dilate//2) )
if fade_to_border:
h_clip_size = H + blur // 2
w_clip_size = W + blur // 2
x[...,:h_clip_size,:] = 0
x[...,-h_clip_size:,:] = 0
x[...,:,:w_clip_size] = 0
x[...,:,-w_clip_size:] = 0
if blur > 0:
x = gaussian_blur(x, blur * 0.250, dtype=dtype)
return x[...,H:-H,W:-W]

View file

@ -0,0 +1,17 @@
from ..Tensor import Tensor
from .any_wise import any_wise
def cast(input_t : Tensor, dtype, output_t:Tensor=None) -> Tensor:
"""
cast operator
arguments
input_t
dtype
output_t compute result to this Tensor.
Tensor may be with different shape, but should match total size.
"""
return any_wise('O=I0', input_t, dtype=dtype, output_t=output_t)

View file

@ -0,0 +1,70 @@
from ..backend import Kernel
from ..HArgs import HArgs
from ..HType import HType
from ..HKernel import HKernel
from ..info import ConcatInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def concat(tensor_list, axis, dtype=None, output_t=None, is_add_to_output=False) -> Tensor:
"""
arguments
tensor_list Iterable
axis Int
dtype np.dtype
output_t compute result to this Tensor.
Tensor may be with different shape,
but should match total size.
gradfn will not be set.
is_add_to_output add result to output_t if output_t is set.
"""
tensor_list = tuple(tensor_list)
HArgs.check_zero_get_length(tensor_list)
HArgs.check_all_tensors(tensor_list)
device = HArgs.check_get_same_device(tensor_list)
shape_list, dtype_list, _ = HArgs.decompose(tensor_list)
op = SCacheton.get(_ConcatOp, shape_list, dtype_list, dtype, int(axis), False if output_t is None else is_add_to_output)
if output_t is None:
output_t = Tensor (op.info.o_shape, op.o_dtype, device=device)
elif output_t.shape.size != op.info.o_shape.size:
raise ValueError(f'output_t must have size {op.info.o_shape.size}')
for forward_krn,t in zip(op.forward_krns,tensor_list):
device.run_kernel(forward_krn, output_t.get_buffer(), t.get_buffer(), global_shape=(t.shape.size,) )
return output_t
class _ConcatOp:
def __init__(self, shape_list, dtype_list, o_dtype, axis, is_add_to_output):
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
self.info = info = ConcatInfo(shape_list, axis)
self.forward_krns = forward_krns = []
for i, (shape, dtype) in enumerate(zip(shape_list, dtype_list)):
forward_krn = Kernel(f"""
{HKernel.define_tensor('O', info.o_shape, o_dtype )}
{HKernel.define_tensor('I', shape, dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'I', shape.ndim)}
i{info.axis} += {info.axis_offsets[i]};
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_seq_enum('I', info.o_shape.ndim)}), I_GLOBAL_LOAD(gid) );
}}
""")
forward_krns.append(forward_krn)

View file

@ -0,0 +1,107 @@
import numpy as np
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..info import BroadcastInfo, Conv2DInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def depthwise_conv2D (input_t : Tensor, kernel_t : Tensor, stride=1, dilation=1, padding='same', dtype=None):
"""
Depthwise Conv2D operator.
input_t Tensor (...,H,W)
kernel_t Tensor (...,H,W)
stride(1) int
dilation(1) int
padding(same) 'valid' no padding
'same' output size will be the same
or divided by stride
int padding value for all sides
Iterable of 4 ints
paddings for left,top,right,bottom sides
...-head part of shapes will be broadcasted to each other
"""
op = SCacheton.get(_DepthwiseConv2DOp, input_t.shape, input_t.dtype, kernel_t.shape, kernel_t.dtype, dtype, int(stride), int(dilation), padding)
output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
output_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(), kernel_t.get_buffer())
return output_t
class _DepthwiseConv2DOp():
def __init__(self, i_shape : AShape, i_dtype, k_shape : AShape, k_dtype, o_dtype, stride, dilation, padding):
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
if i_shape.ndim < 2:
raise ValueError(f'i_shape.ndim must be >= 2')
if k_shape.ndim < 2:
raise ValueError(f'k_shape.ndim must be >= 2')
IH,IW = i_shape[-2:]
KH,KW = k_shape[-2:]
ci = Conv2DInfo(IH, IW, KH, KW, stride, dilation, padding)
if i_shape.ndim == 2 and k_shape.ndim == 2:
# nothing to broadcast
i_br_shape = i_shape
k_br_shape = k_shape
o_shape = AShape([ci.OH, ci.OW])
else:
op = BroadcastInfo([ i_shape[:-2], k_shape[:-2] ])
i_br_shape = op.br_shapes[0] + i_shape[-2:]
k_br_shape = op.br_shapes[1] + k_shape[-2:]
o_shape = op.o_shape + [ci.OH, ci.OW]
self.o_shape = o_shape
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', o_shape, o_dtype)}
{HKernel.define_tensor('I', i_br_shape, i_dtype)}
{HKernel.define_tensor('K', k_br_shape, k_dtype)}
#define PADL {ci.PADL}
#define PADT {ci.PADT}
#define STRIDE {stride}
#define DILATION {dilation}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, __global const K_PTR_TYPE* K_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
float v = 0.0;
{'#pragma unroll' if KH <= 9 else ''}
for (int km2=0; km2<Km2; ++km2)
{{
int im2 = -PADT + km2*DILATION + om2*STRIDE;
if (im2 >= 0 & im2 < Im2)
{'#pragma unroll' if KW <= 9 else ''}
for (int km1=0; km1<Km1; ++km1)
{{
int im1 = -PADL + km1*DILATION + om1*STRIDE;
if (im1 >= 0 & im1 < Im1)
v += ((float)(I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))))
*K_GLOBAL_LOAD(K_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='km2,km1' )}));
}}
}}
O_GLOBAL_STORE(gid, (O_TYPE) v);
}}
""")

View file

@ -0,0 +1,39 @@
import numpy as np
from ..Tensor import Tensor
from .depthwise_conv2D import depthwise_conv2D
def gaussian_blur (input_t : Tensor, sigma, dtype=None) -> Tensor:
"""
arguments
input_t Tensor(...,H,W)
sigma float
"""
if sigma <= 0.0:
return input_t.copy() #TODO
device = input_t.get_device()
key = (gaussian_blur, sigma)
kernel_t = device.get_cached_data(key)
if kernel_t is None:
kernel_t = Tensor.from_value( _make_gaussian_kernel(sigma, np.float32), device=device )
device.set_cached_data(key, kernel_t)
output_t = depthwise_conv2D(input_t, kernel_t, dtype=dtype)
return output_t
def _make_gaussian_kernel(sigma : float, dtype):
kernel_size = max(3, int(2 * 2 * sigma))
if kernel_size % 2 == 0:
kernel_size += 1
mean = np.floor(0.5 * kernel_size)
kernel_1d = np.array([ np.exp(-(float(x) - float(mean)) ** 2 / (2 * sigma ** 2)) for x in range(kernel_size)])
np_kernel = np.outer(kernel_1d, kernel_1d)
kernel = np_kernel / np.sum(np_kernel)
return kernel.astype(dtype)

View file

@ -0,0 +1,158 @@
import numpy as np
from ..AShape import AShape
from ..backend import Kernel
from ..HArgs import HArgs
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def matmul(a_t : Tensor, b_t : Tensor, output_t: Tensor=None, is_add_to_output=False) -> Tensor:
"""
matmul operator in row-major format
A(...,M,K) x
B(...,K,N) =
(...,M,N)
arguments
output_t compute result to this Tensor.
Tensor may be with different shape,
but should match total size.
gradfn will not be set.
is_add_to_output add result to output_t if output_t is set.
"""
return matmulc(b_t, a_t, output_t=output_t, is_add_to_output=is_add_to_output)
def matmulc(a_t : Tensor, b_t : Tensor, output_t : Tensor = None, is_add_to_output=False) -> Tensor:
"""
matmul operator in col-major format
A(...,K,M) x
B(...,N,K) =
(...,N,M)
arguments
output_t compute result to this Tensor.
Tensor may be with different shape,
but should match total size.
gradfn will not be set.
is_add_to_output add result to output_t if output_t is set.
"""
device = HArgs.check_get_same_device([a_t, b_t])
op = SCacheton.get(_MatmulOp, a_t.shape, a_t.dtype, b_t.shape, b_t.dtype, False if output_t is None else is_add_to_output)
if output_t is None:
output_t = Tensor (op.o_shape, op.o_dtype, device=device )
elif output_t.shape.size != op.o_shape.size:
raise ValueError(f'output_t must have size {op.o_shape.size}')
device.run_kernel(op.forward_krn, output_t.get_buffer(), a_t.get_buffer(), b_t.get_buffer(), )
return output_t
class _MatmulOp:
def __init__(self, a_shape, a_dtype, b_shape, b_dtype, is_add_to_output):
a_dtype = np.dtype(a_dtype).type
b_dtype = np.dtype(b_dtype).type
if a_dtype != np.float32 or b_dtype != np.float32:
raise ValueError('matmul works only with float32 tensors.')
if a_shape.ndim != b_shape.ndim:
raise ValueError(f'ndims are not equal. {a_shape.ndim} != {b_shape.ndim}')
ndim = a_shape.ndim
if ndim < 2:
raise ValueError('Tensors ndim must be at least 2.')
K, M = a_shape[-2], a_shape[-1]
N, B_COLS = b_shape[-2], b_shape[-1]
if K != B_COLS:
raise ValueError('A_ROWS != B_COLS')
BATCH = a_shape[0:-2].size
B_BATCH = b_shape[0:-2].size
if BATCH != B_BATCH:
raise ValueError(f'BATCH size {BATCH} != {B_BATCH} in shapes {a_shape} {b_shape}')
if ndim == 2:
self.o_shape = AShape( (N, M) )
else:
self.o_shape = AShape( a_shape[:-2]+(N, M) )
self.o_dtype = np.float32
self.M = M
self.N = N
self.K = K
# Determining optimal tile widths
for MW in [8,4,2,1]:
if M % MW == 0:
break
for KW in [8,4,2,1]:
if N % KW == 0 and K % KW == 0:
break
NW = KW
self.forward_krn = Kernel(global_shape=(M//MW, N//NW, BATCH), kernel_text=f"""
#define K {K}
#define N {N}
#define MW {MW} // M tile Width
#define NW {NW} // N tile Width -- NW & KW should be the same !
#define KW {KW} // K tile Width
#define MT {M//MW} // MT is max for 'mt' (M tile count)
#define KT {K//KW} // KT is max for 'kt' (K tile count)
#define floatMW { f'float{MW}' if MW != 1 else 'float'}
#define floatKW { f'float{KW}' if KW != 1 else 'float'}
__kernel void GeMM(__global floatMW* O, const __global floatMW* restrict A, const __global floatKW* restrict B)
{{
size_t mt = get_global_id(0); //global M-tile id
size_t nc = get_global_id(1); //global N-tile id
size_t batch = get_global_id(2);
float AT[KW][MW]; // sub tiles
float BT[NW][KW];
float CT[NW][MW];
#pragma unroll
for (uint i=0; i<NW*MW; ++i) // zero CT tile
((float*) CT)[i] = 0.0;
for (uint kt=0; kt<KT; ++kt) // iterate over K-dim tiles
{{
#pragma unroll
for (uint k=0; k<KW; ++k) // every k-element inside K-dim tile
*( (floatMW*) AT[k] ) = A[batch*K*MT + (kt*KW + k)*MT + mt]; // store M-Width floats
#pragma unroll
for (uint n=0; n<NW; ++n) // every n-element inside N-dim tile
*( (floatKW*) BT[n] ) = B[batch*N*KT + (nc*NW + n)*KT + kt]; // store K-Width floats
#pragma unroll
for (uint k=0; k<KW; ++k)
#pragma unroll
for (uint n=0; n<NW; ++n) // sub tiles multiplication
#pragma unroll
for (uint m=0; m<MW; ++m)
CT[n][m] += AT[k][m] * BT[n][k];
}}
#pragma unroll
for (uint n=0; n<NW; ++n)
O[ batch*N*MT + (nc*NW + n)*MT + mt] {'+=' if is_add_to_output else '='}
*( (floatMW*) CT[n]);
}}""")

View file

@ -0,0 +1,71 @@
from typing import List
import numpy as np
from ..HType import HType
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..info import PadInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def pad(input_t : Tensor, axes_paddings : List, mode : str = 'constant', constant_value=0, dtype : np.dtype = None, output_t : Tensor=None) -> Tensor:
"""
arguments:
axes_paddings list of (l_pad, r_pad),
if [0] == ... (Ellipsis), then left-side paddings will be filled with (0,0) for remain axes
if [-1] == ... , same for ride-side
dtype cast to dtype
output_t compute result to this Tensor.
Tensor may be with different shape, but should match total size
"""
op = SCacheton.get(_PadOp, input_t.shape, input_t.dtype, dtype, tuple(axes_paddings), mode, constant_value )
if output_t is None:
output_t = Tensor (op.o_shape, op.o_dtype, device=input_t.get_device())
elif output_t.shape.size != op.o_shape.size:
raise ValueError(f'output_t must have size {op.o_shape.size}')
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
return output_t
class _PadOp:
def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, axes_paddings, mode, constant_value ):
_allow_modes = ['constant']
if mode not in _allow_modes:
raise ValueError(f'Allowed pads modes: {_allow_modes}')
if mode == 'constant':
if not HType.is_scalar_type(constant_value):
raise ValueError('constan_value must be scalar')
info = PadInfo(i_shape, axes_paddings)
self.o_shape = o_shape = info.o_shape
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', o_shape, o_dtype)}
{HKernel.define_tensor('I', i_shape, i_dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
if ({' & '.join(f'o{i} >= {l_pad} & o{i} < (O{i}-{r_pad})' for i, (l_pad,r_pad) in enumerate(info.axes_paddings))})
O_GLOBAL_STORE(gid, I_GLOBAL_LOAD( I_IDX({ ','.join(f'o{i}-{l_pad}' for i,(l_pad,r_pad) in zip(range(o_shape.ndim), info.axes_paddings) ) }) ) );
else
O_GLOBAL_STORE(gid, (O_TYPE){constant_value} );
//O_GLOBAL_STORE(gid, I_GLOBAL_LOAD( I_IDX_MOD({ ','.join(f' I{i} + ( (o{i}-{l_pad})*( ((o{i}-{l_pad})/I{i}) % 2 == 0 ? 1: -1) ) % I{i} ' for i,(l_pad,r_pad) in zip(range(o_shape.ndim), info.axes_paddings) ) }) ) );
}}""")
#print(self.forward_krn)

View file

@ -0,0 +1,214 @@
import math
import numpy as np
from ..AAxes import AAxes
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..info import ReductionInfo, TransposeInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
from .slice_ import slice_
from .transpose import transpose
from .any_wise import square, sqrt
def reduce_mean (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
"""
Reduce mean operator.
input_t Tensor
axes(None) int
Iterable of ints.
None - all axes
keepdims(False) keep reduced axes
"""
return reduce_op ('mean', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
def reduce_std(input_t, axes=None, keepdims=False):
"""
Reduce std operator.
input_t Tensor
axes(None) int
Iterable of ints.
None - all axes
keepdims(False) keep reduced axes
"""
return sqrt(reduce_variance(input_t, axes, keepdims))
def reduce_variance(input_t, axes=None, keepdims=False):
"""
Reduce variance operator.
input_t Tensor
axes(None) int
Iterable of ints.
None - all axes
keepdims(False) keep reduced axes
"""
mean = reduce_mean(input_t, axes, keepdims=True)
return reduce_mean(square(input_t - mean), axes, keepdims)
def moments(input_t, axes=None, keepdims=False):
"""
Returns (mean, variance) of input_t
input_t Tensor
axes(None) int
Iterable of ints.
None - all axes
keepdims(False) keep reduced axes
"""
mean = reduce_mean(input_t, axes, keepdims)
mean_shape_keepdims = mean._op.info.o_shape_kd
var = reduce_mean(square(input_t - mean.reshape(mean_shape_keepdims) ), axes, keepdims)
return mean, var
def reduce_min (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
"""
Reduce min operator.
input_t Tensor
axes(None) int
Iterable of ints.
None - all axes
keepdims(False) keep reduced axes
"""
return reduce_op ('min', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
def reduce_max (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
"""
Reduce max operator.
input_t Tensor
axes(None) int
Iterable of ints.
None - all axes
keepdims(False) keep reduced axes
"""
return reduce_op ('max', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
def reduce_sum (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
"""
Reduce sum operator.
input_t Tensor
axes(None) int
Iterable of ints.
None - all axes
keepdims(False) keep reduced axes
"""
return reduce_op ('sum', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
def reduce_op (op_type : str, input_t, axes=None, keepdims=False, output_t=None, is_add_to_output=False):
"""
arguments
op_type 'sum' 'mean' 'min' 'max'
output_t compute result to this Tensor.
Tensor may be with different shape,
but should match total size.
gradfn will not be set.
is_add_to_output add result to output_t if output_t is set.
"""
op = SCacheton.get(_ReduceOp, op_type, input_t.shape, input_t.dtype, AAxes(axes, input_t.shape.ndim), keepdims)
if output_t is None:
output_t = Tensor ( op.info.o_shape, input_t.dtype, device=input_t.get_device() )
elif output_t.shape.size != op.info.o_shape.size:
raise ValueError(f'output_t must have size {op.info.o_shape.size}')
# Make an intermediate tensor
input_t_inter = transpose(input_t, op.intermediate_transpose_axes)
# Perform multistage inplace operation in intermediate tensor
for stage, (shape, STAGE_COLS, STAGE_VALID_COLS) in enumerate(zip(op.forward_krn_shapes, op.forward_krn_stage_cols, op.forward_krn_stage_valid_cols)):
input_t_inter.get_device().run_kernel(op.forward_krn, input_t_inter.get_buffer(), np.int64(op.COLS), np.int64(STAGE_COLS), np.int64(STAGE_VALID_COLS),
global_shape=shape)
if op_type == 'mean':
# divide values in ROWS by number of COLS
input_t_inter.get_device().run_kernel(op.mean_div_forward_krn, input_t_inter.get_buffer(), np.int64(op.COLS), global_shape=(op.ROWS,) )
# Fetch final tensor from zero indexes using slices argument
slice_(input_t_inter, op.inter_slices, output_t=output_t, is_add_to_output=is_add_to_output)
return output_t
class _ReduceOp:
def __init__(self, op_type, i_shape : AShape, i_dtype : np.dtype, axes : AAxes, keepdims=False):
self.op_type = op_type
self.info = info = ReductionInfo(i_shape, axes, keepdims)
# Determine transpose order for intermediate tensor, where reduction axes will be at the end
self.intermediate_transpose_axes = info.o_axes + info.reduction_axes
self.intermediate_shape = TransposeInfo(i_shape, self.intermediate_transpose_axes).o_shape
# slices argument to fetch processed tensor from zero indexes
self.inter_slices = ( slice(None,None,None), ) * info.o_axes.ndim + (0,) * info.reduction_axes.ndim
# COLS are reduction axes, ROWS are remaining axes
rows_ndim = info.o_axes.ndim
self.ROWS = ROWS = self.intermediate_shape[:rows_ndim].size
self.COLS = COLS = self.intermediate_shape[rows_ndim:].size
# Number of stages to operate COLS
n_stages = (COLS-1).bit_length()
self.forward_krn_shapes = [ (ROWS * math.ceil(COLS/ (2**(stage+1)) ),) for stage in range(n_stages) ]
self.forward_krn_stage_cols = [ math.ceil(COLS / (2**(stage+1)) ) for stage in range(n_stages) ]
self.forward_krn_stage_valid_cols = [ math.ceil(COLS / (2** stage ) ) for stage in range(n_stages) ]
self.forward_krn = Kernel(f"""
{HKernel.define_tensor('I', (1,), i_dtype)}
__kernel void impl(__global I_PTR_TYPE* I_PTR_NAME, long COLS, long STAGE_COLS, long STAGE_VALID_COLS)
{{
size_t gid = get_global_id(0);
size_t col = gid % STAGE_COLS;
size_t row = gid / STAGE_COLS;
size_t i_idx = row*COLS + col;
size_t other_col = col + STAGE_COLS;
if (other_col < STAGE_VALID_COLS)
{{
I_TYPE val_a = I_GLOBAL_LOAD(i_idx);
I_TYPE val_b = I_GLOBAL_LOAD(row*COLS + other_col);
{'I_TYPE val_x = val_a + val_b;' if op_type in ['sum','mean'] else
'I_TYPE val_x = fmin( I_TO_FLOATX(val_a), I_TO_FLOATX(val_b) );' if op_type == 'min' else
'I_TYPE val_x = fmax( I_TO_FLOATX(val_a), I_TO_FLOATX(val_b) );' if op_type == 'max' else ''
}
I_GLOBAL_STORE(i_idx, val_x);
}}
}}
""")
self.mean_div_forward_krn = Kernel(f"""
{HKernel.define_tensor('I', (1,), i_dtype)}
__kernel void impl(__global I_PTR_TYPE* I_PTR_NAME, long COLS)
{{
size_t row = get_global_id(0);
I_GLOBAL_STORE(row*COLS, I_GLOBAL_LOAD(row*COLS) / COLS );
}}
""")

View file

@ -0,0 +1,103 @@
import numpy as np
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..info import BroadcastInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def remap (input_t : Tensor, coords_t : Tensor, dtype=None) -> Tensor:
"""
remap input_t in spatial axes using coords_t
arguments
input_t Tensor( ...,IH,IW )
coords_t Tensor( ...,OH,OW,D )
OH - output height
OW - output width
D is (2)[x,y] coords
dtype
...-head part of shapes will be broadcasted to each other
"""
op = SCacheton.get(_RemapOp, input_t.shape, input_t.dtype, coords_t.shape, coords_t.dtype, dtype)
output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(), coords_t.get_buffer())
return output_t
class _RemapOp():
def __init__(self, i_shape : AShape, i_dtype, c_shape : AShape, c_dtype, o_dtype):
if np.dtype(i_dtype).type == np.bool_:
raise ValueError('np.bool_ dtype of i_dtype is not supported.')
if np.dtype(c_dtype).type == np.bool_:
raise ValueError('np.bool_ dtype of c_dtype is not supported.')
if i_shape.ndim < 2:
raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
if c_shape.ndim < 3:
raise ValueError(f'Coords shape ndim must be >= 3(...,H,W,D)')
if c_shape[-1] != 2:
raise ValueError('Last coords dim must be == 2 (x,y)')
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
if i_shape.ndim == 2 and c_shape.ndim == 3:
# nothing to broadcast
i_br_shape = i_shape
c_br_shape = c_shape
o_shape = c_shape[-3:-1]
else:
op = BroadcastInfo([ i_shape[:-2], c_shape[:-3] ])
i_br_shape = op.br_shapes[0] + i_shape[-2:]
c_br_shape = op.br_shapes[1] + c_shape[-3:]
o_shape = op.o_shape + c_shape[-3:-1]
self.o_shape = o_shape
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', o_shape, o_dtype)}
{HKernel.define_tensor('I', i_br_shape, i_dtype)}
{HKernel.define_tensor('C', c_br_shape[:-1], c_dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, __global const C_PTR_TYPE2* C_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)}
C_TYPE2 c_value = C_GLOBAL_LOAD2(C_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim)}));
float cx01 = (float) c_value.x;
float cy01 = (float) c_value.y;
float cx0f = floor(cx01); int cx0 = (int)cx0f;
float cy0f = floor(cy01); int cy0 = (int)cy0f;
float cx1f = cx0f+1; int cx1 = (int)cx1f;
float cy1f = cy0f+1; int cy1 = (int)cy1f;
float p00 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx0')}));
float p01 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx1')}));
float p10 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx0')}));
float p11 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx1')}));
p00 *= (cx1f - cx01)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx0 >= 0 & cx0 < Im1);
p01 *= (cx01 - cx0f)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx1 >= 0 & cx1 < Im1);
p10 *= (cx1f - cx01)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx0 >= 0 & cx0 < Im1);
p11 *= (cx01 - cx0f)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx1 >= 0 & cx1 < Im1);
O_GLOBAL_STORE(gid, p00 + p01 + p10 + p11);
}}
""")

View file

@ -0,0 +1,96 @@
import numpy as np
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def remap_np_affine (input_t : Tensor, affine_n : np.array, inverse=False, output_size=None, dtype=None) -> Tensor:
"""
remap affine operator for all channels using single numpy affine mat
arguments
input_t Tensor (...,H,W)
affine_n np.array (2,3)
dtype
"""
if affine_n.shape != (2,3):
raise ValueError('affine_n.shape must be (2,3)')
op = SCacheton.get(_RemapAffineOp, input_t.shape, input_t.dtype, output_size, dtype)
output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
((a, b, c),
(d, e, f)) = affine_n
if not inverse:
# do inverse by default, match cv2.warpAffine behaviour
D = a*e - b*d
D = 1.0 / D if D != 0.0 else 0.0
a, b, c, d, e, f = ( e*D, -b*D, (b*f-e*c)*D ,
-d*D, a*D, (d*c-a*f)*D )
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(),
np.float32(a), np.float32(b), np.float32(c), np.float32(d), np.float32(e), np.float32(f) )
return output_t
class _RemapAffineOp():
def __init__(self, i_shape : AShape, i_dtype, o_size, o_dtype):
if np.dtype(i_dtype).type == np.bool_:
raise ValueError('np.bool_ dtype of i_dtype is not supported.')
if i_shape.ndim < 2:
raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
IH,IW = i_shape[-2:]
if o_size is not None:
OH,OW = o_size
else:
OH,OW = IH,IW
o_shape = AShape( (OH,OW) )
if i_shape.ndim > 2:
o_shape = i_shape[:-2] + o_shape
self.o_shape = o_shape
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', o_shape, o_dtype)}
{HKernel.define_tensor('I', i_shape, i_dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME,
float a, float b, float c,
float d, float e, float f)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
float cx01 = om1*a + om2*b + c;
float cy01 = om1*d + om2*e + f;
float cx0f = floor(cx01); int cx0 = (int)cx0f;
float cy0f = floor(cy01); int cy0 = (int)cy0f;
float cx1f = cx0f+1; int cx1 = (int)cx1f;
float cy1f = cy0f+1; int cy1 = (int)cy1f;
float p00 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx0')}));
float p01 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx1')}));
float p10 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx0')}));
float p11 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx1')}));
p00 *= (cx1f - cx01)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx0 >= 0 & cx0 < Im1);
p01 *= (cx01 - cx0f)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx1 >= 0 & cx1 < Im1);
p10 *= (cx1f - cx01)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx0 >= 0 & cx0 < Im1);
p11 *= (cx01 - cx0f)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx1 >= 0 & cx1 < Im1);
O_GLOBAL_STORE(gid, p00 + p01 + p10 + p11);
}}
""")

View file

@ -0,0 +1,26 @@
from typing import Iterable
from ..Tensor import Tensor
from ..SCacheton import SCacheton
from ..info import ReshapeInfo
def reshape(input_t : Tensor, new_shape : Iterable, copy=True) -> Tensor:
"""
reshape operator
arguments
new_shape Iterable of ints
copy(True) if True, produces new Tensor
otherwise result tensor points to the same memory
Produces reference Tensor with new shape.
"""
info = SCacheton.get(ReshapeInfo, input_t.shape, tuple(int(x) for x in new_shape) )
if copy:
return Tensor(info.o_shape, input_t.dtype, device=input_t.get_device()).set(input_t)
return input_t.as_shape( info.o_shape )

View file

@ -0,0 +1,67 @@
import numpy as np
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..HType import HType
from ..info import SliceInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def slice_(input_t : Tensor, slices, dtype : np.dtype = None, output_t=None, is_add_to_output=False) -> Tensor:
"""
arguments:
input_t input tensor
slices argument received from class.__getitem__(slices)
output_t compute result to this Tensor.
Tensor may be with different shape, but should match total size.
gradfn will not be set.
is_add_to_output add result to output_t if output_t is set.
Remark.
Slicing logic is not the same as numpy:
For example np[2:0:1] slice will produce invalid array with zero index,
but nn.slice() will select 2 index, same as val_t[2].
"""
op = SCacheton.get(_SliceOp, input_t.shape, input_t.dtype, dtype, HType.hashable_slices(slices), False if output_t is None else is_add_to_output )
o_shape = op.slice_info.o_shape
if output_t is None:
if op.slice_info.just_reshaped:
return input_t.reshape(o_shape)
else:
output_t = Tensor(o_shape, op.o_dtype, device=input_t.get_device())
elif output_t.shape.size != o_shape.size:
raise ValueError(f'output_t must have size {o_shape.size}')
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
return output_t
class _SliceOp:
def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, slices, is_add_to_output):
self.slice_info = slice_info = SliceInfo(i_shape, slices)
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
self.forward_krn = Kernel(global_shape=(slice_info.o_shape_kd.size,), kernel_text=f"""
{HKernel.define_tensor('O', slice_info.o_shape_kd, o_dtype )}
{HKernel.define_tensor('I', i_shape, i_dtype )}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'o', slice_info.o_shape_kd.ndim)}
{chr(10).join( f'size_t i{i} = {b} + o{i} * {s}; ' for i, (b,e,s) in enumerate(slice_info.axes_bes) ) }
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}(gid, I_GLOBAL_LOAD( I_IDX({HKernel.axes_seq_enum('i', i_shape.ndim)}) ) );
}}
""")

View file

@ -0,0 +1,73 @@
import numpy as np
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..HType import HType
from ..info import BroadcastInfo, SliceInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def slice_set(input_t : Tensor, slices, value) -> Tensor:
"""
arguments:
input_t input tensor
slices argument received from class.__getitem__(slices)
value
Remark.
"""
if HType.is_scalar_type(value):
v_shape = None
v_dtype = None
v_scalar = value
elif not isinstance(value, Tensor):
value = Tensor.from_value(value, dtype=input_t.dtype, device=input_t.get_device())
v_shape = value.shape
v_dtype = value.dtype
v_scalar = None
op = SCacheton.get(_SliceSetOp, input_t.shape, input_t.dtype, v_shape, v_dtype, v_scalar, HType.hashable_slices(slices) )
if v_scalar is not None:
input_t.get_device().run_kernel(op.forward_krn, input_t.get_buffer() )
else:
input_t.get_device().run_kernel(op.forward_krn, input_t.get_buffer(), value.get_buffer() )
return input_t
class _SliceSetOp:
def __init__(self, i_shape : AShape, i_dtype : np.dtype, v_shape : AShape, v_dtype : np.dtype, v_scalar, slices):
slice_info = SliceInfo(i_shape, slices)
if v_scalar is None:
if v_shape.ndim > i_shape.ndim:
raise ValueError(f'v_shape.ndim {v_shape.ndim} cannot be larger than i_shape.ndim {i_shape.ndim}')
# Check that v_shape can broadcast with slice_info.shape
br_info = BroadcastInfo([slice_info.o_shape_kd, v_shape])
v_br_shape = br_info.br_shapes[1]
self.forward_krn = Kernel(global_shape=(i_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', i_shape, i_dtype )}
{HKernel.define_tensor('I', v_br_shape, v_dtype ) if v_scalar is None else ''}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME
{', __global const I_PTR_TYPE* I_PTR_NAME' if v_scalar is None else ''})
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', slice_info.o_shape_kd.ndim)}
if ({' & '.join( [f'o{i} >= {b} & o{i} < {e}' if s != 0 else f'o{i} == {b}' for i, (b,e,s) in enumerate(slice_info.axes_abs_bes)] +
[f'((o{i} % {s}) == 0)' for i, (_,_,s) in enumerate(slice_info.axes_abs_bes) if s > 1 ] ) } )
O_GLOBAL_STORE(gid, {f"I_GLOBAL_LOAD( I_IDX_MOD({HKernel.axes_seq_enum('O', i_shape.ndim)}) ) " if v_scalar is None else f" (O_TYPE)({v_scalar})"} );
}}
""")

View file

@ -0,0 +1,74 @@
import numpy as np
from typing import List
from ..AShape import AShape
from ..backend import Kernel
from ..HArgs import HArgs
from ..HKernel import HKernel
from ..HType import HType
from ..info import StackInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def stack(tensor_list : List[Tensor], axis, dtype=None, output_t=None, is_add_to_output=False):
"""
Stack operator.
arguments:
tensor_list List of Tensors
axis Int
output_t compute result to this Tensor.
Tensor may be with different shape, but should match total size.
gradfn will not be set.
is_add_to_output add result to output_t if output_t is set.
"""
HArgs.check_zero_get_length(tensor_list)
HArgs.check_all_tensors(tensor_list)
device = HArgs.check_get_same_device(tensor_list)
shape_list, dtype_list, _ = HArgs.decompose(tensor_list)
op = SCacheton.get(_StackOp, shape_list, dtype_list, int(axis), dtype, False if output_t is None else is_add_to_output)
if output_t is None:
output_t = Tensor (op.info.o_shape, op.o_dtype, device=device)
elif output_t.shape.size != op.info.o_shape.size:
raise ValueError(f'output_t must have size {op.info.o_shape.size}')
for i, krn in enumerate(op.forward_krns):
device.run_kernel(krn, output_t.get_buffer(), tensor_list[i].get_buffer(), np.int64(i) )
return output_t
class _StackOp:
def __init__(self, shape_list : List[AShape], dtype_list : List[np.dtype], axis, o_dtype, is_add_to_output):
self.stack_count = stack_count = len(shape_list)
i_shape = shape_list[0]
if not all (s == i_shape for s in shape_list):
raise ValueError('All shapes must be the same')
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
self.info = info = StackInfo(i_shape, axis, stack_count)
self.forward_krns = forward_krns = []
for i_dtype in dtype_list:
forward_krns.append( Kernel(global_shape=(i_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', info.o_shape, o_dtype )}
{HKernel.define_tensor('I', i_shape, i_dtype )}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, long i_new_idx)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'I', i_shape.ndim)}
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_seq_enum('I', i_shape.ndim, new_axis=('i_new_idx', info.axis))}), I_GLOBAL_LOAD(gid) );
}}
"""))

View file

@ -0,0 +1,58 @@
import numpy as np
from typing import List
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..info import TileInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def tile(input_t : Tensor, tiles : List[int], dtype : np.dtype = None, output_t=None, is_add_to_output=False):
"""
Tile operator
arguments
tiles Iterable of ints
dtype
output_t compute result to this Tensor.
Tensor may be with different shape, but should match total size.
gradfn will not be set.
is_add_to_output add result to output_t if output_t is set.
"""
op = SCacheton.get(_TileOp, input_t.shape, input_t.dtype, tuple(int(tile) for tile in tiles), dtype, False if output_t is None else is_add_to_output)
if output_t is None:
output_t = Tensor (op.info.o_shape, op.o_dtype, device=input_t.get_device())
elif output_t.shape.size != op.info.o_shape.size:
raise ValueError(f'output_t must have size {op.info.o_shape.size}')
input_t.get_device().run_kernel( op.forward_krn, output_t.get_buffer(), input_t.get_buffer())
return output_t
class _TileOp:
def __init__(self, i_shape : AShape, i_dtype, tiles, o_dtype, is_add_to_output):
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
self.info = info = TileInfo(i_shape, tiles)
self.forward_krn = Kernel(global_shape=(info.o_shape.size,), kernel_text=f"""
{HKernel.define_tensor('I', i_shape, i_dtype)}
{HKernel.define_tensor('O', info.o_shape, o_dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs ('gid', 'O', info.o_shape.ndim)}
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'} (gid, I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)})) );
}}
""")

View file

@ -0,0 +1,68 @@
import numpy as np
from ..AAxes import AAxes
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..info import TransposeInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def transpose(input_t : Tensor, axes_order, op_text=None, dtype : np.dtype = None, output_t : Tensor=None, is_add_to_output=False) -> Tensor:
"""
arguments:
axes_order Int
Iterable of ints
None
dtype cast to dtype
op_text(None) optional op with value during transpose.
'O = I'
output_t compute result to this Tensor.
Tensor may be with different shape, but should match total size
"""
op = SCacheton.get(_TransposeOp, input_t.shape, input_t.dtype, dtype, AAxes(axes_order), op_text, False if output_t is None else is_add_to_output )
if output_t is None:
output_t = Tensor (op.o_shape, op.o_dtype, device=input_t.get_device())
elif output_t.shape.size != op.o_shape.size:
raise ValueError(f'output_t must have size {op.o_shape.size}')
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
return output_t
class _TransposeOp:
def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, axes_order : AAxes, op_text, is_add_to_output : bool ):
self.axes_order = axes_order
self.o_shape = o_shape = TransposeInfo(i_shape, axes_order).o_shape
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
if op_text is None:
op_text = 'O = I'
self.forward_krn = Kernel(global_shape=(i_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', o_shape, o_dtype)}
{HKernel.define_tensor('I', i_shape, i_dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'i', i_shape.ndim)}
I_TYPE I = I_GLOBAL_LOAD(gid);
O_TYPE O;
{op_text};
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_order_enum('I', axes_order )}), O );
}}""")

Binary file not shown.

View file

@ -0,0 +1,72 @@
import numpy as np
from ..AShape import AShape
from ..initializer import InitCoords2DArange
from ..SCacheton import SCacheton
from ..Tensor import Tensor
from .matmul import matmul
from .remap import remap
def warp_affine (input_t : Tensor, affine_t : Tensor, output_size=None, dtype=None) -> Tensor:
"""
arguments
input_t Tensor(...,H,W)
affine_t Tensor(...,2,3)
affine matrix
example of identity affine matrix
[1,0,0],
[0,1,0]
...-head part of shapes will be broadcasted to each other
output_size(None)
tuple of 2 ints (HW)
of output size
if None , size will not be changed
"""
op = SCacheton.get(_WarpAffineOp, input_t.shape, input_t.dtype, affine_t.shape, affine_t.dtype, output_size)
affine_t = affine_t.transpose( op.affine_transpose_axes, dtype=np.float32 ).reshape( (-1,3,2) )
coords_t = Tensor(op.coords_shape, np.float32, device=input_t.get_device(), initializer=op.coords_init )
coords_t = coords_t.reshape(op.coords_reshape)
coords_t = matmul(coords_t, affine_t).reshape(op.coords_affined_shape)
output_t = remap(input_t, coords_t, dtype=dtype)
return output_t
class _WarpAffineOp():
def __init__(self, i_shape : AShape, i_dtype, a_shape : AShape, a_dtype, o_size):
if np.dtype(i_dtype).type == np.bool_:
raise ValueError('np.bool_ dtype of i_dtype is not supported.')
if np.dtype(a_dtype).type == np.bool_:
raise ValueError('np.bool_ dtype of a_dtype is not supported.')
if i_shape.ndim < 2:
raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
if a_shape.ndim < 2:
raise ValueError(f'a_shape.ndim must be >= 2 (...,2,3)')
if a_shape[-2] != 2 or a_shape[-1] != 3:
raise ValueError('Last a_shape dims must be == (...,2,3)')
IH,IW = i_shape[-2:]
if o_size is not None:
OH,OW = o_size
else:
OH,OW = IH,IW
self.coords_shape = AShape( (OH,OW,3) )
self.coords_affined_shape = AShape( (OH,OW,2) )
if a_shape.ndim > 2:
self.coords_shape = a_shape[:-2] + self.coords_shape
self.coords_affined_shape = a_shape[:-2] + self.coords_affined_shape
self.coords_init = InitCoords2DArange(0,OH-1,0,OW-1)
self.coords_reshape = (-1,OH*OW,3)
self.affine_transpose_axes = a_shape.axes_arange().swapped_axes(-2,-1)