diff --git a/xlib/avecl/__init__.py b/xlib/avecl/__init__.py new file mode 100644 index 0000000..25455c1 --- /dev/null +++ b/xlib/avecl/__init__.py @@ -0,0 +1,31 @@ +""" +AveCL ! Make OpenCL great again. + +Lightweight ndarray library using OpenCL 1.2 written in pure python. +Applicable for high-performance general purpose n-dim array computations for every device that supports OpenCL 1.2. + +Works in python 3.5+. Dependencies: numpy. + +This lib uses relative import, thus you can place it in any subfolder. + +made by @iperov from scratch. +""" + +from ._internal.AAxes import AAxes +from ._internal.AShape import AShape +from ._internal.backend import (Device, DeviceInfo, Kernel, + get_available_devices_info, get_best_device, + get_default_device, get_device, + set_default_device) +from ._internal.HArgs import HArgs +from ._internal.HKernel import HKernel +from ._internal.HTensor import HTensor +from ._internal.HType import HType +from ._internal.initializer import (InitCoords2DArange, Initializer, + InitRandomUniform) +from ._internal.NCore import NCore +from ._internal.NTest import NTest +from ._internal.op import * +from ._internal.SCacheton import SCacheton +from ._internal.Tensor import Tensor +from ._internal.TensorImpl import * \ No newline at end of file diff --git a/xlib/avecl/_internal/AAxes.py b/xlib/avecl/_internal/AAxes.py new file mode 100644 index 0000000..6c4d673 --- /dev/null +++ b/xlib/avecl/_internal/AAxes.py @@ -0,0 +1,169 @@ +from collections import Iterable + +class AAxes(Iterable): + __slots__ = ['axes','ndim','_inversed'] + + def __init__(self, axes, shape_ndim=None): + """ + Constructs AAxes from user argument + + arguments + + axes AAxes + Int + Iterable of ints + None + + shape_ndim(None) provide shape_ndim if axes contain negative values + + can raise an errors during the construction + + AAxes supports: + + A+B : concat A_axes with B_axes + + A-B : removes B_axes from A_axes + """ + + if isinstance(axes, AAxes): + self.axes = axes.axes + self.ndim = axes.ndim + self._inversed = axes._inversed + elif axes is None: + self.axes = None + self.ndim = None + self._inversed = None + else: + if not isinstance(axes, Iterable): + axes = (axes,) + + if isinstance(axes, Iterable): + valid_axes = [] + for x in axes: + if x is None: + raise ValueError(f'Incorrent value {x} in axes {axes}') + x = int(x) + if x < 0: + if shape_ndim is None: + raise ValueError(f'Incorrent value {x} in axes {axes}, or provide shape_ndim') + x = shape_ndim + x + + if x in valid_axes: + raise ValueError(f'Axes must contain unique values.') + valid_axes.append(x) + + self.axes = tuple(valid_axes) + self.ndim = len(self.axes) + self._inversed = None + + def is_none_axes(self): + """ + returns True if AAxes is constructed with (None) argument, i.e. all-axes + """ + return self.axes is None + + def sorted(self) -> 'AAxes': + """ + returns sorted AAxes + """ + return AAxes(sorted(self.axes)) + + def swapped_axes(self, axis_a, axis_b) -> 'AAxes': + x = list(self.axes) + if axis_a < 0: + axis_a = len(x) + axis_a + if axis_b < 0: + axis_b = len(x) + axis_b + + x[axis_b], x[axis_a] = x[axis_a], x[axis_b] + + return AAxes( tuple(x) ) + + def inversed(self) -> 'AAxes': + """ + Returns inversed axes order + + Example: + + for (0,2,3,1) returns (0,3,1,2) + """ + if self.is_none_axes(): + raise Exception(f'none-axes does not support inversed(). Handle none-axes by calling .is_none_axes()') + + if self._inversed is None: + x = { axis:i for i,axis in enumerate(self.axes) } + t = [] + for i in range(self.ndim): + axis = x.get(i, None) + if axis is None: + raise Exception(f'axes {self.axes} are inconsistent to do inverse order.') + t.append(axis) + self._inversed = AAxes(t) + + return self._inversed + + + def __hash__(self): return self.axes.__hash__() + def __eq__(self, other): + if isinstance(other, AAxes): + return self.axes == other.axes + elif isinstance(other, Iterable): + return self.axes == tuple(other) + return False + def __iter__(self): + if self.is_none_axes(): + raise Exception(f'none-axes does not support iteration. Handle none-axes by calling .is_none_axes()') + return self.axes.__iter__() + + def __len__(self): return self.ndim + def __getitem__(self,key): + if self.is_none_axes(): + raise Exception(f'none-axes does not support indexing. Handle none-axes by calling .is_none_axes()') + + elif isinstance(key, slice): + return AAxes(self.axes[key]) + + return self.axes[key] + + def __radd__(self, o): + if isinstance(o, Iterable): + return AAxes( tuple(o) + self.axes) + else: + raise ValueError(f'unable to use type {o.__class__} in AAxes append') + def __add__(self, o): + if isinstance(o, Iterable): + return AAxes( self.axes + tuple(o) ) + else: + raise ValueError(f'unable to use type {o.__class__} in AAxes append') + + def __rsub__(self, o): + if isinstance(o, Iterable): + new_axes = [] + for axis in o: + if axis not in self.axes: + new_axes.append(axis) + + return AAxes(new_axes) + else: + raise ValueError(f'unable to use type {o.__class__} in AAxes substraction') + + def __sub__(self, o): + if isinstance(o, Iterable): + new_axes = [] + o_axes = tuple(o) + for axis in self.axes: + if axis not in o_axes: + new_axes.append(axis) + + return AAxes(new_axes) + else: + raise ValueError(f'unable to use type {o.__class__} in AAxes substraction') + + def __str__(self): + if self.is_none_axes(): + return '(None)' + return str(self.axes) + + def __repr__(self): return 'AAxes' + self.__str__() + +__all__ = ['AAxes'] \ No newline at end of file diff --git a/xlib/avecl/_internal/AShape.py b/xlib/avecl/_internal/AShape.py new file mode 100644 index 0000000..543b808 --- /dev/null +++ b/xlib/avecl/_internal/AShape.py @@ -0,0 +1,103 @@ +from collections import Iterable +from .AAxes import AAxes + +class AShape(Iterable): + __slots__ = ['shape','size','ndim'] + + def __init__(self, shape): + """ + Constructs valid shape from user argument + + arguments + + shape AShape + Iterable + + can raise ValueError during the construction + """ + + if isinstance(shape, AShape): + self.shape = shape.shape + self.size = shape.size + self.ndim = shape.ndim + else: + if isinstance(shape, (int,float) ): + shape = (int(shape),) + + if isinstance(shape, Iterable): + size = 1 + valid_shape = [] + for x in shape: + if x is None: + raise ValueError(f'Incorrent value {x} in shape {shape}') + x = int(x) + if x < 1: + raise ValueError(f'Incorrent value {x} in shape {shape}') + valid_shape.append(x) + size *= x # Faster than np.prod() + + self.shape = tuple(valid_shape) + self.ndim = len(self.shape) + if self.ndim == 0: + # Force (1,) shape for scalar shape + self.ndim = 1 + self.shape = (1,) + self.size = size + else: + raise ValueError('Invalid type to create AShape') + + def axes_arange(self) -> AAxes: + """ + Returns tuple of axes arange. + + Example (0,1,2) for ndim 3 + """ + return AAxes(range(self.ndim)) + + def transpose_by_axes(self, axes) -> 'AShape': + """ + Same as AShape[axes] + + Returns AShape transposed by axes. + + axes AAxes + Iterable(list,tuple,set,generator) + """ + return AShape(self.shape[axis] for axis in AAxes(axes) ) + + def __hash__(self): return self.shape.__hash__() + def __eq__(self, other): + if isinstance(other, AShape): + return self.shape == other.shape + elif isinstance(other, Iterable): + return self.shape == tuple(other) + return False + def __iter__(self): return self.shape.__iter__() + def __len__(self): return len(self.shape) + def __getitem__(self,key): + if isinstance(key, Iterable): + if isinstance(key, AAxes): + if key.is_none_axes(): + return self + return self.transpose_by_axes(key) + elif isinstance(key, slice): + return AShape(self.shape[key]) + + return self.shape[key] + + def __radd__(self, o): + if isinstance(o, Iterable): + return AShape( tuple(o) + self.shape) + else: + raise ValueError(f'unable to use type {o.__class__} in AShape append') + def __add__(self, o): + if isinstance(o, Iterable): + return AShape( self.shape + tuple(o) ) + else: + raise ValueError(f'unable to use type {o.__class__} in AShape append') + + + def __str__(self): return str(self.shape) + def __repr__(self): return 'AShape' + self.__str__() + +__all__ = ['AShape'] \ No newline at end of file diff --git a/xlib/avecl/_internal/HArgs.py b/xlib/avecl/_internal/HArgs.py new file mode 100644 index 0000000..455ba00 --- /dev/null +++ b/xlib/avecl/_internal/HArgs.py @@ -0,0 +1,106 @@ +from typing import List + +import numpy as np + +from .backend import Device +from .HTensor import HTensor +from .HType import HType +from .Tensor import Tensor +from .AShape import AShape + +class HArgs: + """ + Helper functions for list of arguments + """ + + @staticmethod + def decompose(args): + """ + decompose list of args of Tensor and supported numeric values + + returns ( shape_list, # if scalar value -> shape is None + dtype_list, # + kernel_args_list # + ) + """ + shape_list = [] + dtype_list = [] + kernel_args_list = [] + for arg in args: + + if isinstance(arg, Tensor): + shape_list.append(arg.shape) + dtype_list.append(arg.dtype) + kernel_args_list.append(arg.get_buffer()) + else: + + if isinstance(arg, int): + dtype, arg = np.int32, np.int32(arg) + elif isinstance(arg, float): + dtype, arg = np.float32, np.float32(arg) + elif HType.is_obj_of_np_scalar_type(arg): + dtype = arg.__class__ + else: + raise ValueError(f'Unsupported type of arg: {arg.__class__} Use Tensor or number type.') + + shape_list.append(None) + dtype_list.append(dtype) + kernel_args_list.append(arg) + + return tuple(shape_list), tuple(dtype_list), tuple(kernel_args_list) + + @staticmethod + def get_shapes(args : List[Tensor]) -> List[AShape]: + """ + """ + return tuple(t.shape for t in args) + + @staticmethod + def check_zero_get_length(args) -> int: + """ + raises an error if len(args) == 0, otherwise returns len + """ + args_len = len(args) + if len(args) == 0: + raise ValueError('args must be specified') + return args_len + + @staticmethod + def check_get_same_device(args : List[Tensor]) -> Device: + """ + check all device of tensors are the same and return the device + """ + result = HTensor.all_same_device(args) + if not result: + raise ValueError('all Tensors must have the same device') + return args[0].get_device() + + @staticmethod + def check_all_tensors(args : List[Tensor]): + """ + """ + if not all (isinstance(tensor, Tensor) for tensor in args): + raise ValueError('All values must have type of Tensor') + + @staticmethod + def check_get_same_shape(args : List[Tensor]) -> AShape: + """ + check all shapes of tensors are the same and return the shape + """ + shape = args[0].shape + if not all (t.shape == shape for t in args): + raise ValueError('All tensors must have the same shape') + return shape + + + @staticmethod + def filter_tensor(args, raise_on_empty : bool): + """ + get only tensors from the list + """ + tensor_args = [arg for arg in args if isinstance(arg, Tensor) ] + if raise_on_empty and len(tensor_args) == 0: + raise ValueError('At least one arg must be a Tensor') + return tensor_args + +__all__ = ['HArgs'] diff --git a/xlib/avecl/_internal/HKernel.py b/xlib/avecl/_internal/HKernel.py new file mode 100644 index 0000000..c6bbe76 --- /dev/null +++ b/xlib/avecl/_internal/HKernel.py @@ -0,0 +1,496 @@ +import numpy as np + +class HKernel: + """ + Helper functions for Kernels + """ + + _np_dtype_to_cl = { np.bool_ : 'bool', + np.int8 : 'char', + np.uint8 : 'uchar', + np.int16 : 'short', + np.uint16 : 'ushort', + np.int32 : 'int', + np.uint32 : 'uint', + np.int64 : 'long', + np.uint64 : 'ulong', + np.float16 : 'half', + np.float32 : 'float', + np.float64 : 'double' + } + + + + @staticmethod + def np_dtype_to_cl(dtype : np.dtype): + """ + returns string opencl type from numpy dtype + + example np.float32 -> 'float' + np.uint8 -> 'unsigned char' + """ + return HKernel._np_dtype_to_cl[np.dtype(dtype).type] + + @staticmethod + def define_scalar_func_arg(name, dtype : np.dtype): + """ + """ + return f'{HKernel._np_dtype_to_cl[np.dtype(dtype).type]} {name}' + + @staticmethod + def define_tensor_type(name, dtype : np.dtype): + """ + Returns a definitions for operations with tensor + + example for 'O', np.float16: + + #define O_PTR_NAME p_O + #define O_PTR_TYPE half + #define O_PTR_TYPE2 half2 + #define O_PTR_TYPE3 half3 + #define O_PTR_TYPE4 half4 + #define O_PTR_TYPE8 half8 + #define O_PTR_TYPE16 half16 + #define O_TYPE float + #define O_TYPE2 float2 + #define O_TYPE3 float3 + #define O_TYPE4 float4 + #define O_TYPE8 float8 + #define O_TYPE16 float16 + #define O_GLOBAL_LOAD(offset) vload_half (0, (const __global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_LOAD2(offset) vload_half2 (0, (const __global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_LOAD3(offset) vload_half3 (0, (const __global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_LOAD4(offset) vload_half4 (0, (const __global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_LOAD8(offset) vload_half8 (0, (const __global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_STORE(offset,value) vstore_half ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_STORE2(offset,value) vstore_half2 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_STORE3(offset,value) vstore_half3 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_STORE4(offset,value) vstore_half4 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_STORE8(offset,value) vstore_half8 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) ) + #define O_TO_FLOATX(x) ((float)x) + """ + name_upper = name.upper() + + dtype = np.dtype(dtype).type + + out = [f'#define {name.upper()}_PTR_NAME p_{name.upper()}'] + + if dtype == np.float16: + out += [f'#define {name_upper}_PTR_TYPE half'] + out += [f'#define {name_upper}_PTR_TYPE2 half2'] + out += [f'#define {name_upper}_PTR_TYPE3 half3'] + out += [f'#define {name_upper}_PTR_TYPE4 half4'] + out += [f'#define {name_upper}_PTR_TYPE8 half8'] + out += [f'#define {name_upper}_PTR_TYPE16 half16'] + out += [f'#define {name_upper}_TYPE {HKernel.np_dtype_to_cl(np.float32)}'] + out += [f'#define {name_upper}_TYPE2 {HKernel.np_dtype_to_cl(np.float32)}2'] + out += [f'#define {name_upper}_TYPE3 {HKernel.np_dtype_to_cl(np.float32)}3'] + out += [f'#define {name_upper}_TYPE4 {HKernel.np_dtype_to_cl(np.float32)}4'] + out += [f'#define {name_upper}_TYPE8 {HKernel.np_dtype_to_cl(np.float32)}8'] + out += [f'#define {name_upper}_TYPE16 {HKernel.np_dtype_to_cl(np.float32)}16'] + + + out += [f'#define {name_upper}_GLOBAL_LOAD(offset) vload_half (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )'] + out += [f'#define {name_upper}_GLOBAL_LOAD2(offset) vload_half2 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )'] + out += [f'#define {name_upper}_GLOBAL_LOAD3(offset) vload_half3 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )'] + out += [f'#define {name_upper}_GLOBAL_LOAD4(offset) vload_half4 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )'] + out += [f'#define {name_upper}_GLOBAL_LOAD8(offset) vload_half8 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )'] + out += [f'#define {name_upper}_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )'] + + out += [f'#define {name_upper}_GLOBAL_STORE(offset,value) vstore_half ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )'] + out += [f'#define {name_upper}_GLOBAL_STORE2(offset,value) vstore_half2 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )'] + out += [f'#define {name_upper}_GLOBAL_STORE3(offset,value) vstore_half3 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )'] + out += [f'#define {name_upper}_GLOBAL_STORE4(offset,value) vstore_half4 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )'] + out += [f'#define {name_upper}_GLOBAL_STORE8(offset,value) vstore_half8 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )'] + out += [f'#define {name_upper}_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )'] + + else: + out += [f'#define {name_upper}_PTR_TYPE {HKernel.np_dtype_to_cl(dtype)}'] + out += [f'#define {name_upper}_PTR_TYPE2 {HKernel.np_dtype_to_cl(dtype)}2'] + out += [f'#define {name_upper}_PTR_TYPE3 {HKernel.np_dtype_to_cl(dtype)}3'] + out += [f'#define {name_upper}_PTR_TYPE4 {HKernel.np_dtype_to_cl(dtype)}4'] + out += [f'#define {name_upper}_PTR_TYPE8 {HKernel.np_dtype_to_cl(dtype)}8'] + out += [f'#define {name_upper}_PTR_TYPE16 {HKernel.np_dtype_to_cl(dtype)}16'] + out += [f'#define {name_upper}_TYPE {HKernel.np_dtype_to_cl(dtype)}'] + out += [f'#define {name_upper}_TYPE2 {HKernel.np_dtype_to_cl(dtype)}2'] + out += [f'#define {name_upper}_TYPE3 {HKernel.np_dtype_to_cl(dtype)}3'] + out += [f'#define {name_upper}_TYPE4 {HKernel.np_dtype_to_cl(dtype)}4'] + out += [f'#define {name_upper}_TYPE8 {HKernel.np_dtype_to_cl(dtype)}8'] + out += [f'#define {name_upper}_TYPE16 {HKernel.np_dtype_to_cl(dtype)}16'] + + out += [f'#define {name_upper}_GLOBAL_LOAD(offset) {name_upper}_PTR_NAME[(offset)]'] + out += [f'#define {name_upper}_GLOBAL_LOAD2(offset) {name_upper}_PTR_NAME[(offset)]'] + out += [f'#define {name_upper}_GLOBAL_LOAD3(offset) {name_upper}_PTR_NAME[(offset)]'] + out += [f'#define {name_upper}_GLOBAL_LOAD4(offset) {name_upper}_PTR_NAME[(offset)]'] + out += [f'#define {name_upper}_GLOBAL_LOAD8(offset) {name_upper}_PTR_NAME[(offset)]'] + out += [f'#define {name_upper}_GLOBAL_LOAD16(offset) {name_upper}_PTR_NAME[(offset)]'] + + out += [f'#define {name_upper}_GLOBAL_STORE(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)'] + out += [f'#define {name_upper}_GLOBAL_STORE2(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)'] + out += [f'#define {name_upper}_GLOBAL_STORE3(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)'] + out += [f'#define {name_upper}_GLOBAL_STORE4(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)'] + out += [f'#define {name_upper}_GLOBAL_STORE8(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)'] + out += [f'#define {name_upper}_GLOBAL_STORE16(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)'] + + if dtype in [np.float32, np.float64]: + out += [f'#define {name_upper}_TO_FLOATX(x) x'] + elif dtype in [np.bool_, np.int8, np.uint8, np.int16, np.uint16, np.int32,np.uint32, np.float16]: + out += [f'#define {name_upper}_TO_FLOATX(x) ((float)x)'] + elif dtype in [np.int64,np.uint64]: + out += [f'#define {name_upper}_TO_FLOATX(x) ((double)x)'] + return '\n'.join(out) + + @staticmethod + def define_tensor_shape(name, shape, axes_symbols=None): + """ + Returns a definitions for operations with tensor + + example for 'O', (7,3), + + #define O0 7 + #define O1 3 + #define Om1 3 + #define Om2 7 + #define O_IDX(o0,o1) ( (size_t)(o0) )*3 +( o1 ) + #define O_IDX_MOD(o0,o1) ( (size_t)(o0) % 7 )*3 +( (o1) % 3 ) + """ + shape = tuple(shape) + ndim = len(shape) + name_upper = name.upper() + name_lower = name.lower() + + if axes_symbols is None: + axes_symbols = "".join([str(i) for i in range(ndim)]) + axes_symbols = axes_symbols.upper() + + out = [] + for i in range(ndim): + out += [f'#define {name_upper}{axes_symbols[i]} {shape[i]}'] + + for i in range(1,ndim+1): + out += [f'#define {name_upper}m{i} {shape[-i]}'] + + line = f'#define {name_upper}_IDX({HKernel.axes_seq_enum(name, ndim)}) ' + + for i in range(ndim): + if i == 0: + line += f'( (size_t)({name_lower}{i}) )' + else: + line += f'( {name_lower}{i} )' + + for j in range(i+1,ndim): + line += f'*{shape[j]} ' + if i != ndim-1: + line += '+' + + out += [line] + + line = f'#define {name_upper}_IDX_MOD({HKernel.axes_seq_enum(name, ndim)}) ' + + for i in range(ndim): + if i == 0: + line += f'( (size_t)({name_lower}{i}) % {shape[i]} )' + else: + line += f'( ({name_lower}{i}) % {shape[i]} )' + + for j in range(i+1,ndim): + line += f'*{shape[j]} ' + if i != ndim-1: + line += '+' + + out += [line,''] + + return '\n'.join(out) + + @staticmethod + def define_tensor(name, shape, dtype : np.dtype, axes_symbols=None): + """ + Returns a definitions for operations with tensor + + arguments + + name text + + shape Iterable + + dtype np.dtype + + axes_symbols(None) string of symbols. + None -> numeric symbols will be used + + example for 'O', (2,4), np.float16 + + #define O_PTR_NAME p_O + #define O_PTR_TYPE half + #define O_PTR_TYPE2 half2 + #define O_PTR_TYPE3 half3 + #define O_PTR_TYPE4 half4 + #define O_PTR_TYPE8 half8 + #define O_PTR_TYPE16 half16 + #define O_TYPE float + #define O_TYPE2 float2 + #define O_TYPE3 float3 + #define O_TYPE4 float4 + #define O_TYPE8 float8 + #define O_TYPE16 float16 + #define O_GLOBAL_LOAD(offset) vload_half (0, (const __global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_LOAD2(offset) vload_half2 (0, (const __global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_LOAD3(offset) vload_half3 (0, (const __global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_LOAD4(offset) vload_half4 (0, (const __global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_LOAD8(offset) vload_half8 (0, (const __global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_STORE(offset,value) vstore_half ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_STORE2(offset,value) vstore_half2 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_STORE3(offset,value) vstore_half3 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_STORE4(offset,value) vstore_half4 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_STORE8(offset,value) vstore_half8 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) ) + #define O_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) ) + #define O_TO_FLOATX(x) ((float)x) + #define O0 2 + #define O1 4 + #define Om1 4 + #define Om2 2 + #define O_IDX(o0,o1) ( (size_t)(o0) )*4 +( o1 ) + #define O_IDX_MOD(o0,o1) ( (size_t)(o0) % 2 )*4 +( (o1) % 4 ) + """ + return'\n'.join([ HKernel.define_tensor_type(name, dtype), + HKernel.define_tensor_shape(name, shape, axes_symbols) + ]) + + @staticmethod + def define_axes_sizes(axis_letter, axes_sizes): + """ + Returns definitions of axes sizes + + example for 'O', (4,512,512) + #define O0 4 + #define O1 512 + #define O2 512 + """ + out = "" + axes_sizes = tuple(axes_sizes) + ndim = len(axes_sizes) + for i in range(ndim): + out += f'#define {axis_letter.upper()}{i} {axes_sizes[i]}\n' + + return out + + @staticmethod + def decompose_idx_to_axes_idxs(var_name, tensor_name, ndim): + """ + decompose a size_t variable to axes indexes. + Keeps original variable untouched. + + Example for 'gid','O',2 + + size_t gid_original = gid; + size_t o1 = gid % O1; gid /= O1; + #define om1 o1 + size_t o0 = gid % O0; + #define om2 o0 + gid = gid_original; + """ + name_lower = tensor_name.lower() + name_upper = tensor_name.upper() + + out = [f'size_t {var_name}_original = {var_name};'] + + for i in range(ndim-1,-1,-1): + line = f'size_t {name_lower}{i} = {var_name} % {name_upper}{i};' + if i > 0: + line += f' {var_name} /= {name_upper}{i};' + out += [line] + out += [f'#define {name_lower}m{ndim-i} {name_lower}{i}'] + + out += [f'{var_name} = {var_name}_original;'] + return '\n'.join(out) + + @staticmethod + def axes_order_enum(tensor_name, axes_order): + """ + returns axis enumeration with given order + + Example + ('I', (1,2,0)) returns 'i1,i2,i0' + ('I', 'HW') return 'ih,iw' + """ + if isinstance(axes_order, str): + axes_order = axes_order.lower() + else: + axes_order = tuple(axes_order) + + name_lower = tensor_name.lower() + + return ','.join( [ f'{name_lower}{axes_order[axis]}' for axis in range(len(axes_order)) ]) + + @staticmethod + def axes_seq_enum(tensor_name, ndim, new_axis=None, zero_axes=None, suffix=None): + """ + returns axis sequental enumeration with given ndim + + Example + + ('I', 4) returns 'i0,i1,i2,i3' + + ('I', 4, new_axis=('name',1) ) returns 'i0,name,i1,i2,i3' + + ('I', 3, zero_axes=(1,) ) returns 'i0,0,i2' + + ('I', 2, suffix='ih,iw' ) returns 'i0,i1,ih,iw' + """ + name_lower = tensor_name.lower() + if zero_axes is not None: + axes = [ '0' if axis in zero_axes else f'{name_lower}{axis}' for axis in range(ndim) ] + else: + axes = [ f'{name_lower}{axis}' for axis in range(ndim) ] + + if suffix is None: + suffix = [] + else: + suffix = [suffix] + + if new_axis is not None: + name, axis = new_axis + return','.join(axes[:axis] + [name] + axes[axis:] + suffix) + else: + return ','.join(axes+ suffix) + + @staticmethod + def include_constants_pi(): + """ + defines PI constants + + PI_F + PI_2_F + PI_4_F + """ + return f""" +#define PI_F 3.14159274101257f +#define PI_2_F 1.57079637050629f +#define PI_4_F 0.78539818525314f +""" + + @staticmethod + def include_hash(): + """ + returns hash functions: + + uint hash_uint_uint(uint v) + ulong hash_ulong_from_ulong(ulong x) + float hash_float_from_uint(uint x) + double hash_double_from_ulong(ulong x) + """ + + return f""" + +#define UIF (1.0 / (float)(0xffffffffU)) + +//from Chris Wellons https://nullprogram.com/blog/2018/07/31/ https://www.shadertoy.com/view/WttXWX +uint hash_uint_from_uint(uint x) +{{ + x ^= x >> 17; + x *= 0xed5ad4bbU; + x ^= x >> 11; + x *= 0xac4c1b51U; + x ^= x >> 15; + x *= 0x31848babU; + x ^= x >> 14; + return x; +}} + +ulong hash_ulong_from_ulong(ulong x) +{{ + x ^= x >> 32; + x *= 0xd6e8feb86659fd93U; + x ^= x >> 32; + x *= 0xd6e8feb86659fd93U; + x ^= x >> 32; + return x; +}} + +float hash_float_from_uint(uint x) +{{ + return hash_uint_from_uint(x) / (float)(0xffffffffU); +}} + +double hash_double_from_ulong(ulong x) +{{ + return (double)hash_ulong_from_ulong(x) / (double)(0xffffffffffffffffU); +}} + +/***************************** +UNUSED CODE + +//---------- PCG hashes from https://www.shadertoy.com/view/XlGcRh +uint hash_uint_uint(uint v) +{{ + uint state = v * 747796405u + 2891336453u; + uint word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u; + return (word >> 22u) ^ word; +}} + +uint2 hash_uint2_uint2 (uint2 v) +{{ + v = v * 1664525u + 1013904223u; + v.x += v.y * 1664525u; + v.y += v.x * 1664525u; + v ^= v>>16u; + v.x += v.y * 1664525u; + v.y += v.x * 1664525u; + v ^= v>>16u; + return v; +}} + +uint3 hash_uint3_uint3(uint3 v) +{{ + v = v * 1664525u + 1013904223u; + v.x += v.y*v.z; + v.y += v.z*v.x; + v.z += v.x*v.y; + v ^= v >> 16u; + v.x += v.y*v.z; + v.y += v.z*v.x; + v.z += v.x*v.y; + return v; +}} + +float hash_float_uint(uint v) +{{ + return (float)( hash_uint_uint(v) ) * UIF; +}} + +float2 hash_float2_uint (uint v) +{{ + uint2 q = hash_uint2_uint2( (uint2)(v, 1) ); + return (float2)(q.x, q.y) * UIF; +}} + +float3 hash_float3_uint (uint v) +{{ + uint3 q = hash_uint3_uint3( (uint3)(v, 1, 1) ); + return (float3)(q.x, q.y, q.z) * UIF; +}} + +//---------- Classic hashes used in shaders + +float hash_float_float(float p) +{{ + + float x = sin(p*12.9898)*43758.5453; + return x - floor(x); +}} + +float hash_float_float2(float2 p) +{{ + float x = sin( dot(p, (float2)(12.9898, 78.233)) )*43758.5453; + return x - floor(x); +}} + +****************************/ + + +""" + +__all__ = ['HKernel'] \ No newline at end of file diff --git a/xlib/avecl/_internal/HTensor.py b/xlib/avecl/_internal/HTensor.py new file mode 100644 index 0000000..4126c0c --- /dev/null +++ b/xlib/avecl/_internal/HTensor.py @@ -0,0 +1,22 @@ +from typing import List, Union + +from .Tensor import * + + +class HTensor: + """ + Helper functions for Tensor + """ + + @staticmethod + def all_same_device( tensor_or_list : Union[Tensor, List[Tensor] ] ) -> bool: + """ + check if all tensors in a list use the same device + """ + if not isinstance(tensor_or_list, (list,tuple)): + tensor_or_list = (tensor_or_list,) + + device = tensor_or_list[0].get_device() + return all( device == tensor.get_device() for tensor in tensor_or_list ) + +__all__ = ['HTensor'] diff --git a/xlib/avecl/_internal/HType.py b/xlib/avecl/_internal/HType.py new file mode 100644 index 0000000..355a706 --- /dev/null +++ b/xlib/avecl/_internal/HType.py @@ -0,0 +1,82 @@ +import operator +from typing import Iterable, List +import numpy as np + +scalar_types = [int, float, np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64, + np.float16, np.float32, np.float64, np.bool_] + +np_scalar_types = [np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64, + np.float16, np.float32, np.float64, np.bool_] + +_np_dtype_to_cl = { + np.bool_ : 'bool', + np.uint8 : 'uchar', + np.int8 : 'char', + np.uint16 : 'ushort', + np.int16 : 'short', + np.uint32 : 'uint', + np.int32 : 'int', + np.uint64 : 'ulong', + np.int64 : 'long', + np.float16 : 'half', + np.float32 : 'float', + np.float64 : 'double', +} + +_np_dtype_weight = { + np.bool_ : 1, + np.uint8 : 2, + np.int8 : 3, + np.uint16 : 4, + np.int16 : 5, + np.uint32 : 6, + np.int32 : 7, + np.uint64 : 8, + np.int64 : 9, + np.float16 : 10, + np.float32 : 11, + np.float64 : 12, +} + +class HType: + """ + Helper functions for types. + """ + + def is_scalar_type(value): + return value.__class__ in scalar_types + + def get_np_scalar_types() -> List: + return np_scalar_types + + def is_obj_of_np_scalar_type(obj) -> bool: + return obj.__class__ in np_scalar_types + + def np_dtype_to_cl(dtype : np.dtype) -> str: + return _np_dtype_to_cl[ np.dtype(dtype).type ] + + def get_most_weighted_dtype( dtype_list ): + dtype_list = [ np.dtype(dtype) for dtype in dtype_list ] + + dtype_list = [ (_np_dtype_weight[dtype.type], dtype) for dtype in dtype_list ] + dtype_list = sorted(dtype_list, key=operator.itemgetter(0), reverse=True) + return dtype_list[0][1] + + def hashable_slices(slices): + """ + Convert list of slice to hashable arg. + """ + if not isinstance(slices, Iterable): + slices = (slices,) + normalized_slices = [] + for x in slices: + if isinstance(x, slice): + normalized_slices.append( (x.start, x.stop, x.step) ) + elif x is Ellipsis or x is None: + normalized_slices.append(x) + else: + normalized_slices.append(int(x)) + return tuple(normalized_slices) + + +__all__ = ['HType'] \ No newline at end of file diff --git a/xlib/avecl/_internal/NCore.py b/xlib/avecl/_internal/NCore.py new file mode 100644 index 0000000..cd2597f --- /dev/null +++ b/xlib/avecl/_internal/NCore.py @@ -0,0 +1,24 @@ +from .SCacheton import SCacheton +from .Tensor import Tensor +from .backend import * + +class NCore: + """ + core functions + """ + + @staticmethod + def cleanup(): + """ + try to cleanup all resources consumed by TensorCL. + + can raise Exception + """ + SCacheton.cleanup() + + if Tensor._object_count != 0: + raise Exception(f'Unable to cleanup while {Tensor._object_count} Tensor objects exist.') + + cleanup_devices() + +__all__ = ['NCore'] \ No newline at end of file diff --git a/xlib/avecl/_internal/NTest.py b/xlib/avecl/_internal/NTest.py new file mode 100644 index 0000000..ae5427c --- /dev/null +++ b/xlib/avecl/_internal/NTest.py @@ -0,0 +1,754 @@ +import traceback + +import numpy as np + +from .HType import HType +from .NCore import NCore +from .backend import get_device, get_default_device, set_default_device +from .Tensor import Tensor +from . import op +from .initializer import InitRandomUniform, InitCoords2DArange +from .info import Conv2DInfo + +class NTest(): + + def test_all(): + NCore.cleanup() + + prev_device = get_default_device() + + device = get_device(0) + print(f'Using {device.get_description()}') + + set_default_device(device) + + test_funcs = [ + InitRandomUniform_test, + InitCoords2DArange_test, + cast_test, + transpose_test, + pad_test, + concat_test, + tile_test, + stack_test, + slice_test, + slice_set_test, + reduce_test, + matmul_test, + any_wise_op_test, + depthwise_conv2d_test, + remap_np_affine_test, + remap_test, + warp_affine_test, + gaussian_blur_test, + binary_erode_circle_test, + binary_dilate_circle_test, + binary_morph_test, + ] + + for test_func in test_funcs: + print(f'{test_func.__name__}()') + test_func() + device.cleanup() + device.print_stat() + + NCore.cleanup() + + set_default_device(prev_device) + + print('Done.') + +def _all_close(x,y, atol=1, btol=1): + return np.allclose( np.ndarray.flatten(x[None,...]), np.ndarray.flatten(y[None,...]), atol, btol ) + +def cast_test(): + for in_dtype in HType.get_np_scalar_types(): + for out_dtype in HType.get_np_scalar_types(): + shape = tuple(np.random.randint(1, 8, size=( np.random.randint(1,5))) ) + + print(f'cast: {shape} in_dtype:{str(np.dtype(in_dtype).name)} out_dtype:{str(np.dtype(out_dtype).name)} ... ', end='') + + val_n = np.random.uniform( -64, 64, size=shape ).astype(in_dtype) + cast_n = val_n.astype(out_dtype) + val_t = Tensor.from_value(val_n) + cast_t = op.cast(val_t, out_dtype) + + if not _all_close(cast_t.np(), cast_n): + raise Exception(f'data is not equal') + + print('pass') + +def binary_morph_test(): + for shape_len in range(2,4): + for dtype in HType.get_np_scalar_types(): + shape = np.random.randint( 1, 64, size=(shape_len,) ) + erode_dilate = np.random.randint( -16, 16 ) + blur = np.random.rand()*16 - 8 + + input_n = np.random.randint( 2, size=shape ).astype(dtype) + input_t = Tensor.from_value(input_n) + + print(f'binary_morph: {shape} erode_dilate:{erode_dilate} blur:{blur} {np.dtype(dtype).name} ... ', end='') + + op.binary_morph(input_t, erode_dilate=erode_dilate, blur=blur, fade_to_border=True) + + print('pass') + +def binary_erode_circle_test(): + for shape_len in range(2,4): + for dtype in HType.get_np_scalar_types(): + + shape = np.random.randint( 1, 64, size=(shape_len,) ) + radius = np.random.randint( 1, 16 ) + iterations = np.random.randint( 1, 4 ) + + input_n = np.random.randint( 2, size=shape ).astype(dtype) + input_t = Tensor.from_value(input_n) + + print(f'binary_erode_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='') + + op.binary_erode_circle(input_t, radius=radius, iterations=iterations) + + print('pass') + +def binary_dilate_circle_test(): + for shape_len in range(2,4): + for dtype in HType.get_np_scalar_types(): + + shape = np.random.randint( 1, 64, size=(shape_len,) ) + radius = np.random.randint( 1, 16 ) + iterations = np.random.randint( 1, 4 ) + + input_n = np.random.randint( 2, size=shape ).astype(dtype) + input_t = Tensor.from_value(input_n) + + print(f'binary_dilate_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='') + + op.binary_dilate_circle(input_t, radius=radius, iterations=iterations) + + print('pass') + + +def gaussian_blur_test(): + for shape_len in range(2,5): + for dtype in [np.float16, np.float32, np.float64]: + + shape = np.random.randint( 1, 64, size=(shape_len,) ) + sigma = np.random.rand() * 10 + print(f'gaussian_blur: {shape} sigma:{sigma} {np.dtype(dtype).name} ... ', end='') + + val_n = np.random.randint( 2**8, size=shape ).astype(dtype) + val_t = Tensor.from_value(val_n) + + op.gaussian_blur(val_t, sigma) + + print('pass') + +def pad_test(): + for iteration in range(1): + for shape_len in range(5,1,-1): + for mode in ['constant']: + for dtype in HType.get_np_scalar_types(): + while True: + shape = np.random.randint( 1, 8, size=(shape_len,) ) + + paddings = tuple( (np.random.randint(8), np.random.randint(8)) for i in range(len(shape)) ) + + print(f'pad: {shape} {paddings} {mode} {np.dtype(dtype).name} ... ', end='') + + val_n = np.random.randint( 2**8, size=shape ).astype(dtype) + pad_n = np.pad(val_n, paddings, mode=mode) + + val_t = Tensor.from_value(val_n) + pad_t = op.pad(val_t, paddings, mode=mode) + + print(f'{pad_n.shape} == {pad_t.shape} ... ', end='') + + if pad_n.shape != pad_t.shape: + raise Exception(f'shape is not equal') + + if not _all_close(pad_t.np(), pad_n): + raise Exception(f'data is not equal') + + print('pass') + break + +def slice_set_test(): + for iteration in [0,1]: + for shape_len in range(5,1,-1): + for dtype in HType.get_np_scalar_types(): + while True: + shape = np.random.randint( 1, 8, size=(shape_len,) ) + + if iteration == 0: + slices = [ slice(None,None,None), ] * shape_len + axis = np.random.randint(shape_len) + shape[axis] = 1 + slices[axis] = 0 + else: + slices = [] + for i in range (shape_len): + axis_size = shape[i] + + b = np.random.randint(axis_size) + e = np.random.randint(axis_size) + if b == e: + slices.append(b) + else: + if b < e: + s = 1 + if b == 0: + b = None + if e == axis_size-1: + e = None + else: + s = -1 + if b == axis_size-1: + b = None + if e == 0: + e = None + slices.append ( slice(b,e,s) ) + + if np.random.randint(2) == 0: + axis = np.random.randint(shape_len) + slices[axis] = Ellipsis + + shape = tuple(shape) + slices = tuple(slices) + + print(f'slice_set: {shape} {np.dtype(dtype).name} {slices} ... ', end='') + + val_n = np.random.randint( 2**8, size=shape ).astype(dtype) + val_t = Tensor.from_value(val_n) + + sliced_n = val_n[slices] + + v = [0] if sliced_n.ndim > 0 else 0 + + val_n[slices] = v + val_t[slices] = v + + if not _all_close(val_t.np(), val_n): + raise Exception(f'data is not equal') + + print('pass') + break + +def depthwise_conv2d_test(): + def _numpy_depthwise_conv2d(input_n, kernel_n, STRIDE=1, DILATION=1, padding='same', dtype=np.float32): + N, IC, IH, IW = input_n.shape + KI, KH, KW = kernel_n.shape + + ci = Conv2DInfo(IH, IW, KH, KW, STRIDE, DILATION, padding) + + PADL, PADT = ci.PADL, ci.PADT + + OC, OH, OW = IC, ci.OH, ci.OW + + O_IK_idxs = { idx:[ [ [], [] ], [ [], [] ] ] for idx in range(OH*OW) } + K_IO_idxs = { idx:[ [ [], [] ], [ [], [] ] ] for idx in range(KH*KW) } + I_KO_idxs = { idx:[ [ [], [] ], [ [], [] ] ] for idx in range(IH*IW) } + + for ow in range(OW): + for oh in range(OH): + O_idx = oh*OW + ow + for kh in range(KH): + for kw in range(KW): + iw = -PADL + kw*DILATION + ow*STRIDE + ih = -PADT + kh*DILATION + oh*STRIDE + if (iw >= 0) & (ih >= 0) & (iw < IW) & (ih < IH): + + O_IK_idxs[O_idx][0][0].append (ih) + O_IK_idxs[O_idx][0][1].append (iw) + O_IK_idxs[O_idx][1][0].append (kh) + O_IK_idxs[O_idx][1][1].append (kw) + + K_idx = kh*KW + kw + K_IO_idxs[K_idx][0][0].append (ih) + K_IO_idxs[K_idx][0][1].append (iw) + K_IO_idxs[K_idx][1][0].append (oh) + K_IO_idxs[K_idx][1][1].append (ow) + + I_idx = ih*IW + iw + I_KO_idxs[I_idx][0][0].append (kh) + I_KO_idxs[I_idx][0][1].append (kw) + I_KO_idxs[I_idx][1][0].append (oh) + I_KO_idxs[I_idx][1][1].append (ow) + + output_shape = (N, OC, OH, OW) + output = np.empty( output_shape, dtype) + + for n in range(N): + for oc in range(OC): + for oh in range(OH): + for ow in range(OW): + O_idx = oh*OW + ow + I_idxs = O_IK_idxs[O_idx][0] + K_idxs = O_IK_idxs[O_idx][1] + + v = ( input_n[ n,oc][..., I_idxs[0], I_idxs[1]] * + kernel_n [oc][..., K_idxs[0], K_idxs[1]] ).sum() + + output[n,oc,oh,ow] = v + return output + + for padding in ['same','valid',2]: + for dilation in [1,2]: + for stride in [1,2]: + for ks in [1,3]: + for n in [1,4]: + for ic in [1,4]: + for ih,iw in zip(*[[4,16]]*2): + if padding == 'valid' and iw < ks: + continue + for dtype in [np.int16, np.float16, np.float32]: + input_shape = (n, ic, ih, iw) + kernel_shape = (ic, ks, ks) + + print(f'depthwise_conv2d: {input_shape},{kernel_shape},{padding},{stride},{dilation},{np.dtype(dtype).name} ... ', end='') + + input_n = np.random.randint( 64, size=input_shape ).astype(dtype) + kernel_n = np.ones(shape=kernel_shape ).astype(dtype) + + input_t = Tensor.from_value(input_n) + kernel_t = Tensor.from_value(kernel_n) + + conved_t = op.depthwise_conv2D(input_t, kernel_t, stride=stride, dilation=dilation, padding=padding) + conved_n = _numpy_depthwise_conv2d(input_n, kernel_n, STRIDE=stride, DILATION=dilation, padding=padding, dtype=dtype) + + if conved_n.shape != conved_t.shape: + raise Exception(f'shape is not equal') + + if not all ( np.ndarray.flatten( conved_t.np() == conved_n) ): + raise Exception(f'data is not equal') + + print('pass') + + + +def warp_affine_test(): + for dtype in HType.get_np_scalar_types(): + if dtype == np.bool_: + continue + H = np.random.randint(8, 64) + W = np.random.randint(8, 64) + + print(f'warp_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='') + + input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) ) + + affine_t = Tensor.from_value ( [[1,0,0], + [0,1,0]], dtype) + + result_t = op.warp_affine(input_t, affine_t) + + if not _all_close(input_t.np(), result_t.np() ): + raise Exception(f'data is not equal') + + print('pass') + + +def remap_np_affine_test(): + for dtype in HType.get_np_scalar_types(): + if dtype == np.bool_: + continue + H = np.random.randint(8, 64) + W = np.random.randint(8, 64) + + print(f'remap_np_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='') + + input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) ) + + affine_n = np.array ( [[1,0,0], + [0,1,0]], dtype) + + result_t = op.remap_np_affine(input_t, affine_n) + + if not _all_close(input_t.np(), result_t.np() ): + raise Exception(f'data is not equal') + + print('pass') + + +def remap_test(): + for dtype in HType.get_np_scalar_types(): + if dtype == np.bool_: + continue + H = np.random.randint(8, 64) + W = np.random.randint(8, 64) + + print(f'remap: [{H},{W}] {np.dtype(dtype).name} ... ', end='') + + input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) ) + + coords_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ) + + result_t = op.remap(input_t, coords_t) + + if not _all_close(input_t.np(), result_t.np() ): + raise Exception(f'data is not equal') + + print('pass') + +def tile_test(): + for _ in range(3): + for shape_len in range(3, 5): + for dtype in HType.get_np_scalar_types(): + shape = tuple(np.random.randint( 8, size=(shape_len,) )+1) + tiles = tuple(np.random.randint( 4, size=(shape_len,) )+1) + + print(f'tile: {shape} {tiles} {np.dtype(dtype).name} ... ', end='') + + val_n = np.random.randint( 2**8, size=shape ).astype(dtype) + tiled_n = np.tile(val_n, tiles) + + val_t = Tensor.from_value(val_n) + tiled_t = op.tile(val_t, tiles) + + print(f'{tiled_n.shape} == {tiled_t.shape} ... ', end='') + + if tiled_n.shape != tiled_t.shape: + raise Exception(f'shape is not equal') + + if not _all_close(tiled_t.np(), tiled_n): + raise Exception(f'data is not equal') + + print('pass') + +def stack_test(): + for _ in range(3): + for shape_len in range(1, 4): + for dtype in HType.get_np_scalar_types(): + shape = tuple(np.random.randint( 8, size=(shape_len,) )+1) + axis = np.random.randint(shape_len+1) + stack_count = np.random.randint(4)+1 + + print(f'stack: {shape}*{stack_count} axis:{axis} {np.dtype(dtype).name} ... ', end='') + + vals_n = [ np.random.randint( 2**8, size=shape ).astype(dtype) for i in range(stack_count) ] + stack_n = np.stack(vals_n, axis) + + vals_t = [ Tensor.from_value(vals_n[i]) for i in range(stack_count) ] + stack_t = op.stack(vals_t, axis) + + print(f'{stack_n.shape} == {stack_t.shape} ... ', end='') + + if stack_n.shape != stack_t.shape: + raise Exception('shape is not equal') + + if not _all_close(stack_t.np(), stack_n): + raise Exception(f'data is not equal') + + print('pass') + +def reduce_test(): + for op_type in ['sum', 'mean', 'min', 'max']: + for dtype in HType.get_np_scalar_types(): + if dtype != np.bool_: + for shape_len in range(2, 5): + shape = np.random.randint( 8, size=(shape_len,) )+1 + + reduction_axes = np.array([*range(shape_len)]) + np.random.shuffle(reduction_axes) + + # Cut random amount of reduction_axes + reduction_axes = tuple(reduction_axes [:np.random.randint(shape_len+1)]) + if len(reduction_axes) == 0: + reduction_axes = None + + keepdims = np.random.randint(2) == 0 + + print(f'reduce {op_type}: {shape} {np.dtype(dtype).name} axes={reduction_axes} keepdims={keepdims} ... ', end='') + + if dtype in [np.float16, np.float32, np.float64]: + value_n = np.random.uniform(size=shape).astype(dtype) + else: + value_n = np.random.randint( max(1, int(np.iinfo(dtype).max / np.prod(shape)) ), size=shape, dtype=dtype ) + + value_t = Tensor.from_value(value_n) + + if op_type == 'sum': + reducted_n = value_n.sum(reduction_axes, keepdims=keepdims) + reducted_t = value_t.sum(reduction_axes, keepdims=keepdims) + elif op_type == 'mean': + reducted_n = value_n.mean(reduction_axes, keepdims=keepdims) + reducted_t = value_t.mean(reduction_axes, keepdims=keepdims) + elif op_type == 'max': + reducted_n = value_n.max(reduction_axes, keepdims=keepdims) + reducted_t = value_t.max(reduction_axes, keepdims=keepdims) + elif op_type == 'min': + reducted_n = value_n.min(reduction_axes, keepdims=keepdims) + reducted_t = value_t.min(reduction_axes, keepdims=keepdims) + + print(f'{reducted_n.shape} == {reducted_t.shape} ... ') + + if not _all_close(reducted_t.np(), reducted_n): + raise Exception(f'data is not equal') + + print('pass') + + +def InitRandomUniform_test(): + for dtype in HType.get_np_scalar_types(): + for shape_len in range(1, 5): + shape = np.random.randint( 8, size=(shape_len,) )+1 + + print(f'InitRandomUniform: {shape} {np.dtype(dtype).name} ... ', end='') + + Tensor(shape, dtype, initializer=InitRandomUniform()).np() + + print('pass') + +def InitCoords2DArange_test(): + for dtype in HType.get_np_scalar_types(): + for shape_len in range(2, 5): + shape = np.random.randint( 1, 60, size=(shape_len,) ).tolist() + shape = shape + ([2] if np.random.randint(2) == 0 else [3]) + h_start = np.random.randint(80) + h_stop = h_start + np.random.randint(80) + w_start = np.random.randint(80) + w_stop = w_start + np.random.randint(80) + + print(f'InitCoords2DArange: {shape} {np.dtype(dtype).name} ... ', end='') + + Tensor(shape, dtype, initializer=InitCoords2DArange(h_start,h_stop,w_start,w_stop )).np() + + print('pass') + +def concat_test(): + for shape_len in range(2, 5): + for dtype in HType.get_np_scalar_types(): + shape = (np.random.randint( 8, size=(shape_len,) )+1).tolist() + axis = np.random.randint(shape_len) + count = np.random.randint(4)+1 + + shapes = tuple( tuple( dim if i != axis else np.random.randint(8)+1 + for i,dim in enumerate(shape) ) + for shape in ([shape] * count) ) + + print(f'concat: {shapes} axis={axis} {np.dtype(dtype).name} ... ', end='') + + V_n = [ np.random.randint( 2**8, size=shape ).astype(dtype) for shape in shapes ] + O_n = np.concatenate(V_n, axis) + + print(f'{O_n.shape} == ', end='') + + V_t = [ Tensor.from_value(V_n[i]) for i in range(count) ] + O_t = op.concat(V_t, axis) + + print(f'{O_t.shape} ... ', end='') + + if O_n.shape != O_t.shape: + raise Exception('shape is not equal') + + if not all ( np.ndarray.flatten( O_t.np() == O_n ) ): + raise Exception(f'data is not equal') + + print('pass') + +def matmul_test(): + for _ in range(100): + for dtype in [np.float32]: + BATCH = np.random.randint(8)+1 + M = np.random.randint(8)+1 + N = np.random.randint(32768)+1 + K = np.random.randint(32768)+1 + + while K*N > ( 8000000 // BATCH ): + K = max(1, K // 2) + N = max(1, N // 2) + + if np.random.randint(2) == 0: + size = [2,4,8,16][np.random.randint(4)] + M = max(1, M // size) * size + N = max(1, N // size) * size + K = max(1, K // size) * size + + if BATCH == 1: + A_shape = (M, K) + B_shape = (K, N) + else: + A_shape = (BATCH, M, K) + B_shape = (BATCH, K, N) + + print(f'matmul: {A_shape} {B_shape} {np.dtype(dtype).name} ... ', end='') + + A_n = np.random.randint( 2**4, size=A_shape ).astype(dtype) + B_n = np.random.randint( 2**4, size=B_shape ).astype(dtype) + + O_n = np.matmul(A_n, B_n) + + print(f'{O_n.shape} == ', end='') + + A_t = Tensor.from_value(A_n) + B_t = Tensor.from_value(B_n) + O_t = op.matmul(A_t, B_t) + print(f'{O_t.shape} ... ', end='') + + if O_n.shape != O_t.shape: + raise Exception('shape is not equal') + if not _all_close (O_t.np(), O_n): + raise Exception(f'data is not equal') + + print('pass') + +def slice_test(): + for iteration in [0,1]: + for shape_len in range(5,1,-1): + for dtype in HType.get_np_scalar_types(): + while True: + shape = np.random.randint( 1, 8, size=(shape_len,) ) + + if iteration == 0: + slices = [ slice(None,None,None), ] * shape_len + axis = np.random.randint(shape_len) + shape[axis] = 1 + slices[axis] = 0 + else: + slices = [] + for i in range (shape_len): + axis_size = shape[i] + b = np.random.randint(axis_size) + e = np.random.randint(axis_size) + if b == e: + slices.append(b) + else: + if b < e: + s = 1 + if b == 0: + b = None + if e == axis_size-1: + e = None + else: + s = -1 + if b == axis_size-1: + b = None + if e == 0: + e = None + slices.append ( slice(b,e,s) ) + + if np.random.randint(2) == 0: + axis = np.random.randint(shape_len) + slices[axis] = Ellipsis + + shape = tuple(shape) + slices = tuple(slices) + + print(f'slice: {shape} {np.dtype(dtype).name} {slices} ... ', end='') + + val_n = np.random.randint( 2**8, size=shape ).astype(dtype) + + sliced_n = val_n[slices] + + print(f'{sliced_n.shape} ... ', end='') + + sliced_t = Tensor.from_value(val_n)[slices] + + print(f'{sliced_t.shape} ... ', end='') + + if 0 in sliced_n.shape: + # some cases like 0:1:-1 will produce zero shape and invalid array on numpy + # but nn.slice has no such behaviour, thus we have to generate new slice again + print('pass (bad case)') + continue + + if np.prod(sliced_n.shape) != sliced_t.shape.size: + raise Exception(f'shape is not equal') + + if not _all_close(sliced_t.np(), sliced_n): + raise Exception(f'data is not equal') + + print('pass') + break + + +def transpose_test(): + for dtype in HType.get_np_scalar_types(): + for shape_len in range(2, 5): + shape = np.random.randint( 8, size=(shape_len,) )+1 + axes_order = np.array([*range(shape_len)]) + np.random.shuffle(axes_order) + + print(f'transpose: {shape} {axes_order} ... ', end='') + + val_n = np.random.randint( 2**8, size=shape ).astype(dtype) + transposed_n = np.transpose(val_n, axes_order) + + print(f'{transposed_n.shape} ... ', end='') + + val_t = Tensor.from_value(val_n) + transposed_t = op.transpose (val_t, axes_order ) + + print(f'{transposed_t.shape} ... ', end='') + + if transposed_n.shape != transposed_t.shape: + raise Exception('shape is not equal') + if not all ( np.ndarray.flatten( transposed_t.np() == transposed_n ) ): + raise Exception(f'data is not equal {shape} {axes_order}') + + print('pass') + + +def any_wise_op_test(): + for op_type in ['square', '+', '-', '*', '/', 'min', 'max']: + for dtype in HType.get_np_scalar_types(): + if dtype != np.bool_: + + shape_gen = range(1, 5) + for shape_len in shape_gen: + a_shape = tuple(np.random.randint( 8, size=(shape_len,) )+1) + + if np.random.randint(2) == 0: + b_shape = tuple(a_shape[np.random.randint(len(a_shape)):]) + b_shape = (1,) if len(b_shape) == 0 else b_shape + else: + b_shape = list(a_shape) + b_shape[ np.random.randint(len(b_shape)) ] = 1 + b_shape = tuple(b_shape) + + shapes = [a_shape, b_shape] + if np.random.randint(2) == 0: + shapes = shapes[::-1] + a_shape, b_shape = shapes + + print(f'any_wise: {a_shape} {str(op_type)} {b_shape}:{str(np.dtype(dtype).name)} ...', end='') + + a_n = np.random.randint( 1, 2**8, size=a_shape ).astype(dtype) + b_n = np.random.randint( 1, 2**8, size=b_shape ).astype(dtype) + a_t = Tensor.from_value(a_n) + b_t = Tensor.from_value(b_n) + + if op_type == '+': + r_t = a_t + b_t + elif op_type == '-': + r_t = a_t - b_t + elif op_type == '*': + r_t = a_t * b_t + elif op_type == '/': + r_t = a_t / b_t + elif op_type == 'min': + r_t = op.min_(a_t, b_t) + elif op_type == 'max': + r_t = op.max_(a_t, b_t) + elif op_type == 'square': + r_t = op.square(a_t) + + if op_type in ['+','-','*','/']: + r_n = eval(f'a_n {op_type} b_n') + r_n = r_n.astype(dtype) + elif op_type == 'min': + r_n = np.minimum(a_n, b_n) + elif op_type == 'max': + r_n = np.maximum(a_n, b_n) + elif op_type == 'square': + r_n = np.square(a_n) + + if r_n.shape != r_t.shape: + raise Exception(f'shapes are not equal') + + if not _all_close(r_t.np(), r_n): + raise Exception(f'data is not equal') + + print('pass') + + diff --git a/xlib/avecl/_internal/SCacheton.py b/xlib/avecl/_internal/SCacheton.py new file mode 100644 index 0000000..f14186a --- /dev/null +++ b/xlib/avecl/_internal/SCacheton.py @@ -0,0 +1,55 @@ +class SCacheton: + """ + Static class for caching classes and vars by hashable arguments + """ + cachetons = {} + cachevars = {} + + @staticmethod + def get(cls, *args, **kwargs): + """ + Get class cached by args/kwargs + If it does not exist, creates new with *args,**kwargs + All cached data will be freed with cleanup() + + You must not to store Tensor in SCacheton, use per-device cache vars + """ + cls_multitons = SCacheton.cachetons.get(cls, None) + if cls_multitons is None: + cls_multitons = SCacheton.cachetons[cls] = {} + + key = (args, tuple(kwargs.items()) ) + + data = cls_multitons.get(key, None) + if data is None: + data = cls_multitons[key] = cls(*args, **kwargs) + + return data + + @staticmethod + def set_var(var_name, value): + """ + Set data cached by var_name + All cached data will be freed with cleanup() + + You must not to store Tensor in SCacheton, use per-device cache vars + """ + SCacheton.cachevars[var_name] = value + + @staticmethod + def get_var(var_name): + """ + Get data cached by var_name + All cached data will be freed with cleanup() + + You must not to store Tensor in SCacheton, use per-device cache vars + """ + return SCacheton.cachevars.get(var_name, None) + + @staticmethod + def cleanup(): + """ + Free all cached objects + """ + SCacheton.cachetons = {} + SCacheton.cachevars = {} \ No newline at end of file diff --git a/xlib/avecl/_internal/Tensor.py b/xlib/avecl/_internal/Tensor.py new file mode 100644 index 0000000..bbab164 --- /dev/null +++ b/xlib/avecl/_internal/Tensor.py @@ -0,0 +1,124 @@ +import weakref +from typing import Iterable, Union + +import numpy as np + +from .AShape import * +from .backend import * +from .HType import * + + +class Tensor: + ### CONSTRUCTORS + + def __init__(self, shape : Union[AShape, Iterable], + dtype : np.dtype, + device : Union[None, int, Device, DeviceInfo] = None, + initializer : 'Initializer' = None, + ): + Tensor._object_count += 1 + + self._shape = shape = AShape(shape) + self._dtype = dtype = np.dtype(dtype) + self._device = device = get_device(device) + if device is None: + raise Exception('No device.') + + self._seq_id = Tensor._seq_id = Tensor._seq_id + 1 + self._buffer = Buffer(device, size=shape.size*dtype.itemsize, + on_initialize= lambda initializer=initializer, wself=weakref.ref(self): initializer.initialize_tensor( wself() ) if initializer is not None else None ) + + @staticmethod + def from_value(value, dtype=None, device:Union[None,int,Device,DeviceInfo]=None) -> 'Tensor': + if isinstance(value, Tensor): + device = value.get_device() + elif not isinstance(value, np.ndarray): + if HType.is_scalar_type(value): + if dtype is None: + raise ValueError('dtype must be specified for single value') + value = np.array([value], dtype=dtype) + elif isinstance(value, Iterable): + if dtype is None: + raise ValueError('dtype must be specified for Iterable of values') + value = np.array(value, dtype=dtype) + else: + raise ValueError(f'Unsupported value type {value.__class__}') + + return Tensor(shape=value.shape, dtype=value.dtype, device=device).set(value) + + ### PROPERTIES + + @property + def shape(self): return self._shape + @property + def ndim(self): return len(self._shape) + @property + def dtype(self) -> np.dtype: return self._dtype + @property + def name(self) -> str: return f'#{self.get_seq_id()}' + + ### GETTERS/SETTERS + + def get_buffer(self) -> Buffer: return self._buffer + def get_device(self) -> Device: return self._device + def get_seq_id(self) -> int: return self._seq_id + + def set(self, value : Union['Tensor',np.ndarray,int,float]) -> 'Tensor': + if isinstance(value, Tensor): + if self.shape.size != value.shape.size: + raise ValueError('Unable to set the data from other tensor: shape.size is not the same.') + self.get_buffer().set(value.get_buffer()) + else: + if isinstance(value, np.ndarray): + value = value.astype(self.dtype) + else: + if HType.is_scalar_type(value): + value = np.array([value], dtype=self.dtype) + elif isinstance(value, Iterable): + value = np.array(value, dtype=self.dtype) + else: + raise ValueError(f'Unsupported value type {value.__class__}') + + self.get_buffer().set(value) + return self + + def np(self): + """Returns numpy value of a Tensor""" + return self.get_buffer().np(self.shape, self.dtype) + + + ### OPERATORS + + def __add__(self, value) -> 'Tensor': ... + def __radd__(self, value) -> 'Tensor': ... + def __sub__(self, value) -> 'Tensor': ... + def __rsub__(self, value) -> 'Tensor': ... + def __mul__(self, value) -> 'Tensor': ... + def __rmul__(self, value) -> 'Tensor': ... + def __truediv__(self, value) -> 'Tensor': ... + def __rtruediv__(self, value) -> 'Tensor': ... + def __neg__(self) -> 'Tensor': ... + def __getitem__(self, slices) -> 'Tensor': ... + def __setitem__(self, slices, value): ... + + def as_shape(self, shape) -> 'Tensor': ... + def copy(self) -> 'Tensor': ... + def max(self, axes=None, keepdims=False) -> 'Tensor': ... + def mean(self, axes=None, keepdims=False) -> 'Tensor': ... + def min(self, axes=None, keepdims=False) -> 'Tensor': ... + def reshape(self, new_shape) -> 'Tensor': ... + def sum(self, axes=None, keepdims=False) -> 'Tensor': ... + def transpose(self, axes_order, op_text=None, dtype=None) -> 'Tensor': ... + + @property + def T(self): return self.transpose( tuple(range(self.shape.ndim-1,-1,-1)) ) + + ### INTERNAL METHODS + + def __del__(self): + Tensor._object_count -= 1 + + _object_count = 0 + _seq_id = 0 + +__all__ = ['Tensor'] diff --git a/xlib/avecl/_internal/TensorImpl.py b/xlib/avecl/_internal/TensorImpl.py new file mode 100644 index 0000000..41587d4 --- /dev/null +++ b/xlib/avecl/_internal/TensorImpl.py @@ -0,0 +1,92 @@ +from .op import * +from .AShape import * +from .backend import * +from .Tensor import Tensor + +def Tensor__str__(self : Tensor): return f"T {self.name} {self.shape} {self.dtype.name}" +Tensor.__str__ = Tensor__str__ + +def Tensor__repr__(self : Tensor): + s = self.__str__() + '\n' + s += str(self.np()) + '\n' + s += self.__str__() + return s +Tensor.__repr__ = Tensor__repr__ + +def Tensor__add__(self : Tensor, value) -> Tensor: + return add(self, value) +Tensor.__add__ = Tensor__add__ + +def Tensor__radd__(self : Tensor, value) -> Tensor: + return add(value, self) +Tensor.__radd__ = Tensor__radd__ + +def Tensor__sub__(self : Tensor, value) -> Tensor: + return sub(self, value) +Tensor.__sub__ = Tensor__sub__ + +def Tensor__rsub__(self : Tensor, value) -> Tensor: + return sub(value, self) +Tensor.__rsub__ = Tensor__rsub__ + +def Tensor__mul__(self : Tensor, value) -> Tensor: + if self == value: + return square(self) + return mul(self, value) +Tensor.__mul__ = Tensor__mul__ + +def Tensor__rmul__(self : Tensor, value) -> Tensor: + if self == value: + return square(self) + return mul(value, self) +Tensor.__rmul__ = Tensor__rmul__ + +def Tensor__truediv__(self : Tensor, value) -> Tensor: + return div(self, value) +Tensor.__truediv__ = Tensor__truediv__ + +def Tensor__rtruediv__(self : Tensor, value) -> Tensor: + return div(value, self) +Tensor.__rtruediv__ = Tensor__rtruediv__ + +def Tensor___neg__(self : Tensor): + raise NotImplementedError() +Tensor.___neg__ = Tensor___neg__ + +Tensor.__getitem__ = slice_ +Tensor.__setitem__ = slice_set + +def Tensor_as_shape(self : Tensor, shape) -> Tensor: + return TensorRef(self, shape) +Tensor.as_shape = Tensor_as_shape + +def Tensor_copy(self : Tensor) -> Tensor: + return Tensor.from_value(self) +Tensor.copy = Tensor_copy + +Tensor.max = reduce_max +Tensor.mean = reduce_mean +Tensor.min = reduce_min +Tensor.reshape = reshape +Tensor.sum = reduce_sum +Tensor.transpose = transpose + +class TensorRef(Tensor): + """ + TensorRef used to interpret existing Tensor with different shape. + use Tensor.as_ref() method + """ + + def __init__(self, t : Tensor, shape): + shape = AShape(shape) + if t.shape.size != shape.size: + raise ValueError(f'Cannot interpet shape {t.shape} as ref shape {shape}') + super().__init__(shape, t.dtype, device=t.get_device()) + self._t = t + + # Forward methods to original tensor + def get_seq_id(self) -> int: return self._t.get_seq_id() + def get_buffer(self) -> Buffer: return self._t.get_buffer() + def get_device(self) -> Device: return self._t.get_device() + +__all__ = [] diff --git a/xlib/avecl/_internal/backend/Buffer.py b/xlib/avecl/_internal/backend/Buffer.py new file mode 100644 index 0000000..94401d4 --- /dev/null +++ b/xlib/avecl/_internal/backend/Buffer.py @@ -0,0 +1,109 @@ +from typing import Iterable, Union + +import numpy as np + +from . import OpenCL as CL + +class Buffer: + __slots__ = ['_device','_cl_mem','_size','_on_initialize'] + + def __init__(self, device : 'Device', size : int, on_initialize = None): + """ + represents physical buffer associated with physical device + + device Device + + size int + """ + + Buffer._object_count += 1 + self._device = device + self._size = size + self._cl_mem = None + self._on_initialize = on_initialize + + def __del__(self): + #print('Buffer.__del__') + Buffer._object_count -= 1 + self.free_cl_mem() + + def get_device(self) -> 'Device': return self._device + def get_size(self) -> int: return self._size + + def has_cl_mem(self) -> bool: return self._cl_mem is not None + def get_cl_mem(self) -> CL.cl_mem: + if self._cl_mem is None: + self._cl_mem = self._device._cl_mem_pool_alloc(self._size) + if self._on_initialize is not None: + self._on_initialize() + + return self._cl_mem + + def free_cl_mem(self): + if self._cl_mem is not None: + self._device._cl_mem_pool_free(self._cl_mem) + self._cl_mem = None + + def set(self, value : Union['Buffer', np.ndarray]): + """ + Parameters + + value Buffer copy data from other Buffer. + + np.ndarray copies values from ndarray + to Buffer's memory + + """ + if isinstance(value, Buffer): + if self != value: + if self._size != value._size: + raise Exception(f'Unable to copy from Buffer with {value._size} size to buffer with {self._size} size.') + + if self._device == value._device: + CL.clEnqueueCopyBuffer(self._device._get_ctx_q(), value.get_cl_mem(), self.get_cl_mem(), 0, 0, self._size, 0, None, None) + else: + # Transfer between devices will cause low performance + raise NotImplementedError() + else: + if not isinstance(value, np.ndarray): + raise ValueError (f'Invalid type {value.__class__}. Must be np.ndarray.') + + if value.nbytes != self._size: + raise ValueError(f'Value size {value.nbytes} does not match Buffer size {self._size}.') + + if not value.flags.contiguous: + value = value.reshape(-1) + if not value.flags.contiguous: + raise ValueError ("Unable to write from non-contiguous np array.") + + ev = CL.cl_event() + + clr = CL.clEnqueueWriteBuffer(self._device._get_ctx_q(), self.get_cl_mem(), False, 0, value.nbytes, value.ctypes.data_as(CL.c_void_p), 0, None, ev) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clEnqueueWriteBuffer error: {clr}') + + CL.clWaitForEvents(1, ( CL.cl_event * 1 )(ev) ) + CL.clReleaseEvent(ev) + + def np(self, shape : Iterable, dtype : np.dtype): + """ + Returns data of buffer as np.ndarray with specified shape and dtype + """ + out_np_value = np.empty (shape, dtype) + + if out_np_value.nbytes != self._size: + raise ValueError(f'Unable to represent Buffer with size {self._size} as shape {shape} with dtype {dtype}') + + clr = CL.clEnqueueReadBuffer(self._device._get_ctx_q(), self.get_cl_mem(), True, 0, self._size, out_np_value.ctypes.data, 0, None, None) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clEnqueueReadBuffer error: {clr}') + + return out_np_value + + def __str__(self): + return f'Buffer [{self._size} bytes][{f"{self._cl_mem.value}" if self._cl_mem is not None else "unallocated"}] on {str(self._device)}' + + def __repr__(self): + return self.__str__() + + _object_count = 0 diff --git a/xlib/avecl/_internal/backend/Device.py b/xlib/avecl/_internal/backend/Device.py new file mode 100644 index 0000000..9d439e9 --- /dev/null +++ b/xlib/avecl/_internal/backend/Device.py @@ -0,0 +1,522 @@ +from typing import List, Union + +import numpy as np + +from . import OpenCL as CL +from .Buffer import Buffer +from .DeviceInfo import DeviceInfo +from .Kernel import Kernel + +_np_dtype_to_cl = { np.uint8: CL.cl_uchar, + np.int8: CL.cl_char, + np.uint16: CL.cl_ushort, + np.int16: CL.cl_short, + np.uint32: CL.cl_uint, + np.int32: CL.cl_int, + np.uint64: CL.cl_ulong, + np.int64: CL.cl_long, + np.float16: CL.cl_half, + np.float32: CL.cl_float, + np.float64: CL.cl_double } + +_opencl_device_ids = None +_default_device = None +_devices = {} + +class Device: + """ + Represents physical TensorCL device + """ + + def __init__(self, device_info : DeviceInfo, **kwargs): + if kwargs.get('_check', None) is None: + raise Exception('You should not to create Device from constructor. Use get_device()') + + self._cached_data = {} # cached data (per device) by key + self._pooled_buffers = {} # Pool of cached device buffers. + self._compiled_kernels = {} # compiled kernels by key + self._ctx_q = None # CL command queue + self._ctx = None # CL context + + self._total_memory_allocated = 0 + self._total_buffers_allocated = 0 + self._total_memory_pooled = 0 + self._total_buffers_pooled = 0 + + self._device_info = device_info + self._device_id = _get_opencl_device_ids()[device_info.get_index()] + + def __del__(self): + self.cleanup() + + def __eq__(self, other): + if self is not None and other is not None and isinstance(self, Device) and isinstance(other, Device): + return self._device_id.value == other._device_id.value + return False + + def __hash__(self): + return self._device_id.value + + def _get_ctx(self) -> CL.cl_context: + # Create OpenCL context on demand + if self._ctx is None: + clr = CL.CLRESULT() + ctx = CL.clCreateContext( None, 1, (CL.cl_device_id * 1)( self._device_id ), None, None, clr) + if clr != CL.CLERROR.SUCCESS: + raise Exception('Unable to create OpenCL context.') + self._ctx = ctx + return self._ctx + + def _get_ctx_q(self) -> CL.cl_command_queue: + # Create CommandQueue on demand + if self._ctx_q is None: + clr = CL.CLRESULT() + ctx_q = CL.clCreateCommandQueue(self._get_ctx(), self._device_id, CL.cl_command_queue_properties(0), clr) + if clr != CL.CLERROR.SUCCESS: + raise Exception('Unable to create OpenCL CommandQueue.') + self._ctx_q = ctx_q + return self._ctx_q + + def get_description(self) -> str: + return f"{self._device_info.get_name()} [{(self._device_info.get_total_memory() / 1024**3) :.3}Gb]" + + def __str__(self): + return self.get_description() + + def __repr__(self): + return f'{self.__class__.__name__} object: ' + self.__str__() + + def set_cached_data(self, key, value): + """ + + All cached data will be freed with cleanup() + """ + self._cached_data[key] = value + + def get_cached_data(self, key): + return self._cached_data.get(key, None) + + def get_total_allocated_memory(self): + return self._total_memory_allocated + + def get_max_malloc_size(self) -> int: + size = CL.cl_ulong() + clr = CL.clGetDeviceInfo(self._device_id, CL.CL_DEVICE_MAX_MEM_ALLOC_SIZE, CL.sizeof(size), CL.byref(size), None) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clGetDeviceInfo error: {clr}') + return size.value + + def _compile_kernel(self, key, kernel_text) -> CL.cl_kernel: + """ + compile or get cached kernel + """ + + compiled_krn, prog = self._compiled_kernels.get(key, (None, None) ) + + if compiled_krn is None: + clr = CL.CLRESULT() + prog = CL.clCreateProgramWithSource(self._get_ctx(), 1, CL.c_char_p(kernel_text.encode()), None, clr ) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clCreateProgramWithSource error {clr}, with kernel_text:\n{kernel_text}') + + clr = CL.clBuildProgram(prog, 1, (CL.cl_device_id*1)(self._device_id), CL.c_char_p('-cl-std=CL1.2 -cl-single-precision-constant'.encode()), None, None ) + if clr != CL.CLERROR.SUCCESS: + build_log_size = CL.c_size_t() + clr = CL.clGetProgramBuildInfo(prog, self._device_id, CL.CL_PROGRAM_BUILD_LOG, 0, None, CL.byref(build_log_size) ) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clGetProgramBuildInfo,error: {clr}') + + build_log = CL.create_string_buffer(build_log_size.value) + clr = CL.clGetProgramBuildInfo(prog, self._device_id, CL.CL_PROGRAM_BUILD_LOG, build_log_size.value, build_log, None ) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clGetProgramBuildInfo error: {clr}') + + build_log = str(build_log.value, 'utf-8') + raise Exception(f'clBuildProgram error:\n\n{build_log}') + + num_kernels = CL.cl_uint() + clr = CL.clCreateKernelsInProgram(prog, 0, None, CL.byref(num_kernels)) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clCreateKernelsInProgram error: {clr}') + + if num_kernels.value != 1: + raise Exception(f'Kernel must contain only one __kernel:\n\n{kernel_text}') + + kernels = (CL.cl_kernel * num_kernels.value)() + clr = CL.clCreateKernelsInProgram(prog, num_kernels.value, kernels, None) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clCreateKernelsInProgram error: {clr}') + + compiled_krn = kernels[0] + self._compiled_kernels[key] = (compiled_krn, prog) + + return compiled_krn + + def _cl_mem_alloc(self, size) -> CL.cl_mem: + clr = CL.CLRESULT() + mem = CL.clCreateBuffer(self._get_ctx(), CL.CL_MEM_READ_WRITE, size, None, clr) + if clr == CL.CLERROR.SUCCESS: + # Fill one byte to check memory availability + ev = CL.cl_event() + clr = CL.clEnqueueFillBuffer (self._get_ctx_q(), mem, (CL.c_char * 1)(), 1, 0, 1, 0, None, ev ) + if clr == CL.CLERROR.SUCCESS: + CL.clReleaseEvent(ev) + self._total_memory_allocated += size + self._total_buffers_allocated += 1 + return mem + return None + + def _cl_mem_free(self, mem : CL.cl_mem): + size = CL.c_size_t() + clr = CL.clGetMemObjectInfo(mem, CL.CL_MEM_SIZE, CL.sizeof(size), CL.byref(size), None ) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clGetMemObjectInfo error: {clr}') + size = size.value + self._total_memory_allocated -= size + self._total_buffers_allocated -= 1 + clr = CL.clReleaseMemObject(mem) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clReleaseMemObject error: {clr}') + + def _cl_mem_pool_alloc(self, size): + """ + allocate or get cl_mem from pool + """ + pool = self._pooled_buffers + + # First try to get pooled buffer + ar = pool.get(size, None) + if ar is not None and len(ar) != 0: + mem = ar.pop(-1) + self._total_memory_pooled -= size + self._total_buffers_pooled -= 1 + else: + # No pooled buffer, try to allocate new + while True: + mem = self._cl_mem_alloc(size) + if mem is None: + # MemoryError. Finding largest pooled buffer to release + buf_to_release = None + for size_key in sorted(list(pool.keys()), reverse=True): + ar = pool[size_key] + if len(ar) != 0: + buf_to_release = ar.pop(-1) + break + + if buf_to_release is not None: + # Release pooled buffer and try to allocate again + self._cl_mem_free(buf_to_release) + continue + + raise Exception(f'Unable to allocate {size // 1024**2}Mb on {str(self)}') + + + break + + return mem + + def _cl_mem_pool_free(self, mem : CL.cl_mem): + """ + Put cl_mem to pool for reuse in future. + """ + size = CL.c_size_t() + clr = CL.clGetMemObjectInfo(mem, CL.CL_MEM_SIZE, CL.sizeof(size), CL.byref(size), None ) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clGetMemObjectInfo error: {clr}') + size = size.value + + pool = self._pooled_buffers + ar = pool.get(size, None) + if ar is None: + ar = pool[size] = [] + ar.append(mem) + + self._total_memory_pooled += size + self._total_buffers_pooled += 1 + + def print_stat(self): + s = f''' +Total memory allocated: {self._total_memory_allocated} +Total buffers allocated: {self._total_buffers_allocated} +Total memory pooled: {self._total_memory_pooled} +Total buffers pooled: {self._total_buffers_pooled} +N of compiled kernels: {len(self._compiled_kernels)} +N of cacheddata: {len(self._cached_data)} +''' + print(s) + + def run_kernel(self, kernel : Kernel, *args, global_shape=None, local_shape=None, global_shape_offsets=None, wait=False): + """ + Run kernel on Device + + Arguments + + *args arguments will be passed to OpenCL kernel + allowed types: + + Buffer + np single value + + global_shape(None) tuple of ints, up to 3 dims + amount of parallel kernel executions. + in OpenCL kernel, + id can be obtained via get_global_id(dim) + + local_shape(None) tuple of ints, up to 3 dims + specifies local groups of every dim of global_shape. + in OpenCL kernel, + id can be obtained via get_local_id(dim) + + global_shape_offsets(None) tuple of ints + offsets for global_shape + + wait(False) wait execution to complete + """ + ckernel = self._compile_kernel(kernel, kernel.get_kernel_text()) + + if global_shape is None: + global_shape = kernel.get_global_shape() + if global_shape is None: + raise ValueError('global_shape must be defined.') + + work_dim = len(global_shape) + global_shape_ar = (CL.c_size_t*work_dim)() + for i,v in enumerate(global_shape): + global_shape_ar[i] = v + + local_shape_ar = None + if local_shape is None: + local_shape = kernel.get_local_shape() + if local_shape is not None: + if len(local_shape) != work_dim: + raise ValueError('len of local_shape must match global_shape') + + local_shape_ar = (CL.c_size_t*work_dim)() + for i,v in enumerate(local_shape): + local_shape_ar[i] = v + + + global_shape_offsets_ar = None + if global_shape_offsets is not None: + if len(global_shape_offsets) != work_dim: + raise ValueError('len of global_shape_offsets must match global_shape') + + global_shape_offsets_ar = (CL.c_size_t*work_dim)() + for i,v in enumerate(local_shape): + global_shape_offsets_ar[i] = v + + for i, arg in enumerate(args): + + if isinstance(arg, Buffer): + arg = arg.get_cl_mem() + else: + cl_type = _np_dtype_to_cl.get(arg.__class__, None) + if cl_type is None: + raise ValueError(f'Cannot convert type {arg.__class__} to OpenCL type.') + arg = cl_type(arg) + + clr = CL.clSetKernelArg(ckernel, i, CL.sizeof(arg), CL.byref(arg)) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clSetKernelArg error: {clr}') + + ev = CL.cl_event() if wait else None + + clr = CL.clEnqueueNDRangeKernel(self._get_ctx_q(), ckernel, work_dim, global_shape_offsets_ar, global_shape_ar, local_shape_ar, 0, None, ev) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clEnqueueNDRangeKernel error: {clr}') + + if wait: + CL.clWaitForEvents(1, (CL.cl_event*1)(ev) ) + CL.clReleaseEvent(ev) + + def wait(self): + """ + Wait to finish all queued operations on this Device + """ + clr = CL.clFinish(self._get_ctx_q()) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clFinish error: {clr}') + + def cleanup(self): + """ + Frees all resources from this Device. + """ + self._cached_data = {} + + pool = self._pooled_buffers + for size_key in pool.keys(): + for mem in pool[size_key]: + self._cl_mem_free(mem) + self._pooled_buffers = {} + self._total_memory_pooled = 0 + self._total_buffers_pooled = 0 + + if self._total_memory_allocated != 0: + raise Exception('Unable to cleanup CLDevice, while not all Buffers are deallocated.') + + for kernel, prog in self._compiled_kernels.values(): + clr = CL.clReleaseKernel(kernel) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clReleaseKernel error: {clr}') + + clr = CL.clReleaseProgram(prog) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clReleaseProgram error: {clr}') + self._compiled_kernels = {} + + if self._ctx_q is not None: + clr = CL.clReleaseCommandQueue(self._ctx_q) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clReleaseCommandQueue error: {clr}') + self._ctx_q = None + + if self._ctx is not None: + clr = CL.clReleaseContext(self._ctx) + if clr != CL.CLERROR.SUCCESS: + raise Exception(f'clReleaseContext error: {clr}') + self._ctx = None + +def _get_opencl_device_ids() -> List[CL.cl_device_id]: + global _opencl_device_ids + if _opencl_device_ids is None: + _opencl_device_ids = [] + device_types = CL.CL_DEVICE_TYPE_CPU | CL.CL_DEVICE_TYPE_ACCELERATOR | CL.CL_DEVICE_TYPE_GPU + + while True: + num_platforms = CL.cl_uint() + if CL.clGetPlatformIDs(0, None, num_platforms) != CL.CLERROR.SUCCESS or \ + num_platforms.value == 0: + break + + platforms = (CL.cl_platform_id * num_platforms.value) () + if CL.clGetPlatformIDs(num_platforms.value, platforms, None) != CL.CLERROR.SUCCESS: + break + + for i_platform in range(num_platforms.value): + platform = platforms[i_platform] + num_devices = CL.cl_uint(0) + if CL.clGetDeviceIDs(platform, device_types, 0, None, num_devices) != CL.CLERROR.SUCCESS or \ + num_devices.value == 0: + continue + + device_ids = (CL.cl_device_id * num_devices.value)() + if CL.clGetDeviceIDs(platform, device_types, num_devices.value, device_ids, None) != CL.CLERROR.SUCCESS: + continue + + for i in range(num_devices.value): + device_id = device_ids[i] + # Check OpenCL version. + if device_id is not None: + device_version_size = CL.c_size_t() + if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_VERSION, 0, None, device_version_size) == CL.CLERROR.SUCCESS: + device_version = CL.create_string_buffer(device_version_size.value) + if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_VERSION, device_version_size.value, device_version, None) == CL.CLERROR.SUCCESS: + device_version = str(device_version.value, 'ascii') + + major, minor = device_version.split(' ')[1].split('.') + opencl_version = int(major)*10+int(minor) + if opencl_version >= 12: + _opencl_device_ids.append(device_id) + break + return _opencl_device_ids + +def get_available_devices_info() -> List[DeviceInfo]: + """ + returns a list of available picklable DeviceInfo's + """ + devices = [] + for device_index, device_id in enumerate(_get_opencl_device_ids()): + device_name = 'undefined' + device_total_memory = 0 + + name_size = CL.c_size_t() + if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_NAME, 0, None, name_size) == CL.CLERROR.SUCCESS: + name_value = CL.create_string_buffer(name_size.value) + if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_NAME, name_size.value, name_value, None) == CL.CLERROR.SUCCESS: + device_name = str(name_value.value, 'ascii') + + global_mem_size = CL.cl_ulong() + if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_GLOBAL_MEM_SIZE, CL.sizeof(global_mem_size), CL.byref(global_mem_size), None) == CL.CLERROR.SUCCESS: + device_total_memory = global_mem_size.value + + vendor_id = CL.cl_uint() + CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_VENDOR_ID, CL.sizeof(vendor_id), CL.byref(vendor_id), None) + vendor_id = vendor_id.value + + max_compute_units = CL.cl_uint() + CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_MAX_COMPUTE_UNITS, CL.sizeof(max_compute_units), CL.byref(max_compute_units), None) + max_compute_units = max_compute_units.value + + performance_level = max_compute_units + + if vendor_id == 0x8086: # Intel device + performance_level -= 1000 + + devices.append( DeviceInfo(index=device_index, name=device_name, total_memory=device_total_memory, performance_level=performance_level ) ) + + return devices + +def get_default_device() -> Union[Device, None]: + global _default_device + if _default_device is None: + _default_device = get_device(0) + return _default_device + +def set_default_device(device : Device): + if not isinstance(device, Device): + raise ValueError('device must be an instance of Device') + + global _default_device + _default_device = device + +def get_device(arg : Union[None, int, Device, DeviceInfo]) -> Union[Device, None]: + """ + get physical TensorCL device. + + arg None - get best device + int - by index + DeviceInfo - by device info + Device - returns the same + """ + global _devices + + if arg is None: + return get_best_device() + elif isinstance(arg, int): + devices_info = get_available_devices_info() + if arg < len(devices_info): + arg = devices_info[arg] + else: + return None + elif isinstance(arg, Device): + return arg + elif not isinstance(arg, DeviceInfo): + raise ValueError(f'Unknown type of arg {arg.__class__}') + + device = _devices.get(arg, None) + if device is None: + device = _devices[arg] = Device(arg, _check=1) + + return device + +def get_best_device() -> Union[Device, None]: + """ + returns best device from available. + """ + perf_level = -999999 + result = None + for device_info in get_available_devices_info(): + dev_perf_level = device_info.get_performance_level() + if perf_level < dev_perf_level: + perf_level = dev_perf_level + result = device_info + if result is not None: + result = get_device(result) + return result + +def cleanup_devices(): + global _devices + + for device in list(_devices.values()): + device.cleanup() + _devices = {} diff --git a/xlib/avecl/_internal/backend/DeviceInfo.py b/xlib/avecl/_internal/backend/DeviceInfo.py new file mode 100644 index 0000000..358baaa --- /dev/null +++ b/xlib/avecl/_internal/backend/DeviceInfo.py @@ -0,0 +1,44 @@ +class DeviceInfo: + """ + Represents picklable OpenCL device info + """ + + def __init__(self, index : int = None, name : str = None, total_memory : int = None, performance_level : int = None): + self._index = index + self._name = name + self._total_memory = total_memory + self._performance_level = performance_level + + def __getstate__(self): + return self.__dict__.copy() + + def __setstate__(self, d): + self.__init__() + self.__dict__.update(d) + + def get_index(self) -> int: + return self._index + + def get_name(self) -> str: + return self._name + + def get_total_memory(self) -> int: + return self._total_memory + + def get_performance_level(self) -> int: + return self._performance_level + + def __eq__(self, other): + if self is not None and other is not None and isinstance(self, DeviceInfo) and isinstance(other, DeviceInfo): + return self._index == other._index + return False + + def __hash__(self): + return self._index + + def __str__(self): + return f"[{self._index}] {self._name} [{(self._total_memory / 1024**3) :.3}Gb]" + + def __repr__(self): + return f'{self.__class__.__name__} object: ' + self.__str__() + diff --git a/xlib/avecl/_internal/backend/Kernel.py b/xlib/avecl/_internal/backend/Kernel.py new file mode 100644 index 0000000..0ce31ae --- /dev/null +++ b/xlib/avecl/_internal/backend/Kernel.py @@ -0,0 +1,26 @@ +class Kernel: + """ + TensorCL kernel. + + It does not allocate any resources, thus can be used as static variable within class. + + arguments + + kernel_text OpenCL text of kernel. Must contain only one __kernel + + global_shape default global_shape for .run() + + local_shape default local_shape for .run() + """ + def __init__(self, kernel_text, global_shape=None, local_shape=None): + self._kernel_text = kernel_text + self._global_shape = global_shape + self._local_shape = local_shape + + def get_kernel_text(self) -> str: return self._kernel_text + def get_global_shape(self): return self._global_shape + def get_local_shape(self): return self._local_shape + + def __str__(self): return f'Kernel: \n{self._kernel_text}' + def __repr__(self): return self.__str__() + diff --git a/xlib/avecl/_internal/backend/OpenCL/OpenCL.py b/xlib/avecl/_internal/backend/OpenCL/OpenCL.py new file mode 100644 index 0000000..536591e --- /dev/null +++ b/xlib/avecl/_internal/backend/OpenCL/OpenCL.py @@ -0,0 +1,278 @@ +""" +Minimal OpenCL 1.2 low level ctypes API. +""" +import ctypes +from ctypes import POINTER, create_string_buffer, sizeof, c_char_p, c_char, c_size_t, c_void_p, byref +from ctypes.util import find_library +from enum import IntEnum + +dlls_by_name = {} +def dll_import(dll_name): + dll = dlls_by_name.get(dll_name, None) + if dll is None: + try: + dll = ctypes.cdll.LoadLibrary(find_library(dll_name)) + except: + pass + if dll is None: + raise RuntimeError(f'Unable to load {dll_name} library.') + dlls_by_name[dll_name] = dll + + def decorator(func): + dll_func = getattr(dll, func.__name__) + anno = list(func.__annotations__.values()) + dll_func.argtypes = anno[:-1] + dll_func.restype = anno[-1] + def wrapper(*args): + return dll_func(*args) + return wrapper + return decorator + +class cl_char(ctypes.c_int8): pass +class cl_uchar(ctypes.c_uint8): pass +class cl_short(ctypes.c_int16): pass +class cl_ushort(ctypes.c_uint16): pass +class cl_int(ctypes.c_int32): pass +class cl_uint(ctypes.c_uint32): pass +class cl_long(ctypes.c_int64): pass +class cl_ulong(ctypes.c_uint64): pass +class cl_half(ctypes.c_uint16): pass +class cl_float(ctypes.c_float): pass +class cl_double(ctypes.c_double): pass +class cl_bool(cl_uint): pass +class cl_bitfield(cl_ulong): + def __or__(self, other): + assert isinstance(other, self.__class__) + return self.__class__(self.value | other.value) + def __and__(self, other): + assert isinstance(other, self.__class__) + return self.__class__(self.value & other.value) + def __xor__(self, other): + assert isinstance(other, self.__class__) + return self.__class__(self.value ^ other.value) + def __not__(self): + return self.__class__(~self.value) + def __contains__(self, other): + assert isinstance(other, self.__class__) + return (self.value & other.value) == other.value + def __hash__(self): + return self.value.__hash__() + def __eq__(self, other): + if not isinstance(other, self.__class__): + return False + else: + return self.value == other.value + def __ne__(self, other): + return not(self == other) + def __repr__(self): + return f'cl_bitfield: {self.value}' + +class CLERROR(IntEnum): + SUCCESS = 0 + DEVICE_NOT_FOUND = -1 + DEVICE_NOT_AVAILABLE = -2 + COMPILER_NOT_AVAILABLE = -3 + MEM_OBJECT_ALLOCATION_FAILURE = -4 + OUT_OF_RESOURCES = -5 + OUT_OF_HOST_MEMORY = -6 + PROFILING_INFO_NOT_AVAILABLE = -7 + MEM_COPY_OVERLAP = -8 + IMAGE_FORMAT_MISMATCH = -9 + IMAGE_FORMAT_NOT_SUPPORTED = -10 + BUILD_PROGRAM_FAILURE = -11 + MAP_FAILURE = -12 + MISALIGNED_SUB_BUFFER_OFFSET = -13 + EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST = -14 + INVALID_VALUE = -30 + INVALID_DEVICE_TYPE = -31 + INVALID_PLATFORM = -32 + INVALID_DEVICE = -33 + INVALID_CONTEXT = -34 + INVALID_QUEUE_PROPERTIES = -35 + INVALID_COMMAND_QUEUE = -36 + INVALID_HOST_PTR = -37 + INVALID_MEM_OBJECT = -38 + INVALID_IMAGE_FORMAT_DESCRIPTOR = -39 + INVALID_IMAGE_SIZE = -40 + INVALID_SAMPLER = -41 + INVALID_BINARY = -42 + INVALID_BUILD_OPTIONS = -43 + INVALID_PROGRAM = -44 + INVALID_PROGRAM_EXECUTABLE = -45 + INVALID_KERNEL_NAME = -46 + INVALID_KERNEL_DEFINITION = -47 + INVALID_KERNEL = -48 + INVALID_ARG_INDEX = -49 + INVALID_ARG_VALUE = -50 + INVALID_ARG_SIZE = -51 + INVALID_KERNEL_ARGS = -52 + INVALID_WORK_DIMENSION = -53 + INVALID_WORK_GROUP_SIZE = -54 + INVALID_WORK_ITEM_SIZE = -55 + INVALID_GLOBAL_OFFSET = -56 + INVALID_EVENT_WAIT_LIST = -57 + INVALID_EVENT = -58 + INVALID_OPERATION = -59 + INVALID_GL_OBJECT = -60 + INVALID_BUFFER_SIZE = -61 + INVALID_MIP_LEVEL = -62 + INVALID_GLOBAL_WORK_SIZE = -63 + INVALID_PROPERTY = -64 + INVALID_GL_SHAREGROUP_REFERENCE_KHR = -1000 + PLATFORM_NOT_FOUND_KHR = -1001 + +class CLRESULT(cl_int): + def __eq__(self, other): + if isinstance(other, int): + return self.value == other + elif isinstance(other, self.__class__): + return self.value == other.value + else: + return False + def __ne__(self, other): + return not(self == other) + def __hash__(self): + return self.value.__hash__() + def __str__(self): + try: + return f'CLRESULT ({str(CLERROR(self.value))})' + except: + return f'CLRESULT ({self.value})' + def __repr__(self): + return self.__str__() + +class cl_platform_id(c_void_p): ... +class cl_platform_info(cl_uint): ... +class cl_device_id(c_void_p): ... +class cl_device_type(cl_bitfield): ... +class cl_device_info(cl_uint): ... +class cl_context(c_void_p): ... +class cl_context_properties(c_void_p): ... +class cl_command_queue(c_void_p): ... +class cl_command_queue_properties(cl_bitfield): ... +class cl_event(c_void_p): ... +class cl_mem(c_void_p): ... +class cl_mem_info(cl_uint): ... +class cl_mem_flags(cl_bitfield): ... +class cl_program(c_void_p): ... +class cl_program_build_info(cl_uint): ... +class cl_kernel(c_void_p): ... + +# https://github.com/KhronosGroup/OpenCL-Headers/blob/master/CL/cl.h +CL_PLATFORM_PROFILE = cl_platform_info(0x0900) +CL_PLATFORM_VERSION = cl_platform_info(0x0901) +CL_PLATFORM_NAME = cl_platform_info(0x0902) +CL_PLATFORM_VENDOR = cl_platform_info(0x0903) +CL_PLATFORM_EXTENSIONS = cl_platform_info(0x0904) + +CL_DEVICE_TYPE_DEFAULT = cl_device_type( (1 << 0) ) +CL_DEVICE_TYPE_CPU = cl_device_type( (1 << 1) ) +CL_DEVICE_TYPE_GPU = cl_device_type( (1 << 2) ) +CL_DEVICE_TYPE_ACCELERATOR = cl_device_type( (1 << 3) ) +CL_DEVICE_TYPE_ALL = cl_device_type( 0xFFFFFFFF ) + +CL_DEVICE_TYPE = cl_device_info (0x1000) +CL_DEVICE_VENDOR_ID = cl_device_info (0x1001) +CL_DEVICE_MAX_COMPUTE_UNITS = cl_device_info (0x1002) +CL_DEVICE_GLOBAL_MEM_SIZE = cl_device_info (0x101F) +CL_DEVICE_NAME = cl_device_info (0x102B) +CL_DEVICE_VERSION = cl_device_info (0x102F) +CL_DEVICE_MAX_MEM_ALLOC_SIZE = cl_device_info (0x1010) +CL_DEVICE_MAX_WORK_GROUP_SIZE = cl_device_info (0x1004) +CL_DRIVER_VERSION = cl_device_info (0x102D) +CL_DEVICE_EXTENSIONS = cl_device_info (0x1030) + +# cl_mem_flags +CL_MEM_READ_WRITE = cl_mem_flags( (1 << 0) ) +CL_MEM_WRITE_ONLY = cl_mem_flags( (1 << 1) ) +CL_MEM_READ_ONLY = cl_mem_flags( (1 << 2) ) +CL_MEM_USE_HOST_PTR = cl_mem_flags( (1 << 3) ) +CL_MEM_ALLOC_HOST_PTR = cl_mem_flags( (1 << 4) ) +CL_MEM_COPY_HOST_PTR = cl_mem_flags( (1 << 5) ) + +# cl_mem_info +CL_MEM_SIZE = cl_mem_info(0x1102) + +# cl_program_build_info +CL_PROGRAM_BUILD_STATUS = cl_program_build_info(0x1181) +CL_PROGRAM_BUILD_OPTIONS = cl_program_build_info(0x1182) +CL_PROGRAM_BUILD_LOG = cl_program_build_info(0x1183) + + +@dll_import('OpenCL') +def clGetPlatformIDs (num_entries : cl_uint, platforms : POINTER(cl_platform_id), num_platforms : POINTER(cl_uint) ) -> CLRESULT: ... + +@dll_import('OpenCL') +def clGetPlatformInfo (platform : cl_platform_id, param_name : cl_platform_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t)) -> CLRESULT: ... + +@dll_import('OpenCL') +def clGetDeviceIDs (platform : cl_platform_id, device_type : cl_device_type, num_entries : cl_uint, devices : POINTER(cl_device_id), num_devices : POINTER(cl_uint)) -> CLRESULT: ... + +@dll_import('OpenCL') +def clGetDeviceInfo(device : cl_device_id, param_name : cl_device_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t)) -> CLRESULT: ... + +@dll_import('OpenCL') +def clCreateContext(properties : cl_context_properties, num_devices : cl_uint, devices : POINTER(cl_device_id), pfn_notify : c_void_p, user_data : c_void_p, errcode_ret : POINTER(CLRESULT) ) -> cl_context: ... + +@dll_import('OpenCL') +def clReleaseContext(context : cl_context) -> CLRESULT: ... + +@dll_import('OpenCL') +def clCreateCommandQueue(context : cl_context, device : cl_device_id, properties : cl_command_queue_properties, errcode_ret : POINTER(CLRESULT) ) -> cl_command_queue: ... + +@dll_import('OpenCL') +def clReleaseCommandQueue(command_queue : cl_command_queue) -> CLRESULT: ... + +@dll_import('OpenCL') +def clFinish(command_queue : cl_command_queue) -> CLRESULT: ... + +@dll_import('OpenCL') +def clWaitForEvents(num_events : cl_uint, event_list : POINTER(cl_event) ) -> CLRESULT: ... + +@dll_import('OpenCL') +def clReleaseEvent(event : cl_event) -> CLRESULT: ... + +@dll_import('OpenCL') +def clCreateBuffer(context : cl_context, flags : cl_mem_flags, size : c_size_t, host_ptr : c_void_p, errcode_ret : POINTER(CLRESULT) ) -> cl_mem: ... + +@dll_import('OpenCL') +def clGetMemObjectInfo(memobj : cl_mem, param_name : cl_mem_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t) ) -> CLRESULT: ... + +@dll_import('OpenCL') +def clReleaseMemObject(memobj : cl_mem) -> CLRESULT: ... + +@dll_import('OpenCL') +def clEnqueueReadBuffer (command_queue : cl_command_queue, buffer : cl_mem, blocking_read : cl_bool, offset : c_size_t, cb : c_size_t, ptr : c_void_p, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ... + +@dll_import('OpenCL') +def clEnqueueWriteBuffer (command_queue : cl_command_queue, buffer : cl_mem, blocking_write : cl_bool, offset : c_size_t, size : c_size_t, ptr : c_void_p, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ... + +@dll_import('OpenCL') +def clEnqueueCopyBuffer (command_queue : cl_command_queue, src_buffer : cl_mem, dst_buffer : cl_mem, src_offset : c_size_t, dst_offset : c_size_t, cb : c_size_t, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : cl_event) -> CLRESULT: ... + +@dll_import('OpenCL') +def clEnqueueFillBuffer (command_queue : cl_command_queue, buffer : cl_mem, pattern : c_void_p, pattern_size : c_size_t, offset : c_size_t, size : c_size_t, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ... + +@dll_import('OpenCL') +def clCreateProgramWithSource (context : cl_context, count : cl_uint, strings : POINTER(c_char_p), lengths : POINTER(c_size_t), errcode_ret : POINTER(CLRESULT) ) -> cl_program: ... + +@dll_import('OpenCL') +def clReleaseProgram (program : cl_program) -> CLRESULT: ... + +@dll_import('OpenCL') +def clBuildProgram (program : cl_program, num_devices : cl_uint, device_list : POINTER(cl_device_id), options : c_char_p, pfn_notify : c_void_p, user_data : c_void_p) -> CLRESULT: ... + +@dll_import('OpenCL') +def clGetProgramBuildInfo (program : cl_program, device : cl_device_id, param_name : cl_program_build_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t) ) -> CLRESULT: ... + +@dll_import('OpenCL') +def clCreateKernelsInProgram (program : cl_program, num_kernels : cl_uint, kernels : POINTER(cl_kernel), num_kernels_ret : POINTER(cl_uint) ) -> CLRESULT: ... + +@dll_import('OpenCL') +def clReleaseKernel (program : cl_kernel) -> CLRESULT: ... + +@dll_import('OpenCL') +def clSetKernelArg (kernel : cl_kernel, arg_index : cl_uint, arg_size : c_size_t, arg_value : c_void_p) -> CLRESULT: ... + +@dll_import('OpenCL') +def clEnqueueNDRangeKernel (command_queue : cl_command_queue, kernel : cl_kernel, work_dim : cl_uint, global_work_offset : POINTER(c_size_t), global_work_size : POINTER(c_size_t), local_work_size : POINTER(c_size_t), num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ... diff --git a/xlib/avecl/_internal/backend/OpenCL/__init__.py b/xlib/avecl/_internal/backend/OpenCL/__init__.py new file mode 100644 index 0000000..a29b4e7 --- /dev/null +++ b/xlib/avecl/_internal/backend/OpenCL/__init__.py @@ -0,0 +1,37 @@ +""" +Minimal OpenCL 1.2 low level ctypes API. +""" +from .OpenCL import (CL_DEVICE_EXTENSIONS, CL_DEVICE_GLOBAL_MEM_SIZE, + CL_DEVICE_MAX_COMPUTE_UNITS, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + CL_DEVICE_MAX_WORK_GROUP_SIZE, CL_DEVICE_NAME, + CL_DEVICE_TYPE, CL_DEVICE_TYPE_ACCELERATOR, + CL_DEVICE_TYPE_ALL, CL_DEVICE_TYPE_CPU, + CL_DEVICE_TYPE_DEFAULT, CL_DEVICE_TYPE_GPU, + CL_DEVICE_VENDOR_ID, CL_DEVICE_VERSION, CL_DRIVER_VERSION, + CL_MEM_ALLOC_HOST_PTR, CL_MEM_COPY_HOST_PTR, + CL_MEM_READ_ONLY, CL_MEM_READ_WRITE, CL_MEM_SIZE, + CL_MEM_USE_HOST_PTR, CL_MEM_WRITE_ONLY, + CL_PLATFORM_EXTENSIONS, CL_PLATFORM_NAME, + CL_PLATFORM_PROFILE, CL_PLATFORM_VENDOR, + CL_PLATFORM_VERSION, CL_PROGRAM_BUILD_LOG, + CL_PROGRAM_BUILD_OPTIONS, CL_PROGRAM_BUILD_STATUS, + CLERROR, CLRESULT, byref, c_char, c_char_p, c_size_t, + c_void_p, cl_bitfield, cl_bool, cl_char, cl_command_queue, + cl_command_queue_properties, cl_context, + cl_context_properties, cl_device_id, cl_device_info, + cl_device_type, cl_double, cl_event, cl_float, cl_half, + cl_int, cl_kernel, cl_long, cl_mem, cl_mem_info, + cl_platform_id, cl_platform_info, cl_program, + cl_program_build_info, cl_short, cl_uchar, cl_uint, + cl_ulong, cl_ushort, clBuildProgram, clCreateBuffer, + clCreateCommandQueue, clCreateContext, + clCreateKernelsInProgram, clCreateProgramWithSource, + clEnqueueCopyBuffer, clEnqueueFillBuffer, + clEnqueueNDRangeKernel, clEnqueueReadBuffer, + clEnqueueWriteBuffer, clFinish, clGetDeviceIDs, + clGetDeviceInfo, clGetMemObjectInfo, clGetPlatformIDs, + clGetPlatformInfo, clGetProgramBuildInfo, + clReleaseCommandQueue, clReleaseContext, clReleaseEvent, + clReleaseKernel, clReleaseMemObject, clReleaseProgram, + clSetKernelArg, clWaitForEvents, create_string_buffer, + ctypes, sizeof) diff --git a/xlib/avecl/_internal/backend/__init__.py b/xlib/avecl/_internal/backend/__init__.py new file mode 100644 index 0000000..c7eb8a1 --- /dev/null +++ b/xlib/avecl/_internal/backend/__init__.py @@ -0,0 +1,6 @@ +from .Buffer import Buffer +from .Device import (Device, cleanup_devices, get_available_devices_info, + get_best_device, get_default_device, get_device, + set_default_device) +from .DeviceInfo import DeviceInfo +from .Kernel import Kernel diff --git a/xlib/avecl/_internal/info/BroadcastInfo.py b/xlib/avecl/_internal/info/BroadcastInfo.py new file mode 100644 index 0000000..d7e5487 --- /dev/null +++ b/xlib/avecl/_internal/info/BroadcastInfo.py @@ -0,0 +1,74 @@ +from typing import List + +import numpy as np + +from ..AShape import AShape +from ..AAxes import AAxes + +class BroadcastInfo: + """ + Broadcast info for multi shapes. + + arguments + + shapes List[AShape] + + errors during the construction: + + ValueError + + Example: + shapes = ( 3,), + (3, 1,) + #Example | comment + + o_shape #(3, 3) broadcasted together o_shape + + br_shapes #(1, 3) shape[0] broadcasted to o_shape ndim + #(3, 1) shape[1] broadcasted to o_shape ndim + + shapes_tiles #(3, 1) tiles to make o_shape from br_shapes[0] + #(1, 3) tiles to make o_shape from br_shapes[1] + + shapes_reduction_axes #(0,) reduction_axes to make shapes[0] from o_shape + #(1,) reduction_axes to make shapes[1] from o_shape + """ + + __slots__ = ['o_shape', 'br_shapes', 'shapes_tiles', 'shapes_reduction_axes'] + + def __init__(self, shapes : List[AShape]): + + highest_rank = sorted([shape.ndim for shape in shapes])[-1] + + # Broadcasted all shapes to highest ndim with (1,)-s in from left side + br_shapes = [ (1,)*(highest_rank-shape.ndim) + shape for shape in shapes ] + + # Determine o_shape from all shapes + o_shape = AShape( np.max( [ [ br_shape.shape[axis] for br_shape in br_shapes] for axis in range(highest_rank) ], axis=1 ) ) + + shapes_tiles = [] + shapes_reduction_axes = [] + for br_shape in br_shapes: + shape_tile = [] + shape_reduction_axes = [] + for axis, (x,y) in enumerate(zip(br_shape.shape, o_shape.shape)): + if x != y: + if x == 1 and y != 1: + shape_tile.append(y) + shape_reduction_axes.append(axis) + elif x != 1 and y == 1: + shape_tile.append(1) + else: + raise ValueError(f'operands could not be broadcast together with shapes {br_shape} {o_shape}') + else: + shape_tile.append(1) + + shapes_tiles.append(shape_tile) + shapes_reduction_axes.append( AAxes(shape_reduction_axes) ) + + + self.o_shape : AShape = AShape(o_shape) + self.br_shapes : List[AShape] = br_shapes + self.shapes_tiles : List[List] = shapes_tiles + self.shapes_reduction_axes : List [AAxes] = shapes_reduction_axes + diff --git a/xlib/avecl/_internal/info/ConcatInfo.py b/xlib/avecl/_internal/info/ConcatInfo.py new file mode 100644 index 0000000..29eeee5 --- /dev/null +++ b/xlib/avecl/_internal/info/ConcatInfo.py @@ -0,0 +1,62 @@ +from ..AShape import AShape + +class ConcatInfo: + __slots__ = ['o_shape', 'axis', 'axis_sizes', 'axis_offsets'] + + def __init__(self, shapes, axis): + """ + Concat info + + arguments + + shapes Iterable of shapes + + errors during the construction: + + ValueError + + result + + .o_shape AShape + + .axis Int fixed axis argument + + .axis_sizes List[Int] axis sizes for every shape in shapes + + .axis_offsets List[Int] axis offset in o_shape + for every shape in shapes + """ + + shapes = tuple(shapes) + + if len(shapes) == 0: + raise ValueError('shapes is empty') + + + shape = shapes[0] + + if axis < 0: + axis = shape.ndim + axis + if axis < 0 or axis >= shape.ndim: + raise ValueError(f'Wrong axis {axis}') + + fixed_shapes = [ tuple(a for i,a in enumerate(shape) if i != axis) for shape in shapes ] + req_shape = fixed_shapes[0] + if not all(shape == req_shape for shape in fixed_shapes[1:]): + raise ValueError(f'All shapes must match shape {tuple(a if i != axis else "*" for i,a in enumerate(shape))}') + + axis_sizes = [ shape[axis] for shape in shapes ] + + axis_offset = 0 + axis_offsets = [] + for axis_size in axis_sizes: + axis_offsets.append(axis_offset) + axis_offset += axis_size + + self.o_shape = AShape( tuple(shape)[0:axis] + (sum(axis_sizes),) + tuple(shape)[axis+1:] ) + self.axis = axis + self.axis_sizes = axis_sizes + self.axis_offsets = tuple(axis_offsets) + + + diff --git a/xlib/avecl/_internal/info/Conv2DInfo.py b/xlib/avecl/_internal/info/Conv2DInfo.py new file mode 100644 index 0000000..4afb126 --- /dev/null +++ b/xlib/avecl/_internal/info/Conv2DInfo.py @@ -0,0 +1,78 @@ +from collections import Iterable +import math + +class Conv2DInfo: + """ + Conv2DInfo + + arguments + + H, W int axes sizes + KH, KW int kernel sizes + stride int + dilation int + + padding 'valid' no padding + 'same' output size will be the same + or divided by stride + int padding value for all sides + + Iterable of 4 ints + paddings for left,top,right,bottom sides + + errors during the construction: + + ValueError + + result: + + .PADL .PADR paddings for W axis + .PADT .PADB paddings for H axis + + .OH .OW result axes + + .OH_T .OW_T result transposed axes. + it is None if padding != 'valid','same' + """ + + __slots__ = ['PADL', 'PADR', 'PADT', 'PADB', 'OH', 'OW', 'OH_T', 'OW_T'] + + def __init__(self, H, W, KH, KW, stride, dilation, padding): + # Effective kernel sizes with dilation + EKH = (KH-1)*dilation + 1 + EKW = (KW-1)*dilation + 1 + + # Determine pad size of sides + OH_T = OW_T = None + if padding == 'valid': + PADL = PADT = PADR = PADB = 0 + OH_T = H * stride + max(EKH - stride, 0) + OW_T = W * stride + max(EKW - stride, 0) + elif padding == 'same': + PADL = int(math.floor((EKW - 1)/2)) + PADT = int(math.floor((EKH - 1)/2)) + PADR = int(math.ceil((EKW - 1)/2)) + PADB = int(math.ceil((EKH - 1)/2)) + OH_T = H * stride + OW_T = W * stride + elif isinstance(padding, int): + PADL = PADT = PADR = PADB = padding + elif isinstance(padding, Iterable): + padding = tuple(int(x) for x in padding) + if len(padding) != 4: + raise ValueError("Invalid paddings list length.") + PADL, PADT, PADR, PADB = padding + else: + raise ValueError("Invalid padding value.") + + self.PADL = PADL + self.PADT = PADT + self.PADR = PADR + self.PADB = PADB + + self.OH = max(1, int((H + PADT + PADB - EKH) / stride + 1) ) + self.OW = max(1, int((W + PADL + PADR - EKW) / stride + 1) ) + self.OH_T = OH_T + self.OW_T = OW_T + + diff --git a/xlib/avecl/_internal/info/PadInfo.py b/xlib/avecl/_internal/info/PadInfo.py new file mode 100644 index 0000000..45c3b09 --- /dev/null +++ b/xlib/avecl/_internal/info/PadInfo.py @@ -0,0 +1,56 @@ +from typing import List + +import numpy as np + +from ..AShape import AShape + + +class PadInfo: + """ + Pad info. + + arguments + + shape AShape + + axes_paddings list of (l_pad, r_pad) + + if [0] == ... (Ellipsis), then left-side paddings will be filled with (0,0) for remain axes + if [-1] == ... , same for ride-side + + errors during the construction: + + ValueError + + result: + + .o_shape AShape + + """ + + __slots__ = ['o_shape','axes_paddings'] + + def __init__(self, shape, axes_paddings : List): + + if Ellipsis in axes_paddings: + if sum(1 if x == Ellipsis else 0 for x in axes_paddings) > 1: + raise ValueError('only 1 ...(ellipsis) allowed in axes_paddings') + if axes_paddings[0] == Ellipsis: + axes_paddings = ((0,0),)*(shape.ndim-(len(axes_paddings)-1))+ axes_paddings[1:] + elif axes_paddings[-1] == Ellipsis: + axes_paddings = axes_paddings[:-1] + ((0,0),)*(shape.ndim-(len(axes_paddings)-1)) + else: + raise ValueError('...(ellipsis) must be at the begin or the end of axes_paddings') + + if len(axes_paddings) != shape.ndim: + raise ValueError(f'axes_paddings should match shape.ndim {shape.ndim}') + + + self.axes_paddings = axes_paddings + o_shape = [] + + for axis, (axis_size, (l_pad, r_pad)) in enumerate(zip(shape, axes_paddings)): + new_axis_size = l_pad + axis_size + r_pad + o_shape.append(new_axis_size) + + self.o_shape = AShape(o_shape) diff --git a/xlib/avecl/_internal/info/ReductionInfo.py b/xlib/avecl/_internal/info/ReductionInfo.py new file mode 100644 index 0000000..8c28c9e --- /dev/null +++ b/xlib/avecl/_internal/info/ReductionInfo.py @@ -0,0 +1,50 @@ +from ..AShape import AShape + +class ReductionInfo: + """ + Reduction info + + arguments + + shape AShape + + axes AAxes + + keepdims bool + + can raise ValueError, TypeError during the construction + """ + + __slots__ = [ + 'reduction_axes', # sorted reduction AAxes + 'o_axes', # remain AAxes after reduction + 'o_shape', # result AShape of reduction + 'o_shape_kd', # result AShape of reduction with keepdims + ] + + def __init__(self, shape, axes, keepdims): + shape_axes = shape.axes_arange() + + if axes.is_none_axes(): + axes = shape_axes + + # Check correctness of axes + for axis in axes: + if axis not in shape_axes: + raise ValueError(f'Wrong axis {axis} not in {shape_axes}') + + self.reduction_axes = reduction_axes = axes.sorted() + + # Output axes. Remove axes from shape_axes + self.o_axes = o_axes = shape_axes - axes + + if o_axes.is_none_axes(): + o_shape = AShape( (1,) ) + else: + o_shape = shape[o_axes] + + self.o_shape = o_shape + self.o_shape_kd = AShape( 1 if axis in reduction_axes else shape[axis] for axis in range(shape.ndim)) + + if keepdims: + self.o_shape = self.o_shape_kd \ No newline at end of file diff --git a/xlib/avecl/_internal/info/ReshapeInfo.py b/xlib/avecl/_internal/info/ReshapeInfo.py new file mode 100644 index 0000000..2bd1799 --- /dev/null +++ b/xlib/avecl/_internal/info/ReshapeInfo.py @@ -0,0 +1,44 @@ +from ..AShape import AShape + +class ReshapeInfo: + """ + Reshape info. + can raise ValueError,TypeError during the construction + + arguments + + shape AShape + + target_shape Iterable of ints + can be any len and contains only one '-1' + Example + + shape (2, 512, 8, 8, 64) + target_shape (2, 512, -1) + o_shape (2, 512, 4096) + """ + + __slots__ = ['o_shape'] + + def __init__(self, shape, target_shape): + o_shape = [] + + remain_size = shape.size + + unk_axis = None + for t_size in target_shape: + t_size = int(t_size) + if t_size != -1: + mod = remain_size % t_size + if mod != 0: + raise ValueError(f'Cannot reshape {shape} to {target_shape}.') + remain_size /= t_size + else: + if unk_axis is not None: + raise ValueError('Can specify only one unknown dimension.') + unk_axis = len(o_shape) + o_shape.append( t_size ) + + if unk_axis is not None: + o_shape[unk_axis] = int(remain_size) + self.o_shape = AShape(o_shape) \ No newline at end of file diff --git a/xlib/avecl/_internal/info/SliceInfo.py b/xlib/avecl/_internal/info/SliceInfo.py new file mode 100644 index 0000000..f2918d8 --- /dev/null +++ b/xlib/avecl/_internal/info/SliceInfo.py @@ -0,0 +1,148 @@ +import math + +import numpy as np + +from ..AShape import AShape + + +class SliceInfo: + __slots__ = ['o_shape', 'o_shape_kd', 'just_reshaped','axes_bes','axes_abs_bes'] + + def __init__(self, shape : AShape, slices): + """ + Slice info. + can raise ValueError,TypeError during the construction + + arguments + + slices + + Example + + + o_shape result shape after slice + + axes_bes list of (begin,step,end) per axis + if s==0, then single axis element is fetched from position b + s can be negative. + """ + # Validate slices argument for given shape. + new_slices = [] + before_ellipsis = None + + for s in slices: + if s is Ellipsis: + before_ellipsis = new_slices + new_slices = [] + continue + elif s is not None and not isinstance(s, (int,tuple) ): + raise ValueError(f'unknown slice argument {s} of type {s.__class__}') + + new_slices.append(s) + + if before_ellipsis is not None: + # Process Ellipsis separator + new_slices_n_axes = sum([ 1 for x in new_slices if x != None]) + before_ellipsis_n_axes = sum([ 1 for x in before_ellipsis if x != None]) + + # Expand slices by filling intermediate (None,None,None) for each remaining axis + new_slices = before_ellipsis + \ + [(None,None,None)]*max(0, shape.ndim-before_ellipsis_n_axes-new_slices_n_axes) + \ + new_slices + + new_slices_n_axes = sum([ 1 for x in new_slices if x != None]) + if new_slices_n_axes > shape.ndim: + raise ValueError('slices arguments more than shape axes') + elif new_slices_n_axes < shape.ndim: + # Fill remaining axes + new_slices += [(None,None,None)]*( shape.ndim - new_slices_n_axes ) + + slices = tuple(new_slices) + + # Compute shapes + output_is_reshaped = True # Flag determines that output_tensor + # can be just reshaped without any computation + o_shape = [] # output tensor shape + o_shape_kd = [] # output shape used in kernel, must match input shape + axes_bes = [] + axes_abs_bes = [] + + i_axis = 0 + + # Process slices arguments + for v in slices: + if v is None: + # None is new axis + # We can add unlimited number of (1,) axes at any place of shape + o_shape.append(1) + continue + + i_axis_size = shape[i_axis] + i_axis += 1 + + if isinstance(v, int): + if v < 0: + v += i_axis_size + if v < 0 or v >= i_axis_size: + raise ValueError(f'index {v} is out of bounds for axis {i_axis} with size {i_axis_size}') + b,e,s = v,v,0 + else: + b,e,s = v + if s == 0: + raise ValueError(f'slice step cannot be zero') + + # Fix begin, end, step values + if s is None: + s = 1 + if b is None: + b = 0 if s >= 0 else i_axis_size-1 + if e is None: + e = i_axis_size if s >= 0 else -1 + elif e < 0: + e += i_axis_size + if b < 0: + b += i_axis_size + + if s >= 0: + b = np.clip(b, 0, i_axis_size) + e = np.clip(e, 0, i_axis_size) + if b > e: + raise ValueError('for positive step, begin cannot be > end.') + + abs_b, abs_e, abs_s = b,e,s + else: + b = np.clip(b, 0, i_axis_size-1) + e = np.clip(e, -1, i_axis_size) + + if b <= e: + raise ValueError('for negative step, begin cannot be <= end.') + + abs_s = -s + abs_e = b + 1 + abs_b = b - (math.ceil( (b-e) / abs_s ) -1) * abs_s + + # for every o_shape_kd axis + # we have exact begin,step values to fetch value from input + axes_bes.append( (b,e,s)) + axes_abs_bes.append( (abs_b, abs_e, abs_s)) + + if i_axis_size != 1 and not (b == 0 and e == i_axis_size and s == 1): + # Such params of axis slice will change input, thus output cannot be as just reshaped input + output_is_reshaped = False + + # Compute output_axis_size based on begin,end,step + o_axis_size = max(0, math.ceil ( (e-b) / (s if s != 0 else 1) ) ) + + if o_axis_size >= 1: + # >= 1 : select range of indexes, axis will remain + o_shape.append(o_axis_size) + # ^ othwerwise axis will be supressed + + # o_shape with keepdims, must match ndim of input shape + o_shape_kd.append( max(1,o_axis_size) ) + + self.just_reshaped = output_is_reshaped + self.o_shape = AShape(o_shape) + self.o_shape_kd = AShape(o_shape_kd) + self.axes_bes = axes_bes + self.axes_abs_bes = axes_abs_bes \ No newline at end of file diff --git a/xlib/avecl/_internal/info/StackInfo.py b/xlib/avecl/_internal/info/StackInfo.py new file mode 100644 index 0000000..117e939 --- /dev/null +++ b/xlib/avecl/_internal/info/StackInfo.py @@ -0,0 +1,37 @@ +from ..AShape import AShape + +class StackInfo: + __slots__ = ['o_shape', 'axis'] + + def __init__(self, shape, axis, stack_count): + """ + Stack info + + arguments + + shape AShape + + axis Int + + stack_count Int + + errors during the construction: + + ValueError + + result: + + .o_shape AShape + + .axis Int positive axis argument + """ + if axis < 0: + axis = shape.ndim + 1 + axis + if axis < 0 or axis > shape.ndim: + raise ValueError(f'Wrong axis {axis}') + + if stack_count <= 0: + raise ValueError(f'Invalid stack_count {stack_count}') + + self.o_shape = AShape( tuple(shape)[0:axis] + (stack_count,) + tuple(shape)[axis:] ) + self.axis = axis \ No newline at end of file diff --git a/xlib/avecl/_internal/info/TileInfo.py b/xlib/avecl/_internal/info/TileInfo.py new file mode 100644 index 0000000..bc18e12 --- /dev/null +++ b/xlib/avecl/_internal/info/TileInfo.py @@ -0,0 +1,52 @@ +import numpy as np +from ..AShape import AShape, AShape + +class TileInfo: + """ + Tile info. + + arguments + + shape AShape + + tiles Iterable of ints + + errors during the construction: + + ValueError + + result: + + .o_shape AShape + + .axes_slices list of slice() to fetch original shape + from o_shape for each tile + """ + + __slots__ = ['o_shape', 'axes_slices'] + + def __init__(self, shape, tiles): + if len(tiles) != shape.ndim: + raise ValueError(f'tiles should match shape.ndim {shape.ndim}') + + self.o_shape = AShape(dim*tiles[i] for i,dim in enumerate(shape)) + + c = [0]*shape.ndim + + axes_offsets = [] + for n in range(np.prod(tiles)): + axes_offsets.append( c.copy() ) + for i in range(shape.ndim-1,-1,-1): + c[i] += 1 + if c[i] < tiles[i]: + break + c[i] = 0 + + axes_slices = [] + for axes_offset in axes_offsets: + sl = [] + for axis,tile in enumerate(axes_offset): + axis_size = shape[axis] + sl.append( slice(axis_size*tile, axis_size*(tile+1)) ) + axes_slices.append(tuple(sl)) + self.axes_slices = tuple(axes_slices) \ No newline at end of file diff --git a/xlib/avecl/_internal/info/TransposeInfo.py b/xlib/avecl/_internal/info/TransposeInfo.py new file mode 100644 index 0000000..8f25d22 --- /dev/null +++ b/xlib/avecl/_internal/info/TransposeInfo.py @@ -0,0 +1,37 @@ +from ..AShape import AShape +from ..AAxes import AAxes + +class TransposeInfo: + """ + TransposeInfo + + arguments + + shape AShape + + axes_order AAxes + + errors during the construction: + + ValueError + + result + + .o_shape AShape + + .no_changes bool transpose changes nothing + + """ + + __slots__ = ['no_changes', 'o_shape'] + + def __init__(self, shape : AShape, axes_order : AAxes): + if shape.ndim != axes_order.ndim: + raise ValueError('axes must match the shape') + + # Axes order changes nothing? + self.o_shape = shape[axes_order] + self.no_changes = axes_order == shape.axes_arange() + + + diff --git a/xlib/avecl/_internal/info/__init__.py b/xlib/avecl/_internal/info/__init__.py new file mode 100644 index 0000000..2efc6d9 --- /dev/null +++ b/xlib/avecl/_internal/info/__init__.py @@ -0,0 +1,10 @@ +from .BroadcastInfo import BroadcastInfo +from .ConcatInfo import ConcatInfo +from .PadInfo import PadInfo +from .ReductionInfo import ReductionInfo +from .ReshapeInfo import ReshapeInfo +from .SliceInfo import SliceInfo +from .StackInfo import StackInfo +from .TileInfo import TileInfo +from .TransposeInfo import TransposeInfo +from .Conv2DInfo import Conv2DInfo \ No newline at end of file diff --git a/xlib/avecl/_internal/initializer/InitCoords2DArange.py b/xlib/avecl/_internal/initializer/InitCoords2DArange.py new file mode 100644 index 0000000..75ba981 --- /dev/null +++ b/xlib/avecl/_internal/initializer/InitCoords2DArange.py @@ -0,0 +1,89 @@ +import numpy as np + +from ..backend import Kernel +from ..HKernel import HKernel +from ..SCacheton import SCacheton +from ..Tensor import Tensor +from .Initializer import Initializer + +class InitCoords2DArange(Initializer): + + def __init__(self, h_start, h_stop, w_start, w_stop): + """ + Initialize (...,H,W,D) tensor with coords arange + D == 2(x,y) or 3 (x,y,1) + + arguments + + h_start float height start value (inclusive) + + h_stop float height stop value (inclusive) + + w_start float width start value (inclusive) + + w_stop float width stop value (inclusive) + + """ + super().__init__() + self._h_start = h_start + self._h_stop = h_stop + self._w_start = w_start + self._w_stop = w_stop + + def initialize_tensor(self, tensor : Tensor): + shape = tensor.shape + dtype = tensor.dtype + + if shape.ndim < 3: + raise ValueError(f'tensor.shape.ndim must be >= 3 (...,H,W,D)') + + OH,OW,OD = shape[-3:] + if OD not in [2,3]: + raise ValueError(f'last dim D {OD} must == 2 or 3') + + if OH > 1: + h_step = (self._h_stop-self._h_start) / (OH-1) + else: + h_step = 0 + + if OW > 1: + w_step = (self._w_stop-self._w_start) / (OW-1) + else: + w_step = 0 + + key = (InitCoords2DArange, dtype) + kernel = SCacheton.get_var(key) + if kernel is None: + kernel = Kernel(kernel_text=f""" + +{HKernel.define_tensor_type('O', tensor.dtype)} + +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME , float h_start, float h_step + , float w_start, float w_step, + uint O0, uint O1, uint O2) +{{ + size_t gid = get_global_id(0); + + {HKernel.decompose_idx_to_axes_idxs('gid', 'O', 3)} + + O_TYPE v; + if (o2 == 0) + v = w_start+o1*w_step; + else + if (o2 == 1) + v = h_start+o0*h_step; + else + v = 1; + + O_GLOBAL_STORE(gid, v); +}} +""") + SCacheton.set_var(key, kernel) + + tensor.get_device().run_kernel( kernel, tensor.get_buffer(), + np.float32(self._h_start), np.float32(h_step), + np.float32(self._w_start), np.float32(w_step), + np.uint32(OH), np.uint32(OW), np.uint32(OD), + global_shape=(shape.size,) ) + + def __str__(self): return f'CoordsArange' diff --git a/xlib/avecl/_internal/initializer/InitRandomUniform.py b/xlib/avecl/_internal/initializer/InitRandomUniform.py new file mode 100644 index 0000000..a657de0 --- /dev/null +++ b/xlib/avecl/_internal/initializer/InitRandomUniform.py @@ -0,0 +1,62 @@ +import numpy as np + +from ..backend import Kernel +from ..HKernel import HKernel +from ..SCacheton import SCacheton +from ..Tensor import Tensor +from .Initializer import Initializer + + +class InitRandomUniform(Initializer): + + def __init__(self, low=0, high=1): + """ + arguments + + low(0) low value + + high(1) high value (exclusive) + """ + super().__init__() + self._low = low + self._high = high + + def initialize_tensor(self, tensor : Tensor): + + key = (InitRandomUniform, self._low, self._high, tensor.dtype) + kernel = SCacheton.get_var(key) + if kernel is None: + + hl = self._high-self._low + l = self._low + + if tensor.dtype in [np.bool_,np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32]: + gen_expression = f'hash_uint_from_uint(gid+seed32) % {int(hl)} + {int(l)}' + elif tensor.dtype in [np.int64]: + gen_expression = f'hash_ulong_from_ulong(gid+seed64) % {int(hl)} + {int(l)}' + elif tensor.dtype in [np.uint64]: + gen_expression = f'hash_ulong_from_ulong(gid+seed64) % {int(hl)} + {int(l)}' + elif tensor.dtype in [np.float16, np.float32]: + gen_expression = f'hash_float_from_uint(gid+seed32)*{hl} + {l}' + elif tensor.dtype in [np.float64]: + gen_expression = f'hash_double_from_ulong(gid+seed64)*{hl} + {l}' + + kernel = Kernel(kernel_text=f""" +{HKernel.include_hash()} +{HKernel.define_tensor('O', (tensor.shape.size,), tensor.dtype )} +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, uint seed32, ulong seed64) +{{ + size_t gid = get_global_id(0); + O_GLOBAL_STORE(gid, {gen_expression} ); +}} +""") + SCacheton.set_var(key, kernel) + + + tensor.get_device().run_kernel( kernel, tensor.get_buffer(), + np.uint32(np.random.randint(np.iinfo(np.uint32).max, dtype=np.uint32)), + np.uint64(np.random.randint(np.iinfo(np.uint64).max, dtype=np.uint64)), + global_shape=(tensor.shape.size,) ) + + def __str__(self): return f'InitRandomUniform low={self._low}, high={self._high}' + diff --git a/xlib/avecl/_internal/initializer/Initializer.py b/xlib/avecl/_internal/initializer/Initializer.py new file mode 100644 index 0000000..e1e5273 --- /dev/null +++ b/xlib/avecl/_internal/initializer/Initializer.py @@ -0,0 +1,17 @@ +from ..Tensor import Tensor + +class Initializer: + """ + Base class for tensor inititalizers + """ + def initialize_tensor(self, tensor : Tensor): + """ + Implement initialization of tensor + + You can compute the data using python and then call buffer.set(numpy_value) + or call kernel.run( tensor.get_device(), tensor.get_buffer, ...) to initialize the data using OpenCL + """ + raise NotImplementedError() + + def __str__(self): return 'Initializer' + def __repr__(self): return self.__str__() diff --git a/xlib/avecl/_internal/initializer/__init__.py b/xlib/avecl/_internal/initializer/__init__.py new file mode 100644 index 0000000..932ea21 --- /dev/null +++ b/xlib/avecl/_internal/initializer/__init__.py @@ -0,0 +1,3 @@ +from .InitCoords2DArange import InitCoords2DArange +from .Initializer import Initializer +from .InitRandomUniform import InitRandomUniform diff --git a/xlib/avecl/_internal/op/__init__.py b/xlib/avecl/_internal/op/__init__.py new file mode 100644 index 0000000..4a1a4b6 --- /dev/null +++ b/xlib/avecl/_internal/op/__init__.py @@ -0,0 +1,21 @@ +from .any_wise import add, any_wise, div, max_, min_, mul, sqrt, square, sub +from .binary_dilate_circle import binary_dilate_circle +from .binary_erode_circle import binary_erode_circle +from .binary_morph import binary_morph +from .cast import cast +from .concat import concat +from .depthwise_conv2D import depthwise_conv2D +from .gaussian_blur import gaussian_blur +from .matmul import matmul, matmulc +from .pad import pad +from .reduce import (moments, reduce_max, reduce_mean, reduce_min, reduce_std, + reduce_sum, reduce_variance) +from .remap import remap +from .remap_np_affine import remap_np_affine +from .reshape import reshape +from .slice_ import slice_ +from .slice_set import slice_set +from .stack import stack +from .tile import tile +from .transpose import transpose +from .warp_affine import warp_affine diff --git a/xlib/avecl/_internal/op/any_wise.py b/xlib/avecl/_internal/op/any_wise.py new file mode 100644 index 0000000..71f5fe1 --- /dev/null +++ b/xlib/avecl/_internal/op/any_wise.py @@ -0,0 +1,111 @@ +import numpy as np + +from ..AShape import AShape +from ..backend import Kernel +from ..HArgs import HArgs +from ..HKernel import HKernel +from ..HType import HType +from ..info import BroadcastInfo +from ..SCacheton import SCacheton +from ..Tensor import Tensor + + +def any_wise(op_text : str, + *args, + dtype : np.dtype = None, + output_t:Tensor=None) -> Tensor: + """ + operator for N-wise ops with N inputs + + arguments + op_text example: O=(2*I0*I1)+I2 + + *args List[ Tensor | number ] + + dtype + + output_t compute result to this Tensor. + Tensor may be with different shape, but should match total size. + """ + HArgs.check_zero_get_length(args) + tensor_args = HArgs.filter_tensor(args, raise_on_empty=True) + device = HArgs.check_get_same_device(tensor_args) + + shape_list, dtype_list, krn_args = HArgs.decompose(args) + + op = SCacheton.get(_AnyWiseOp, shape_list, dtype_list, dtype, op_text) + + if output_t is None: + output_t = Tensor ( op.o_shape, op.o_dtype, device=device ) + elif output_t.shape.size != op.o_shape.size: + raise ValueError(f'output_t must have size {op.o_shape.size}') + + device.run_kernel(op.forward_krn, output_t.get_buffer(), *krn_args) + + return output_t + +class _AnyWiseOp: + def __init__(self, shape_list, dtype_list, o_dtype, op_text : str): + if len(shape_list) != len(dtype_list): + raise ValueError('len(shape_list) != len(dtype_list)') + + self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list) + + if len(shape_list) == 1: + # element-wise. + i_shape, i_dtype = shape_list[0], dtype_list[0] + self.o_shape = o_shape = i_shape + + self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f""" +{HKernel.define_tensor('O', o_shape, o_dtype)} +{HKernel.define_tensor('IN', i_shape, i_dtype)} +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const IN_PTR_TYPE* IN_PTR_NAME) +{{ +size_t gid = get_global_id(0); + +O_TYPE O = O_GLOBAL_LOAD(gid); +IN_TYPE I0 = IN_GLOBAL_LOAD(gid); +{op_text}; +O_GLOBAL_STORE(gid, O); +}} +""") + else: + # Multi arg. + self.info = info = BroadcastInfo( [ shape if shape is not None else AShape((1,)) for shape in shape_list ]) + + self.o_shape = o_shape = info.o_shape + + defs, arg_defs, impls = [], [], [] + for i, (t_shape, t_dtype) in enumerate(zip(shape_list, dtype_list)): + t_name = f'I{i}' + if t_shape is not None: + defs.append( HKernel.define_tensor(t_name, info.br_shapes[i], t_dtype) ) + arg_defs.append( f", __global const {t_name}_PTR_TYPE* {t_name}_PTR_NAME" ) + impls.append( f"{t_name}_TYPE {t_name} = {t_name}_GLOBAL_LOAD({t_name}_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)}));") + else: + arg_defs.append( f", {HKernel.define_scalar_func_arg(t_name, t_dtype)}" ) + + defs, arg_defs, impls = '\n'.join(defs), '\n'.join(arg_defs), '\n'.join(impls) + + self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f""" +{defs} +{HKernel.define_tensor('O', o_shape, o_dtype)} +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME{arg_defs}) +{{ +size_t gid = get_global_id(0); +{HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)} +{impls} +O_TYPE O; +{op_text}; +O_GLOBAL_STORE(gid, O); +}} +""") + +def add(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0+I1', a_t, b_t) +def sub(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0-I1', a_t, b_t) +def mul(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0*I1', a_t, b_t) +def div(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0/I1', a_t, b_t) +def min_(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=fmin( I0_TO_FLOATX(I0), I1_TO_FLOATX(I1) )', a_t, b_t) +def max_(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=fmax( I0_TO_FLOATX(I0), I1_TO_FLOATX(I1) )', a_t, b_t) +def square(a_t : Tensor) -> Tensor: return any_wise('O=I0*I0', a_t) +def sqrt(a_t : Tensor) -> Tensor: return any_wise('O=sqrt(I0_TO_FLOATX(I0))', a_t) diff --git a/xlib/avecl/_internal/op/binary_dilate_circle.py b/xlib/avecl/_internal/op/binary_dilate_circle.py new file mode 100644 index 0000000..e2229fb --- /dev/null +++ b/xlib/avecl/_internal/op/binary_dilate_circle.py @@ -0,0 +1,91 @@ +from ..AShape import AShape +from ..backend import Kernel +from ..HKernel import HKernel +from ..info import Conv2DInfo +from ..SCacheton import SCacheton +from ..Tensor import Tensor + + +def binary_dilate_circle (input_t : Tensor, radius : int = 1, iterations : int = 1, dtype=None): + """ + Binary dilate operator using circle kernel with radius. + + input_t Tensor (...,H,W) + + per-element of H,W, set 1 if any neighbor elements inside circle with radius != 0. + otherwise set 0. + """ + op = SCacheton.get(_BinaryDilateOp, input_t.shape, input_t.dtype, int(radius), dtype) + + device = input_t.get_device() + + if radius <= 0 or iterations <= 0: + return input_t.copy() + else: + for i in range(iterations): + if i == 0: + buf_in = input_t + else: + buf_in, buf_out = buf_out, buf_in + if i <= 1: + buf_out = Tensor( op.o_shape, op.o_dtype, device=device ) + device.run_kernel(op.forward_krn, buf_out.get_buffer(), buf_in.get_buffer() ) + + return buf_out + +class _BinaryDilateOp(): + def __init__(self, i_shape : AShape, i_dtype, radius, o_dtype): + self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype + + if i_shape.ndim < 2: + raise ValueError(f'i_shape.ndim must be >= 2') + + KS = radius*2+1 + IH,IW = i_shape[-2:] + + ci = Conv2DInfo(IH, IW, KS, KS, stride=1, dilation=1, padding='same') + + self.o_shape = o_shape = i_shape + + self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f""" +{HKernel.define_tensor('O', o_shape, o_dtype)} +{HKernel.define_tensor('I', i_shape, i_dtype)} + +#define PADL {ci.PADL} +#define PADT {ci.PADT} + +#define RADIUS {radius} +#define KS {KS} + +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME) +{{ + size_t gid = get_global_id(0); + {HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)} + + {'#pragma unroll' if KS <= 16 else ''} + for (int kh=0; kh= 0 & im1 < Im1 & im2 >= 0 & im2 < Im2) ? + I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )})) + : 0; + + if (i_val != (I_TYPE)0) + {{ + O_GLOBAL_STORE(gid, (O_TYPE) 1); + return; + }} + }} + }} + + O_GLOBAL_STORE(gid, (O_TYPE) 0 ); +}} +""") + + diff --git a/xlib/avecl/_internal/op/binary_erode_circle.py b/xlib/avecl/_internal/op/binary_erode_circle.py new file mode 100644 index 0000000..3d5c4c3 --- /dev/null +++ b/xlib/avecl/_internal/op/binary_erode_circle.py @@ -0,0 +1,91 @@ +from ..AShape import AShape +from ..backend import Kernel +from ..HKernel import HKernel +from ..info import Conv2DInfo +from ..SCacheton import SCacheton +from ..Tensor import Tensor + + +def binary_erode_circle (input_t : Tensor, radius : int = 1, iterations : int = 1, dtype=None): + """ + Binary erode operator using circle kernel with radius. + + input_t Tensor (...,H,W) + + per-element of H,W, set 1 if all neighbor elements inside circle with radius != 0. + otherwise set 0. + """ + op = SCacheton.get(_BinaryErodeOp, input_t.shape, input_t.dtype, int(radius), dtype) + + device = input_t.get_device() + + if radius <= 0 or iterations <= 0: + return input_t.copy() + else: + for i in range(iterations): + if i == 0: + buf_in = input_t + else: + buf_in, buf_out = buf_out, buf_in + if i <= 1: + buf_out = Tensor( op.o_shape, op.o_dtype, device=device ) + device.run_kernel(op.forward_krn, buf_out.get_buffer(), buf_in.get_buffer() ) + + return buf_out + +class _BinaryErodeOp(): + def __init__(self, i_shape : AShape, i_dtype, radius, o_dtype): + self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype + + if i_shape.ndim < 2: + raise ValueError(f'i_shape.ndim must be >= 2') + + KS = radius*2+1 + IH,IW = i_shape[-2:] + + ci = Conv2DInfo(IH, IW, KS, KS, stride=1, dilation=1, padding='same') + + self.o_shape = o_shape = i_shape + + self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f""" +{HKernel.define_tensor('O', o_shape, o_dtype)} +{HKernel.define_tensor('I', i_shape, i_dtype)} + +#define PADL {ci.PADL} +#define PADT {ci.PADT} + +#define RADIUS {radius} +#define KS {KS} + +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME) +{{ + size_t gid = get_global_id(0); + {HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)} + + {'#pragma unroll' if KS <= 16 else ''} + for (int kh=0; kh= 0 & im1 < Im1 & im2 >= 0 & im2 < Im2) ? + I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )})) + : 0; + + if (i_val == (I_TYPE)0) + {{ + O_GLOBAL_STORE(gid, (O_TYPE) 0 ); + return; + }} + }} + }} + + O_GLOBAL_STORE(gid, (O_TYPE) 1 ); +}} +""") + + diff --git a/xlib/avecl/_internal/op/binary_morph.py b/xlib/avecl/_internal/op/binary_morph.py new file mode 100644 index 0000000..f0dc5b2 --- /dev/null +++ b/xlib/avecl/_internal/op/binary_morph.py @@ -0,0 +1,43 @@ +from ..Tensor import Tensor +from .binary_dilate_circle import binary_dilate_circle +from .binary_erode_circle import binary_erode_circle +from .gaussian_blur import gaussian_blur +from .pad import pad + + +def binary_morph(input_t : Tensor, erode_dilate : int, blur : float, fade_to_border : bool = False, dtype=None) -> Tensor: + """ + Apply optional binary erode/dilate and optional blur. + + input_t (...,H,W) tensor. Non zero values will be treated as 1. + + erode_dilate int >= 0 amount of pixels to dilate + + blur float >= 0 amount of pixels to blur + + fade_to_border(False) clip the image in order + to fade smoothly to the border with specified blur amount + """ + x = input_t + + H,W = input_t.shape[-2:] + + x = pad(x, (...,(H,H),(W,W)), mode='constant', constant_value=0) + + if erode_dilate > 0: + x = binary_erode_circle(x, radius=1, iterations=max(1,erode_dilate//2)) + elif erode_dilate < 0: + x = binary_dilate_circle(x, radius=1, iterations=max(1,-erode_dilate//2) ) + + if fade_to_border: + h_clip_size = H + blur // 2 + w_clip_size = W + blur // 2 + x[...,:h_clip_size,:] = 0 + x[...,-h_clip_size:,:] = 0 + x[...,:,:w_clip_size] = 0 + x[...,:,-w_clip_size:] = 0 + + if blur > 0: + x = gaussian_blur(x, blur * 0.250, dtype=dtype) + + return x[...,H:-H,W:-W] diff --git a/xlib/avecl/_internal/op/cast.py b/xlib/avecl/_internal/op/cast.py new file mode 100644 index 0000000..c0e6af7 --- /dev/null +++ b/xlib/avecl/_internal/op/cast.py @@ -0,0 +1,17 @@ +from ..Tensor import Tensor + +from .any_wise import any_wise + +def cast(input_t : Tensor, dtype, output_t:Tensor=None) -> Tensor: + """ + cast operator + + arguments + input_t + + dtype + + output_t compute result to this Tensor. + Tensor may be with different shape, but should match total size. + """ + return any_wise('O=I0', input_t, dtype=dtype, output_t=output_t) diff --git a/xlib/avecl/_internal/op/concat.py b/xlib/avecl/_internal/op/concat.py new file mode 100644 index 0000000..d9cfd50 --- /dev/null +++ b/xlib/avecl/_internal/op/concat.py @@ -0,0 +1,70 @@ +from ..backend import Kernel +from ..HArgs import HArgs +from ..HType import HType +from ..HKernel import HKernel +from ..info import ConcatInfo +from ..SCacheton import SCacheton +from ..Tensor import Tensor + +def concat(tensor_list, axis, dtype=None, output_t=None, is_add_to_output=False) -> Tensor: + """ + arguments + + tensor_list Iterable + + axis Int + + dtype np.dtype + + output_t compute result to this Tensor. + Tensor may be with different shape, + but should match total size. + gradfn will not be set. + + is_add_to_output add result to output_t if output_t is set. + """ + tensor_list = tuple(tensor_list) + HArgs.check_zero_get_length(tensor_list) + HArgs.check_all_tensors(tensor_list) + + device = HArgs.check_get_same_device(tensor_list) + shape_list, dtype_list, _ = HArgs.decompose(tensor_list) + + op = SCacheton.get(_ConcatOp, shape_list, dtype_list, dtype, int(axis), False if output_t is None else is_add_to_output) + + if output_t is None: + output_t = Tensor (op.info.o_shape, op.o_dtype, device=device) + elif output_t.shape.size != op.info.o_shape.size: + raise ValueError(f'output_t must have size {op.info.o_shape.size}') + + for forward_krn,t in zip(op.forward_krns,tensor_list): + device.run_kernel(forward_krn, output_t.get_buffer(), t.get_buffer(), global_shape=(t.shape.size,) ) + + return output_t + +class _ConcatOp: + def __init__(self, shape_list, dtype_list, o_dtype, axis, is_add_to_output): + self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list) + + self.info = info = ConcatInfo(shape_list, axis) + + self.forward_krns = forward_krns = [] + + for i, (shape, dtype) in enumerate(zip(shape_list, dtype_list)): + forward_krn = Kernel(f""" + +{HKernel.define_tensor('O', info.o_shape, o_dtype )} +{HKernel.define_tensor('I', shape, dtype)} + +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME) +{{ + size_t gid = get_global_id(0); + + {HKernel.decompose_idx_to_axes_idxs('gid', 'I', shape.ndim)} + + i{info.axis} += {info.axis_offsets[i]}; + + {'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_seq_enum('I', info.o_shape.ndim)}), I_GLOBAL_LOAD(gid) ); +}} +""") + forward_krns.append(forward_krn) \ No newline at end of file diff --git a/xlib/avecl/_internal/op/depthwise_conv2D.py b/xlib/avecl/_internal/op/depthwise_conv2D.py new file mode 100644 index 0000000..b4c08e1 --- /dev/null +++ b/xlib/avecl/_internal/op/depthwise_conv2D.py @@ -0,0 +1,107 @@ +import numpy as np + +from ..AShape import AShape +from ..backend import Kernel +from ..HKernel import HKernel +from ..info import BroadcastInfo, Conv2DInfo +from ..SCacheton import SCacheton +from ..Tensor import Tensor + + +def depthwise_conv2D (input_t : Tensor, kernel_t : Tensor, stride=1, dilation=1, padding='same', dtype=None): + """ + Depthwise Conv2D operator. + + input_t Tensor (...,H,W) + + kernel_t Tensor (...,H,W) + + stride(1) int + + dilation(1) int + + padding(same) 'valid' no padding + 'same' output size will be the same + or divided by stride + int padding value for all sides + Iterable of 4 ints + paddings for left,top,right,bottom sides + + ...-head part of shapes will be broadcasted to each other + """ + + op = SCacheton.get(_DepthwiseConv2DOp, input_t.shape, input_t.dtype, kernel_t.shape, kernel_t.dtype, dtype, int(stride), int(dilation), padding) + + output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() ) + output_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(), kernel_t.get_buffer()) + + return output_t + +class _DepthwiseConv2DOp(): + def __init__(self, i_shape : AShape, i_dtype, k_shape : AShape, k_dtype, o_dtype, stride, dilation, padding): + self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype + + if i_shape.ndim < 2: + raise ValueError(f'i_shape.ndim must be >= 2') + + if k_shape.ndim < 2: + raise ValueError(f'k_shape.ndim must be >= 2') + + IH,IW = i_shape[-2:] + KH,KW = k_shape[-2:] + + ci = Conv2DInfo(IH, IW, KH, KW, stride, dilation, padding) + + if i_shape.ndim == 2 and k_shape.ndim == 2: + # nothing to broadcast + i_br_shape = i_shape + k_br_shape = k_shape + + o_shape = AShape([ci.OH, ci.OW]) + else: + op = BroadcastInfo([ i_shape[:-2], k_shape[:-2] ]) + + i_br_shape = op.br_shapes[0] + i_shape[-2:] + k_br_shape = op.br_shapes[1] + k_shape[-2:] + + o_shape = op.o_shape + [ci.OH, ci.OW] + + self.o_shape = o_shape + + self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f""" +{HKernel.define_tensor('O', o_shape, o_dtype)} +{HKernel.define_tensor('I', i_br_shape, i_dtype)} +{HKernel.define_tensor('K', k_br_shape, k_dtype)} + +#define PADL {ci.PADL} +#define PADT {ci.PADT} + +#define STRIDE {stride} +#define DILATION {dilation} + +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, __global const K_PTR_TYPE* K_PTR_NAME) +{{ + size_t gid = get_global_id(0); + {HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)} + + float v = 0.0; + {'#pragma unroll' if KH <= 9 else ''} + for (int km2=0; km2= 0 & im2 < Im2) + {'#pragma unroll' if KW <= 9 else ''} + for (int km1=0; km1= 0 & im1 < Im1) + v += ((float)(I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )})))) + *K_GLOBAL_LOAD(K_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='km2,km1' )})); + }} + }} + + O_GLOBAL_STORE(gid, (O_TYPE) v); +}} +""") + + diff --git a/xlib/avecl/_internal/op/gaussian_blur.py b/xlib/avecl/_internal/op/gaussian_blur.py new file mode 100644 index 0000000..69c9fd0 --- /dev/null +++ b/xlib/avecl/_internal/op/gaussian_blur.py @@ -0,0 +1,39 @@ +import numpy as np + +from ..Tensor import Tensor +from .depthwise_conv2D import depthwise_conv2D + + +def gaussian_blur (input_t : Tensor, sigma, dtype=None) -> Tensor: + """ + arguments + + input_t Tensor(...,H,W) + + sigma float + + + """ + if sigma <= 0.0: + return input_t.copy() #TODO + + device = input_t.get_device() + + key = (gaussian_blur, sigma) + kernel_t = device.get_cached_data(key) + if kernel_t is None: + kernel_t = Tensor.from_value( _make_gaussian_kernel(sigma, np.float32), device=device ) + device.set_cached_data(key, kernel_t) + + output_t = depthwise_conv2D(input_t, kernel_t, dtype=dtype) + return output_t + +def _make_gaussian_kernel(sigma : float, dtype): + kernel_size = max(3, int(2 * 2 * sigma)) + if kernel_size % 2 == 0: + kernel_size += 1 + mean = np.floor(0.5 * kernel_size) + kernel_1d = np.array([ np.exp(-(float(x) - float(mean)) ** 2 / (2 * sigma ** 2)) for x in range(kernel_size)]) + np_kernel = np.outer(kernel_1d, kernel_1d) + kernel = np_kernel / np.sum(np_kernel) + return kernel.astype(dtype) diff --git a/xlib/avecl/_internal/op/matmul.py b/xlib/avecl/_internal/op/matmul.py new file mode 100644 index 0000000..b1ee13e --- /dev/null +++ b/xlib/avecl/_internal/op/matmul.py @@ -0,0 +1,158 @@ +import numpy as np + +from ..AShape import AShape +from ..backend import Kernel +from ..HArgs import HArgs +from ..SCacheton import SCacheton +from ..Tensor import Tensor + + +def matmul(a_t : Tensor, b_t : Tensor, output_t: Tensor=None, is_add_to_output=False) -> Tensor: + """ + matmul operator in row-major format + + A(...,M,K) x + B(...,K,N) = + (...,M,N) + + arguments + output_t compute result to this Tensor. + Tensor may be with different shape, + but should match total size. + gradfn will not be set. + + is_add_to_output add result to output_t if output_t is set. + """ + + return matmulc(b_t, a_t, output_t=output_t, is_add_to_output=is_add_to_output) + + + +def matmulc(a_t : Tensor, b_t : Tensor, output_t : Tensor = None, is_add_to_output=False) -> Tensor: + """ + matmul operator in col-major format + + A(...,K,M) x + B(...,N,K) = + (...,N,M) + + arguments + + output_t compute result to this Tensor. + Tensor may be with different shape, + but should match total size. + gradfn will not be set. + + is_add_to_output add result to output_t if output_t is set. + """ + device = HArgs.check_get_same_device([a_t, b_t]) + + op = SCacheton.get(_MatmulOp, a_t.shape, a_t.dtype, b_t.shape, b_t.dtype, False if output_t is None else is_add_to_output) + + if output_t is None: + output_t = Tensor (op.o_shape, op.o_dtype, device=device ) + elif output_t.shape.size != op.o_shape.size: + raise ValueError(f'output_t must have size {op.o_shape.size}') + + device.run_kernel(op.forward_krn, output_t.get_buffer(), a_t.get_buffer(), b_t.get_buffer(), ) + + return output_t + + +class _MatmulOp: + def __init__(self, a_shape, a_dtype, b_shape, b_dtype, is_add_to_output): + a_dtype = np.dtype(a_dtype).type + b_dtype = np.dtype(b_dtype).type + + if a_dtype != np.float32 or b_dtype != np.float32: + raise ValueError('matmul works only with float32 tensors.') + + if a_shape.ndim != b_shape.ndim: + raise ValueError(f'ndims are not equal. {a_shape.ndim} != {b_shape.ndim}') + + ndim = a_shape.ndim + if ndim < 2: + raise ValueError('Tensors ndim must be at least 2.') + + K, M = a_shape[-2], a_shape[-1] + N, B_COLS = b_shape[-2], b_shape[-1] + + if K != B_COLS: + raise ValueError('A_ROWS != B_COLS') + + BATCH = a_shape[0:-2].size + B_BATCH = b_shape[0:-2].size + + if BATCH != B_BATCH: + raise ValueError(f'BATCH size {BATCH} != {B_BATCH} in shapes {a_shape} {b_shape}') + + if ndim == 2: + self.o_shape = AShape( (N, M) ) + else: + self.o_shape = AShape( a_shape[:-2]+(N, M) ) + self.o_dtype = np.float32 + + self.M = M + self.N = N + self.K = K + + # Determining optimal tile widths + for MW in [8,4,2,1]: + if M % MW == 0: + break + for KW in [8,4,2,1]: + if N % KW == 0 and K % KW == 0: + break + NW = KW + + self.forward_krn = Kernel(global_shape=(M//MW, N//NW, BATCH), kernel_text=f""" +#define K {K} +#define N {N} +#define MW {MW} // M tile Width +#define NW {NW} // N tile Width -- NW & KW should be the same ! +#define KW {KW} // K tile Width +#define MT {M//MW} // MT is max for 'mt' (M tile count) +#define KT {K//KW} // KT is max for 'kt' (K tile count) + +#define floatMW { f'float{MW}' if MW != 1 else 'float'} +#define floatKW { f'float{KW}' if KW != 1 else 'float'} + +__kernel void GeMM(__global floatMW* O, const __global floatMW* restrict A, const __global floatKW* restrict B) +{{ + size_t mt = get_global_id(0); //global M-tile id + size_t nc = get_global_id(1); //global N-tile id + size_t batch = get_global_id(2); + + float AT[KW][MW]; // sub tiles + float BT[NW][KW]; + float CT[NW][MW]; + + #pragma unroll + for (uint i=0; i Tensor: + """ + arguments: + + axes_paddings list of (l_pad, r_pad), + + if [0] == ... (Ellipsis), then left-side paddings will be filled with (0,0) for remain axes + if [-1] == ... , same for ride-side + + dtype cast to dtype + + output_t compute result to this Tensor. + Tensor may be with different shape, but should match total size + + """ + op = SCacheton.get(_PadOp, input_t.shape, input_t.dtype, dtype, tuple(axes_paddings), mode, constant_value ) + + if output_t is None: + output_t = Tensor (op.o_shape, op.o_dtype, device=input_t.get_device()) + elif output_t.shape.size != op.o_shape.size: + raise ValueError(f'output_t must have size {op.o_shape.size}') + + input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() ) + + return output_t + + +class _PadOp: + def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, axes_paddings, mode, constant_value ): + _allow_modes = ['constant'] + if mode not in _allow_modes: + raise ValueError(f'Allowed pads modes: {_allow_modes}') + + if mode == 'constant': + if not HType.is_scalar_type(constant_value): + raise ValueError('constan_value must be scalar') + + info = PadInfo(i_shape, axes_paddings) + + self.o_shape = o_shape = info.o_shape + self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype + + self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f""" +{HKernel.define_tensor('O', o_shape, o_dtype)} +{HKernel.define_tensor('I', i_shape, i_dtype)} +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME) +{{ +size_t gid = get_global_id(0); + +{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)} + +if ({' & '.join(f'o{i} >= {l_pad} & o{i} < (O{i}-{r_pad})' for i, (l_pad,r_pad) in enumerate(info.axes_paddings))}) + O_GLOBAL_STORE(gid, I_GLOBAL_LOAD( I_IDX({ ','.join(f'o{i}-{l_pad}' for i,(l_pad,r_pad) in zip(range(o_shape.ndim), info.axes_paddings) ) }) ) ); +else + O_GLOBAL_STORE(gid, (O_TYPE){constant_value} ); + //O_GLOBAL_STORE(gid, I_GLOBAL_LOAD( I_IDX_MOD({ ','.join(f' I{i} + ( (o{i}-{l_pad})*( ((o{i}-{l_pad})/I{i}) % 2 == 0 ? 1: -1) ) % I{i} ' for i,(l_pad,r_pad) in zip(range(o_shape.ndim), info.axes_paddings) ) }) ) ); +}}""") + #print(self.forward_krn) diff --git a/xlib/avecl/_internal/op/reduce.py b/xlib/avecl/_internal/op/reduce.py new file mode 100644 index 0000000..6356eee --- /dev/null +++ b/xlib/avecl/_internal/op/reduce.py @@ -0,0 +1,214 @@ +import math + +import numpy as np + +from ..AAxes import AAxes +from ..AShape import AShape +from ..backend import Kernel +from ..HKernel import HKernel +from ..info import ReductionInfo, TransposeInfo +from ..SCacheton import SCacheton +from ..Tensor import Tensor +from .slice_ import slice_ +from .transpose import transpose +from .any_wise import square, sqrt + + +def reduce_mean (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor: + """ + Reduce mean operator. + + input_t Tensor + + axes(None) int + Iterable of ints. + None - all axes + + keepdims(False) keep reduced axes + """ + return reduce_op ('mean', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output) + +def reduce_std(input_t, axes=None, keepdims=False): + """ + Reduce std operator. + + input_t Tensor + + axes(None) int + Iterable of ints. + None - all axes + + keepdims(False) keep reduced axes + """ + return sqrt(reduce_variance(input_t, axes, keepdims)) + + +def reduce_variance(input_t, axes=None, keepdims=False): + """ + Reduce variance operator. + + input_t Tensor + + axes(None) int + Iterable of ints. + None - all axes + + keepdims(False) keep reduced axes + """ + mean = reduce_mean(input_t, axes, keepdims=True) + return reduce_mean(square(input_t - mean), axes, keepdims) + +def moments(input_t, axes=None, keepdims=False): + """ + Returns (mean, variance) of input_t + + input_t Tensor + + axes(None) int + Iterable of ints. + None - all axes + + keepdims(False) keep reduced axes + """ + mean = reduce_mean(input_t, axes, keepdims) + mean_shape_keepdims = mean._op.info.o_shape_kd + var = reduce_mean(square(input_t - mean.reshape(mean_shape_keepdims) ), axes, keepdims) + return mean, var + +def reduce_min (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor: + """ + Reduce min operator. + + input_t Tensor + + axes(None) int + Iterable of ints. + None - all axes + + keepdims(False) keep reduced axes + """ + return reduce_op ('min', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output) + +def reduce_max (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor: + """ + Reduce max operator. + + input_t Tensor + + axes(None) int + Iterable of ints. + None - all axes + + keepdims(False) keep reduced axes + """ + return reduce_op ('max', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output) + +def reduce_sum (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor: + """ + Reduce sum operator. + + input_t Tensor + + axes(None) int + Iterable of ints. + None - all axes + + keepdims(False) keep reduced axes + """ + return reduce_op ('sum', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output) + +def reduce_op (op_type : str, input_t, axes=None, keepdims=False, output_t=None, is_add_to_output=False): + """ + arguments + + op_type 'sum' 'mean' 'min' 'max' + + output_t compute result to this Tensor. + Tensor may be with different shape, + but should match total size. + gradfn will not be set. + + is_add_to_output add result to output_t if output_t is set. + """ + + op = SCacheton.get(_ReduceOp, op_type, input_t.shape, input_t.dtype, AAxes(axes, input_t.shape.ndim), keepdims) + + if output_t is None: + output_t = Tensor ( op.info.o_shape, input_t.dtype, device=input_t.get_device() ) + elif output_t.shape.size != op.info.o_shape.size: + raise ValueError(f'output_t must have size {op.info.o_shape.size}') + + # Make an intermediate tensor + input_t_inter = transpose(input_t, op.intermediate_transpose_axes) + + # Perform multistage inplace operation in intermediate tensor + for stage, (shape, STAGE_COLS, STAGE_VALID_COLS) in enumerate(zip(op.forward_krn_shapes, op.forward_krn_stage_cols, op.forward_krn_stage_valid_cols)): + input_t_inter.get_device().run_kernel(op.forward_krn, input_t_inter.get_buffer(), np.int64(op.COLS), np.int64(STAGE_COLS), np.int64(STAGE_VALID_COLS), + global_shape=shape) + + if op_type == 'mean': + # divide values in ROWS by number of COLS + input_t_inter.get_device().run_kernel(op.mean_div_forward_krn, input_t_inter.get_buffer(), np.int64(op.COLS), global_shape=(op.ROWS,) ) + + # Fetch final tensor from zero indexes using slices argument + slice_(input_t_inter, op.inter_slices, output_t=output_t, is_add_to_output=is_add_to_output) + + return output_t + + +class _ReduceOp: + def __init__(self, op_type, i_shape : AShape, i_dtype : np.dtype, axes : AAxes, keepdims=False): + self.op_type = op_type + self.info = info = ReductionInfo(i_shape, axes, keepdims) + + # Determine transpose order for intermediate tensor, where reduction axes will be at the end + self.intermediate_transpose_axes = info.o_axes + info.reduction_axes + self.intermediate_shape = TransposeInfo(i_shape, self.intermediate_transpose_axes).o_shape + + # slices argument to fetch processed tensor from zero indexes + self.inter_slices = ( slice(None,None,None), ) * info.o_axes.ndim + (0,) * info.reduction_axes.ndim + + # COLS are reduction axes, ROWS are remaining axes + rows_ndim = info.o_axes.ndim + self.ROWS = ROWS = self.intermediate_shape[:rows_ndim].size + self.COLS = COLS = self.intermediate_shape[rows_ndim:].size + + # Number of stages to operate COLS + n_stages = (COLS-1).bit_length() + self.forward_krn_shapes = [ (ROWS * math.ceil(COLS/ (2**(stage+1)) ),) for stage in range(n_stages) ] + self.forward_krn_stage_cols = [ math.ceil(COLS / (2**(stage+1)) ) for stage in range(n_stages) ] + self.forward_krn_stage_valid_cols = [ math.ceil(COLS / (2** stage ) ) for stage in range(n_stages) ] + + self.forward_krn = Kernel(f""" +{HKernel.define_tensor('I', (1,), i_dtype)} + +__kernel void impl(__global I_PTR_TYPE* I_PTR_NAME, long COLS, long STAGE_COLS, long STAGE_VALID_COLS) +{{ + size_t gid = get_global_id(0); + + size_t col = gid % STAGE_COLS; + size_t row = gid / STAGE_COLS; + size_t i_idx = row*COLS + col; + + size_t other_col = col + STAGE_COLS; + if (other_col < STAGE_VALID_COLS) + {{ + I_TYPE val_a = I_GLOBAL_LOAD(i_idx); + I_TYPE val_b = I_GLOBAL_LOAD(row*COLS + other_col); + + {'I_TYPE val_x = val_a + val_b;' if op_type in ['sum','mean'] else + 'I_TYPE val_x = fmin( I_TO_FLOATX(val_a), I_TO_FLOATX(val_b) );' if op_type == 'min' else + 'I_TYPE val_x = fmax( I_TO_FLOATX(val_a), I_TO_FLOATX(val_b) );' if op_type == 'max' else '' + } + I_GLOBAL_STORE(i_idx, val_x); + }} +}} +""") + self.mean_div_forward_krn = Kernel(f""" +{HKernel.define_tensor('I', (1,), i_dtype)} +__kernel void impl(__global I_PTR_TYPE* I_PTR_NAME, long COLS) +{{ + size_t row = get_global_id(0); + I_GLOBAL_STORE(row*COLS, I_GLOBAL_LOAD(row*COLS) / COLS ); +}} +""") diff --git a/xlib/avecl/_internal/op/remap.py b/xlib/avecl/_internal/op/remap.py new file mode 100644 index 0000000..ee4843c --- /dev/null +++ b/xlib/avecl/_internal/op/remap.py @@ -0,0 +1,103 @@ +import numpy as np + +from ..AShape import AShape +from ..backend import Kernel +from ..HKernel import HKernel +from ..info import BroadcastInfo +from ..SCacheton import SCacheton +from ..Tensor import Tensor + +def remap (input_t : Tensor, coords_t : Tensor, dtype=None) -> Tensor: + """ + remap input_t in spatial axes using coords_t + + arguments + + input_t Tensor( ...,IH,IW ) + + coords_t Tensor( ...,OH,OW,D ) + OH - output height + OW - output width + D is (2)[x,y] coords + + dtype + + ...-head part of shapes will be broadcasted to each other + """ + + op = SCacheton.get(_RemapOp, input_t.shape, input_t.dtype, coords_t.shape, coords_t.dtype, dtype) + + output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() ) + + input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(), coords_t.get_buffer()) + + return output_t + + +class _RemapOp(): + def __init__(self, i_shape : AShape, i_dtype, c_shape : AShape, c_dtype, o_dtype): + if np.dtype(i_dtype).type == np.bool_: + raise ValueError('np.bool_ dtype of i_dtype is not supported.') + if np.dtype(c_dtype).type == np.bool_: + raise ValueError('np.bool_ dtype of c_dtype is not supported.') + if i_shape.ndim < 2: + raise ValueError('i_shape.ndim must be >= 2 (...,H,W)') + if c_shape.ndim < 3: + raise ValueError(f'Coords shape ndim must be >= 3(...,H,W,D)') + if c_shape[-1] != 2: + raise ValueError('Last coords dim must be == 2 (x,y)') + + self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype + + if i_shape.ndim == 2 and c_shape.ndim == 3: + # nothing to broadcast + + i_br_shape = i_shape + c_br_shape = c_shape + + o_shape = c_shape[-3:-1] + else: + op = BroadcastInfo([ i_shape[:-2], c_shape[:-3] ]) + + i_br_shape = op.br_shapes[0] + i_shape[-2:] + c_br_shape = op.br_shapes[1] + c_shape[-3:] + + o_shape = op.o_shape + c_shape[-3:-1] + + self.o_shape = o_shape + + self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f""" + +{HKernel.define_tensor('O', o_shape, o_dtype)} +{HKernel.define_tensor('I', i_br_shape, i_dtype)} +{HKernel.define_tensor('C', c_br_shape[:-1], c_dtype)} + +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, __global const C_PTR_TYPE2* C_PTR_NAME) +{{ + size_t gid = get_global_id(0); + + {HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)} + + C_TYPE2 c_value = C_GLOBAL_LOAD2(C_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim)})); + + float cx01 = (float) c_value.x; + float cy01 = (float) c_value.y; + + float cx0f = floor(cx01); int cx0 = (int)cx0f; + float cy0f = floor(cy01); int cy0 = (int)cy0f; + float cx1f = cx0f+1; int cx1 = (int)cx1f; + float cy1f = cy0f+1; int cy1 = (int)cy1f; + + float p00 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx0')})); + float p01 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx1')})); + float p10 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx0')})); + float p11 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx1')})); + + p00 *= (cx1f - cx01)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx0 >= 0 & cx0 < Im1); + p01 *= (cx01 - cx0f)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx1 >= 0 & cx1 < Im1); + p10 *= (cx1f - cx01)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx0 >= 0 & cx0 < Im1); + p11 *= (cx01 - cx0f)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx1 >= 0 & cx1 < Im1); + + O_GLOBAL_STORE(gid, p00 + p01 + p10 + p11); +}} +""") diff --git a/xlib/avecl/_internal/op/remap_np_affine.py b/xlib/avecl/_internal/op/remap_np_affine.py new file mode 100644 index 0000000..1d97f87 --- /dev/null +++ b/xlib/avecl/_internal/op/remap_np_affine.py @@ -0,0 +1,96 @@ +import numpy as np + +from ..AShape import AShape +from ..backend import Kernel +from ..HKernel import HKernel +from ..SCacheton import SCacheton +from ..Tensor import Tensor + +def remap_np_affine (input_t : Tensor, affine_n : np.array, inverse=False, output_size=None, dtype=None) -> Tensor: + """ + remap affine operator for all channels using single numpy affine mat + + arguments + + input_t Tensor (...,H,W) + + affine_n np.array (2,3) + + dtype + """ + if affine_n.shape != (2,3): + raise ValueError('affine_n.shape must be (2,3)') + + op = SCacheton.get(_RemapAffineOp, input_t.shape, input_t.dtype, output_size, dtype) + + output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() ) + + ((a, b, c), + (d, e, f)) = affine_n + if not inverse: + # do inverse by default, match cv2.warpAffine behaviour + D = a*e - b*d + D = 1.0 / D if D != 0.0 else 0.0 + a, b, c, d, e, f = ( e*D, -b*D, (b*f-e*c)*D , + -d*D, a*D, (d*c-a*f)*D ) + + input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(), + np.float32(a), np.float32(b), np.float32(c), np.float32(d), np.float32(e), np.float32(f) ) + + return output_t + + +class _RemapAffineOp(): + def __init__(self, i_shape : AShape, i_dtype, o_size, o_dtype): + if np.dtype(i_dtype).type == np.bool_: + raise ValueError('np.bool_ dtype of i_dtype is not supported.') + if i_shape.ndim < 2: + raise ValueError('i_shape.ndim must be >= 2 (...,H,W)') + + IH,IW = i_shape[-2:] + if o_size is not None: + OH,OW = o_size + else: + OH,OW = IH,IW + + o_shape = AShape( (OH,OW) ) + if i_shape.ndim > 2: + o_shape = i_shape[:-2] + o_shape + + self.o_shape = o_shape + self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype + + self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f""" + +{HKernel.define_tensor('O', o_shape, o_dtype)} +{HKernel.define_tensor('I', i_shape, i_dtype)} + +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, + float a, float b, float c, + float d, float e, float f) +{{ + size_t gid = get_global_id(0); + + {HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)} + + float cx01 = om1*a + om2*b + c; + float cy01 = om1*d + om2*e + f; + + float cx0f = floor(cx01); int cx0 = (int)cx0f; + float cy0f = floor(cy01); int cy0 = (int)cy0f; + float cx1f = cx0f+1; int cx1 = (int)cx1f; + float cy1f = cy0f+1; int cy1 = (int)cy1f; + + float p00 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx0')})); + float p01 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx1')})); + float p10 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx0')})); + float p11 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx1')})); + + p00 *= (cx1f - cx01)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx0 >= 0 & cx0 < Im1); + p01 *= (cx01 - cx0f)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx1 >= 0 & cx1 < Im1); + p10 *= (cx1f - cx01)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx0 >= 0 & cx0 < Im1); + p11 *= (cx01 - cx0f)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx1 >= 0 & cx1 < Im1); + + O_GLOBAL_STORE(gid, p00 + p01 + p10 + p11); +}} +""") \ No newline at end of file diff --git a/xlib/avecl/_internal/op/reshape.py b/xlib/avecl/_internal/op/reshape.py new file mode 100644 index 0000000..25d04fa --- /dev/null +++ b/xlib/avecl/_internal/op/reshape.py @@ -0,0 +1,26 @@ +from typing import Iterable + +from ..Tensor import Tensor +from ..SCacheton import SCacheton +from ..info import ReshapeInfo + + +def reshape(input_t : Tensor, new_shape : Iterable, copy=True) -> Tensor: + """ + reshape operator + + arguments + + new_shape Iterable of ints + + copy(True) if True, produces new Tensor + otherwise result tensor points to the same memory + + Produces reference Tensor with new shape. + """ + info = SCacheton.get(ReshapeInfo, input_t.shape, tuple(int(x) for x in new_shape) ) + + if copy: + return Tensor(info.o_shape, input_t.dtype, device=input_t.get_device()).set(input_t) + return input_t.as_shape( info.o_shape ) + diff --git a/xlib/avecl/_internal/op/slice_.py b/xlib/avecl/_internal/op/slice_.py new file mode 100644 index 0000000..2503d42 --- /dev/null +++ b/xlib/avecl/_internal/op/slice_.py @@ -0,0 +1,67 @@ +import numpy as np + +from ..AShape import AShape +from ..backend import Kernel +from ..HKernel import HKernel +from ..HType import HType +from ..info import SliceInfo +from ..SCacheton import SCacheton +from ..Tensor import Tensor + + +def slice_(input_t : Tensor, slices, dtype : np.dtype = None, output_t=None, is_add_to_output=False) -> Tensor: + """ + arguments: + + input_t input tensor + slices argument received from class.__getitem__(slices) + + output_t compute result to this Tensor. + Tensor may be with different shape, but should match total size. + gradfn will not be set. + + is_add_to_output add result to output_t if output_t is set. + + Remark. + + Slicing logic is not the same as numpy: + For example np[2:0:1] slice will produce invalid array with zero index, + but nn.slice() will select 2 index, same as val_t[2]. + """ + op = SCacheton.get(_SliceOp, input_t.shape, input_t.dtype, dtype, HType.hashable_slices(slices), False if output_t is None else is_add_to_output ) + o_shape = op.slice_info.o_shape + + if output_t is None: + if op.slice_info.just_reshaped: + return input_t.reshape(o_shape) + else: + output_t = Tensor(o_shape, op.o_dtype, device=input_t.get_device()) + + elif output_t.shape.size != o_shape.size: + raise ValueError(f'output_t must have size {o_shape.size}') + + input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() ) + + return output_t + + +class _SliceOp: + def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, slices, is_add_to_output): + self.slice_info = slice_info = SliceInfo(i_shape, slices) + + self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype + + self.forward_krn = Kernel(global_shape=(slice_info.o_shape_kd.size,), kernel_text=f""" +{HKernel.define_tensor('O', slice_info.o_shape_kd, o_dtype )} +{HKernel.define_tensor('I', i_shape, i_dtype )} +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME) +{{ +size_t gid = get_global_id(0); + +{HKernel.decompose_idx_to_axes_idxs('gid', 'o', slice_info.o_shape_kd.ndim)} + +{chr(10).join( f'size_t i{i} = {b} + o{i} * {s}; ' for i, (b,e,s) in enumerate(slice_info.axes_bes) ) } + +{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}(gid, I_GLOBAL_LOAD( I_IDX({HKernel.axes_seq_enum('i', i_shape.ndim)}) ) ); +}} +""") diff --git a/xlib/avecl/_internal/op/slice_set.py b/xlib/avecl/_internal/op/slice_set.py new file mode 100644 index 0000000..5b0ec61 --- /dev/null +++ b/xlib/avecl/_internal/op/slice_set.py @@ -0,0 +1,73 @@ +import numpy as np + +from ..AShape import AShape +from ..backend import Kernel +from ..HKernel import HKernel +from ..HType import HType +from ..info import BroadcastInfo, SliceInfo +from ..SCacheton import SCacheton +from ..Tensor import Tensor + + +def slice_set(input_t : Tensor, slices, value) -> Tensor: + """ + arguments: + + input_t input tensor + slices argument received from class.__getitem__(slices) + value + + + Remark. + + """ + if HType.is_scalar_type(value): + v_shape = None + v_dtype = None + v_scalar = value + elif not isinstance(value, Tensor): + value = Tensor.from_value(value, dtype=input_t.dtype, device=input_t.get_device()) + v_shape = value.shape + v_dtype = value.dtype + v_scalar = None + + op = SCacheton.get(_SliceSetOp, input_t.shape, input_t.dtype, v_shape, v_dtype, v_scalar, HType.hashable_slices(slices) ) + + if v_scalar is not None: + input_t.get_device().run_kernel(op.forward_krn, input_t.get_buffer() ) + else: + input_t.get_device().run_kernel(op.forward_krn, input_t.get_buffer(), value.get_buffer() ) + + return input_t + +class _SliceSetOp: + def __init__(self, i_shape : AShape, i_dtype : np.dtype, v_shape : AShape, v_dtype : np.dtype, v_scalar, slices): + slice_info = SliceInfo(i_shape, slices) + + if v_scalar is None: + if v_shape.ndim > i_shape.ndim: + raise ValueError(f'v_shape.ndim {v_shape.ndim} cannot be larger than i_shape.ndim {i_shape.ndim}') + + # Check that v_shape can broadcast with slice_info.shape + br_info = BroadcastInfo([slice_info.o_shape_kd, v_shape]) + + v_br_shape = br_info.br_shapes[1] + + self.forward_krn = Kernel(global_shape=(i_shape.size,), kernel_text=f""" +{HKernel.define_tensor('O', i_shape, i_dtype )} + +{HKernel.define_tensor('I', v_br_shape, v_dtype ) if v_scalar is None else ''} + +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME + {', __global const I_PTR_TYPE* I_PTR_NAME' if v_scalar is None else ''}) +{{ +size_t gid = get_global_id(0); + +{HKernel.decompose_idx_to_axes_idxs('gid', 'O', slice_info.o_shape_kd.ndim)} + +if ({' & '.join( [f'o{i} >= {b} & o{i} < {e}' if s != 0 else f'o{i} == {b}' for i, (b,e,s) in enumerate(slice_info.axes_abs_bes)] + + [f'((o{i} % {s}) == 0)' for i, (_,_,s) in enumerate(slice_info.axes_abs_bes) if s > 1 ] ) } ) + + O_GLOBAL_STORE(gid, {f"I_GLOBAL_LOAD( I_IDX_MOD({HKernel.axes_seq_enum('O', i_shape.ndim)}) ) " if v_scalar is None else f" (O_TYPE)({v_scalar})"} ); +}} +""") diff --git a/xlib/avecl/_internal/op/stack.py b/xlib/avecl/_internal/op/stack.py new file mode 100644 index 0000000..3fe0245 --- /dev/null +++ b/xlib/avecl/_internal/op/stack.py @@ -0,0 +1,74 @@ +import numpy as np +from typing import List + +from ..AShape import AShape +from ..backend import Kernel +from ..HArgs import HArgs +from ..HKernel import HKernel +from ..HType import HType +from ..info import StackInfo +from ..SCacheton import SCacheton +from ..Tensor import Tensor + +def stack(tensor_list : List[Tensor], axis, dtype=None, output_t=None, is_add_to_output=False): + """ + Stack operator. + + arguments: + + tensor_list List of Tensors + + axis Int + + output_t compute result to this Tensor. + Tensor may be with different shape, but should match total size. + gradfn will not be set. + + is_add_to_output add result to output_t if output_t is set. + """ + HArgs.check_zero_get_length(tensor_list) + HArgs.check_all_tensors(tensor_list) + + device = HArgs.check_get_same_device(tensor_list) + + shape_list, dtype_list, _ = HArgs.decompose(tensor_list) + + op = SCacheton.get(_StackOp, shape_list, dtype_list, int(axis), dtype, False if output_t is None else is_add_to_output) + + if output_t is None: + output_t = Tensor (op.info.o_shape, op.o_dtype, device=device) + elif output_t.shape.size != op.info.o_shape.size: + raise ValueError(f'output_t must have size {op.info.o_shape.size}') + + for i, krn in enumerate(op.forward_krns): + device.run_kernel(krn, output_t.get_buffer(), tensor_list[i].get_buffer(), np.int64(i) ) + + return output_t + + +class _StackOp: + def __init__(self, shape_list : List[AShape], dtype_list : List[np.dtype], axis, o_dtype, is_add_to_output): + self.stack_count = stack_count = len(shape_list) + + i_shape = shape_list[0] + if not all (s == i_shape for s in shape_list): + raise ValueError('All shapes must be the same') + + self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list) + self.info = info = StackInfo(i_shape, axis, stack_count) + + self.forward_krns = forward_krns = [] + for i_dtype in dtype_list: + forward_krns.append( Kernel(global_shape=(i_shape.size,), kernel_text=f""" +{HKernel.define_tensor('O', info.o_shape, o_dtype )} +{HKernel.define_tensor('I', i_shape, i_dtype )} + +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, long i_new_idx) +{{ + size_t gid = get_global_id(0); + + {HKernel.decompose_idx_to_axes_idxs('gid', 'I', i_shape.ndim)} + + {'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_seq_enum('I', i_shape.ndim, new_axis=('i_new_idx', info.axis))}), I_GLOBAL_LOAD(gid) ); +}} +""")) diff --git a/xlib/avecl/_internal/op/tile.py b/xlib/avecl/_internal/op/tile.py new file mode 100644 index 0000000..472bb94 --- /dev/null +++ b/xlib/avecl/_internal/op/tile.py @@ -0,0 +1,58 @@ +import numpy as np +from typing import List + +from ..AShape import AShape +from ..backend import Kernel +from ..HKernel import HKernel +from ..info import TileInfo +from ..SCacheton import SCacheton +from ..Tensor import Tensor + +def tile(input_t : Tensor, tiles : List[int], dtype : np.dtype = None, output_t=None, is_add_to_output=False): + """ + Tile operator + + arguments + + tiles Iterable of ints + + dtype + + output_t compute result to this Tensor. + Tensor may be with different shape, but should match total size. + gradfn will not be set. + + is_add_to_output add result to output_t if output_t is set. + """ + + op = SCacheton.get(_TileOp, input_t.shape, input_t.dtype, tuple(int(tile) for tile in tiles), dtype, False if output_t is None else is_add_to_output) + + if output_t is None: + output_t = Tensor (op.info.o_shape, op.o_dtype, device=input_t.get_device()) + elif output_t.shape.size != op.info.o_shape.size: + raise ValueError(f'output_t must have size {op.info.o_shape.size}') + + input_t.get_device().run_kernel( op.forward_krn, output_t.get_buffer(), input_t.get_buffer()) + + return output_t + +class _TileOp: + def __init__(self, i_shape : AShape, i_dtype, tiles, o_dtype, is_add_to_output): + self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype + + self.info = info = TileInfo(i_shape, tiles) + + self.forward_krn = Kernel(global_shape=(info.o_shape.size,), kernel_text=f""" + +{HKernel.define_tensor('I', i_shape, i_dtype)} +{HKernel.define_tensor('O', info.o_shape, o_dtype)} + +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME) +{{ + size_t gid = get_global_id(0); + {HKernel.decompose_idx_to_axes_idxs ('gid', 'O', info.o_shape.ndim)} + + {'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'} (gid, I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)})) ); +}} +""") + diff --git a/xlib/avecl/_internal/op/transpose.py b/xlib/avecl/_internal/op/transpose.py new file mode 100644 index 0000000..43501bc --- /dev/null +++ b/xlib/avecl/_internal/op/transpose.py @@ -0,0 +1,68 @@ +import numpy as np + +from ..AAxes import AAxes +from ..AShape import AShape +from ..backend import Kernel +from ..HKernel import HKernel +from ..info import TransposeInfo +from ..SCacheton import SCacheton +from ..Tensor import Tensor + +def transpose(input_t : Tensor, axes_order, op_text=None, dtype : np.dtype = None, output_t : Tensor=None, is_add_to_output=False) -> Tensor: + """ + arguments: + + axes_order Int + Iterable of ints + None + + dtype cast to dtype + + op_text(None) optional op with value during transpose. + 'O = I' + + output_t compute result to this Tensor. + Tensor may be with different shape, but should match total size + + """ + op = SCacheton.get(_TransposeOp, input_t.shape, input_t.dtype, dtype, AAxes(axes_order), op_text, False if output_t is None else is_add_to_output ) + + if output_t is None: + output_t = Tensor (op.o_shape, op.o_dtype, device=input_t.get_device()) + elif output_t.shape.size != op.o_shape.size: + raise ValueError(f'output_t must have size {op.o_shape.size}') + + input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() ) + + return output_t + + +class _TransposeOp: + def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, axes_order : AAxes, op_text, is_add_to_output : bool ): + self.axes_order = axes_order + self.o_shape = o_shape = TransposeInfo(i_shape, axes_order).o_shape + self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype + + if op_text is None: + op_text = 'O = I' + + self.forward_krn = Kernel(global_shape=(i_shape.size,), kernel_text=f""" +{HKernel.define_tensor('O', o_shape, o_dtype)} +{HKernel.define_tensor('I', i_shape, i_dtype)} +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME) +{{ + +size_t gid = get_global_id(0); + +{HKernel.decompose_idx_to_axes_idxs('gid', 'i', i_shape.ndim)} + +I_TYPE I = I_GLOBAL_LOAD(gid); +O_TYPE O; + +{op_text}; + +{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_order_enum('I', axes_order )}), O ); + +}}""") + + diff --git a/xlib/avecl/_internal/op/unused.zip b/xlib/avecl/_internal/op/unused.zip new file mode 100644 index 0000000..76ba9ef Binary files /dev/null and b/xlib/avecl/_internal/op/unused.zip differ diff --git a/xlib/avecl/_internal/op/warp_affine.py b/xlib/avecl/_internal/op/warp_affine.py new file mode 100644 index 0000000..0f28725 --- /dev/null +++ b/xlib/avecl/_internal/op/warp_affine.py @@ -0,0 +1,72 @@ +import numpy as np + +from ..AShape import AShape +from ..initializer import InitCoords2DArange +from ..SCacheton import SCacheton +from ..Tensor import Tensor +from .matmul import matmul +from .remap import remap + +def warp_affine (input_t : Tensor, affine_t : Tensor, output_size=None, dtype=None) -> Tensor: + """ + arguments + + input_t Tensor(...,H,W) + + affine_t Tensor(...,2,3) + affine matrix + + example of identity affine matrix + [1,0,0], + [0,1,0] + + ...-head part of shapes will be broadcasted to each other + + output_size(None) + + tuple of 2 ints (HW) + of output size + if None , size will not be changed + + """ + op = SCacheton.get(_WarpAffineOp, input_t.shape, input_t.dtype, affine_t.shape, affine_t.dtype, output_size) + + affine_t = affine_t.transpose( op.affine_transpose_axes, dtype=np.float32 ).reshape( (-1,3,2) ) + + coords_t = Tensor(op.coords_shape, np.float32, device=input_t.get_device(), initializer=op.coords_init ) + coords_t = coords_t.reshape(op.coords_reshape) + coords_t = matmul(coords_t, affine_t).reshape(op.coords_affined_shape) + + output_t = remap(input_t, coords_t, dtype=dtype) + return output_t + + +class _WarpAffineOp(): + def __init__(self, i_shape : AShape, i_dtype, a_shape : AShape, a_dtype, o_size): + if np.dtype(i_dtype).type == np.bool_: + raise ValueError('np.bool_ dtype of i_dtype is not supported.') + if np.dtype(a_dtype).type == np.bool_: + raise ValueError('np.bool_ dtype of a_dtype is not supported.') + if i_shape.ndim < 2: + raise ValueError('i_shape.ndim must be >= 2 (...,H,W)') + if a_shape.ndim < 2: + raise ValueError(f'a_shape.ndim must be >= 2 (...,2,3)') + if a_shape[-2] != 2 or a_shape[-1] != 3: + raise ValueError('Last a_shape dims must be == (...,2,3)') + + IH,IW = i_shape[-2:] + if o_size is not None: + OH,OW = o_size + else: + OH,OW = IH,IW + + self.coords_shape = AShape( (OH,OW,3) ) + self.coords_affined_shape = AShape( (OH,OW,2) ) + + if a_shape.ndim > 2: + self.coords_shape = a_shape[:-2] + self.coords_shape + self.coords_affined_shape = a_shape[:-2] + self.coords_affined_shape + + self.coords_init = InitCoords2DArange(0,OH-1,0,OW-1) + self.coords_reshape = (-1,OH*OW,3) + self.affine_transpose_axes = a_shape.axes_arange().swapped_axes(-2,-1)