mirror of
https://github.com/iperov/DeepFaceLive
synced 2025-08-19 13:09:58 -07:00
add xlib.avecl
This commit is contained in:
parent
932edfe875
commit
0058474da7
56 changed files with 5569 additions and 0 deletions
31
xlib/avecl/__init__.py
Normal file
31
xlib/avecl/__init__.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
"""
|
||||
AveCL ! Make OpenCL great again.
|
||||
|
||||
Lightweight ndarray library using OpenCL 1.2 written in pure python.
|
||||
Applicable for high-performance general purpose n-dim array computations for every device that supports OpenCL 1.2.
|
||||
|
||||
Works in python 3.5+. Dependencies: numpy.
|
||||
|
||||
This lib uses relative import, thus you can place it in any subfolder.
|
||||
|
||||
made by @iperov from scratch.
|
||||
"""
|
||||
|
||||
from ._internal.AAxes import AAxes
|
||||
from ._internal.AShape import AShape
|
||||
from ._internal.backend import (Device, DeviceInfo, Kernel,
|
||||
get_available_devices_info, get_best_device,
|
||||
get_default_device, get_device,
|
||||
set_default_device)
|
||||
from ._internal.HArgs import HArgs
|
||||
from ._internal.HKernel import HKernel
|
||||
from ._internal.HTensor import HTensor
|
||||
from ._internal.HType import HType
|
||||
from ._internal.initializer import (InitCoords2DArange, Initializer,
|
||||
InitRandomUniform)
|
||||
from ._internal.NCore import NCore
|
||||
from ._internal.NTest import NTest
|
||||
from ._internal.op import *
|
||||
from ._internal.SCacheton import SCacheton
|
||||
from ._internal.Tensor import Tensor
|
||||
from ._internal.TensorImpl import *
|
169
xlib/avecl/_internal/AAxes.py
Normal file
169
xlib/avecl/_internal/AAxes.py
Normal file
|
@ -0,0 +1,169 @@
|
|||
from collections import Iterable
|
||||
|
||||
class AAxes(Iterable):
|
||||
__slots__ = ['axes','ndim','_inversed']
|
||||
|
||||
def __init__(self, axes, shape_ndim=None):
|
||||
"""
|
||||
Constructs AAxes from user argument
|
||||
|
||||
arguments
|
||||
|
||||
axes AAxes
|
||||
Int
|
||||
Iterable of ints
|
||||
None
|
||||
|
||||
shape_ndim(None) provide shape_ndim if axes contain negative values
|
||||
|
||||
can raise an errors during the construction
|
||||
|
||||
AAxes supports:
|
||||
|
||||
A+B : concat A_axes with B_axes
|
||||
|
||||
A-B : removes B_axes from A_axes
|
||||
"""
|
||||
|
||||
if isinstance(axes, AAxes):
|
||||
self.axes = axes.axes
|
||||
self.ndim = axes.ndim
|
||||
self._inversed = axes._inversed
|
||||
elif axes is None:
|
||||
self.axes = None
|
||||
self.ndim = None
|
||||
self._inversed = None
|
||||
else:
|
||||
if not isinstance(axes, Iterable):
|
||||
axes = (axes,)
|
||||
|
||||
if isinstance(axes, Iterable):
|
||||
valid_axes = []
|
||||
for x in axes:
|
||||
if x is None:
|
||||
raise ValueError(f'Incorrent value {x} in axes {axes}')
|
||||
x = int(x)
|
||||
if x < 0:
|
||||
if shape_ndim is None:
|
||||
raise ValueError(f'Incorrent value {x} in axes {axes}, or provide shape_ndim')
|
||||
x = shape_ndim + x
|
||||
|
||||
if x in valid_axes:
|
||||
raise ValueError(f'Axes must contain unique values.')
|
||||
valid_axes.append(x)
|
||||
|
||||
self.axes = tuple(valid_axes)
|
||||
self.ndim = len(self.axes)
|
||||
self._inversed = None
|
||||
|
||||
def is_none_axes(self):
|
||||
"""
|
||||
returns True if AAxes is constructed with (None) argument, i.e. all-axes
|
||||
"""
|
||||
return self.axes is None
|
||||
|
||||
def sorted(self) -> 'AAxes':
|
||||
"""
|
||||
returns sorted AAxes
|
||||
"""
|
||||
return AAxes(sorted(self.axes))
|
||||
|
||||
def swapped_axes(self, axis_a, axis_b) -> 'AAxes':
|
||||
x = list(self.axes)
|
||||
if axis_a < 0:
|
||||
axis_a = len(x) + axis_a
|
||||
if axis_b < 0:
|
||||
axis_b = len(x) + axis_b
|
||||
|
||||
x[axis_b], x[axis_a] = x[axis_a], x[axis_b]
|
||||
|
||||
return AAxes( tuple(x) )
|
||||
|
||||
def inversed(self) -> 'AAxes':
|
||||
"""
|
||||
Returns inversed axes order
|
||||
|
||||
Example:
|
||||
|
||||
for (0,2,3,1) returns (0,3,1,2)
|
||||
"""
|
||||
if self.is_none_axes():
|
||||
raise Exception(f'none-axes does not support inversed(). Handle none-axes by calling .is_none_axes()')
|
||||
|
||||
if self._inversed is None:
|
||||
x = { axis:i for i,axis in enumerate(self.axes) }
|
||||
t = []
|
||||
for i in range(self.ndim):
|
||||
axis = x.get(i, None)
|
||||
if axis is None:
|
||||
raise Exception(f'axes {self.axes} are inconsistent to do inverse order.')
|
||||
t.append(axis)
|
||||
self._inversed = AAxes(t)
|
||||
|
||||
return self._inversed
|
||||
|
||||
|
||||
def __hash__(self): return self.axes.__hash__()
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, AAxes):
|
||||
return self.axes == other.axes
|
||||
elif isinstance(other, Iterable):
|
||||
return self.axes == tuple(other)
|
||||
return False
|
||||
def __iter__(self):
|
||||
if self.is_none_axes():
|
||||
raise Exception(f'none-axes does not support iteration. Handle none-axes by calling .is_none_axes()')
|
||||
return self.axes.__iter__()
|
||||
|
||||
def __len__(self): return self.ndim
|
||||
def __getitem__(self,key):
|
||||
if self.is_none_axes():
|
||||
raise Exception(f'none-axes does not support indexing. Handle none-axes by calling .is_none_axes()')
|
||||
|
||||
elif isinstance(key, slice):
|
||||
return AAxes(self.axes[key])
|
||||
|
||||
return self.axes[key]
|
||||
|
||||
def __radd__(self, o):
|
||||
if isinstance(o, Iterable):
|
||||
return AAxes( tuple(o) + self.axes)
|
||||
else:
|
||||
raise ValueError(f'unable to use type {o.__class__} in AAxes append')
|
||||
def __add__(self, o):
|
||||
if isinstance(o, Iterable):
|
||||
return AAxes( self.axes + tuple(o) )
|
||||
else:
|
||||
raise ValueError(f'unable to use type {o.__class__} in AAxes append')
|
||||
|
||||
def __rsub__(self, o):
|
||||
if isinstance(o, Iterable):
|
||||
new_axes = []
|
||||
for axis in o:
|
||||
if axis not in self.axes:
|
||||
new_axes.append(axis)
|
||||
|
||||
return AAxes(new_axes)
|
||||
else:
|
||||
raise ValueError(f'unable to use type {o.__class__} in AAxes substraction')
|
||||
|
||||
def __sub__(self, o):
|
||||
if isinstance(o, Iterable):
|
||||
new_axes = []
|
||||
o_axes = tuple(o)
|
||||
for axis in self.axes:
|
||||
if axis not in o_axes:
|
||||
new_axes.append(axis)
|
||||
|
||||
return AAxes(new_axes)
|
||||
else:
|
||||
raise ValueError(f'unable to use type {o.__class__} in AAxes substraction')
|
||||
|
||||
def __str__(self):
|
||||
if self.is_none_axes():
|
||||
return '(None)'
|
||||
return str(self.axes)
|
||||
|
||||
def __repr__(self): return 'AAxes' + self.__str__()
|
||||
|
||||
__all__ = ['AAxes']
|
103
xlib/avecl/_internal/AShape.py
Normal file
103
xlib/avecl/_internal/AShape.py
Normal file
|
@ -0,0 +1,103 @@
|
|||
from collections import Iterable
|
||||
from .AAxes import AAxes
|
||||
|
||||
class AShape(Iterable):
|
||||
__slots__ = ['shape','size','ndim']
|
||||
|
||||
def __init__(self, shape):
|
||||
"""
|
||||
Constructs valid shape from user argument
|
||||
|
||||
arguments
|
||||
|
||||
shape AShape
|
||||
Iterable
|
||||
|
||||
can raise ValueError during the construction
|
||||
"""
|
||||
|
||||
if isinstance(shape, AShape):
|
||||
self.shape = shape.shape
|
||||
self.size = shape.size
|
||||
self.ndim = shape.ndim
|
||||
else:
|
||||
if isinstance(shape, (int,float) ):
|
||||
shape = (int(shape),)
|
||||
|
||||
if isinstance(shape, Iterable):
|
||||
size = 1
|
||||
valid_shape = []
|
||||
for x in shape:
|
||||
if x is None:
|
||||
raise ValueError(f'Incorrent value {x} in shape {shape}')
|
||||
x = int(x)
|
||||
if x < 1:
|
||||
raise ValueError(f'Incorrent value {x} in shape {shape}')
|
||||
valid_shape.append(x)
|
||||
size *= x # Faster than np.prod()
|
||||
|
||||
self.shape = tuple(valid_shape)
|
||||
self.ndim = len(self.shape)
|
||||
if self.ndim == 0:
|
||||
# Force (1,) shape for scalar shape
|
||||
self.ndim = 1
|
||||
self.shape = (1,)
|
||||
self.size = size
|
||||
else:
|
||||
raise ValueError('Invalid type to create AShape')
|
||||
|
||||
def axes_arange(self) -> AAxes:
|
||||
"""
|
||||
Returns tuple of axes arange.
|
||||
|
||||
Example (0,1,2) for ndim 3
|
||||
"""
|
||||
return AAxes(range(self.ndim))
|
||||
|
||||
def transpose_by_axes(self, axes) -> 'AShape':
|
||||
"""
|
||||
Same as AShape[axes]
|
||||
|
||||
Returns AShape transposed by axes.
|
||||
|
||||
axes AAxes
|
||||
Iterable(list,tuple,set,generator)
|
||||
"""
|
||||
return AShape(self.shape[axis] for axis in AAxes(axes) )
|
||||
|
||||
def __hash__(self): return self.shape.__hash__()
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, AShape):
|
||||
return self.shape == other.shape
|
||||
elif isinstance(other, Iterable):
|
||||
return self.shape == tuple(other)
|
||||
return False
|
||||
def __iter__(self): return self.shape.__iter__()
|
||||
def __len__(self): return len(self.shape)
|
||||
def __getitem__(self,key):
|
||||
if isinstance(key, Iterable):
|
||||
if isinstance(key, AAxes):
|
||||
if key.is_none_axes():
|
||||
return self
|
||||
return self.transpose_by_axes(key)
|
||||
elif isinstance(key, slice):
|
||||
return AShape(self.shape[key])
|
||||
|
||||
return self.shape[key]
|
||||
|
||||
def __radd__(self, o):
|
||||
if isinstance(o, Iterable):
|
||||
return AShape( tuple(o) + self.shape)
|
||||
else:
|
||||
raise ValueError(f'unable to use type {o.__class__} in AShape append')
|
||||
def __add__(self, o):
|
||||
if isinstance(o, Iterable):
|
||||
return AShape( self.shape + tuple(o) )
|
||||
else:
|
||||
raise ValueError(f'unable to use type {o.__class__} in AShape append')
|
||||
|
||||
|
||||
def __str__(self): return str(self.shape)
|
||||
def __repr__(self): return 'AShape' + self.__str__()
|
||||
|
||||
__all__ = ['AShape']
|
106
xlib/avecl/_internal/HArgs.py
Normal file
106
xlib/avecl/_internal/HArgs.py
Normal file
|
@ -0,0 +1,106 @@
|
|||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .backend import Device
|
||||
from .HTensor import HTensor
|
||||
from .HType import HType
|
||||
from .Tensor import Tensor
|
||||
from .AShape import AShape
|
||||
|
||||
class HArgs:
|
||||
"""
|
||||
Helper functions for list of arguments
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def decompose(args):
|
||||
"""
|
||||
decompose list of args of Tensor and supported numeric values
|
||||
|
||||
returns ( shape_list, # if scalar value -> shape is None
|
||||
dtype_list, #
|
||||
kernel_args_list #
|
||||
)
|
||||
"""
|
||||
shape_list = []
|
||||
dtype_list = []
|
||||
kernel_args_list = []
|
||||
for arg in args:
|
||||
|
||||
if isinstance(arg, Tensor):
|
||||
shape_list.append(arg.shape)
|
||||
dtype_list.append(arg.dtype)
|
||||
kernel_args_list.append(arg.get_buffer())
|
||||
else:
|
||||
|
||||
if isinstance(arg, int):
|
||||
dtype, arg = np.int32, np.int32(arg)
|
||||
elif isinstance(arg, float):
|
||||
dtype, arg = np.float32, np.float32(arg)
|
||||
elif HType.is_obj_of_np_scalar_type(arg):
|
||||
dtype = arg.__class__
|
||||
else:
|
||||
raise ValueError(f'Unsupported type of arg: {arg.__class__} Use Tensor or number type.')
|
||||
|
||||
shape_list.append(None)
|
||||
dtype_list.append(dtype)
|
||||
kernel_args_list.append(arg)
|
||||
|
||||
return tuple(shape_list), tuple(dtype_list), tuple(kernel_args_list)
|
||||
|
||||
@staticmethod
|
||||
def get_shapes(args : List[Tensor]) -> List[AShape]:
|
||||
"""
|
||||
"""
|
||||
return tuple(t.shape for t in args)
|
||||
|
||||
@staticmethod
|
||||
def check_zero_get_length(args) -> int:
|
||||
"""
|
||||
raises an error if len(args) == 0, otherwise returns len
|
||||
"""
|
||||
args_len = len(args)
|
||||
if len(args) == 0:
|
||||
raise ValueError('args must be specified')
|
||||
return args_len
|
||||
|
||||
@staticmethod
|
||||
def check_get_same_device(args : List[Tensor]) -> Device:
|
||||
"""
|
||||
check all device of tensors are the same and return the device
|
||||
"""
|
||||
result = HTensor.all_same_device(args)
|
||||
if not result:
|
||||
raise ValueError('all Tensors must have the same device')
|
||||
return args[0].get_device()
|
||||
|
||||
@staticmethod
|
||||
def check_all_tensors(args : List[Tensor]):
|
||||
"""
|
||||
"""
|
||||
if not all (isinstance(tensor, Tensor) for tensor in args):
|
||||
raise ValueError('All values must have type of Tensor')
|
||||
|
||||
@staticmethod
|
||||
def check_get_same_shape(args : List[Tensor]) -> AShape:
|
||||
"""
|
||||
check all shapes of tensors are the same and return the shape
|
||||
"""
|
||||
shape = args[0].shape
|
||||
if not all (t.shape == shape for t in args):
|
||||
raise ValueError('All tensors must have the same shape')
|
||||
return shape
|
||||
|
||||
|
||||
@staticmethod
|
||||
def filter_tensor(args, raise_on_empty : bool):
|
||||
"""
|
||||
get only tensors from the list
|
||||
"""
|
||||
tensor_args = [arg for arg in args if isinstance(arg, Tensor) ]
|
||||
if raise_on_empty and len(tensor_args) == 0:
|
||||
raise ValueError('At least one arg must be a Tensor')
|
||||
return tensor_args
|
||||
|
||||
__all__ = ['HArgs']
|
496
xlib/avecl/_internal/HKernel.py
Normal file
496
xlib/avecl/_internal/HKernel.py
Normal file
|
@ -0,0 +1,496 @@
|
|||
import numpy as np
|
||||
|
||||
class HKernel:
|
||||
"""
|
||||
Helper functions for Kernels
|
||||
"""
|
||||
|
||||
_np_dtype_to_cl = { np.bool_ : 'bool',
|
||||
np.int8 : 'char',
|
||||
np.uint8 : 'uchar',
|
||||
np.int16 : 'short',
|
||||
np.uint16 : 'ushort',
|
||||
np.int32 : 'int',
|
||||
np.uint32 : 'uint',
|
||||
np.int64 : 'long',
|
||||
np.uint64 : 'ulong',
|
||||
np.float16 : 'half',
|
||||
np.float32 : 'float',
|
||||
np.float64 : 'double'
|
||||
}
|
||||
|
||||
|
||||
|
||||
@staticmethod
|
||||
def np_dtype_to_cl(dtype : np.dtype):
|
||||
"""
|
||||
returns string opencl type from numpy dtype
|
||||
|
||||
example np.float32 -> 'float'
|
||||
np.uint8 -> 'unsigned char'
|
||||
"""
|
||||
return HKernel._np_dtype_to_cl[np.dtype(dtype).type]
|
||||
|
||||
@staticmethod
|
||||
def define_scalar_func_arg(name, dtype : np.dtype):
|
||||
"""
|
||||
"""
|
||||
return f'{HKernel._np_dtype_to_cl[np.dtype(dtype).type]} {name}'
|
||||
|
||||
@staticmethod
|
||||
def define_tensor_type(name, dtype : np.dtype):
|
||||
"""
|
||||
Returns a definitions for operations with tensor
|
||||
|
||||
example for 'O', np.float16:
|
||||
|
||||
#define O_PTR_NAME p_O
|
||||
#define O_PTR_TYPE half
|
||||
#define O_PTR_TYPE2 half2
|
||||
#define O_PTR_TYPE3 half3
|
||||
#define O_PTR_TYPE4 half4
|
||||
#define O_PTR_TYPE8 half8
|
||||
#define O_PTR_TYPE16 half16
|
||||
#define O_TYPE float
|
||||
#define O_TYPE2 float2
|
||||
#define O_TYPE3 float3
|
||||
#define O_TYPE4 float4
|
||||
#define O_TYPE8 float8
|
||||
#define O_TYPE16 float16
|
||||
#define O_GLOBAL_LOAD(offset) vload_half (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_LOAD2(offset) vload_half2 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_LOAD3(offset) vload_half3 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_LOAD4(offset) vload_half4 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_LOAD8(offset) vload_half8 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_STORE(offset,value) vstore_half ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_STORE2(offset,value) vstore_half2 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_STORE3(offset,value) vstore_half3 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_STORE4(offset,value) vstore_half4 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_STORE8(offset,value) vstore_half8 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_TO_FLOATX(x) ((float)x)
|
||||
"""
|
||||
name_upper = name.upper()
|
||||
|
||||
dtype = np.dtype(dtype).type
|
||||
|
||||
out = [f'#define {name.upper()}_PTR_NAME p_{name.upper()}']
|
||||
|
||||
if dtype == np.float16:
|
||||
out += [f'#define {name_upper}_PTR_TYPE half']
|
||||
out += [f'#define {name_upper}_PTR_TYPE2 half2']
|
||||
out += [f'#define {name_upper}_PTR_TYPE3 half3']
|
||||
out += [f'#define {name_upper}_PTR_TYPE4 half4']
|
||||
out += [f'#define {name_upper}_PTR_TYPE8 half8']
|
||||
out += [f'#define {name_upper}_PTR_TYPE16 half16']
|
||||
out += [f'#define {name_upper}_TYPE {HKernel.np_dtype_to_cl(np.float32)}']
|
||||
out += [f'#define {name_upper}_TYPE2 {HKernel.np_dtype_to_cl(np.float32)}2']
|
||||
out += [f'#define {name_upper}_TYPE3 {HKernel.np_dtype_to_cl(np.float32)}3']
|
||||
out += [f'#define {name_upper}_TYPE4 {HKernel.np_dtype_to_cl(np.float32)}4']
|
||||
out += [f'#define {name_upper}_TYPE8 {HKernel.np_dtype_to_cl(np.float32)}8']
|
||||
out += [f'#define {name_upper}_TYPE16 {HKernel.np_dtype_to_cl(np.float32)}16']
|
||||
|
||||
|
||||
out += [f'#define {name_upper}_GLOBAL_LOAD(offset) vload_half (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
|
||||
out += [f'#define {name_upper}_GLOBAL_LOAD2(offset) vload_half2 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
|
||||
out += [f'#define {name_upper}_GLOBAL_LOAD3(offset) vload_half3 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
|
||||
out += [f'#define {name_upper}_GLOBAL_LOAD4(offset) vload_half4 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
|
||||
out += [f'#define {name_upper}_GLOBAL_LOAD8(offset) vload_half8 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
|
||||
out += [f'#define {name_upper}_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
|
||||
|
||||
out += [f'#define {name_upper}_GLOBAL_STORE(offset,value) vstore_half ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
|
||||
out += [f'#define {name_upper}_GLOBAL_STORE2(offset,value) vstore_half2 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
|
||||
out += [f'#define {name_upper}_GLOBAL_STORE3(offset,value) vstore_half3 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
|
||||
out += [f'#define {name_upper}_GLOBAL_STORE4(offset,value) vstore_half4 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
|
||||
out += [f'#define {name_upper}_GLOBAL_STORE8(offset,value) vstore_half8 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
|
||||
out += [f'#define {name_upper}_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
|
||||
|
||||
else:
|
||||
out += [f'#define {name_upper}_PTR_TYPE {HKernel.np_dtype_to_cl(dtype)}']
|
||||
out += [f'#define {name_upper}_PTR_TYPE2 {HKernel.np_dtype_to_cl(dtype)}2']
|
||||
out += [f'#define {name_upper}_PTR_TYPE3 {HKernel.np_dtype_to_cl(dtype)}3']
|
||||
out += [f'#define {name_upper}_PTR_TYPE4 {HKernel.np_dtype_to_cl(dtype)}4']
|
||||
out += [f'#define {name_upper}_PTR_TYPE8 {HKernel.np_dtype_to_cl(dtype)}8']
|
||||
out += [f'#define {name_upper}_PTR_TYPE16 {HKernel.np_dtype_to_cl(dtype)}16']
|
||||
out += [f'#define {name_upper}_TYPE {HKernel.np_dtype_to_cl(dtype)}']
|
||||
out += [f'#define {name_upper}_TYPE2 {HKernel.np_dtype_to_cl(dtype)}2']
|
||||
out += [f'#define {name_upper}_TYPE3 {HKernel.np_dtype_to_cl(dtype)}3']
|
||||
out += [f'#define {name_upper}_TYPE4 {HKernel.np_dtype_to_cl(dtype)}4']
|
||||
out += [f'#define {name_upper}_TYPE8 {HKernel.np_dtype_to_cl(dtype)}8']
|
||||
out += [f'#define {name_upper}_TYPE16 {HKernel.np_dtype_to_cl(dtype)}16']
|
||||
|
||||
out += [f'#define {name_upper}_GLOBAL_LOAD(offset) {name_upper}_PTR_NAME[(offset)]']
|
||||
out += [f'#define {name_upper}_GLOBAL_LOAD2(offset) {name_upper}_PTR_NAME[(offset)]']
|
||||
out += [f'#define {name_upper}_GLOBAL_LOAD3(offset) {name_upper}_PTR_NAME[(offset)]']
|
||||
out += [f'#define {name_upper}_GLOBAL_LOAD4(offset) {name_upper}_PTR_NAME[(offset)]']
|
||||
out += [f'#define {name_upper}_GLOBAL_LOAD8(offset) {name_upper}_PTR_NAME[(offset)]']
|
||||
out += [f'#define {name_upper}_GLOBAL_LOAD16(offset) {name_upper}_PTR_NAME[(offset)]']
|
||||
|
||||
out += [f'#define {name_upper}_GLOBAL_STORE(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
|
||||
out += [f'#define {name_upper}_GLOBAL_STORE2(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
|
||||
out += [f'#define {name_upper}_GLOBAL_STORE3(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
|
||||
out += [f'#define {name_upper}_GLOBAL_STORE4(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
|
||||
out += [f'#define {name_upper}_GLOBAL_STORE8(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
|
||||
out += [f'#define {name_upper}_GLOBAL_STORE16(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
|
||||
|
||||
if dtype in [np.float32, np.float64]:
|
||||
out += [f'#define {name_upper}_TO_FLOATX(x) x']
|
||||
elif dtype in [np.bool_, np.int8, np.uint8, np.int16, np.uint16, np.int32,np.uint32, np.float16]:
|
||||
out += [f'#define {name_upper}_TO_FLOATX(x) ((float)x)']
|
||||
elif dtype in [np.int64,np.uint64]:
|
||||
out += [f'#define {name_upper}_TO_FLOATX(x) ((double)x)']
|
||||
return '\n'.join(out)
|
||||
|
||||
@staticmethod
|
||||
def define_tensor_shape(name, shape, axes_symbols=None):
|
||||
"""
|
||||
Returns a definitions for operations with tensor
|
||||
|
||||
example for 'O', (7,3),
|
||||
|
||||
#define O0 7
|
||||
#define O1 3
|
||||
#define Om1 3
|
||||
#define Om2 7
|
||||
#define O_IDX(o0,o1) ( (size_t)(o0) )*3 +( o1 )
|
||||
#define O_IDX_MOD(o0,o1) ( (size_t)(o0) % 7 )*3 +( (o1) % 3 )
|
||||
"""
|
||||
shape = tuple(shape)
|
||||
ndim = len(shape)
|
||||
name_upper = name.upper()
|
||||
name_lower = name.lower()
|
||||
|
||||
if axes_symbols is None:
|
||||
axes_symbols = "".join([str(i) for i in range(ndim)])
|
||||
axes_symbols = axes_symbols.upper()
|
||||
|
||||
out = []
|
||||
for i in range(ndim):
|
||||
out += [f'#define {name_upper}{axes_symbols[i]} {shape[i]}']
|
||||
|
||||
for i in range(1,ndim+1):
|
||||
out += [f'#define {name_upper}m{i} {shape[-i]}']
|
||||
|
||||
line = f'#define {name_upper}_IDX({HKernel.axes_seq_enum(name, ndim)}) '
|
||||
|
||||
for i in range(ndim):
|
||||
if i == 0:
|
||||
line += f'( (size_t)({name_lower}{i}) )'
|
||||
else:
|
||||
line += f'( {name_lower}{i} )'
|
||||
|
||||
for j in range(i+1,ndim):
|
||||
line += f'*{shape[j]} '
|
||||
if i != ndim-1:
|
||||
line += '+'
|
||||
|
||||
out += [line]
|
||||
|
||||
line = f'#define {name_upper}_IDX_MOD({HKernel.axes_seq_enum(name, ndim)}) '
|
||||
|
||||
for i in range(ndim):
|
||||
if i == 0:
|
||||
line += f'( (size_t)({name_lower}{i}) % {shape[i]} )'
|
||||
else:
|
||||
line += f'( ({name_lower}{i}) % {shape[i]} )'
|
||||
|
||||
for j in range(i+1,ndim):
|
||||
line += f'*{shape[j]} '
|
||||
if i != ndim-1:
|
||||
line += '+'
|
||||
|
||||
out += [line,'']
|
||||
|
||||
return '\n'.join(out)
|
||||
|
||||
@staticmethod
|
||||
def define_tensor(name, shape, dtype : np.dtype, axes_symbols=None):
|
||||
"""
|
||||
Returns a definitions for operations with tensor
|
||||
|
||||
arguments
|
||||
|
||||
name text
|
||||
|
||||
shape Iterable
|
||||
|
||||
dtype np.dtype
|
||||
|
||||
axes_symbols(None) string of symbols.
|
||||
None -> numeric symbols will be used
|
||||
|
||||
example for 'O', (2,4), np.float16
|
||||
|
||||
#define O_PTR_NAME p_O
|
||||
#define O_PTR_TYPE half
|
||||
#define O_PTR_TYPE2 half2
|
||||
#define O_PTR_TYPE3 half3
|
||||
#define O_PTR_TYPE4 half4
|
||||
#define O_PTR_TYPE8 half8
|
||||
#define O_PTR_TYPE16 half16
|
||||
#define O_TYPE float
|
||||
#define O_TYPE2 float2
|
||||
#define O_TYPE3 float3
|
||||
#define O_TYPE4 float4
|
||||
#define O_TYPE8 float8
|
||||
#define O_TYPE16 float16
|
||||
#define O_GLOBAL_LOAD(offset) vload_half (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_LOAD2(offset) vload_half2 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_LOAD3(offset) vload_half3 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_LOAD4(offset) vload_half4 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_LOAD8(offset) vload_half8 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_STORE(offset,value) vstore_half ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_STORE2(offset,value) vstore_half2 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_STORE3(offset,value) vstore_half3 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_STORE4(offset,value) vstore_half4 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_STORE8(offset,value) vstore_half8 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
|
||||
#define O_TO_FLOATX(x) ((float)x)
|
||||
#define O0 2
|
||||
#define O1 4
|
||||
#define Om1 4
|
||||
#define Om2 2
|
||||
#define O_IDX(o0,o1) ( (size_t)(o0) )*4 +( o1 )
|
||||
#define O_IDX_MOD(o0,o1) ( (size_t)(o0) % 2 )*4 +( (o1) % 4 )
|
||||
"""
|
||||
return'\n'.join([ HKernel.define_tensor_type(name, dtype),
|
||||
HKernel.define_tensor_shape(name, shape, axes_symbols)
|
||||
])
|
||||
|
||||
@staticmethod
|
||||
def define_axes_sizes(axis_letter, axes_sizes):
|
||||
"""
|
||||
Returns definitions of axes sizes
|
||||
|
||||
example for 'O', (4,512,512)
|
||||
#define O0 4
|
||||
#define O1 512
|
||||
#define O2 512
|
||||
"""
|
||||
out = ""
|
||||
axes_sizes = tuple(axes_sizes)
|
||||
ndim = len(axes_sizes)
|
||||
for i in range(ndim):
|
||||
out += f'#define {axis_letter.upper()}{i} {axes_sizes[i]}\n'
|
||||
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def decompose_idx_to_axes_idxs(var_name, tensor_name, ndim):
|
||||
"""
|
||||
decompose a size_t variable to axes indexes.
|
||||
Keeps original variable untouched.
|
||||
|
||||
Example for 'gid','O',2
|
||||
|
||||
size_t gid_original = gid;
|
||||
size_t o1 = gid % O1; gid /= O1;
|
||||
#define om1 o1
|
||||
size_t o0 = gid % O0;
|
||||
#define om2 o0
|
||||
gid = gid_original;
|
||||
"""
|
||||
name_lower = tensor_name.lower()
|
||||
name_upper = tensor_name.upper()
|
||||
|
||||
out = [f'size_t {var_name}_original = {var_name};']
|
||||
|
||||
for i in range(ndim-1,-1,-1):
|
||||
line = f'size_t {name_lower}{i} = {var_name} % {name_upper}{i};'
|
||||
if i > 0:
|
||||
line += f' {var_name} /= {name_upper}{i};'
|
||||
out += [line]
|
||||
out += [f'#define {name_lower}m{ndim-i} {name_lower}{i}']
|
||||
|
||||
out += [f'{var_name} = {var_name}_original;']
|
||||
return '\n'.join(out)
|
||||
|
||||
@staticmethod
|
||||
def axes_order_enum(tensor_name, axes_order):
|
||||
"""
|
||||
returns axis enumeration with given order
|
||||
|
||||
Example
|
||||
('I', (1,2,0)) returns 'i1,i2,i0'
|
||||
('I', 'HW') return 'ih,iw'
|
||||
"""
|
||||
if isinstance(axes_order, str):
|
||||
axes_order = axes_order.lower()
|
||||
else:
|
||||
axes_order = tuple(axes_order)
|
||||
|
||||
name_lower = tensor_name.lower()
|
||||
|
||||
return ','.join( [ f'{name_lower}{axes_order[axis]}' for axis in range(len(axes_order)) ])
|
||||
|
||||
@staticmethod
|
||||
def axes_seq_enum(tensor_name, ndim, new_axis=None, zero_axes=None, suffix=None):
|
||||
"""
|
||||
returns axis sequental enumeration with given ndim
|
||||
|
||||
Example
|
||||
|
||||
('I', 4) returns 'i0,i1,i2,i3'
|
||||
|
||||
('I', 4, new_axis=('name',1) ) returns 'i0,name,i1,i2,i3'
|
||||
|
||||
('I', 3, zero_axes=(1,) ) returns 'i0,0,i2'
|
||||
|
||||
('I', 2, suffix='ih,iw' ) returns 'i0,i1,ih,iw'
|
||||
"""
|
||||
name_lower = tensor_name.lower()
|
||||
if zero_axes is not None:
|
||||
axes = [ '0' if axis in zero_axes else f'{name_lower}{axis}' for axis in range(ndim) ]
|
||||
else:
|
||||
axes = [ f'{name_lower}{axis}' for axis in range(ndim) ]
|
||||
|
||||
if suffix is None:
|
||||
suffix = []
|
||||
else:
|
||||
suffix = [suffix]
|
||||
|
||||
if new_axis is not None:
|
||||
name, axis = new_axis
|
||||
return','.join(axes[:axis] + [name] + axes[axis:] + suffix)
|
||||
else:
|
||||
return ','.join(axes+ suffix)
|
||||
|
||||
@staticmethod
|
||||
def include_constants_pi():
|
||||
"""
|
||||
defines PI constants
|
||||
|
||||
PI_F
|
||||
PI_2_F
|
||||
PI_4_F
|
||||
"""
|
||||
return f"""
|
||||
#define PI_F 3.14159274101257f
|
||||
#define PI_2_F 1.57079637050629f
|
||||
#define PI_4_F 0.78539818525314f
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def include_hash():
|
||||
"""
|
||||
returns hash functions:
|
||||
|
||||
uint hash_uint_uint(uint v)
|
||||
ulong hash_ulong_from_ulong(ulong x)
|
||||
float hash_float_from_uint(uint x)
|
||||
double hash_double_from_ulong(ulong x)
|
||||
"""
|
||||
|
||||
return f"""
|
||||
|
||||
#define UIF (1.0 / (float)(0xffffffffU))
|
||||
|
||||
//from Chris Wellons https://nullprogram.com/blog/2018/07/31/ https://www.shadertoy.com/view/WttXWX
|
||||
uint hash_uint_from_uint(uint x)
|
||||
{{
|
||||
x ^= x >> 17;
|
||||
x *= 0xed5ad4bbU;
|
||||
x ^= x >> 11;
|
||||
x *= 0xac4c1b51U;
|
||||
x ^= x >> 15;
|
||||
x *= 0x31848babU;
|
||||
x ^= x >> 14;
|
||||
return x;
|
||||
}}
|
||||
|
||||
ulong hash_ulong_from_ulong(ulong x)
|
||||
{{
|
||||
x ^= x >> 32;
|
||||
x *= 0xd6e8feb86659fd93U;
|
||||
x ^= x >> 32;
|
||||
x *= 0xd6e8feb86659fd93U;
|
||||
x ^= x >> 32;
|
||||
return x;
|
||||
}}
|
||||
|
||||
float hash_float_from_uint(uint x)
|
||||
{{
|
||||
return hash_uint_from_uint(x) / (float)(0xffffffffU);
|
||||
}}
|
||||
|
||||
double hash_double_from_ulong(ulong x)
|
||||
{{
|
||||
return (double)hash_ulong_from_ulong(x) / (double)(0xffffffffffffffffU);
|
||||
}}
|
||||
|
||||
/*****************************
|
||||
UNUSED CODE
|
||||
|
||||
//---------- PCG hashes from https://www.shadertoy.com/view/XlGcRh
|
||||
uint hash_uint_uint(uint v)
|
||||
{{
|
||||
uint state = v * 747796405u + 2891336453u;
|
||||
uint word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u;
|
||||
return (word >> 22u) ^ word;
|
||||
}}
|
||||
|
||||
uint2 hash_uint2_uint2 (uint2 v)
|
||||
{{
|
||||
v = v * 1664525u + 1013904223u;
|
||||
v.x += v.y * 1664525u;
|
||||
v.y += v.x * 1664525u;
|
||||
v ^= v>>16u;
|
||||
v.x += v.y * 1664525u;
|
||||
v.y += v.x * 1664525u;
|
||||
v ^= v>>16u;
|
||||
return v;
|
||||
}}
|
||||
|
||||
uint3 hash_uint3_uint3(uint3 v)
|
||||
{{
|
||||
v = v * 1664525u + 1013904223u;
|
||||
v.x += v.y*v.z;
|
||||
v.y += v.z*v.x;
|
||||
v.z += v.x*v.y;
|
||||
v ^= v >> 16u;
|
||||
v.x += v.y*v.z;
|
||||
v.y += v.z*v.x;
|
||||
v.z += v.x*v.y;
|
||||
return v;
|
||||
}}
|
||||
|
||||
float hash_float_uint(uint v)
|
||||
{{
|
||||
return (float)( hash_uint_uint(v) ) * UIF;
|
||||
}}
|
||||
|
||||
float2 hash_float2_uint (uint v)
|
||||
{{
|
||||
uint2 q = hash_uint2_uint2( (uint2)(v, 1) );
|
||||
return (float2)(q.x, q.y) * UIF;
|
||||
}}
|
||||
|
||||
float3 hash_float3_uint (uint v)
|
||||
{{
|
||||
uint3 q = hash_uint3_uint3( (uint3)(v, 1, 1) );
|
||||
return (float3)(q.x, q.y, q.z) * UIF;
|
||||
}}
|
||||
|
||||
//---------- Classic hashes used in shaders
|
||||
|
||||
float hash_float_float(float p)
|
||||
{{
|
||||
|
||||
float x = sin(p*12.9898)*43758.5453;
|
||||
return x - floor(x);
|
||||
}}
|
||||
|
||||
float hash_float_float2(float2 p)
|
||||
{{
|
||||
float x = sin( dot(p, (float2)(12.9898, 78.233)) )*43758.5453;
|
||||
return x - floor(x);
|
||||
}}
|
||||
|
||||
****************************/
|
||||
|
||||
|
||||
"""
|
||||
|
||||
__all__ = ['HKernel']
|
22
xlib/avecl/_internal/HTensor.py
Normal file
22
xlib/avecl/_internal/HTensor.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
from typing import List, Union
|
||||
|
||||
from .Tensor import *
|
||||
|
||||
|
||||
class HTensor:
|
||||
"""
|
||||
Helper functions for Tensor
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def all_same_device( tensor_or_list : Union[Tensor, List[Tensor] ] ) -> bool:
|
||||
"""
|
||||
check if all tensors in a list use the same device
|
||||
"""
|
||||
if not isinstance(tensor_or_list, (list,tuple)):
|
||||
tensor_or_list = (tensor_or_list,)
|
||||
|
||||
device = tensor_or_list[0].get_device()
|
||||
return all( device == tensor.get_device() for tensor in tensor_or_list )
|
||||
|
||||
__all__ = ['HTensor']
|
82
xlib/avecl/_internal/HType.py
Normal file
82
xlib/avecl/_internal/HType.py
Normal file
|
@ -0,0 +1,82 @@
|
|||
import operator
|
||||
from typing import Iterable, List
|
||||
import numpy as np
|
||||
|
||||
scalar_types = [int, float, np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64,
|
||||
np.float16, np.float32, np.float64, np.bool_]
|
||||
|
||||
np_scalar_types = [np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64,
|
||||
np.float16, np.float32, np.float64, np.bool_]
|
||||
|
||||
_np_dtype_to_cl = {
|
||||
np.bool_ : 'bool',
|
||||
np.uint8 : 'uchar',
|
||||
np.int8 : 'char',
|
||||
np.uint16 : 'ushort',
|
||||
np.int16 : 'short',
|
||||
np.uint32 : 'uint',
|
||||
np.int32 : 'int',
|
||||
np.uint64 : 'ulong',
|
||||
np.int64 : 'long',
|
||||
np.float16 : 'half',
|
||||
np.float32 : 'float',
|
||||
np.float64 : 'double',
|
||||
}
|
||||
|
||||
_np_dtype_weight = {
|
||||
np.bool_ : 1,
|
||||
np.uint8 : 2,
|
||||
np.int8 : 3,
|
||||
np.uint16 : 4,
|
||||
np.int16 : 5,
|
||||
np.uint32 : 6,
|
||||
np.int32 : 7,
|
||||
np.uint64 : 8,
|
||||
np.int64 : 9,
|
||||
np.float16 : 10,
|
||||
np.float32 : 11,
|
||||
np.float64 : 12,
|
||||
}
|
||||
|
||||
class HType:
|
||||
"""
|
||||
Helper functions for types.
|
||||
"""
|
||||
|
||||
def is_scalar_type(value):
|
||||
return value.__class__ in scalar_types
|
||||
|
||||
def get_np_scalar_types() -> List:
|
||||
return np_scalar_types
|
||||
|
||||
def is_obj_of_np_scalar_type(obj) -> bool:
|
||||
return obj.__class__ in np_scalar_types
|
||||
|
||||
def np_dtype_to_cl(dtype : np.dtype) -> str:
|
||||
return _np_dtype_to_cl[ np.dtype(dtype).type ]
|
||||
|
||||
def get_most_weighted_dtype( dtype_list ):
|
||||
dtype_list = [ np.dtype(dtype) for dtype in dtype_list ]
|
||||
|
||||
dtype_list = [ (_np_dtype_weight[dtype.type], dtype) for dtype in dtype_list ]
|
||||
dtype_list = sorted(dtype_list, key=operator.itemgetter(0), reverse=True)
|
||||
return dtype_list[0][1]
|
||||
|
||||
def hashable_slices(slices):
|
||||
"""
|
||||
Convert list of slice to hashable arg.
|
||||
"""
|
||||
if not isinstance(slices, Iterable):
|
||||
slices = (slices,)
|
||||
normalized_slices = []
|
||||
for x in slices:
|
||||
if isinstance(x, slice):
|
||||
normalized_slices.append( (x.start, x.stop, x.step) )
|
||||
elif x is Ellipsis or x is None:
|
||||
normalized_slices.append(x)
|
||||
else:
|
||||
normalized_slices.append(int(x))
|
||||
return tuple(normalized_slices)
|
||||
|
||||
|
||||
__all__ = ['HType']
|
24
xlib/avecl/_internal/NCore.py
Normal file
24
xlib/avecl/_internal/NCore.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
from .SCacheton import SCacheton
|
||||
from .Tensor import Tensor
|
||||
from .backend import *
|
||||
|
||||
class NCore:
|
||||
"""
|
||||
core functions
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def cleanup():
|
||||
"""
|
||||
try to cleanup all resources consumed by TensorCL.
|
||||
|
||||
can raise Exception
|
||||
"""
|
||||
SCacheton.cleanup()
|
||||
|
||||
if Tensor._object_count != 0:
|
||||
raise Exception(f'Unable to cleanup while {Tensor._object_count} Tensor objects exist.')
|
||||
|
||||
cleanup_devices()
|
||||
|
||||
__all__ = ['NCore']
|
754
xlib/avecl/_internal/NTest.py
Normal file
754
xlib/avecl/_internal/NTest.py
Normal file
|
@ -0,0 +1,754 @@
|
|||
import traceback
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .HType import HType
|
||||
from .NCore import NCore
|
||||
from .backend import get_device, get_default_device, set_default_device
|
||||
from .Tensor import Tensor
|
||||
from . import op
|
||||
from .initializer import InitRandomUniform, InitCoords2DArange
|
||||
from .info import Conv2DInfo
|
||||
|
||||
class NTest():
|
||||
|
||||
def test_all():
|
||||
NCore.cleanup()
|
||||
|
||||
prev_device = get_default_device()
|
||||
|
||||
device = get_device(0)
|
||||
print(f'Using {device.get_description()}')
|
||||
|
||||
set_default_device(device)
|
||||
|
||||
test_funcs = [
|
||||
InitRandomUniform_test,
|
||||
InitCoords2DArange_test,
|
||||
cast_test,
|
||||
transpose_test,
|
||||
pad_test,
|
||||
concat_test,
|
||||
tile_test,
|
||||
stack_test,
|
||||
slice_test,
|
||||
slice_set_test,
|
||||
reduce_test,
|
||||
matmul_test,
|
||||
any_wise_op_test,
|
||||
depthwise_conv2d_test,
|
||||
remap_np_affine_test,
|
||||
remap_test,
|
||||
warp_affine_test,
|
||||
gaussian_blur_test,
|
||||
binary_erode_circle_test,
|
||||
binary_dilate_circle_test,
|
||||
binary_morph_test,
|
||||
]
|
||||
|
||||
for test_func in test_funcs:
|
||||
print(f'{test_func.__name__}()')
|
||||
test_func()
|
||||
device.cleanup()
|
||||
device.print_stat()
|
||||
|
||||
NCore.cleanup()
|
||||
|
||||
set_default_device(prev_device)
|
||||
|
||||
print('Done.')
|
||||
|
||||
def _all_close(x,y, atol=1, btol=1):
|
||||
return np.allclose( np.ndarray.flatten(x[None,...]), np.ndarray.flatten(y[None,...]), atol, btol )
|
||||
|
||||
def cast_test():
|
||||
for in_dtype in HType.get_np_scalar_types():
|
||||
for out_dtype in HType.get_np_scalar_types():
|
||||
shape = tuple(np.random.randint(1, 8, size=( np.random.randint(1,5))) )
|
||||
|
||||
print(f'cast: {shape} in_dtype:{str(np.dtype(in_dtype).name)} out_dtype:{str(np.dtype(out_dtype).name)} ... ', end='')
|
||||
|
||||
val_n = np.random.uniform( -64, 64, size=shape ).astype(in_dtype)
|
||||
cast_n = val_n.astype(out_dtype)
|
||||
val_t = Tensor.from_value(val_n)
|
||||
cast_t = op.cast(val_t, out_dtype)
|
||||
|
||||
if not _all_close(cast_t.np(), cast_n):
|
||||
raise Exception(f'data is not equal')
|
||||
|
||||
print('pass')
|
||||
|
||||
def binary_morph_test():
|
||||
for shape_len in range(2,4):
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
shape = np.random.randint( 1, 64, size=(shape_len,) )
|
||||
erode_dilate = np.random.randint( -16, 16 )
|
||||
blur = np.random.rand()*16 - 8
|
||||
|
||||
input_n = np.random.randint( 2, size=shape ).astype(dtype)
|
||||
input_t = Tensor.from_value(input_n)
|
||||
|
||||
print(f'binary_morph: {shape} erode_dilate:{erode_dilate} blur:{blur} {np.dtype(dtype).name} ... ', end='')
|
||||
|
||||
op.binary_morph(input_t, erode_dilate=erode_dilate, blur=blur, fade_to_border=True)
|
||||
|
||||
print('pass')
|
||||
|
||||
def binary_erode_circle_test():
|
||||
for shape_len in range(2,4):
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
|
||||
shape = np.random.randint( 1, 64, size=(shape_len,) )
|
||||
radius = np.random.randint( 1, 16 )
|
||||
iterations = np.random.randint( 1, 4 )
|
||||
|
||||
input_n = np.random.randint( 2, size=shape ).astype(dtype)
|
||||
input_t = Tensor.from_value(input_n)
|
||||
|
||||
print(f'binary_erode_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='')
|
||||
|
||||
op.binary_erode_circle(input_t, radius=radius, iterations=iterations)
|
||||
|
||||
print('pass')
|
||||
|
||||
def binary_dilate_circle_test():
|
||||
for shape_len in range(2,4):
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
|
||||
shape = np.random.randint( 1, 64, size=(shape_len,) )
|
||||
radius = np.random.randint( 1, 16 )
|
||||
iterations = np.random.randint( 1, 4 )
|
||||
|
||||
input_n = np.random.randint( 2, size=shape ).astype(dtype)
|
||||
input_t = Tensor.from_value(input_n)
|
||||
|
||||
print(f'binary_dilate_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='')
|
||||
|
||||
op.binary_dilate_circle(input_t, radius=radius, iterations=iterations)
|
||||
|
||||
print('pass')
|
||||
|
||||
|
||||
def gaussian_blur_test():
|
||||
for shape_len in range(2,5):
|
||||
for dtype in [np.float16, np.float32, np.float64]:
|
||||
|
||||
shape = np.random.randint( 1, 64, size=(shape_len,) )
|
||||
sigma = np.random.rand() * 10
|
||||
print(f'gaussian_blur: {shape} sigma:{sigma} {np.dtype(dtype).name} ... ', end='')
|
||||
|
||||
val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
|
||||
val_t = Tensor.from_value(val_n)
|
||||
|
||||
op.gaussian_blur(val_t, sigma)
|
||||
|
||||
print('pass')
|
||||
|
||||
def pad_test():
|
||||
for iteration in range(1):
|
||||
for shape_len in range(5,1,-1):
|
||||
for mode in ['constant']:
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
while True:
|
||||
shape = np.random.randint( 1, 8, size=(shape_len,) )
|
||||
|
||||
paddings = tuple( (np.random.randint(8), np.random.randint(8)) for i in range(len(shape)) )
|
||||
|
||||
print(f'pad: {shape} {paddings} {mode} {np.dtype(dtype).name} ... ', end='')
|
||||
|
||||
val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
|
||||
pad_n = np.pad(val_n, paddings, mode=mode)
|
||||
|
||||
val_t = Tensor.from_value(val_n)
|
||||
pad_t = op.pad(val_t, paddings, mode=mode)
|
||||
|
||||
print(f'{pad_n.shape} == {pad_t.shape} ... ', end='')
|
||||
|
||||
if pad_n.shape != pad_t.shape:
|
||||
raise Exception(f'shape is not equal')
|
||||
|
||||
if not _all_close(pad_t.np(), pad_n):
|
||||
raise Exception(f'data is not equal')
|
||||
|
||||
print('pass')
|
||||
break
|
||||
|
||||
def slice_set_test():
|
||||
for iteration in [0,1]:
|
||||
for shape_len in range(5,1,-1):
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
while True:
|
||||
shape = np.random.randint( 1, 8, size=(shape_len,) )
|
||||
|
||||
if iteration == 0:
|
||||
slices = [ slice(None,None,None), ] * shape_len
|
||||
axis = np.random.randint(shape_len)
|
||||
shape[axis] = 1
|
||||
slices[axis] = 0
|
||||
else:
|
||||
slices = []
|
||||
for i in range (shape_len):
|
||||
axis_size = shape[i]
|
||||
|
||||
b = np.random.randint(axis_size)
|
||||
e = np.random.randint(axis_size)
|
||||
if b == e:
|
||||
slices.append(b)
|
||||
else:
|
||||
if b < e:
|
||||
s = 1
|
||||
if b == 0:
|
||||
b = None
|
||||
if e == axis_size-1:
|
||||
e = None
|
||||
else:
|
||||
s = -1
|
||||
if b == axis_size-1:
|
||||
b = None
|
||||
if e == 0:
|
||||
e = None
|
||||
slices.append ( slice(b,e,s) )
|
||||
|
||||
if np.random.randint(2) == 0:
|
||||
axis = np.random.randint(shape_len)
|
||||
slices[axis] = Ellipsis
|
||||
|
||||
shape = tuple(shape)
|
||||
slices = tuple(slices)
|
||||
|
||||
print(f'slice_set: {shape} {np.dtype(dtype).name} {slices} ... ', end='')
|
||||
|
||||
val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
|
||||
val_t = Tensor.from_value(val_n)
|
||||
|
||||
sliced_n = val_n[slices]
|
||||
|
||||
v = [0] if sliced_n.ndim > 0 else 0
|
||||
|
||||
val_n[slices] = v
|
||||
val_t[slices] = v
|
||||
|
||||
if not _all_close(val_t.np(), val_n):
|
||||
raise Exception(f'data is not equal')
|
||||
|
||||
print('pass')
|
||||
break
|
||||
|
||||
def depthwise_conv2d_test():
|
||||
def _numpy_depthwise_conv2d(input_n, kernel_n, STRIDE=1, DILATION=1, padding='same', dtype=np.float32):
|
||||
N, IC, IH, IW = input_n.shape
|
||||
KI, KH, KW = kernel_n.shape
|
||||
|
||||
ci = Conv2DInfo(IH, IW, KH, KW, STRIDE, DILATION, padding)
|
||||
|
||||
PADL, PADT = ci.PADL, ci.PADT
|
||||
|
||||
OC, OH, OW = IC, ci.OH, ci.OW
|
||||
|
||||
O_IK_idxs = { idx:[ [ [], [] ], [ [], [] ] ] for idx in range(OH*OW) }
|
||||
K_IO_idxs = { idx:[ [ [], [] ], [ [], [] ] ] for idx in range(KH*KW) }
|
||||
I_KO_idxs = { idx:[ [ [], [] ], [ [], [] ] ] for idx in range(IH*IW) }
|
||||
|
||||
for ow in range(OW):
|
||||
for oh in range(OH):
|
||||
O_idx = oh*OW + ow
|
||||
for kh in range(KH):
|
||||
for kw in range(KW):
|
||||
iw = -PADL + kw*DILATION + ow*STRIDE
|
||||
ih = -PADT + kh*DILATION + oh*STRIDE
|
||||
if (iw >= 0) & (ih >= 0) & (iw < IW) & (ih < IH):
|
||||
|
||||
O_IK_idxs[O_idx][0][0].append (ih)
|
||||
O_IK_idxs[O_idx][0][1].append (iw)
|
||||
O_IK_idxs[O_idx][1][0].append (kh)
|
||||
O_IK_idxs[O_idx][1][1].append (kw)
|
||||
|
||||
K_idx = kh*KW + kw
|
||||
K_IO_idxs[K_idx][0][0].append (ih)
|
||||
K_IO_idxs[K_idx][0][1].append (iw)
|
||||
K_IO_idxs[K_idx][1][0].append (oh)
|
||||
K_IO_idxs[K_idx][1][1].append (ow)
|
||||
|
||||
I_idx = ih*IW + iw
|
||||
I_KO_idxs[I_idx][0][0].append (kh)
|
||||
I_KO_idxs[I_idx][0][1].append (kw)
|
||||
I_KO_idxs[I_idx][1][0].append (oh)
|
||||
I_KO_idxs[I_idx][1][1].append (ow)
|
||||
|
||||
output_shape = (N, OC, OH, OW)
|
||||
output = np.empty( output_shape, dtype)
|
||||
|
||||
for n in range(N):
|
||||
for oc in range(OC):
|
||||
for oh in range(OH):
|
||||
for ow in range(OW):
|
||||
O_idx = oh*OW + ow
|
||||
I_idxs = O_IK_idxs[O_idx][0]
|
||||
K_idxs = O_IK_idxs[O_idx][1]
|
||||
|
||||
v = ( input_n[ n,oc][..., I_idxs[0], I_idxs[1]] *
|
||||
kernel_n [oc][..., K_idxs[0], K_idxs[1]] ).sum()
|
||||
|
||||
output[n,oc,oh,ow] = v
|
||||
return output
|
||||
|
||||
for padding in ['same','valid',2]:
|
||||
for dilation in [1,2]:
|
||||
for stride in [1,2]:
|
||||
for ks in [1,3]:
|
||||
for n in [1,4]:
|
||||
for ic in [1,4]:
|
||||
for ih,iw in zip(*[[4,16]]*2):
|
||||
if padding == 'valid' and iw < ks:
|
||||
continue
|
||||
for dtype in [np.int16, np.float16, np.float32]:
|
||||
input_shape = (n, ic, ih, iw)
|
||||
kernel_shape = (ic, ks, ks)
|
||||
|
||||
print(f'depthwise_conv2d: {input_shape},{kernel_shape},{padding},{stride},{dilation},{np.dtype(dtype).name} ... ', end='')
|
||||
|
||||
input_n = np.random.randint( 64, size=input_shape ).astype(dtype)
|
||||
kernel_n = np.ones(shape=kernel_shape ).astype(dtype)
|
||||
|
||||
input_t = Tensor.from_value(input_n)
|
||||
kernel_t = Tensor.from_value(kernel_n)
|
||||
|
||||
conved_t = op.depthwise_conv2D(input_t, kernel_t, stride=stride, dilation=dilation, padding=padding)
|
||||
conved_n = _numpy_depthwise_conv2d(input_n, kernel_n, STRIDE=stride, DILATION=dilation, padding=padding, dtype=dtype)
|
||||
|
||||
if conved_n.shape != conved_t.shape:
|
||||
raise Exception(f'shape is not equal')
|
||||
|
||||
if not all ( np.ndarray.flatten( conved_t.np() == conved_n) ):
|
||||
raise Exception(f'data is not equal')
|
||||
|
||||
print('pass')
|
||||
|
||||
|
||||
|
||||
def warp_affine_test():
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
if dtype == np.bool_:
|
||||
continue
|
||||
H = np.random.randint(8, 64)
|
||||
W = np.random.randint(8, 64)
|
||||
|
||||
print(f'warp_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
|
||||
|
||||
input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )
|
||||
|
||||
affine_t = Tensor.from_value ( [[1,0,0],
|
||||
[0,1,0]], dtype)
|
||||
|
||||
result_t = op.warp_affine(input_t, affine_t)
|
||||
|
||||
if not _all_close(input_t.np(), result_t.np() ):
|
||||
raise Exception(f'data is not equal')
|
||||
|
||||
print('pass')
|
||||
|
||||
|
||||
def remap_np_affine_test():
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
if dtype == np.bool_:
|
||||
continue
|
||||
H = np.random.randint(8, 64)
|
||||
W = np.random.randint(8, 64)
|
||||
|
||||
print(f'remap_np_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
|
||||
|
||||
input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )
|
||||
|
||||
affine_n = np.array ( [[1,0,0],
|
||||
[0,1,0]], dtype)
|
||||
|
||||
result_t = op.remap_np_affine(input_t, affine_n)
|
||||
|
||||
if not _all_close(input_t.np(), result_t.np() ):
|
||||
raise Exception(f'data is not equal')
|
||||
|
||||
print('pass')
|
||||
|
||||
|
||||
def remap_test():
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
if dtype == np.bool_:
|
||||
continue
|
||||
H = np.random.randint(8, 64)
|
||||
W = np.random.randint(8, 64)
|
||||
|
||||
print(f'remap: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
|
||||
|
||||
input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )
|
||||
|
||||
coords_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) )
|
||||
|
||||
result_t = op.remap(input_t, coords_t)
|
||||
|
||||
if not _all_close(input_t.np(), result_t.np() ):
|
||||
raise Exception(f'data is not equal')
|
||||
|
||||
print('pass')
|
||||
|
||||
def tile_test():
|
||||
for _ in range(3):
|
||||
for shape_len in range(3, 5):
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
shape = tuple(np.random.randint( 8, size=(shape_len,) )+1)
|
||||
tiles = tuple(np.random.randint( 4, size=(shape_len,) )+1)
|
||||
|
||||
print(f'tile: {shape} {tiles} {np.dtype(dtype).name} ... ', end='')
|
||||
|
||||
val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
|
||||
tiled_n = np.tile(val_n, tiles)
|
||||
|
||||
val_t = Tensor.from_value(val_n)
|
||||
tiled_t = op.tile(val_t, tiles)
|
||||
|
||||
print(f'{tiled_n.shape} == {tiled_t.shape} ... ', end='')
|
||||
|
||||
if tiled_n.shape != tiled_t.shape:
|
||||
raise Exception(f'shape is not equal')
|
||||
|
||||
if not _all_close(tiled_t.np(), tiled_n):
|
||||
raise Exception(f'data is not equal')
|
||||
|
||||
print('pass')
|
||||
|
||||
def stack_test():
|
||||
for _ in range(3):
|
||||
for shape_len in range(1, 4):
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
shape = tuple(np.random.randint( 8, size=(shape_len,) )+1)
|
||||
axis = np.random.randint(shape_len+1)
|
||||
stack_count = np.random.randint(4)+1
|
||||
|
||||
print(f'stack: {shape}*{stack_count} axis:{axis} {np.dtype(dtype).name} ... ', end='')
|
||||
|
||||
vals_n = [ np.random.randint( 2**8, size=shape ).astype(dtype) for i in range(stack_count) ]
|
||||
stack_n = np.stack(vals_n, axis)
|
||||
|
||||
vals_t = [ Tensor.from_value(vals_n[i]) for i in range(stack_count) ]
|
||||
stack_t = op.stack(vals_t, axis)
|
||||
|
||||
print(f'{stack_n.shape} == {stack_t.shape} ... ', end='')
|
||||
|
||||
if stack_n.shape != stack_t.shape:
|
||||
raise Exception('shape is not equal')
|
||||
|
||||
if not _all_close(stack_t.np(), stack_n):
|
||||
raise Exception(f'data is not equal')
|
||||
|
||||
print('pass')
|
||||
|
||||
def reduce_test():
|
||||
for op_type in ['sum', 'mean', 'min', 'max']:
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
if dtype != np.bool_:
|
||||
for shape_len in range(2, 5):
|
||||
shape = np.random.randint( 8, size=(shape_len,) )+1
|
||||
|
||||
reduction_axes = np.array([*range(shape_len)])
|
||||
np.random.shuffle(reduction_axes)
|
||||
|
||||
# Cut random amount of reduction_axes
|
||||
reduction_axes = tuple(reduction_axes [:np.random.randint(shape_len+1)])
|
||||
if len(reduction_axes) == 0:
|
||||
reduction_axes = None
|
||||
|
||||
keepdims = np.random.randint(2) == 0
|
||||
|
||||
print(f'reduce {op_type}: {shape} {np.dtype(dtype).name} axes={reduction_axes} keepdims={keepdims} ... ', end='')
|
||||
|
||||
if dtype in [np.float16, np.float32, np.float64]:
|
||||
value_n = np.random.uniform(size=shape).astype(dtype)
|
||||
else:
|
||||
value_n = np.random.randint( max(1, int(np.iinfo(dtype).max / np.prod(shape)) ), size=shape, dtype=dtype )
|
||||
|
||||
value_t = Tensor.from_value(value_n)
|
||||
|
||||
if op_type == 'sum':
|
||||
reducted_n = value_n.sum(reduction_axes, keepdims=keepdims)
|
||||
reducted_t = value_t.sum(reduction_axes, keepdims=keepdims)
|
||||
elif op_type == 'mean':
|
||||
reducted_n = value_n.mean(reduction_axes, keepdims=keepdims)
|
||||
reducted_t = value_t.mean(reduction_axes, keepdims=keepdims)
|
||||
elif op_type == 'max':
|
||||
reducted_n = value_n.max(reduction_axes, keepdims=keepdims)
|
||||
reducted_t = value_t.max(reduction_axes, keepdims=keepdims)
|
||||
elif op_type == 'min':
|
||||
reducted_n = value_n.min(reduction_axes, keepdims=keepdims)
|
||||
reducted_t = value_t.min(reduction_axes, keepdims=keepdims)
|
||||
|
||||
print(f'{reducted_n.shape} == {reducted_t.shape} ... ')
|
||||
|
||||
if not _all_close(reducted_t.np(), reducted_n):
|
||||
raise Exception(f'data is not equal')
|
||||
|
||||
print('pass')
|
||||
|
||||
|
||||
def InitRandomUniform_test():
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
for shape_len in range(1, 5):
|
||||
shape = np.random.randint( 8, size=(shape_len,) )+1
|
||||
|
||||
print(f'InitRandomUniform: {shape} {np.dtype(dtype).name} ... ', end='')
|
||||
|
||||
Tensor(shape, dtype, initializer=InitRandomUniform()).np()
|
||||
|
||||
print('pass')
|
||||
|
||||
def InitCoords2DArange_test():
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
for shape_len in range(2, 5):
|
||||
shape = np.random.randint( 1, 60, size=(shape_len,) ).tolist()
|
||||
shape = shape + ([2] if np.random.randint(2) == 0 else [3])
|
||||
h_start = np.random.randint(80)
|
||||
h_stop = h_start + np.random.randint(80)
|
||||
w_start = np.random.randint(80)
|
||||
w_stop = w_start + np.random.randint(80)
|
||||
|
||||
print(f'InitCoords2DArange: {shape} {np.dtype(dtype).name} ... ', end='')
|
||||
|
||||
Tensor(shape, dtype, initializer=InitCoords2DArange(h_start,h_stop,w_start,w_stop )).np()
|
||||
|
||||
print('pass')
|
||||
|
||||
def concat_test():
|
||||
for shape_len in range(2, 5):
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
shape = (np.random.randint( 8, size=(shape_len,) )+1).tolist()
|
||||
axis = np.random.randint(shape_len)
|
||||
count = np.random.randint(4)+1
|
||||
|
||||
shapes = tuple( tuple( dim if i != axis else np.random.randint(8)+1
|
||||
for i,dim in enumerate(shape) )
|
||||
for shape in ([shape] * count) )
|
||||
|
||||
print(f'concat: {shapes} axis={axis} {np.dtype(dtype).name} ... ', end='')
|
||||
|
||||
V_n = [ np.random.randint( 2**8, size=shape ).astype(dtype) for shape in shapes ]
|
||||
O_n = np.concatenate(V_n, axis)
|
||||
|
||||
print(f'{O_n.shape} == ', end='')
|
||||
|
||||
V_t = [ Tensor.from_value(V_n[i]) for i in range(count) ]
|
||||
O_t = op.concat(V_t, axis)
|
||||
|
||||
print(f'{O_t.shape} ... ', end='')
|
||||
|
||||
if O_n.shape != O_t.shape:
|
||||
raise Exception('shape is not equal')
|
||||
|
||||
if not all ( np.ndarray.flatten( O_t.np() == O_n ) ):
|
||||
raise Exception(f'data is not equal')
|
||||
|
||||
print('pass')
|
||||
|
||||
def matmul_test():
|
||||
for _ in range(100):
|
||||
for dtype in [np.float32]:
|
||||
BATCH = np.random.randint(8)+1
|
||||
M = np.random.randint(8)+1
|
||||
N = np.random.randint(32768)+1
|
||||
K = np.random.randint(32768)+1
|
||||
|
||||
while K*N > ( 8000000 // BATCH ):
|
||||
K = max(1, K // 2)
|
||||
N = max(1, N // 2)
|
||||
|
||||
if np.random.randint(2) == 0:
|
||||
size = [2,4,8,16][np.random.randint(4)]
|
||||
M = max(1, M // size) * size
|
||||
N = max(1, N // size) * size
|
||||
K = max(1, K // size) * size
|
||||
|
||||
if BATCH == 1:
|
||||
A_shape = (M, K)
|
||||
B_shape = (K, N)
|
||||
else:
|
||||
A_shape = (BATCH, M, K)
|
||||
B_shape = (BATCH, K, N)
|
||||
|
||||
print(f'matmul: {A_shape} {B_shape} {np.dtype(dtype).name} ... ', end='')
|
||||
|
||||
A_n = np.random.randint( 2**4, size=A_shape ).astype(dtype)
|
||||
B_n = np.random.randint( 2**4, size=B_shape ).astype(dtype)
|
||||
|
||||
O_n = np.matmul(A_n, B_n)
|
||||
|
||||
print(f'{O_n.shape} == ', end='')
|
||||
|
||||
A_t = Tensor.from_value(A_n)
|
||||
B_t = Tensor.from_value(B_n)
|
||||
O_t = op.matmul(A_t, B_t)
|
||||
print(f'{O_t.shape} ... ', end='')
|
||||
|
||||
if O_n.shape != O_t.shape:
|
||||
raise Exception('shape is not equal')
|
||||
if not _all_close (O_t.np(), O_n):
|
||||
raise Exception(f'data is not equal')
|
||||
|
||||
print('pass')
|
||||
|
||||
def slice_test():
|
||||
for iteration in [0,1]:
|
||||
for shape_len in range(5,1,-1):
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
while True:
|
||||
shape = np.random.randint( 1, 8, size=(shape_len,) )
|
||||
|
||||
if iteration == 0:
|
||||
slices = [ slice(None,None,None), ] * shape_len
|
||||
axis = np.random.randint(shape_len)
|
||||
shape[axis] = 1
|
||||
slices[axis] = 0
|
||||
else:
|
||||
slices = []
|
||||
for i in range (shape_len):
|
||||
axis_size = shape[i]
|
||||
b = np.random.randint(axis_size)
|
||||
e = np.random.randint(axis_size)
|
||||
if b == e:
|
||||
slices.append(b)
|
||||
else:
|
||||
if b < e:
|
||||
s = 1
|
||||
if b == 0:
|
||||
b = None
|
||||
if e == axis_size-1:
|
||||
e = None
|
||||
else:
|
||||
s = -1
|
||||
if b == axis_size-1:
|
||||
b = None
|
||||
if e == 0:
|
||||
e = None
|
||||
slices.append ( slice(b,e,s) )
|
||||
|
||||
if np.random.randint(2) == 0:
|
||||
axis = np.random.randint(shape_len)
|
||||
slices[axis] = Ellipsis
|
||||
|
||||
shape = tuple(shape)
|
||||
slices = tuple(slices)
|
||||
|
||||
print(f'slice: {shape} {np.dtype(dtype).name} {slices} ... ', end='')
|
||||
|
||||
val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
|
||||
|
||||
sliced_n = val_n[slices]
|
||||
|
||||
print(f'{sliced_n.shape} ... ', end='')
|
||||
|
||||
sliced_t = Tensor.from_value(val_n)[slices]
|
||||
|
||||
print(f'{sliced_t.shape} ... ', end='')
|
||||
|
||||
if 0 in sliced_n.shape:
|
||||
# some cases like 0:1:-1 will produce zero shape and invalid array on numpy
|
||||
# but nn.slice has no such behaviour, thus we have to generate new slice again
|
||||
print('pass (bad case)')
|
||||
continue
|
||||
|
||||
if np.prod(sliced_n.shape) != sliced_t.shape.size:
|
||||
raise Exception(f'shape is not equal')
|
||||
|
||||
if not _all_close(sliced_t.np(), sliced_n):
|
||||
raise Exception(f'data is not equal')
|
||||
|
||||
print('pass')
|
||||
break
|
||||
|
||||
|
||||
def transpose_test():
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
for shape_len in range(2, 5):
|
||||
shape = np.random.randint( 8, size=(shape_len,) )+1
|
||||
axes_order = np.array([*range(shape_len)])
|
||||
np.random.shuffle(axes_order)
|
||||
|
||||
print(f'transpose: {shape} {axes_order} ... ', end='')
|
||||
|
||||
val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
|
||||
transposed_n = np.transpose(val_n, axes_order)
|
||||
|
||||
print(f'{transposed_n.shape} ... ', end='')
|
||||
|
||||
val_t = Tensor.from_value(val_n)
|
||||
transposed_t = op.transpose (val_t, axes_order )
|
||||
|
||||
print(f'{transposed_t.shape} ... ', end='')
|
||||
|
||||
if transposed_n.shape != transposed_t.shape:
|
||||
raise Exception('shape is not equal')
|
||||
if not all ( np.ndarray.flatten( transposed_t.np() == transposed_n ) ):
|
||||
raise Exception(f'data is not equal {shape} {axes_order}')
|
||||
|
||||
print('pass')
|
||||
|
||||
|
||||
def any_wise_op_test():
|
||||
for op_type in ['square', '+', '-', '*', '/', 'min', 'max']:
|
||||
for dtype in HType.get_np_scalar_types():
|
||||
if dtype != np.bool_:
|
||||
|
||||
shape_gen = range(1, 5)
|
||||
for shape_len in shape_gen:
|
||||
a_shape = tuple(np.random.randint( 8, size=(shape_len,) )+1)
|
||||
|
||||
if np.random.randint(2) == 0:
|
||||
b_shape = tuple(a_shape[np.random.randint(len(a_shape)):])
|
||||
b_shape = (1,) if len(b_shape) == 0 else b_shape
|
||||
else:
|
||||
b_shape = list(a_shape)
|
||||
b_shape[ np.random.randint(len(b_shape)) ] = 1
|
||||
b_shape = tuple(b_shape)
|
||||
|
||||
shapes = [a_shape, b_shape]
|
||||
if np.random.randint(2) == 0:
|
||||
shapes = shapes[::-1]
|
||||
a_shape, b_shape = shapes
|
||||
|
||||
print(f'any_wise: {a_shape} {str(op_type)} {b_shape}:{str(np.dtype(dtype).name)} ...', end='')
|
||||
|
||||
a_n = np.random.randint( 1, 2**8, size=a_shape ).astype(dtype)
|
||||
b_n = np.random.randint( 1, 2**8, size=b_shape ).astype(dtype)
|
||||
a_t = Tensor.from_value(a_n)
|
||||
b_t = Tensor.from_value(b_n)
|
||||
|
||||
if op_type == '+':
|
||||
r_t = a_t + b_t
|
||||
elif op_type == '-':
|
||||
r_t = a_t - b_t
|
||||
elif op_type == '*':
|
||||
r_t = a_t * b_t
|
||||
elif op_type == '/':
|
||||
r_t = a_t / b_t
|
||||
elif op_type == 'min':
|
||||
r_t = op.min_(a_t, b_t)
|
||||
elif op_type == 'max':
|
||||
r_t = op.max_(a_t, b_t)
|
||||
elif op_type == 'square':
|
||||
r_t = op.square(a_t)
|
||||
|
||||
if op_type in ['+','-','*','/']:
|
||||
r_n = eval(f'a_n {op_type} b_n')
|
||||
r_n = r_n.astype(dtype)
|
||||
elif op_type == 'min':
|
||||
r_n = np.minimum(a_n, b_n)
|
||||
elif op_type == 'max':
|
||||
r_n = np.maximum(a_n, b_n)
|
||||
elif op_type == 'square':
|
||||
r_n = np.square(a_n)
|
||||
|
||||
if r_n.shape != r_t.shape:
|
||||
raise Exception(f'shapes are not equal')
|
||||
|
||||
if not _all_close(r_t.np(), r_n):
|
||||
raise Exception(f'data is not equal')
|
||||
|
||||
print('pass')
|
||||
|
||||
|
55
xlib/avecl/_internal/SCacheton.py
Normal file
55
xlib/avecl/_internal/SCacheton.py
Normal file
|
@ -0,0 +1,55 @@
|
|||
class SCacheton:
|
||||
"""
|
||||
Static class for caching classes and vars by hashable arguments
|
||||
"""
|
||||
cachetons = {}
|
||||
cachevars = {}
|
||||
|
||||
@staticmethod
|
||||
def get(cls, *args, **kwargs):
|
||||
"""
|
||||
Get class cached by args/kwargs
|
||||
If it does not exist, creates new with *args,**kwargs
|
||||
All cached data will be freed with cleanup()
|
||||
|
||||
You must not to store Tensor in SCacheton, use per-device cache vars
|
||||
"""
|
||||
cls_multitons = SCacheton.cachetons.get(cls, None)
|
||||
if cls_multitons is None:
|
||||
cls_multitons = SCacheton.cachetons[cls] = {}
|
||||
|
||||
key = (args, tuple(kwargs.items()) )
|
||||
|
||||
data = cls_multitons.get(key, None)
|
||||
if data is None:
|
||||
data = cls_multitons[key] = cls(*args, **kwargs)
|
||||
|
||||
return data
|
||||
|
||||
@staticmethod
|
||||
def set_var(var_name, value):
|
||||
"""
|
||||
Set data cached by var_name
|
||||
All cached data will be freed with cleanup()
|
||||
|
||||
You must not to store Tensor in SCacheton, use per-device cache vars
|
||||
"""
|
||||
SCacheton.cachevars[var_name] = value
|
||||
|
||||
@staticmethod
|
||||
def get_var(var_name):
|
||||
"""
|
||||
Get data cached by var_name
|
||||
All cached data will be freed with cleanup()
|
||||
|
||||
You must not to store Tensor in SCacheton, use per-device cache vars
|
||||
"""
|
||||
return SCacheton.cachevars.get(var_name, None)
|
||||
|
||||
@staticmethod
|
||||
def cleanup():
|
||||
"""
|
||||
Free all cached objects
|
||||
"""
|
||||
SCacheton.cachetons = {}
|
||||
SCacheton.cachevars = {}
|
124
xlib/avecl/_internal/Tensor.py
Normal file
124
xlib/avecl/_internal/Tensor.py
Normal file
|
@ -0,0 +1,124 @@
|
|||
import weakref
|
||||
from typing import Iterable, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .AShape import *
|
||||
from .backend import *
|
||||
from .HType import *
|
||||
|
||||
|
||||
class Tensor:
|
||||
### CONSTRUCTORS
|
||||
|
||||
def __init__(self, shape : Union[AShape, Iterable],
|
||||
dtype : np.dtype,
|
||||
device : Union[None, int, Device, DeviceInfo] = None,
|
||||
initializer : 'Initializer' = None,
|
||||
):
|
||||
Tensor._object_count += 1
|
||||
|
||||
self._shape = shape = AShape(shape)
|
||||
self._dtype = dtype = np.dtype(dtype)
|
||||
self._device = device = get_device(device)
|
||||
if device is None:
|
||||
raise Exception('No device.')
|
||||
|
||||
self._seq_id = Tensor._seq_id = Tensor._seq_id + 1
|
||||
self._buffer = Buffer(device, size=shape.size*dtype.itemsize,
|
||||
on_initialize= lambda initializer=initializer, wself=weakref.ref(self): initializer.initialize_tensor( wself() ) if initializer is not None else None )
|
||||
|
||||
@staticmethod
|
||||
def from_value(value, dtype=None, device:Union[None,int,Device,DeviceInfo]=None) -> 'Tensor':
|
||||
if isinstance(value, Tensor):
|
||||
device = value.get_device()
|
||||
elif not isinstance(value, np.ndarray):
|
||||
if HType.is_scalar_type(value):
|
||||
if dtype is None:
|
||||
raise ValueError('dtype must be specified for single value')
|
||||
value = np.array([value], dtype=dtype)
|
||||
elif isinstance(value, Iterable):
|
||||
if dtype is None:
|
||||
raise ValueError('dtype must be specified for Iterable of values')
|
||||
value = np.array(value, dtype=dtype)
|
||||
else:
|
||||
raise ValueError(f'Unsupported value type {value.__class__}')
|
||||
|
||||
return Tensor(shape=value.shape, dtype=value.dtype, device=device).set(value)
|
||||
|
||||
### PROPERTIES
|
||||
|
||||
@property
|
||||
def shape(self): return self._shape
|
||||
@property
|
||||
def ndim(self): return len(self._shape)
|
||||
@property
|
||||
def dtype(self) -> np.dtype: return self._dtype
|
||||
@property
|
||||
def name(self) -> str: return f'#{self.get_seq_id()}'
|
||||
|
||||
### GETTERS/SETTERS
|
||||
|
||||
def get_buffer(self) -> Buffer: return self._buffer
|
||||
def get_device(self) -> Device: return self._device
|
||||
def get_seq_id(self) -> int: return self._seq_id
|
||||
|
||||
def set(self, value : Union['Tensor',np.ndarray,int,float]) -> 'Tensor':
|
||||
if isinstance(value, Tensor):
|
||||
if self.shape.size != value.shape.size:
|
||||
raise ValueError('Unable to set the data from other tensor: shape.size is not the same.')
|
||||
self.get_buffer().set(value.get_buffer())
|
||||
else:
|
||||
if isinstance(value, np.ndarray):
|
||||
value = value.astype(self.dtype)
|
||||
else:
|
||||
if HType.is_scalar_type(value):
|
||||
value = np.array([value], dtype=self.dtype)
|
||||
elif isinstance(value, Iterable):
|
||||
value = np.array(value, dtype=self.dtype)
|
||||
else:
|
||||
raise ValueError(f'Unsupported value type {value.__class__}')
|
||||
|
||||
self.get_buffer().set(value)
|
||||
return self
|
||||
|
||||
def np(self):
|
||||
"""Returns numpy value of a Tensor"""
|
||||
return self.get_buffer().np(self.shape, self.dtype)
|
||||
|
||||
|
||||
### OPERATORS
|
||||
|
||||
def __add__(self, value) -> 'Tensor': ...
|
||||
def __radd__(self, value) -> 'Tensor': ...
|
||||
def __sub__(self, value) -> 'Tensor': ...
|
||||
def __rsub__(self, value) -> 'Tensor': ...
|
||||
def __mul__(self, value) -> 'Tensor': ...
|
||||
def __rmul__(self, value) -> 'Tensor': ...
|
||||
def __truediv__(self, value) -> 'Tensor': ...
|
||||
def __rtruediv__(self, value) -> 'Tensor': ...
|
||||
def __neg__(self) -> 'Tensor': ...
|
||||
def __getitem__(self, slices) -> 'Tensor': ...
|
||||
def __setitem__(self, slices, value): ...
|
||||
|
||||
def as_shape(self, shape) -> 'Tensor': ...
|
||||
def copy(self) -> 'Tensor': ...
|
||||
def max(self, axes=None, keepdims=False) -> 'Tensor': ...
|
||||
def mean(self, axes=None, keepdims=False) -> 'Tensor': ...
|
||||
def min(self, axes=None, keepdims=False) -> 'Tensor': ...
|
||||
def reshape(self, new_shape) -> 'Tensor': ...
|
||||
def sum(self, axes=None, keepdims=False) -> 'Tensor': ...
|
||||
def transpose(self, axes_order, op_text=None, dtype=None) -> 'Tensor': ...
|
||||
|
||||
@property
|
||||
def T(self): return self.transpose( tuple(range(self.shape.ndim-1,-1,-1)) )
|
||||
|
||||
### INTERNAL METHODS
|
||||
|
||||
def __del__(self):
|
||||
Tensor._object_count -= 1
|
||||
|
||||
_object_count = 0
|
||||
_seq_id = 0
|
||||
|
||||
__all__ = ['Tensor']
|
92
xlib/avecl/_internal/TensorImpl.py
Normal file
92
xlib/avecl/_internal/TensorImpl.py
Normal file
|
@ -0,0 +1,92 @@
|
|||
from .op import *
|
||||
from .AShape import *
|
||||
from .backend import *
|
||||
from .Tensor import Tensor
|
||||
|
||||
def Tensor__str__(self : Tensor): return f"T {self.name} {self.shape} {self.dtype.name}"
|
||||
Tensor.__str__ = Tensor__str__
|
||||
|
||||
def Tensor__repr__(self : Tensor):
|
||||
s = self.__str__() + '\n'
|
||||
s += str(self.np()) + '\n'
|
||||
s += self.__str__()
|
||||
return s
|
||||
Tensor.__repr__ = Tensor__repr__
|
||||
|
||||
def Tensor__add__(self : Tensor, value) -> Tensor:
|
||||
return add(self, value)
|
||||
Tensor.__add__ = Tensor__add__
|
||||
|
||||
def Tensor__radd__(self : Tensor, value) -> Tensor:
|
||||
return add(value, self)
|
||||
Tensor.__radd__ = Tensor__radd__
|
||||
|
||||
def Tensor__sub__(self : Tensor, value) -> Tensor:
|
||||
return sub(self, value)
|
||||
Tensor.__sub__ = Tensor__sub__
|
||||
|
||||
def Tensor__rsub__(self : Tensor, value) -> Tensor:
|
||||
return sub(value, self)
|
||||
Tensor.__rsub__ = Tensor__rsub__
|
||||
|
||||
def Tensor__mul__(self : Tensor, value) -> Tensor:
|
||||
if self == value:
|
||||
return square(self)
|
||||
return mul(self, value)
|
||||
Tensor.__mul__ = Tensor__mul__
|
||||
|
||||
def Tensor__rmul__(self : Tensor, value) -> Tensor:
|
||||
if self == value:
|
||||
return square(self)
|
||||
return mul(value, self)
|
||||
Tensor.__rmul__ = Tensor__rmul__
|
||||
|
||||
def Tensor__truediv__(self : Tensor, value) -> Tensor:
|
||||
return div(self, value)
|
||||
Tensor.__truediv__ = Tensor__truediv__
|
||||
|
||||
def Tensor__rtruediv__(self : Tensor, value) -> Tensor:
|
||||
return div(value, self)
|
||||
Tensor.__rtruediv__ = Tensor__rtruediv__
|
||||
|
||||
def Tensor___neg__(self : Tensor):
|
||||
raise NotImplementedError()
|
||||
Tensor.___neg__ = Tensor___neg__
|
||||
|
||||
Tensor.__getitem__ = slice_
|
||||
Tensor.__setitem__ = slice_set
|
||||
|
||||
def Tensor_as_shape(self : Tensor, shape) -> Tensor:
|
||||
return TensorRef(self, shape)
|
||||
Tensor.as_shape = Tensor_as_shape
|
||||
|
||||
def Tensor_copy(self : Tensor) -> Tensor:
|
||||
return Tensor.from_value(self)
|
||||
Tensor.copy = Tensor_copy
|
||||
|
||||
Tensor.max = reduce_max
|
||||
Tensor.mean = reduce_mean
|
||||
Tensor.min = reduce_min
|
||||
Tensor.reshape = reshape
|
||||
Tensor.sum = reduce_sum
|
||||
Tensor.transpose = transpose
|
||||
|
||||
class TensorRef(Tensor):
|
||||
"""
|
||||
TensorRef used to interpret existing Tensor with different shape.
|
||||
use Tensor.as_ref() method
|
||||
"""
|
||||
|
||||
def __init__(self, t : Tensor, shape):
|
||||
shape = AShape(shape)
|
||||
if t.shape.size != shape.size:
|
||||
raise ValueError(f'Cannot interpet shape {t.shape} as ref shape {shape}')
|
||||
super().__init__(shape, t.dtype, device=t.get_device())
|
||||
self._t = t
|
||||
|
||||
# Forward methods to original tensor
|
||||
def get_seq_id(self) -> int: return self._t.get_seq_id()
|
||||
def get_buffer(self) -> Buffer: return self._t.get_buffer()
|
||||
def get_device(self) -> Device: return self._t.get_device()
|
||||
|
||||
__all__ = []
|
109
xlib/avecl/_internal/backend/Buffer.py
Normal file
109
xlib/avecl/_internal/backend/Buffer.py
Normal file
|
@ -0,0 +1,109 @@
|
|||
from typing import Iterable, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from . import OpenCL as CL
|
||||
|
||||
class Buffer:
|
||||
__slots__ = ['_device','_cl_mem','_size','_on_initialize']
|
||||
|
||||
def __init__(self, device : 'Device', size : int, on_initialize = None):
|
||||
"""
|
||||
represents physical buffer associated with physical device
|
||||
|
||||
device Device
|
||||
|
||||
size int
|
||||
"""
|
||||
|
||||
Buffer._object_count += 1
|
||||
self._device = device
|
||||
self._size = size
|
||||
self._cl_mem = None
|
||||
self._on_initialize = on_initialize
|
||||
|
||||
def __del__(self):
|
||||
#print('Buffer.__del__')
|
||||
Buffer._object_count -= 1
|
||||
self.free_cl_mem()
|
||||
|
||||
def get_device(self) -> 'Device': return self._device
|
||||
def get_size(self) -> int: return self._size
|
||||
|
||||
def has_cl_mem(self) -> bool: return self._cl_mem is not None
|
||||
def get_cl_mem(self) -> CL.cl_mem:
|
||||
if self._cl_mem is None:
|
||||
self._cl_mem = self._device._cl_mem_pool_alloc(self._size)
|
||||
if self._on_initialize is not None:
|
||||
self._on_initialize()
|
||||
|
||||
return self._cl_mem
|
||||
|
||||
def free_cl_mem(self):
|
||||
if self._cl_mem is not None:
|
||||
self._device._cl_mem_pool_free(self._cl_mem)
|
||||
self._cl_mem = None
|
||||
|
||||
def set(self, value : Union['Buffer', np.ndarray]):
|
||||
"""
|
||||
Parameters
|
||||
|
||||
value Buffer copy data from other Buffer.
|
||||
|
||||
np.ndarray copies values from ndarray
|
||||
to Buffer's memory
|
||||
|
||||
"""
|
||||
if isinstance(value, Buffer):
|
||||
if self != value:
|
||||
if self._size != value._size:
|
||||
raise Exception(f'Unable to copy from Buffer with {value._size} size to buffer with {self._size} size.')
|
||||
|
||||
if self._device == value._device:
|
||||
CL.clEnqueueCopyBuffer(self._device._get_ctx_q(), value.get_cl_mem(), self.get_cl_mem(), 0, 0, self._size, 0, None, None)
|
||||
else:
|
||||
# Transfer between devices will cause low performance
|
||||
raise NotImplementedError()
|
||||
else:
|
||||
if not isinstance(value, np.ndarray):
|
||||
raise ValueError (f'Invalid type {value.__class__}. Must be np.ndarray.')
|
||||
|
||||
if value.nbytes != self._size:
|
||||
raise ValueError(f'Value size {value.nbytes} does not match Buffer size {self._size}.')
|
||||
|
||||
if not value.flags.contiguous:
|
||||
value = value.reshape(-1)
|
||||
if not value.flags.contiguous:
|
||||
raise ValueError ("Unable to write from non-contiguous np array.")
|
||||
|
||||
ev = CL.cl_event()
|
||||
|
||||
clr = CL.clEnqueueWriteBuffer(self._device._get_ctx_q(), self.get_cl_mem(), False, 0, value.nbytes, value.ctypes.data_as(CL.c_void_p), 0, None, ev)
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clEnqueueWriteBuffer error: {clr}')
|
||||
|
||||
CL.clWaitForEvents(1, ( CL.cl_event * 1 )(ev) )
|
||||
CL.clReleaseEvent(ev)
|
||||
|
||||
def np(self, shape : Iterable, dtype : np.dtype):
|
||||
"""
|
||||
Returns data of buffer as np.ndarray with specified shape and dtype
|
||||
"""
|
||||
out_np_value = np.empty (shape, dtype)
|
||||
|
||||
if out_np_value.nbytes != self._size:
|
||||
raise ValueError(f'Unable to represent Buffer with size {self._size} as shape {shape} with dtype {dtype}')
|
||||
|
||||
clr = CL.clEnqueueReadBuffer(self._device._get_ctx_q(), self.get_cl_mem(), True, 0, self._size, out_np_value.ctypes.data, 0, None, None)
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clEnqueueReadBuffer error: {clr}')
|
||||
|
||||
return out_np_value
|
||||
|
||||
def __str__(self):
|
||||
return f'Buffer [{self._size} bytes][{f"{self._cl_mem.value}" if self._cl_mem is not None else "unallocated"}] on {str(self._device)}'
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
_object_count = 0
|
522
xlib/avecl/_internal/backend/Device.py
Normal file
522
xlib/avecl/_internal/backend/Device.py
Normal file
|
@ -0,0 +1,522 @@
|
|||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from . import OpenCL as CL
|
||||
from .Buffer import Buffer
|
||||
from .DeviceInfo import DeviceInfo
|
||||
from .Kernel import Kernel
|
||||
|
||||
_np_dtype_to_cl = { np.uint8: CL.cl_uchar,
|
||||
np.int8: CL.cl_char,
|
||||
np.uint16: CL.cl_ushort,
|
||||
np.int16: CL.cl_short,
|
||||
np.uint32: CL.cl_uint,
|
||||
np.int32: CL.cl_int,
|
||||
np.uint64: CL.cl_ulong,
|
||||
np.int64: CL.cl_long,
|
||||
np.float16: CL.cl_half,
|
||||
np.float32: CL.cl_float,
|
||||
np.float64: CL.cl_double }
|
||||
|
||||
_opencl_device_ids = None
|
||||
_default_device = None
|
||||
_devices = {}
|
||||
|
||||
class Device:
|
||||
"""
|
||||
Represents physical TensorCL device
|
||||
"""
|
||||
|
||||
def __init__(self, device_info : DeviceInfo, **kwargs):
|
||||
if kwargs.get('_check', None) is None:
|
||||
raise Exception('You should not to create Device from constructor. Use get_device()')
|
||||
|
||||
self._cached_data = {} # cached data (per device) by key
|
||||
self._pooled_buffers = {} # Pool of cached device buffers.
|
||||
self._compiled_kernels = {} # compiled kernels by key
|
||||
self._ctx_q = None # CL command queue
|
||||
self._ctx = None # CL context
|
||||
|
||||
self._total_memory_allocated = 0
|
||||
self._total_buffers_allocated = 0
|
||||
self._total_memory_pooled = 0
|
||||
self._total_buffers_pooled = 0
|
||||
|
||||
self._device_info = device_info
|
||||
self._device_id = _get_opencl_device_ids()[device_info.get_index()]
|
||||
|
||||
def __del__(self):
|
||||
self.cleanup()
|
||||
|
||||
def __eq__(self, other):
|
||||
if self is not None and other is not None and isinstance(self, Device) and isinstance(other, Device):
|
||||
return self._device_id.value == other._device_id.value
|
||||
return False
|
||||
|
||||
def __hash__(self):
|
||||
return self._device_id.value
|
||||
|
||||
def _get_ctx(self) -> CL.cl_context:
|
||||
# Create OpenCL context on demand
|
||||
if self._ctx is None:
|
||||
clr = CL.CLRESULT()
|
||||
ctx = CL.clCreateContext( None, 1, (CL.cl_device_id * 1)( self._device_id ), None, None, clr)
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception('Unable to create OpenCL context.')
|
||||
self._ctx = ctx
|
||||
return self._ctx
|
||||
|
||||
def _get_ctx_q(self) -> CL.cl_command_queue:
|
||||
# Create CommandQueue on demand
|
||||
if self._ctx_q is None:
|
||||
clr = CL.CLRESULT()
|
||||
ctx_q = CL.clCreateCommandQueue(self._get_ctx(), self._device_id, CL.cl_command_queue_properties(0), clr)
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception('Unable to create OpenCL CommandQueue.')
|
||||
self._ctx_q = ctx_q
|
||||
return self._ctx_q
|
||||
|
||||
def get_description(self) -> str:
|
||||
return f"{self._device_info.get_name()} [{(self._device_info.get_total_memory() / 1024**3) :.3}Gb]"
|
||||
|
||||
def __str__(self):
|
||||
return self.get_description()
|
||||
|
||||
def __repr__(self):
|
||||
return f'{self.__class__.__name__} object: ' + self.__str__()
|
||||
|
||||
def set_cached_data(self, key, value):
|
||||
"""
|
||||
|
||||
All cached data will be freed with cleanup()
|
||||
"""
|
||||
self._cached_data[key] = value
|
||||
|
||||
def get_cached_data(self, key):
|
||||
return self._cached_data.get(key, None)
|
||||
|
||||
def get_total_allocated_memory(self):
|
||||
return self._total_memory_allocated
|
||||
|
||||
def get_max_malloc_size(self) -> int:
|
||||
size = CL.cl_ulong()
|
||||
clr = CL.clGetDeviceInfo(self._device_id, CL.CL_DEVICE_MAX_MEM_ALLOC_SIZE, CL.sizeof(size), CL.byref(size), None)
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clGetDeviceInfo error: {clr}')
|
||||
return size.value
|
||||
|
||||
def _compile_kernel(self, key, kernel_text) -> CL.cl_kernel:
|
||||
"""
|
||||
compile or get cached kernel
|
||||
"""
|
||||
|
||||
compiled_krn, prog = self._compiled_kernels.get(key, (None, None) )
|
||||
|
||||
if compiled_krn is None:
|
||||
clr = CL.CLRESULT()
|
||||
prog = CL.clCreateProgramWithSource(self._get_ctx(), 1, CL.c_char_p(kernel_text.encode()), None, clr )
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clCreateProgramWithSource error {clr}, with kernel_text:\n{kernel_text}')
|
||||
|
||||
clr = CL.clBuildProgram(prog, 1, (CL.cl_device_id*1)(self._device_id), CL.c_char_p('-cl-std=CL1.2 -cl-single-precision-constant'.encode()), None, None )
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
build_log_size = CL.c_size_t()
|
||||
clr = CL.clGetProgramBuildInfo(prog, self._device_id, CL.CL_PROGRAM_BUILD_LOG, 0, None, CL.byref(build_log_size) )
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clGetProgramBuildInfo,error: {clr}')
|
||||
|
||||
build_log = CL.create_string_buffer(build_log_size.value)
|
||||
clr = CL.clGetProgramBuildInfo(prog, self._device_id, CL.CL_PROGRAM_BUILD_LOG, build_log_size.value, build_log, None )
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clGetProgramBuildInfo error: {clr}')
|
||||
|
||||
build_log = str(build_log.value, 'utf-8')
|
||||
raise Exception(f'clBuildProgram error:\n\n{build_log}')
|
||||
|
||||
num_kernels = CL.cl_uint()
|
||||
clr = CL.clCreateKernelsInProgram(prog, 0, None, CL.byref(num_kernels))
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clCreateKernelsInProgram error: {clr}')
|
||||
|
||||
if num_kernels.value != 1:
|
||||
raise Exception(f'Kernel must contain only one __kernel:\n\n{kernel_text}')
|
||||
|
||||
kernels = (CL.cl_kernel * num_kernels.value)()
|
||||
clr = CL.clCreateKernelsInProgram(prog, num_kernels.value, kernels, None)
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clCreateKernelsInProgram error: {clr}')
|
||||
|
||||
compiled_krn = kernels[0]
|
||||
self._compiled_kernels[key] = (compiled_krn, prog)
|
||||
|
||||
return compiled_krn
|
||||
|
||||
def _cl_mem_alloc(self, size) -> CL.cl_mem:
|
||||
clr = CL.CLRESULT()
|
||||
mem = CL.clCreateBuffer(self._get_ctx(), CL.CL_MEM_READ_WRITE, size, None, clr)
|
||||
if clr == CL.CLERROR.SUCCESS:
|
||||
# Fill one byte to check memory availability
|
||||
ev = CL.cl_event()
|
||||
clr = CL.clEnqueueFillBuffer (self._get_ctx_q(), mem, (CL.c_char * 1)(), 1, 0, 1, 0, None, ev )
|
||||
if clr == CL.CLERROR.SUCCESS:
|
||||
CL.clReleaseEvent(ev)
|
||||
self._total_memory_allocated += size
|
||||
self._total_buffers_allocated += 1
|
||||
return mem
|
||||
return None
|
||||
|
||||
def _cl_mem_free(self, mem : CL.cl_mem):
|
||||
size = CL.c_size_t()
|
||||
clr = CL.clGetMemObjectInfo(mem, CL.CL_MEM_SIZE, CL.sizeof(size), CL.byref(size), None )
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clGetMemObjectInfo error: {clr}')
|
||||
size = size.value
|
||||
self._total_memory_allocated -= size
|
||||
self._total_buffers_allocated -= 1
|
||||
clr = CL.clReleaseMemObject(mem)
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clReleaseMemObject error: {clr}')
|
||||
|
||||
def _cl_mem_pool_alloc(self, size):
|
||||
"""
|
||||
allocate or get cl_mem from pool
|
||||
"""
|
||||
pool = self._pooled_buffers
|
||||
|
||||
# First try to get pooled buffer
|
||||
ar = pool.get(size, None)
|
||||
if ar is not None and len(ar) != 0:
|
||||
mem = ar.pop(-1)
|
||||
self._total_memory_pooled -= size
|
||||
self._total_buffers_pooled -= 1
|
||||
else:
|
||||
# No pooled buffer, try to allocate new
|
||||
while True:
|
||||
mem = self._cl_mem_alloc(size)
|
||||
if mem is None:
|
||||
# MemoryError. Finding largest pooled buffer to release
|
||||
buf_to_release = None
|
||||
for size_key in sorted(list(pool.keys()), reverse=True):
|
||||
ar = pool[size_key]
|
||||
if len(ar) != 0:
|
||||
buf_to_release = ar.pop(-1)
|
||||
break
|
||||
|
||||
if buf_to_release is not None:
|
||||
# Release pooled buffer and try to allocate again
|
||||
self._cl_mem_free(buf_to_release)
|
||||
continue
|
||||
|
||||
raise Exception(f'Unable to allocate {size // 1024**2}Mb on {str(self)}')
|
||||
|
||||
|
||||
break
|
||||
|
||||
return mem
|
||||
|
||||
def _cl_mem_pool_free(self, mem : CL.cl_mem):
|
||||
"""
|
||||
Put cl_mem to pool for reuse in future.
|
||||
"""
|
||||
size = CL.c_size_t()
|
||||
clr = CL.clGetMemObjectInfo(mem, CL.CL_MEM_SIZE, CL.sizeof(size), CL.byref(size), None )
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clGetMemObjectInfo error: {clr}')
|
||||
size = size.value
|
||||
|
||||
pool = self._pooled_buffers
|
||||
ar = pool.get(size, None)
|
||||
if ar is None:
|
||||
ar = pool[size] = []
|
||||
ar.append(mem)
|
||||
|
||||
self._total_memory_pooled += size
|
||||
self._total_buffers_pooled += 1
|
||||
|
||||
def print_stat(self):
|
||||
s = f'''
|
||||
Total memory allocated: {self._total_memory_allocated}
|
||||
Total buffers allocated: {self._total_buffers_allocated}
|
||||
Total memory pooled: {self._total_memory_pooled}
|
||||
Total buffers pooled: {self._total_buffers_pooled}
|
||||
N of compiled kernels: {len(self._compiled_kernels)}
|
||||
N of cacheddata: {len(self._cached_data)}
|
||||
'''
|
||||
print(s)
|
||||
|
||||
def run_kernel(self, kernel : Kernel, *args, global_shape=None, local_shape=None, global_shape_offsets=None, wait=False):
|
||||
"""
|
||||
Run kernel on Device
|
||||
|
||||
Arguments
|
||||
|
||||
*args arguments will be passed to OpenCL kernel
|
||||
allowed types:
|
||||
|
||||
Buffer
|
||||
np single value
|
||||
|
||||
global_shape(None) tuple of ints, up to 3 dims
|
||||
amount of parallel kernel executions.
|
||||
in OpenCL kernel,
|
||||
id can be obtained via get_global_id(dim)
|
||||
|
||||
local_shape(None) tuple of ints, up to 3 dims
|
||||
specifies local groups of every dim of global_shape.
|
||||
in OpenCL kernel,
|
||||
id can be obtained via get_local_id(dim)
|
||||
|
||||
global_shape_offsets(None) tuple of ints
|
||||
offsets for global_shape
|
||||
|
||||
wait(False) wait execution to complete
|
||||
"""
|
||||
ckernel = self._compile_kernel(kernel, kernel.get_kernel_text())
|
||||
|
||||
if global_shape is None:
|
||||
global_shape = kernel.get_global_shape()
|
||||
if global_shape is None:
|
||||
raise ValueError('global_shape must be defined.')
|
||||
|
||||
work_dim = len(global_shape)
|
||||
global_shape_ar = (CL.c_size_t*work_dim)()
|
||||
for i,v in enumerate(global_shape):
|
||||
global_shape_ar[i] = v
|
||||
|
||||
local_shape_ar = None
|
||||
if local_shape is None:
|
||||
local_shape = kernel.get_local_shape()
|
||||
if local_shape is not None:
|
||||
if len(local_shape) != work_dim:
|
||||
raise ValueError('len of local_shape must match global_shape')
|
||||
|
||||
local_shape_ar = (CL.c_size_t*work_dim)()
|
||||
for i,v in enumerate(local_shape):
|
||||
local_shape_ar[i] = v
|
||||
|
||||
|
||||
global_shape_offsets_ar = None
|
||||
if global_shape_offsets is not None:
|
||||
if len(global_shape_offsets) != work_dim:
|
||||
raise ValueError('len of global_shape_offsets must match global_shape')
|
||||
|
||||
global_shape_offsets_ar = (CL.c_size_t*work_dim)()
|
||||
for i,v in enumerate(local_shape):
|
||||
global_shape_offsets_ar[i] = v
|
||||
|
||||
for i, arg in enumerate(args):
|
||||
|
||||
if isinstance(arg, Buffer):
|
||||
arg = arg.get_cl_mem()
|
||||
else:
|
||||
cl_type = _np_dtype_to_cl.get(arg.__class__, None)
|
||||
if cl_type is None:
|
||||
raise ValueError(f'Cannot convert type {arg.__class__} to OpenCL type.')
|
||||
arg = cl_type(arg)
|
||||
|
||||
clr = CL.clSetKernelArg(ckernel, i, CL.sizeof(arg), CL.byref(arg))
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clSetKernelArg error: {clr}')
|
||||
|
||||
ev = CL.cl_event() if wait else None
|
||||
|
||||
clr = CL.clEnqueueNDRangeKernel(self._get_ctx_q(), ckernel, work_dim, global_shape_offsets_ar, global_shape_ar, local_shape_ar, 0, None, ev)
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clEnqueueNDRangeKernel error: {clr}')
|
||||
|
||||
if wait:
|
||||
CL.clWaitForEvents(1, (CL.cl_event*1)(ev) )
|
||||
CL.clReleaseEvent(ev)
|
||||
|
||||
def wait(self):
|
||||
"""
|
||||
Wait to finish all queued operations on this Device
|
||||
"""
|
||||
clr = CL.clFinish(self._get_ctx_q())
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clFinish error: {clr}')
|
||||
|
||||
def cleanup(self):
|
||||
"""
|
||||
Frees all resources from this Device.
|
||||
"""
|
||||
self._cached_data = {}
|
||||
|
||||
pool = self._pooled_buffers
|
||||
for size_key in pool.keys():
|
||||
for mem in pool[size_key]:
|
||||
self._cl_mem_free(mem)
|
||||
self._pooled_buffers = {}
|
||||
self._total_memory_pooled = 0
|
||||
self._total_buffers_pooled = 0
|
||||
|
||||
if self._total_memory_allocated != 0:
|
||||
raise Exception('Unable to cleanup CLDevice, while not all Buffers are deallocated.')
|
||||
|
||||
for kernel, prog in self._compiled_kernels.values():
|
||||
clr = CL.clReleaseKernel(kernel)
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clReleaseKernel error: {clr}')
|
||||
|
||||
clr = CL.clReleaseProgram(prog)
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clReleaseProgram error: {clr}')
|
||||
self._compiled_kernels = {}
|
||||
|
||||
if self._ctx_q is not None:
|
||||
clr = CL.clReleaseCommandQueue(self._ctx_q)
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clReleaseCommandQueue error: {clr}')
|
||||
self._ctx_q = None
|
||||
|
||||
if self._ctx is not None:
|
||||
clr = CL.clReleaseContext(self._ctx)
|
||||
if clr != CL.CLERROR.SUCCESS:
|
||||
raise Exception(f'clReleaseContext error: {clr}')
|
||||
self._ctx = None
|
||||
|
||||
def _get_opencl_device_ids() -> List[CL.cl_device_id]:
|
||||
global _opencl_device_ids
|
||||
if _opencl_device_ids is None:
|
||||
_opencl_device_ids = []
|
||||
device_types = CL.CL_DEVICE_TYPE_CPU | CL.CL_DEVICE_TYPE_ACCELERATOR | CL.CL_DEVICE_TYPE_GPU
|
||||
|
||||
while True:
|
||||
num_platforms = CL.cl_uint()
|
||||
if CL.clGetPlatformIDs(0, None, num_platforms) != CL.CLERROR.SUCCESS or \
|
||||
num_platforms.value == 0:
|
||||
break
|
||||
|
||||
platforms = (CL.cl_platform_id * num_platforms.value) ()
|
||||
if CL.clGetPlatformIDs(num_platforms.value, platforms, None) != CL.CLERROR.SUCCESS:
|
||||
break
|
||||
|
||||
for i_platform in range(num_platforms.value):
|
||||
platform = platforms[i_platform]
|
||||
num_devices = CL.cl_uint(0)
|
||||
if CL.clGetDeviceIDs(platform, device_types, 0, None, num_devices) != CL.CLERROR.SUCCESS or \
|
||||
num_devices.value == 0:
|
||||
continue
|
||||
|
||||
device_ids = (CL.cl_device_id * num_devices.value)()
|
||||
if CL.clGetDeviceIDs(platform, device_types, num_devices.value, device_ids, None) != CL.CLERROR.SUCCESS:
|
||||
continue
|
||||
|
||||
for i in range(num_devices.value):
|
||||
device_id = device_ids[i]
|
||||
# Check OpenCL version.
|
||||
if device_id is not None:
|
||||
device_version_size = CL.c_size_t()
|
||||
if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_VERSION, 0, None, device_version_size) == CL.CLERROR.SUCCESS:
|
||||
device_version = CL.create_string_buffer(device_version_size.value)
|
||||
if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_VERSION, device_version_size.value, device_version, None) == CL.CLERROR.SUCCESS:
|
||||
device_version = str(device_version.value, 'ascii')
|
||||
|
||||
major, minor = device_version.split(' ')[1].split('.')
|
||||
opencl_version = int(major)*10+int(minor)
|
||||
if opencl_version >= 12:
|
||||
_opencl_device_ids.append(device_id)
|
||||
break
|
||||
return _opencl_device_ids
|
||||
|
||||
def get_available_devices_info() -> List[DeviceInfo]:
|
||||
"""
|
||||
returns a list of available picklable DeviceInfo's
|
||||
"""
|
||||
devices = []
|
||||
for device_index, device_id in enumerate(_get_opencl_device_ids()):
|
||||
device_name = 'undefined'
|
||||
device_total_memory = 0
|
||||
|
||||
name_size = CL.c_size_t()
|
||||
if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_NAME, 0, None, name_size) == CL.CLERROR.SUCCESS:
|
||||
name_value = CL.create_string_buffer(name_size.value)
|
||||
if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_NAME, name_size.value, name_value, None) == CL.CLERROR.SUCCESS:
|
||||
device_name = str(name_value.value, 'ascii')
|
||||
|
||||
global_mem_size = CL.cl_ulong()
|
||||
if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_GLOBAL_MEM_SIZE, CL.sizeof(global_mem_size), CL.byref(global_mem_size), None) == CL.CLERROR.SUCCESS:
|
||||
device_total_memory = global_mem_size.value
|
||||
|
||||
vendor_id = CL.cl_uint()
|
||||
CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_VENDOR_ID, CL.sizeof(vendor_id), CL.byref(vendor_id), None)
|
||||
vendor_id = vendor_id.value
|
||||
|
||||
max_compute_units = CL.cl_uint()
|
||||
CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_MAX_COMPUTE_UNITS, CL.sizeof(max_compute_units), CL.byref(max_compute_units), None)
|
||||
max_compute_units = max_compute_units.value
|
||||
|
||||
performance_level = max_compute_units
|
||||
|
||||
if vendor_id == 0x8086: # Intel device
|
||||
performance_level -= 1000
|
||||
|
||||
devices.append( DeviceInfo(index=device_index, name=device_name, total_memory=device_total_memory, performance_level=performance_level ) )
|
||||
|
||||
return devices
|
||||
|
||||
def get_default_device() -> Union[Device, None]:
|
||||
global _default_device
|
||||
if _default_device is None:
|
||||
_default_device = get_device(0)
|
||||
return _default_device
|
||||
|
||||
def set_default_device(device : Device):
|
||||
if not isinstance(device, Device):
|
||||
raise ValueError('device must be an instance of Device')
|
||||
|
||||
global _default_device
|
||||
_default_device = device
|
||||
|
||||
def get_device(arg : Union[None, int, Device, DeviceInfo]) -> Union[Device, None]:
|
||||
"""
|
||||
get physical TensorCL device.
|
||||
|
||||
arg None - get best device
|
||||
int - by index
|
||||
DeviceInfo - by device info
|
||||
Device - returns the same
|
||||
"""
|
||||
global _devices
|
||||
|
||||
if arg is None:
|
||||
return get_best_device()
|
||||
elif isinstance(arg, int):
|
||||
devices_info = get_available_devices_info()
|
||||
if arg < len(devices_info):
|
||||
arg = devices_info[arg]
|
||||
else:
|
||||
return None
|
||||
elif isinstance(arg, Device):
|
||||
return arg
|
||||
elif not isinstance(arg, DeviceInfo):
|
||||
raise ValueError(f'Unknown type of arg {arg.__class__}')
|
||||
|
||||
device = _devices.get(arg, None)
|
||||
if device is None:
|
||||
device = _devices[arg] = Device(arg, _check=1)
|
||||
|
||||
return device
|
||||
|
||||
def get_best_device() -> Union[Device, None]:
|
||||
"""
|
||||
returns best device from available.
|
||||
"""
|
||||
perf_level = -999999
|
||||
result = None
|
||||
for device_info in get_available_devices_info():
|
||||
dev_perf_level = device_info.get_performance_level()
|
||||
if perf_level < dev_perf_level:
|
||||
perf_level = dev_perf_level
|
||||
result = device_info
|
||||
if result is not None:
|
||||
result = get_device(result)
|
||||
return result
|
||||
|
||||
def cleanup_devices():
|
||||
global _devices
|
||||
|
||||
for device in list(_devices.values()):
|
||||
device.cleanup()
|
||||
_devices = {}
|
44
xlib/avecl/_internal/backend/DeviceInfo.py
Normal file
44
xlib/avecl/_internal/backend/DeviceInfo.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
class DeviceInfo:
|
||||
"""
|
||||
Represents picklable OpenCL device info
|
||||
"""
|
||||
|
||||
def __init__(self, index : int = None, name : str = None, total_memory : int = None, performance_level : int = None):
|
||||
self._index = index
|
||||
self._name = name
|
||||
self._total_memory = total_memory
|
||||
self._performance_level = performance_level
|
||||
|
||||
def __getstate__(self):
|
||||
return self.__dict__.copy()
|
||||
|
||||
def __setstate__(self, d):
|
||||
self.__init__()
|
||||
self.__dict__.update(d)
|
||||
|
||||
def get_index(self) -> int:
|
||||
return self._index
|
||||
|
||||
def get_name(self) -> str:
|
||||
return self._name
|
||||
|
||||
def get_total_memory(self) -> int:
|
||||
return self._total_memory
|
||||
|
||||
def get_performance_level(self) -> int:
|
||||
return self._performance_level
|
||||
|
||||
def __eq__(self, other):
|
||||
if self is not None and other is not None and isinstance(self, DeviceInfo) and isinstance(other, DeviceInfo):
|
||||
return self._index == other._index
|
||||
return False
|
||||
|
||||
def __hash__(self):
|
||||
return self._index
|
||||
|
||||
def __str__(self):
|
||||
return f"[{self._index}] {self._name} [{(self._total_memory / 1024**3) :.3}Gb]"
|
||||
|
||||
def __repr__(self):
|
||||
return f'{self.__class__.__name__} object: ' + self.__str__()
|
||||
|
26
xlib/avecl/_internal/backend/Kernel.py
Normal file
26
xlib/avecl/_internal/backend/Kernel.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
class Kernel:
|
||||
"""
|
||||
TensorCL kernel.
|
||||
|
||||
It does not allocate any resources, thus can be used as static variable within class.
|
||||
|
||||
arguments
|
||||
|
||||
kernel_text OpenCL text of kernel. Must contain only one __kernel
|
||||
|
||||
global_shape default global_shape for .run()
|
||||
|
||||
local_shape default local_shape for .run()
|
||||
"""
|
||||
def __init__(self, kernel_text, global_shape=None, local_shape=None):
|
||||
self._kernel_text = kernel_text
|
||||
self._global_shape = global_shape
|
||||
self._local_shape = local_shape
|
||||
|
||||
def get_kernel_text(self) -> str: return self._kernel_text
|
||||
def get_global_shape(self): return self._global_shape
|
||||
def get_local_shape(self): return self._local_shape
|
||||
|
||||
def __str__(self): return f'Kernel: \n{self._kernel_text}'
|
||||
def __repr__(self): return self.__str__()
|
||||
|
278
xlib/avecl/_internal/backend/OpenCL/OpenCL.py
Normal file
278
xlib/avecl/_internal/backend/OpenCL/OpenCL.py
Normal file
|
@ -0,0 +1,278 @@
|
|||
"""
|
||||
Minimal OpenCL 1.2 low level ctypes API.
|
||||
"""
|
||||
import ctypes
|
||||
from ctypes import POINTER, create_string_buffer, sizeof, c_char_p, c_char, c_size_t, c_void_p, byref
|
||||
from ctypes.util import find_library
|
||||
from enum import IntEnum
|
||||
|
||||
dlls_by_name = {}
|
||||
def dll_import(dll_name):
|
||||
dll = dlls_by_name.get(dll_name, None)
|
||||
if dll is None:
|
||||
try:
|
||||
dll = ctypes.cdll.LoadLibrary(find_library(dll_name))
|
||||
except:
|
||||
pass
|
||||
if dll is None:
|
||||
raise RuntimeError(f'Unable to load {dll_name} library.')
|
||||
dlls_by_name[dll_name] = dll
|
||||
|
||||
def decorator(func):
|
||||
dll_func = getattr(dll, func.__name__)
|
||||
anno = list(func.__annotations__.values())
|
||||
dll_func.argtypes = anno[:-1]
|
||||
dll_func.restype = anno[-1]
|
||||
def wrapper(*args):
|
||||
return dll_func(*args)
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
class cl_char(ctypes.c_int8): pass
|
||||
class cl_uchar(ctypes.c_uint8): pass
|
||||
class cl_short(ctypes.c_int16): pass
|
||||
class cl_ushort(ctypes.c_uint16): pass
|
||||
class cl_int(ctypes.c_int32): pass
|
||||
class cl_uint(ctypes.c_uint32): pass
|
||||
class cl_long(ctypes.c_int64): pass
|
||||
class cl_ulong(ctypes.c_uint64): pass
|
||||
class cl_half(ctypes.c_uint16): pass
|
||||
class cl_float(ctypes.c_float): pass
|
||||
class cl_double(ctypes.c_double): pass
|
||||
class cl_bool(cl_uint): pass
|
||||
class cl_bitfield(cl_ulong):
|
||||
def __or__(self, other):
|
||||
assert isinstance(other, self.__class__)
|
||||
return self.__class__(self.value | other.value)
|
||||
def __and__(self, other):
|
||||
assert isinstance(other, self.__class__)
|
||||
return self.__class__(self.value & other.value)
|
||||
def __xor__(self, other):
|
||||
assert isinstance(other, self.__class__)
|
||||
return self.__class__(self.value ^ other.value)
|
||||
def __not__(self):
|
||||
return self.__class__(~self.value)
|
||||
def __contains__(self, other):
|
||||
assert isinstance(other, self.__class__)
|
||||
return (self.value & other.value) == other.value
|
||||
def __hash__(self):
|
||||
return self.value.__hash__()
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, self.__class__):
|
||||
return False
|
||||
else:
|
||||
return self.value == other.value
|
||||
def __ne__(self, other):
|
||||
return not(self == other)
|
||||
def __repr__(self):
|
||||
return f'cl_bitfield: {self.value}'
|
||||
|
||||
class CLERROR(IntEnum):
|
||||
SUCCESS = 0
|
||||
DEVICE_NOT_FOUND = -1
|
||||
DEVICE_NOT_AVAILABLE = -2
|
||||
COMPILER_NOT_AVAILABLE = -3
|
||||
MEM_OBJECT_ALLOCATION_FAILURE = -4
|
||||
OUT_OF_RESOURCES = -5
|
||||
OUT_OF_HOST_MEMORY = -6
|
||||
PROFILING_INFO_NOT_AVAILABLE = -7
|
||||
MEM_COPY_OVERLAP = -8
|
||||
IMAGE_FORMAT_MISMATCH = -9
|
||||
IMAGE_FORMAT_NOT_SUPPORTED = -10
|
||||
BUILD_PROGRAM_FAILURE = -11
|
||||
MAP_FAILURE = -12
|
||||
MISALIGNED_SUB_BUFFER_OFFSET = -13
|
||||
EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST = -14
|
||||
INVALID_VALUE = -30
|
||||
INVALID_DEVICE_TYPE = -31
|
||||
INVALID_PLATFORM = -32
|
||||
INVALID_DEVICE = -33
|
||||
INVALID_CONTEXT = -34
|
||||
INVALID_QUEUE_PROPERTIES = -35
|
||||
INVALID_COMMAND_QUEUE = -36
|
||||
INVALID_HOST_PTR = -37
|
||||
INVALID_MEM_OBJECT = -38
|
||||
INVALID_IMAGE_FORMAT_DESCRIPTOR = -39
|
||||
INVALID_IMAGE_SIZE = -40
|
||||
INVALID_SAMPLER = -41
|
||||
INVALID_BINARY = -42
|
||||
INVALID_BUILD_OPTIONS = -43
|
||||
INVALID_PROGRAM = -44
|
||||
INVALID_PROGRAM_EXECUTABLE = -45
|
||||
INVALID_KERNEL_NAME = -46
|
||||
INVALID_KERNEL_DEFINITION = -47
|
||||
INVALID_KERNEL = -48
|
||||
INVALID_ARG_INDEX = -49
|
||||
INVALID_ARG_VALUE = -50
|
||||
INVALID_ARG_SIZE = -51
|
||||
INVALID_KERNEL_ARGS = -52
|
||||
INVALID_WORK_DIMENSION = -53
|
||||
INVALID_WORK_GROUP_SIZE = -54
|
||||
INVALID_WORK_ITEM_SIZE = -55
|
||||
INVALID_GLOBAL_OFFSET = -56
|
||||
INVALID_EVENT_WAIT_LIST = -57
|
||||
INVALID_EVENT = -58
|
||||
INVALID_OPERATION = -59
|
||||
INVALID_GL_OBJECT = -60
|
||||
INVALID_BUFFER_SIZE = -61
|
||||
INVALID_MIP_LEVEL = -62
|
||||
INVALID_GLOBAL_WORK_SIZE = -63
|
||||
INVALID_PROPERTY = -64
|
||||
INVALID_GL_SHAREGROUP_REFERENCE_KHR = -1000
|
||||
PLATFORM_NOT_FOUND_KHR = -1001
|
||||
|
||||
class CLRESULT(cl_int):
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, int):
|
||||
return self.value == other
|
||||
elif isinstance(other, self.__class__):
|
||||
return self.value == other.value
|
||||
else:
|
||||
return False
|
||||
def __ne__(self, other):
|
||||
return not(self == other)
|
||||
def __hash__(self):
|
||||
return self.value.__hash__()
|
||||
def __str__(self):
|
||||
try:
|
||||
return f'CLRESULT ({str(CLERROR(self.value))})'
|
||||
except:
|
||||
return f'CLRESULT ({self.value})'
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
class cl_platform_id(c_void_p): ...
|
||||
class cl_platform_info(cl_uint): ...
|
||||
class cl_device_id(c_void_p): ...
|
||||
class cl_device_type(cl_bitfield): ...
|
||||
class cl_device_info(cl_uint): ...
|
||||
class cl_context(c_void_p): ...
|
||||
class cl_context_properties(c_void_p): ...
|
||||
class cl_command_queue(c_void_p): ...
|
||||
class cl_command_queue_properties(cl_bitfield): ...
|
||||
class cl_event(c_void_p): ...
|
||||
class cl_mem(c_void_p): ...
|
||||
class cl_mem_info(cl_uint): ...
|
||||
class cl_mem_flags(cl_bitfield): ...
|
||||
class cl_program(c_void_p): ...
|
||||
class cl_program_build_info(cl_uint): ...
|
||||
class cl_kernel(c_void_p): ...
|
||||
|
||||
# https://github.com/KhronosGroup/OpenCL-Headers/blob/master/CL/cl.h
|
||||
CL_PLATFORM_PROFILE = cl_platform_info(0x0900)
|
||||
CL_PLATFORM_VERSION = cl_platform_info(0x0901)
|
||||
CL_PLATFORM_NAME = cl_platform_info(0x0902)
|
||||
CL_PLATFORM_VENDOR = cl_platform_info(0x0903)
|
||||
CL_PLATFORM_EXTENSIONS = cl_platform_info(0x0904)
|
||||
|
||||
CL_DEVICE_TYPE_DEFAULT = cl_device_type( (1 << 0) )
|
||||
CL_DEVICE_TYPE_CPU = cl_device_type( (1 << 1) )
|
||||
CL_DEVICE_TYPE_GPU = cl_device_type( (1 << 2) )
|
||||
CL_DEVICE_TYPE_ACCELERATOR = cl_device_type( (1 << 3) )
|
||||
CL_DEVICE_TYPE_ALL = cl_device_type( 0xFFFFFFFF )
|
||||
|
||||
CL_DEVICE_TYPE = cl_device_info (0x1000)
|
||||
CL_DEVICE_VENDOR_ID = cl_device_info (0x1001)
|
||||
CL_DEVICE_MAX_COMPUTE_UNITS = cl_device_info (0x1002)
|
||||
CL_DEVICE_GLOBAL_MEM_SIZE = cl_device_info (0x101F)
|
||||
CL_DEVICE_NAME = cl_device_info (0x102B)
|
||||
CL_DEVICE_VERSION = cl_device_info (0x102F)
|
||||
CL_DEVICE_MAX_MEM_ALLOC_SIZE = cl_device_info (0x1010)
|
||||
CL_DEVICE_MAX_WORK_GROUP_SIZE = cl_device_info (0x1004)
|
||||
CL_DRIVER_VERSION = cl_device_info (0x102D)
|
||||
CL_DEVICE_EXTENSIONS = cl_device_info (0x1030)
|
||||
|
||||
# cl_mem_flags
|
||||
CL_MEM_READ_WRITE = cl_mem_flags( (1 << 0) )
|
||||
CL_MEM_WRITE_ONLY = cl_mem_flags( (1 << 1) )
|
||||
CL_MEM_READ_ONLY = cl_mem_flags( (1 << 2) )
|
||||
CL_MEM_USE_HOST_PTR = cl_mem_flags( (1 << 3) )
|
||||
CL_MEM_ALLOC_HOST_PTR = cl_mem_flags( (1 << 4) )
|
||||
CL_MEM_COPY_HOST_PTR = cl_mem_flags( (1 << 5) )
|
||||
|
||||
# cl_mem_info
|
||||
CL_MEM_SIZE = cl_mem_info(0x1102)
|
||||
|
||||
# cl_program_build_info
|
||||
CL_PROGRAM_BUILD_STATUS = cl_program_build_info(0x1181)
|
||||
CL_PROGRAM_BUILD_OPTIONS = cl_program_build_info(0x1182)
|
||||
CL_PROGRAM_BUILD_LOG = cl_program_build_info(0x1183)
|
||||
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clGetPlatformIDs (num_entries : cl_uint, platforms : POINTER(cl_platform_id), num_platforms : POINTER(cl_uint) ) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clGetPlatformInfo (platform : cl_platform_id, param_name : cl_platform_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t)) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clGetDeviceIDs (platform : cl_platform_id, device_type : cl_device_type, num_entries : cl_uint, devices : POINTER(cl_device_id), num_devices : POINTER(cl_uint)) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clGetDeviceInfo(device : cl_device_id, param_name : cl_device_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t)) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clCreateContext(properties : cl_context_properties, num_devices : cl_uint, devices : POINTER(cl_device_id), pfn_notify : c_void_p, user_data : c_void_p, errcode_ret : POINTER(CLRESULT) ) -> cl_context: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clReleaseContext(context : cl_context) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clCreateCommandQueue(context : cl_context, device : cl_device_id, properties : cl_command_queue_properties, errcode_ret : POINTER(CLRESULT) ) -> cl_command_queue: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clReleaseCommandQueue(command_queue : cl_command_queue) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clFinish(command_queue : cl_command_queue) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clWaitForEvents(num_events : cl_uint, event_list : POINTER(cl_event) ) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clReleaseEvent(event : cl_event) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clCreateBuffer(context : cl_context, flags : cl_mem_flags, size : c_size_t, host_ptr : c_void_p, errcode_ret : POINTER(CLRESULT) ) -> cl_mem: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clGetMemObjectInfo(memobj : cl_mem, param_name : cl_mem_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t) ) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clReleaseMemObject(memobj : cl_mem) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clEnqueueReadBuffer (command_queue : cl_command_queue, buffer : cl_mem, blocking_read : cl_bool, offset : c_size_t, cb : c_size_t, ptr : c_void_p, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clEnqueueWriteBuffer (command_queue : cl_command_queue, buffer : cl_mem, blocking_write : cl_bool, offset : c_size_t, size : c_size_t, ptr : c_void_p, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clEnqueueCopyBuffer (command_queue : cl_command_queue, src_buffer : cl_mem, dst_buffer : cl_mem, src_offset : c_size_t, dst_offset : c_size_t, cb : c_size_t, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : cl_event) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clEnqueueFillBuffer (command_queue : cl_command_queue, buffer : cl_mem, pattern : c_void_p, pattern_size : c_size_t, offset : c_size_t, size : c_size_t, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clCreateProgramWithSource (context : cl_context, count : cl_uint, strings : POINTER(c_char_p), lengths : POINTER(c_size_t), errcode_ret : POINTER(CLRESULT) ) -> cl_program: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clReleaseProgram (program : cl_program) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clBuildProgram (program : cl_program, num_devices : cl_uint, device_list : POINTER(cl_device_id), options : c_char_p, pfn_notify : c_void_p, user_data : c_void_p) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clGetProgramBuildInfo (program : cl_program, device : cl_device_id, param_name : cl_program_build_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t) ) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clCreateKernelsInProgram (program : cl_program, num_kernels : cl_uint, kernels : POINTER(cl_kernel), num_kernels_ret : POINTER(cl_uint) ) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clReleaseKernel (program : cl_kernel) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clSetKernelArg (kernel : cl_kernel, arg_index : cl_uint, arg_size : c_size_t, arg_value : c_void_p) -> CLRESULT: ...
|
||||
|
||||
@dll_import('OpenCL')
|
||||
def clEnqueueNDRangeKernel (command_queue : cl_command_queue, kernel : cl_kernel, work_dim : cl_uint, global_work_offset : POINTER(c_size_t), global_work_size : POINTER(c_size_t), local_work_size : POINTER(c_size_t), num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ...
|
37
xlib/avecl/_internal/backend/OpenCL/__init__.py
Normal file
37
xlib/avecl/_internal/backend/OpenCL/__init__.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
"""
|
||||
Minimal OpenCL 1.2 low level ctypes API.
|
||||
"""
|
||||
from .OpenCL import (CL_DEVICE_EXTENSIONS, CL_DEVICE_GLOBAL_MEM_SIZE,
|
||||
CL_DEVICE_MAX_COMPUTE_UNITS, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
|
||||
CL_DEVICE_MAX_WORK_GROUP_SIZE, CL_DEVICE_NAME,
|
||||
CL_DEVICE_TYPE, CL_DEVICE_TYPE_ACCELERATOR,
|
||||
CL_DEVICE_TYPE_ALL, CL_DEVICE_TYPE_CPU,
|
||||
CL_DEVICE_TYPE_DEFAULT, CL_DEVICE_TYPE_GPU,
|
||||
CL_DEVICE_VENDOR_ID, CL_DEVICE_VERSION, CL_DRIVER_VERSION,
|
||||
CL_MEM_ALLOC_HOST_PTR, CL_MEM_COPY_HOST_PTR,
|
||||
CL_MEM_READ_ONLY, CL_MEM_READ_WRITE, CL_MEM_SIZE,
|
||||
CL_MEM_USE_HOST_PTR, CL_MEM_WRITE_ONLY,
|
||||
CL_PLATFORM_EXTENSIONS, CL_PLATFORM_NAME,
|
||||
CL_PLATFORM_PROFILE, CL_PLATFORM_VENDOR,
|
||||
CL_PLATFORM_VERSION, CL_PROGRAM_BUILD_LOG,
|
||||
CL_PROGRAM_BUILD_OPTIONS, CL_PROGRAM_BUILD_STATUS,
|
||||
CLERROR, CLRESULT, byref, c_char, c_char_p, c_size_t,
|
||||
c_void_p, cl_bitfield, cl_bool, cl_char, cl_command_queue,
|
||||
cl_command_queue_properties, cl_context,
|
||||
cl_context_properties, cl_device_id, cl_device_info,
|
||||
cl_device_type, cl_double, cl_event, cl_float, cl_half,
|
||||
cl_int, cl_kernel, cl_long, cl_mem, cl_mem_info,
|
||||
cl_platform_id, cl_platform_info, cl_program,
|
||||
cl_program_build_info, cl_short, cl_uchar, cl_uint,
|
||||
cl_ulong, cl_ushort, clBuildProgram, clCreateBuffer,
|
||||
clCreateCommandQueue, clCreateContext,
|
||||
clCreateKernelsInProgram, clCreateProgramWithSource,
|
||||
clEnqueueCopyBuffer, clEnqueueFillBuffer,
|
||||
clEnqueueNDRangeKernel, clEnqueueReadBuffer,
|
||||
clEnqueueWriteBuffer, clFinish, clGetDeviceIDs,
|
||||
clGetDeviceInfo, clGetMemObjectInfo, clGetPlatformIDs,
|
||||
clGetPlatformInfo, clGetProgramBuildInfo,
|
||||
clReleaseCommandQueue, clReleaseContext, clReleaseEvent,
|
||||
clReleaseKernel, clReleaseMemObject, clReleaseProgram,
|
||||
clSetKernelArg, clWaitForEvents, create_string_buffer,
|
||||
ctypes, sizeof)
|
6
xlib/avecl/_internal/backend/__init__.py
Normal file
6
xlib/avecl/_internal/backend/__init__.py
Normal file
|
@ -0,0 +1,6 @@
|
|||
from .Buffer import Buffer
|
||||
from .Device import (Device, cleanup_devices, get_available_devices_info,
|
||||
get_best_device, get_default_device, get_device,
|
||||
set_default_device)
|
||||
from .DeviceInfo import DeviceInfo
|
||||
from .Kernel import Kernel
|
74
xlib/avecl/_internal/info/BroadcastInfo.py
Normal file
74
xlib/avecl/_internal/info/BroadcastInfo.py
Normal file
|
@ -0,0 +1,74 @@
|
|||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..AAxes import AAxes
|
||||
|
||||
class BroadcastInfo:
|
||||
"""
|
||||
Broadcast info for multi shapes.
|
||||
|
||||
arguments
|
||||
|
||||
shapes List[AShape]
|
||||
|
||||
errors during the construction:
|
||||
|
||||
ValueError
|
||||
|
||||
Example:
|
||||
shapes = ( 3,),
|
||||
(3, 1,)
|
||||
#Example | comment
|
||||
|
||||
o_shape #(3, 3) broadcasted together o_shape
|
||||
|
||||
br_shapes #(1, 3) shape[0] broadcasted to o_shape ndim
|
||||
#(3, 1) shape[1] broadcasted to o_shape ndim
|
||||
|
||||
shapes_tiles #(3, 1) tiles to make o_shape from br_shapes[0]
|
||||
#(1, 3) tiles to make o_shape from br_shapes[1]
|
||||
|
||||
shapes_reduction_axes #(0,) reduction_axes to make shapes[0] from o_shape
|
||||
#(1,) reduction_axes to make shapes[1] from o_shape
|
||||
"""
|
||||
|
||||
__slots__ = ['o_shape', 'br_shapes', 'shapes_tiles', 'shapes_reduction_axes']
|
||||
|
||||
def __init__(self, shapes : List[AShape]):
|
||||
|
||||
highest_rank = sorted([shape.ndim for shape in shapes])[-1]
|
||||
|
||||
# Broadcasted all shapes to highest ndim with (1,)-s in from left side
|
||||
br_shapes = [ (1,)*(highest_rank-shape.ndim) + shape for shape in shapes ]
|
||||
|
||||
# Determine o_shape from all shapes
|
||||
o_shape = AShape( np.max( [ [ br_shape.shape[axis] for br_shape in br_shapes] for axis in range(highest_rank) ], axis=1 ) )
|
||||
|
||||
shapes_tiles = []
|
||||
shapes_reduction_axes = []
|
||||
for br_shape in br_shapes:
|
||||
shape_tile = []
|
||||
shape_reduction_axes = []
|
||||
for axis, (x,y) in enumerate(zip(br_shape.shape, o_shape.shape)):
|
||||
if x != y:
|
||||
if x == 1 and y != 1:
|
||||
shape_tile.append(y)
|
||||
shape_reduction_axes.append(axis)
|
||||
elif x != 1 and y == 1:
|
||||
shape_tile.append(1)
|
||||
else:
|
||||
raise ValueError(f'operands could not be broadcast together with shapes {br_shape} {o_shape}')
|
||||
else:
|
||||
shape_tile.append(1)
|
||||
|
||||
shapes_tiles.append(shape_tile)
|
||||
shapes_reduction_axes.append( AAxes(shape_reduction_axes) )
|
||||
|
||||
|
||||
self.o_shape : AShape = AShape(o_shape)
|
||||
self.br_shapes : List[AShape] = br_shapes
|
||||
self.shapes_tiles : List[List] = shapes_tiles
|
||||
self.shapes_reduction_axes : List [AAxes] = shapes_reduction_axes
|
||||
|
62
xlib/avecl/_internal/info/ConcatInfo.py
Normal file
62
xlib/avecl/_internal/info/ConcatInfo.py
Normal file
|
@ -0,0 +1,62 @@
|
|||
from ..AShape import AShape
|
||||
|
||||
class ConcatInfo:
|
||||
__slots__ = ['o_shape', 'axis', 'axis_sizes', 'axis_offsets']
|
||||
|
||||
def __init__(self, shapes, axis):
|
||||
"""
|
||||
Concat info
|
||||
|
||||
arguments
|
||||
|
||||
shapes Iterable of shapes
|
||||
|
||||
errors during the construction:
|
||||
|
||||
ValueError
|
||||
|
||||
result
|
||||
|
||||
.o_shape AShape
|
||||
|
||||
.axis Int fixed axis argument
|
||||
|
||||
.axis_sizes List[Int] axis sizes for every shape in shapes
|
||||
|
||||
.axis_offsets List[Int] axis offset in o_shape
|
||||
for every shape in shapes
|
||||
"""
|
||||
|
||||
shapes = tuple(shapes)
|
||||
|
||||
if len(shapes) == 0:
|
||||
raise ValueError('shapes is empty')
|
||||
|
||||
|
||||
shape = shapes[0]
|
||||
|
||||
if axis < 0:
|
||||
axis = shape.ndim + axis
|
||||
if axis < 0 or axis >= shape.ndim:
|
||||
raise ValueError(f'Wrong axis {axis}')
|
||||
|
||||
fixed_shapes = [ tuple(a for i,a in enumerate(shape) if i != axis) for shape in shapes ]
|
||||
req_shape = fixed_shapes[0]
|
||||
if not all(shape == req_shape for shape in fixed_shapes[1:]):
|
||||
raise ValueError(f'All shapes must match shape {tuple(a if i != axis else "*" for i,a in enumerate(shape))}')
|
||||
|
||||
axis_sizes = [ shape[axis] for shape in shapes ]
|
||||
|
||||
axis_offset = 0
|
||||
axis_offsets = []
|
||||
for axis_size in axis_sizes:
|
||||
axis_offsets.append(axis_offset)
|
||||
axis_offset += axis_size
|
||||
|
||||
self.o_shape = AShape( tuple(shape)[0:axis] + (sum(axis_sizes),) + tuple(shape)[axis+1:] )
|
||||
self.axis = axis
|
||||
self.axis_sizes = axis_sizes
|
||||
self.axis_offsets = tuple(axis_offsets)
|
||||
|
||||
|
||||
|
78
xlib/avecl/_internal/info/Conv2DInfo.py
Normal file
78
xlib/avecl/_internal/info/Conv2DInfo.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
from collections import Iterable
|
||||
import math
|
||||
|
||||
class Conv2DInfo:
|
||||
"""
|
||||
Conv2DInfo
|
||||
|
||||
arguments
|
||||
|
||||
H, W int axes sizes
|
||||
KH, KW int kernel sizes
|
||||
stride int
|
||||
dilation int
|
||||
|
||||
padding 'valid' no padding
|
||||
'same' output size will be the same
|
||||
or divided by stride
|
||||
int padding value for all sides
|
||||
|
||||
Iterable of 4 ints
|
||||
paddings for left,top,right,bottom sides
|
||||
|
||||
errors during the construction:
|
||||
|
||||
ValueError
|
||||
|
||||
result:
|
||||
|
||||
.PADL .PADR paddings for W axis
|
||||
.PADT .PADB paddings for H axis
|
||||
|
||||
.OH .OW result axes
|
||||
|
||||
.OH_T .OW_T result transposed axes.
|
||||
it is None if padding != 'valid','same'
|
||||
"""
|
||||
|
||||
__slots__ = ['PADL', 'PADR', 'PADT', 'PADB', 'OH', 'OW', 'OH_T', 'OW_T']
|
||||
|
||||
def __init__(self, H, W, KH, KW, stride, dilation, padding):
|
||||
# Effective kernel sizes with dilation
|
||||
EKH = (KH-1)*dilation + 1
|
||||
EKW = (KW-1)*dilation + 1
|
||||
|
||||
# Determine pad size of sides
|
||||
OH_T = OW_T = None
|
||||
if padding == 'valid':
|
||||
PADL = PADT = PADR = PADB = 0
|
||||
OH_T = H * stride + max(EKH - stride, 0)
|
||||
OW_T = W * stride + max(EKW - stride, 0)
|
||||
elif padding == 'same':
|
||||
PADL = int(math.floor((EKW - 1)/2))
|
||||
PADT = int(math.floor((EKH - 1)/2))
|
||||
PADR = int(math.ceil((EKW - 1)/2))
|
||||
PADB = int(math.ceil((EKH - 1)/2))
|
||||
OH_T = H * stride
|
||||
OW_T = W * stride
|
||||
elif isinstance(padding, int):
|
||||
PADL = PADT = PADR = PADB = padding
|
||||
elif isinstance(padding, Iterable):
|
||||
padding = tuple(int(x) for x in padding)
|
||||
if len(padding) != 4:
|
||||
raise ValueError("Invalid paddings list length.")
|
||||
PADL, PADT, PADR, PADB = padding
|
||||
else:
|
||||
raise ValueError("Invalid padding value.")
|
||||
|
||||
self.PADL = PADL
|
||||
self.PADT = PADT
|
||||
self.PADR = PADR
|
||||
self.PADB = PADB
|
||||
|
||||
self.OH = max(1, int((H + PADT + PADB - EKH) / stride + 1) )
|
||||
self.OW = max(1, int((W + PADL + PADR - EKW) / stride + 1) )
|
||||
self.OH_T = OH_T
|
||||
self.OW_T = OW_T
|
||||
|
||||
|
56
xlib/avecl/_internal/info/PadInfo.py
Normal file
56
xlib/avecl/_internal/info/PadInfo.py
Normal file
|
@ -0,0 +1,56 @@
|
|||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
|
||||
|
||||
class PadInfo:
|
||||
"""
|
||||
Pad info.
|
||||
|
||||
arguments
|
||||
|
||||
shape AShape
|
||||
|
||||
axes_paddings list of (l_pad, r_pad)
|
||||
|
||||
if [0] == ... (Ellipsis), then left-side paddings will be filled with (0,0) for remain axes
|
||||
if [-1] == ... , same for ride-side
|
||||
|
||||
errors during the construction:
|
||||
|
||||
ValueError
|
||||
|
||||
result:
|
||||
|
||||
.o_shape AShape
|
||||
|
||||
"""
|
||||
|
||||
__slots__ = ['o_shape','axes_paddings']
|
||||
|
||||
def __init__(self, shape, axes_paddings : List):
|
||||
|
||||
if Ellipsis in axes_paddings:
|
||||
if sum(1 if x == Ellipsis else 0 for x in axes_paddings) > 1:
|
||||
raise ValueError('only 1 ...(ellipsis) allowed in axes_paddings')
|
||||
if axes_paddings[0] == Ellipsis:
|
||||
axes_paddings = ((0,0),)*(shape.ndim-(len(axes_paddings)-1))+ axes_paddings[1:]
|
||||
elif axes_paddings[-1] == Ellipsis:
|
||||
axes_paddings = axes_paddings[:-1] + ((0,0),)*(shape.ndim-(len(axes_paddings)-1))
|
||||
else:
|
||||
raise ValueError('...(ellipsis) must be at the begin or the end of axes_paddings')
|
||||
|
||||
if len(axes_paddings) != shape.ndim:
|
||||
raise ValueError(f'axes_paddings should match shape.ndim {shape.ndim}')
|
||||
|
||||
|
||||
self.axes_paddings = axes_paddings
|
||||
o_shape = []
|
||||
|
||||
for axis, (axis_size, (l_pad, r_pad)) in enumerate(zip(shape, axes_paddings)):
|
||||
new_axis_size = l_pad + axis_size + r_pad
|
||||
o_shape.append(new_axis_size)
|
||||
|
||||
self.o_shape = AShape(o_shape)
|
50
xlib/avecl/_internal/info/ReductionInfo.py
Normal file
50
xlib/avecl/_internal/info/ReductionInfo.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
from ..AShape import AShape
|
||||
|
||||
class ReductionInfo:
|
||||
"""
|
||||
Reduction info
|
||||
|
||||
arguments
|
||||
|
||||
shape AShape
|
||||
|
||||
axes AAxes
|
||||
|
||||
keepdims bool
|
||||
|
||||
can raise ValueError, TypeError during the construction
|
||||
"""
|
||||
|
||||
__slots__ = [
|
||||
'reduction_axes', # sorted reduction AAxes
|
||||
'o_axes', # remain AAxes after reduction
|
||||
'o_shape', # result AShape of reduction
|
||||
'o_shape_kd', # result AShape of reduction with keepdims
|
||||
]
|
||||
|
||||
def __init__(self, shape, axes, keepdims):
|
||||
shape_axes = shape.axes_arange()
|
||||
|
||||
if axes.is_none_axes():
|
||||
axes = shape_axes
|
||||
|
||||
# Check correctness of axes
|
||||
for axis in axes:
|
||||
if axis not in shape_axes:
|
||||
raise ValueError(f'Wrong axis {axis} not in {shape_axes}')
|
||||
|
||||
self.reduction_axes = reduction_axes = axes.sorted()
|
||||
|
||||
# Output axes. Remove axes from shape_axes
|
||||
self.o_axes = o_axes = shape_axes - axes
|
||||
|
||||
if o_axes.is_none_axes():
|
||||
o_shape = AShape( (1,) )
|
||||
else:
|
||||
o_shape = shape[o_axes]
|
||||
|
||||
self.o_shape = o_shape
|
||||
self.o_shape_kd = AShape( 1 if axis in reduction_axes else shape[axis] for axis in range(shape.ndim))
|
||||
|
||||
if keepdims:
|
||||
self.o_shape = self.o_shape_kd
|
44
xlib/avecl/_internal/info/ReshapeInfo.py
Normal file
44
xlib/avecl/_internal/info/ReshapeInfo.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
from ..AShape import AShape
|
||||
|
||||
class ReshapeInfo:
|
||||
"""
|
||||
Reshape info.
|
||||
can raise ValueError,TypeError during the construction
|
||||
|
||||
arguments
|
||||
|
||||
shape AShape
|
||||
|
||||
target_shape Iterable of ints
|
||||
can be any len and contains only one '-1'
|
||||
Example
|
||||
|
||||
shape (2, 512, 8, 8, 64)
|
||||
target_shape (2, 512, -1)
|
||||
o_shape (2, 512, 4096)
|
||||
"""
|
||||
|
||||
__slots__ = ['o_shape']
|
||||
|
||||
def __init__(self, shape, target_shape):
|
||||
o_shape = []
|
||||
|
||||
remain_size = shape.size
|
||||
|
||||
unk_axis = None
|
||||
for t_size in target_shape:
|
||||
t_size = int(t_size)
|
||||
if t_size != -1:
|
||||
mod = remain_size % t_size
|
||||
if mod != 0:
|
||||
raise ValueError(f'Cannot reshape {shape} to {target_shape}.')
|
||||
remain_size /= t_size
|
||||
else:
|
||||
if unk_axis is not None:
|
||||
raise ValueError('Can specify only one unknown dimension.')
|
||||
unk_axis = len(o_shape)
|
||||
o_shape.append( t_size )
|
||||
|
||||
if unk_axis is not None:
|
||||
o_shape[unk_axis] = int(remain_size)
|
||||
self.o_shape = AShape(o_shape)
|
148
xlib/avecl/_internal/info/SliceInfo.py
Normal file
148
xlib/avecl/_internal/info/SliceInfo.py
Normal file
|
@ -0,0 +1,148 @@
|
|||
import math
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
|
||||
|
||||
class SliceInfo:
|
||||
__slots__ = ['o_shape', 'o_shape_kd', 'just_reshaped','axes_bes','axes_abs_bes']
|
||||
|
||||
def __init__(self, shape : AShape, slices):
|
||||
"""
|
||||
Slice info.
|
||||
can raise ValueError,TypeError during the construction
|
||||
|
||||
arguments
|
||||
|
||||
slices
|
||||
|
||||
Example
|
||||
|
||||
|
||||
o_shape result shape after slice
|
||||
|
||||
axes_bes list of (begin,step,end) per axis
|
||||
if s==0, then single axis element is fetched from position b
|
||||
s can be negative.
|
||||
"""
|
||||
# Validate slices argument for given shape.
|
||||
new_slices = []
|
||||
before_ellipsis = None
|
||||
|
||||
for s in slices:
|
||||
if s is Ellipsis:
|
||||
before_ellipsis = new_slices
|
||||
new_slices = []
|
||||
continue
|
||||
elif s is not None and not isinstance(s, (int,tuple) ):
|
||||
raise ValueError(f'unknown slice argument {s} of type {s.__class__}')
|
||||
|
||||
new_slices.append(s)
|
||||
|
||||
if before_ellipsis is not None:
|
||||
# Process Ellipsis separator
|
||||
new_slices_n_axes = sum([ 1 for x in new_slices if x != None])
|
||||
before_ellipsis_n_axes = sum([ 1 for x in before_ellipsis if x != None])
|
||||
|
||||
# Expand slices by filling intermediate (None,None,None) for each remaining axis
|
||||
new_slices = before_ellipsis + \
|
||||
[(None,None,None)]*max(0, shape.ndim-before_ellipsis_n_axes-new_slices_n_axes) + \
|
||||
new_slices
|
||||
|
||||
new_slices_n_axes = sum([ 1 for x in new_slices if x != None])
|
||||
if new_slices_n_axes > shape.ndim:
|
||||
raise ValueError('slices arguments more than shape axes')
|
||||
elif new_slices_n_axes < shape.ndim:
|
||||
# Fill remaining axes
|
||||
new_slices += [(None,None,None)]*( shape.ndim - new_slices_n_axes )
|
||||
|
||||
slices = tuple(new_slices)
|
||||
|
||||
# Compute shapes
|
||||
output_is_reshaped = True # Flag determines that output_tensor
|
||||
# can be just reshaped without any computation
|
||||
o_shape = [] # output tensor shape
|
||||
o_shape_kd = [] # output shape used in kernel, must match input shape
|
||||
axes_bes = []
|
||||
axes_abs_bes = []
|
||||
|
||||
i_axis = 0
|
||||
|
||||
# Process slices arguments
|
||||
for v in slices:
|
||||
if v is None:
|
||||
# None is new axis
|
||||
# We can add unlimited number of (1,) axes at any place of shape
|
||||
o_shape.append(1)
|
||||
continue
|
||||
|
||||
i_axis_size = shape[i_axis]
|
||||
i_axis += 1
|
||||
|
||||
if isinstance(v, int):
|
||||
if v < 0:
|
||||
v += i_axis_size
|
||||
if v < 0 or v >= i_axis_size:
|
||||
raise ValueError(f'index {v} is out of bounds for axis {i_axis} with size {i_axis_size}')
|
||||
b,e,s = v,v,0
|
||||
else:
|
||||
b,e,s = v
|
||||
if s == 0:
|
||||
raise ValueError(f'slice step cannot be zero')
|
||||
|
||||
# Fix begin, end, step values
|
||||
if s is None:
|
||||
s = 1
|
||||
if b is None:
|
||||
b = 0 if s >= 0 else i_axis_size-1
|
||||
if e is None:
|
||||
e = i_axis_size if s >= 0 else -1
|
||||
elif e < 0:
|
||||
e += i_axis_size
|
||||
if b < 0:
|
||||
b += i_axis_size
|
||||
|
||||
if s >= 0:
|
||||
b = np.clip(b, 0, i_axis_size)
|
||||
e = np.clip(e, 0, i_axis_size)
|
||||
if b > e:
|
||||
raise ValueError('for positive step, begin cannot be > end.')
|
||||
|
||||
abs_b, abs_e, abs_s = b,e,s
|
||||
else:
|
||||
b = np.clip(b, 0, i_axis_size-1)
|
||||
e = np.clip(e, -1, i_axis_size)
|
||||
|
||||
if b <= e:
|
||||
raise ValueError('for negative step, begin cannot be <= end.')
|
||||
|
||||
abs_s = -s
|
||||
abs_e = b + 1
|
||||
abs_b = b - (math.ceil( (b-e) / abs_s ) -1) * abs_s
|
||||
|
||||
# for every o_shape_kd axis
|
||||
# we have exact begin,step values to fetch value from input
|
||||
axes_bes.append( (b,e,s))
|
||||
axes_abs_bes.append( (abs_b, abs_e, abs_s))
|
||||
|
||||
if i_axis_size != 1 and not (b == 0 and e == i_axis_size and s == 1):
|
||||
# Such params of axis slice will change input, thus output cannot be as just reshaped input
|
||||
output_is_reshaped = False
|
||||
|
||||
# Compute output_axis_size based on begin,end,step
|
||||
o_axis_size = max(0, math.ceil ( (e-b) / (s if s != 0 else 1) ) )
|
||||
|
||||
if o_axis_size >= 1:
|
||||
# >= 1 : select range of indexes, axis will remain
|
||||
o_shape.append(o_axis_size)
|
||||
# ^ othwerwise axis will be supressed
|
||||
|
||||
# o_shape with keepdims, must match ndim of input shape
|
||||
o_shape_kd.append( max(1,o_axis_size) )
|
||||
|
||||
self.just_reshaped = output_is_reshaped
|
||||
self.o_shape = AShape(o_shape)
|
||||
self.o_shape_kd = AShape(o_shape_kd)
|
||||
self.axes_bes = axes_bes
|
||||
self.axes_abs_bes = axes_abs_bes
|
37
xlib/avecl/_internal/info/StackInfo.py
Normal file
37
xlib/avecl/_internal/info/StackInfo.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
from ..AShape import AShape
|
||||
|
||||
class StackInfo:
|
||||
__slots__ = ['o_shape', 'axis']
|
||||
|
||||
def __init__(self, shape, axis, stack_count):
|
||||
"""
|
||||
Stack info
|
||||
|
||||
arguments
|
||||
|
||||
shape AShape
|
||||
|
||||
axis Int
|
||||
|
||||
stack_count Int
|
||||
|
||||
errors during the construction:
|
||||
|
||||
ValueError
|
||||
|
||||
result:
|
||||
|
||||
.o_shape AShape
|
||||
|
||||
.axis Int positive axis argument
|
||||
"""
|
||||
if axis < 0:
|
||||
axis = shape.ndim + 1 + axis
|
||||
if axis < 0 or axis > shape.ndim:
|
||||
raise ValueError(f'Wrong axis {axis}')
|
||||
|
||||
if stack_count <= 0:
|
||||
raise ValueError(f'Invalid stack_count {stack_count}')
|
||||
|
||||
self.o_shape = AShape( tuple(shape)[0:axis] + (stack_count,) + tuple(shape)[axis:] )
|
||||
self.axis = axis
|
52
xlib/avecl/_internal/info/TileInfo.py
Normal file
52
xlib/avecl/_internal/info/TileInfo.py
Normal file
|
@ -0,0 +1,52 @@
|
|||
import numpy as np
|
||||
from ..AShape import AShape, AShape
|
||||
|
||||
class TileInfo:
|
||||
"""
|
||||
Tile info.
|
||||
|
||||
arguments
|
||||
|
||||
shape AShape
|
||||
|
||||
tiles Iterable of ints
|
||||
|
||||
errors during the construction:
|
||||
|
||||
ValueError
|
||||
|
||||
result:
|
||||
|
||||
.o_shape AShape
|
||||
|
||||
.axes_slices list of slice() to fetch original shape
|
||||
from o_shape for each tile
|
||||
"""
|
||||
|
||||
__slots__ = ['o_shape', 'axes_slices']
|
||||
|
||||
def __init__(self, shape, tiles):
|
||||
if len(tiles) != shape.ndim:
|
||||
raise ValueError(f'tiles should match shape.ndim {shape.ndim}')
|
||||
|
||||
self.o_shape = AShape(dim*tiles[i] for i,dim in enumerate(shape))
|
||||
|
||||
c = [0]*shape.ndim
|
||||
|
||||
axes_offsets = []
|
||||
for n in range(np.prod(tiles)):
|
||||
axes_offsets.append( c.copy() )
|
||||
for i in range(shape.ndim-1,-1,-1):
|
||||
c[i] += 1
|
||||
if c[i] < tiles[i]:
|
||||
break
|
||||
c[i] = 0
|
||||
|
||||
axes_slices = []
|
||||
for axes_offset in axes_offsets:
|
||||
sl = []
|
||||
for axis,tile in enumerate(axes_offset):
|
||||
axis_size = shape[axis]
|
||||
sl.append( slice(axis_size*tile, axis_size*(tile+1)) )
|
||||
axes_slices.append(tuple(sl))
|
||||
self.axes_slices = tuple(axes_slices)
|
37
xlib/avecl/_internal/info/TransposeInfo.py
Normal file
37
xlib/avecl/_internal/info/TransposeInfo.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
from ..AShape import AShape
|
||||
from ..AAxes import AAxes
|
||||
|
||||
class TransposeInfo:
|
||||
"""
|
||||
TransposeInfo
|
||||
|
||||
arguments
|
||||
|
||||
shape AShape
|
||||
|
||||
axes_order AAxes
|
||||
|
||||
errors during the construction:
|
||||
|
||||
ValueError
|
||||
|
||||
result
|
||||
|
||||
.o_shape AShape
|
||||
|
||||
.no_changes bool transpose changes nothing
|
||||
|
||||
"""
|
||||
|
||||
__slots__ = ['no_changes', 'o_shape']
|
||||
|
||||
def __init__(self, shape : AShape, axes_order : AAxes):
|
||||
if shape.ndim != axes_order.ndim:
|
||||
raise ValueError('axes must match the shape')
|
||||
|
||||
# Axes order changes nothing?
|
||||
self.o_shape = shape[axes_order]
|
||||
self.no_changes = axes_order == shape.axes_arange()
|
||||
|
||||
|
||||
|
10
xlib/avecl/_internal/info/__init__.py
Normal file
10
xlib/avecl/_internal/info/__init__.py
Normal file
|
@ -0,0 +1,10 @@
|
|||
from .BroadcastInfo import BroadcastInfo
|
||||
from .ConcatInfo import ConcatInfo
|
||||
from .PadInfo import PadInfo
|
||||
from .ReductionInfo import ReductionInfo
|
||||
from .ReshapeInfo import ReshapeInfo
|
||||
from .SliceInfo import SliceInfo
|
||||
from .StackInfo import StackInfo
|
||||
from .TileInfo import TileInfo
|
||||
from .TransposeInfo import TransposeInfo
|
||||
from .Conv2DInfo import Conv2DInfo
|
89
xlib/avecl/_internal/initializer/InitCoords2DArange.py
Normal file
89
xlib/avecl/_internal/initializer/InitCoords2DArange.py
Normal file
|
@ -0,0 +1,89 @@
|
|||
import numpy as np
|
||||
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
from .Initializer import Initializer
|
||||
|
||||
class InitCoords2DArange(Initializer):
|
||||
|
||||
def __init__(self, h_start, h_stop, w_start, w_stop):
|
||||
"""
|
||||
Initialize (...,H,W,D) tensor with coords arange
|
||||
D == 2(x,y) or 3 (x,y,1)
|
||||
|
||||
arguments
|
||||
|
||||
h_start float height start value (inclusive)
|
||||
|
||||
h_stop float height stop value (inclusive)
|
||||
|
||||
w_start float width start value (inclusive)
|
||||
|
||||
w_stop float width stop value (inclusive)
|
||||
|
||||
"""
|
||||
super().__init__()
|
||||
self._h_start = h_start
|
||||
self._h_stop = h_stop
|
||||
self._w_start = w_start
|
||||
self._w_stop = w_stop
|
||||
|
||||
def initialize_tensor(self, tensor : Tensor):
|
||||
shape = tensor.shape
|
||||
dtype = tensor.dtype
|
||||
|
||||
if shape.ndim < 3:
|
||||
raise ValueError(f'tensor.shape.ndim must be >= 3 (...,H,W,D)')
|
||||
|
||||
OH,OW,OD = shape[-3:]
|
||||
if OD not in [2,3]:
|
||||
raise ValueError(f'last dim D {OD} must == 2 or 3')
|
||||
|
||||
if OH > 1:
|
||||
h_step = (self._h_stop-self._h_start) / (OH-1)
|
||||
else:
|
||||
h_step = 0
|
||||
|
||||
if OW > 1:
|
||||
w_step = (self._w_stop-self._w_start) / (OW-1)
|
||||
else:
|
||||
w_step = 0
|
||||
|
||||
key = (InitCoords2DArange, dtype)
|
||||
kernel = SCacheton.get_var(key)
|
||||
if kernel is None:
|
||||
kernel = Kernel(kernel_text=f"""
|
||||
|
||||
{HKernel.define_tensor_type('O', tensor.dtype)}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME , float h_start, float h_step
|
||||
, float w_start, float w_step,
|
||||
uint O0, uint O1, uint O2)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', 3)}
|
||||
|
||||
O_TYPE v;
|
||||
if (o2 == 0)
|
||||
v = w_start+o1*w_step;
|
||||
else
|
||||
if (o2 == 1)
|
||||
v = h_start+o0*h_step;
|
||||
else
|
||||
v = 1;
|
||||
|
||||
O_GLOBAL_STORE(gid, v);
|
||||
}}
|
||||
""")
|
||||
SCacheton.set_var(key, kernel)
|
||||
|
||||
tensor.get_device().run_kernel( kernel, tensor.get_buffer(),
|
||||
np.float32(self._h_start), np.float32(h_step),
|
||||
np.float32(self._w_start), np.float32(w_step),
|
||||
np.uint32(OH), np.uint32(OW), np.uint32(OD),
|
||||
global_shape=(shape.size,) )
|
||||
|
||||
def __str__(self): return f'CoordsArange'
|
62
xlib/avecl/_internal/initializer/InitRandomUniform.py
Normal file
62
xlib/avecl/_internal/initializer/InitRandomUniform.py
Normal file
|
@ -0,0 +1,62 @@
|
|||
import numpy as np
|
||||
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
from .Initializer import Initializer
|
||||
|
||||
|
||||
class InitRandomUniform(Initializer):
|
||||
|
||||
def __init__(self, low=0, high=1):
|
||||
"""
|
||||
arguments
|
||||
|
||||
low(0) low value
|
||||
|
||||
high(1) high value (exclusive)
|
||||
"""
|
||||
super().__init__()
|
||||
self._low = low
|
||||
self._high = high
|
||||
|
||||
def initialize_tensor(self, tensor : Tensor):
|
||||
|
||||
key = (InitRandomUniform, self._low, self._high, tensor.dtype)
|
||||
kernel = SCacheton.get_var(key)
|
||||
if kernel is None:
|
||||
|
||||
hl = self._high-self._low
|
||||
l = self._low
|
||||
|
||||
if tensor.dtype in [np.bool_,np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32]:
|
||||
gen_expression = f'hash_uint_from_uint(gid+seed32) % {int(hl)} + {int(l)}'
|
||||
elif tensor.dtype in [np.int64]:
|
||||
gen_expression = f'hash_ulong_from_ulong(gid+seed64) % {int(hl)} + {int(l)}'
|
||||
elif tensor.dtype in [np.uint64]:
|
||||
gen_expression = f'hash_ulong_from_ulong(gid+seed64) % {int(hl)} + {int(l)}'
|
||||
elif tensor.dtype in [np.float16, np.float32]:
|
||||
gen_expression = f'hash_float_from_uint(gid+seed32)*{hl} + {l}'
|
||||
elif tensor.dtype in [np.float64]:
|
||||
gen_expression = f'hash_double_from_ulong(gid+seed64)*{hl} + {l}'
|
||||
|
||||
kernel = Kernel(kernel_text=f"""
|
||||
{HKernel.include_hash()}
|
||||
{HKernel.define_tensor('O', (tensor.shape.size,), tensor.dtype )}
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, uint seed32, ulong seed64)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
O_GLOBAL_STORE(gid, {gen_expression} );
|
||||
}}
|
||||
""")
|
||||
SCacheton.set_var(key, kernel)
|
||||
|
||||
|
||||
tensor.get_device().run_kernel( kernel, tensor.get_buffer(),
|
||||
np.uint32(np.random.randint(np.iinfo(np.uint32).max, dtype=np.uint32)),
|
||||
np.uint64(np.random.randint(np.iinfo(np.uint64).max, dtype=np.uint64)),
|
||||
global_shape=(tensor.shape.size,) )
|
||||
|
||||
def __str__(self): return f'InitRandomUniform low={self._low}, high={self._high}'
|
||||
|
17
xlib/avecl/_internal/initializer/Initializer.py
Normal file
17
xlib/avecl/_internal/initializer/Initializer.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
from ..Tensor import Tensor
|
||||
|
||||
class Initializer:
|
||||
"""
|
||||
Base class for tensor inititalizers
|
||||
"""
|
||||
def initialize_tensor(self, tensor : Tensor):
|
||||
"""
|
||||
Implement initialization of tensor
|
||||
|
||||
You can compute the data using python and then call buffer.set(numpy_value)
|
||||
or call kernel.run( tensor.get_device(), tensor.get_buffer, ...) to initialize the data using OpenCL
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def __str__(self): return 'Initializer'
|
||||
def __repr__(self): return self.__str__()
|
3
xlib/avecl/_internal/initializer/__init__.py
Normal file
3
xlib/avecl/_internal/initializer/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from .InitCoords2DArange import InitCoords2DArange
|
||||
from .Initializer import Initializer
|
||||
from .InitRandomUniform import InitRandomUniform
|
21
xlib/avecl/_internal/op/__init__.py
Normal file
21
xlib/avecl/_internal/op/__init__.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
from .any_wise import add, any_wise, div, max_, min_, mul, sqrt, square, sub
|
||||
from .binary_dilate_circle import binary_dilate_circle
|
||||
from .binary_erode_circle import binary_erode_circle
|
||||
from .binary_morph import binary_morph
|
||||
from .cast import cast
|
||||
from .concat import concat
|
||||
from .depthwise_conv2D import depthwise_conv2D
|
||||
from .gaussian_blur import gaussian_blur
|
||||
from .matmul import matmul, matmulc
|
||||
from .pad import pad
|
||||
from .reduce import (moments, reduce_max, reduce_mean, reduce_min, reduce_std,
|
||||
reduce_sum, reduce_variance)
|
||||
from .remap import remap
|
||||
from .remap_np_affine import remap_np_affine
|
||||
from .reshape import reshape
|
||||
from .slice_ import slice_
|
||||
from .slice_set import slice_set
|
||||
from .stack import stack
|
||||
from .tile import tile
|
||||
from .transpose import transpose
|
||||
from .warp_affine import warp_affine
|
111
xlib/avecl/_internal/op/any_wise.py
Normal file
111
xlib/avecl/_internal/op/any_wise.py
Normal file
|
@ -0,0 +1,111 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HArgs import HArgs
|
||||
from ..HKernel import HKernel
|
||||
from ..HType import HType
|
||||
from ..info import BroadcastInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
|
||||
def any_wise(op_text : str,
|
||||
*args,
|
||||
dtype : np.dtype = None,
|
||||
output_t:Tensor=None) -> Tensor:
|
||||
"""
|
||||
operator for N-wise ops with N inputs
|
||||
|
||||
arguments
|
||||
op_text example: O=(2*I0*I1)+I2
|
||||
|
||||
*args List[ Tensor | number ]
|
||||
|
||||
dtype
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape, but should match total size.
|
||||
"""
|
||||
HArgs.check_zero_get_length(args)
|
||||
tensor_args = HArgs.filter_tensor(args, raise_on_empty=True)
|
||||
device = HArgs.check_get_same_device(tensor_args)
|
||||
|
||||
shape_list, dtype_list, krn_args = HArgs.decompose(args)
|
||||
|
||||
op = SCacheton.get(_AnyWiseOp, shape_list, dtype_list, dtype, op_text)
|
||||
|
||||
if output_t is None:
|
||||
output_t = Tensor ( op.o_shape, op.o_dtype, device=device )
|
||||
elif output_t.shape.size != op.o_shape.size:
|
||||
raise ValueError(f'output_t must have size {op.o_shape.size}')
|
||||
|
||||
device.run_kernel(op.forward_krn, output_t.get_buffer(), *krn_args)
|
||||
|
||||
return output_t
|
||||
|
||||
class _AnyWiseOp:
|
||||
def __init__(self, shape_list, dtype_list, o_dtype, op_text : str):
|
||||
if len(shape_list) != len(dtype_list):
|
||||
raise ValueError('len(shape_list) != len(dtype_list)')
|
||||
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
|
||||
|
||||
if len(shape_list) == 1:
|
||||
# element-wise.
|
||||
i_shape, i_dtype = shape_list[0], dtype_list[0]
|
||||
self.o_shape = o_shape = i_shape
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
{HKernel.define_tensor('IN', i_shape, i_dtype)}
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const IN_PTR_TYPE* IN_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
O_TYPE O = O_GLOBAL_LOAD(gid);
|
||||
IN_TYPE I0 = IN_GLOBAL_LOAD(gid);
|
||||
{op_text};
|
||||
O_GLOBAL_STORE(gid, O);
|
||||
}}
|
||||
""")
|
||||
else:
|
||||
# Multi arg.
|
||||
self.info = info = BroadcastInfo( [ shape if shape is not None else AShape((1,)) for shape in shape_list ])
|
||||
|
||||
self.o_shape = o_shape = info.o_shape
|
||||
|
||||
defs, arg_defs, impls = [], [], []
|
||||
for i, (t_shape, t_dtype) in enumerate(zip(shape_list, dtype_list)):
|
||||
t_name = f'I{i}'
|
||||
if t_shape is not None:
|
||||
defs.append( HKernel.define_tensor(t_name, info.br_shapes[i], t_dtype) )
|
||||
arg_defs.append( f", __global const {t_name}_PTR_TYPE* {t_name}_PTR_NAME" )
|
||||
impls.append( f"{t_name}_TYPE {t_name} = {t_name}_GLOBAL_LOAD({t_name}_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)}));")
|
||||
else:
|
||||
arg_defs.append( f", {HKernel.define_scalar_func_arg(t_name, t_dtype)}" )
|
||||
|
||||
defs, arg_defs, impls = '\n'.join(defs), '\n'.join(arg_defs), '\n'.join(impls)
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
|
||||
{defs}
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME{arg_defs})
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)}
|
||||
{impls}
|
||||
O_TYPE O;
|
||||
{op_text};
|
||||
O_GLOBAL_STORE(gid, O);
|
||||
}}
|
||||
""")
|
||||
|
||||
def add(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0+I1', a_t, b_t)
|
||||
def sub(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0-I1', a_t, b_t)
|
||||
def mul(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0*I1', a_t, b_t)
|
||||
def div(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0/I1', a_t, b_t)
|
||||
def min_(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=fmin( I0_TO_FLOATX(I0), I1_TO_FLOATX(I1) )', a_t, b_t)
|
||||
def max_(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=fmax( I0_TO_FLOATX(I0), I1_TO_FLOATX(I1) )', a_t, b_t)
|
||||
def square(a_t : Tensor) -> Tensor: return any_wise('O=I0*I0', a_t)
|
||||
def sqrt(a_t : Tensor) -> Tensor: return any_wise('O=sqrt(I0_TO_FLOATX(I0))', a_t)
|
91
xlib/avecl/_internal/op/binary_dilate_circle.py
Normal file
91
xlib/avecl/_internal/op/binary_dilate_circle.py
Normal file
|
@ -0,0 +1,91 @@
|
|||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..info import Conv2DInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
|
||||
def binary_dilate_circle (input_t : Tensor, radius : int = 1, iterations : int = 1, dtype=None):
|
||||
"""
|
||||
Binary dilate operator using circle kernel with radius.
|
||||
|
||||
input_t Tensor (...,H,W)
|
||||
|
||||
per-element of H,W, set 1 if any neighbor elements inside circle with radius != 0.
|
||||
otherwise set 0.
|
||||
"""
|
||||
op = SCacheton.get(_BinaryDilateOp, input_t.shape, input_t.dtype, int(radius), dtype)
|
||||
|
||||
device = input_t.get_device()
|
||||
|
||||
if radius <= 0 or iterations <= 0:
|
||||
return input_t.copy()
|
||||
else:
|
||||
for i in range(iterations):
|
||||
if i == 0:
|
||||
buf_in = input_t
|
||||
else:
|
||||
buf_in, buf_out = buf_out, buf_in
|
||||
if i <= 1:
|
||||
buf_out = Tensor( op.o_shape, op.o_dtype, device=device )
|
||||
device.run_kernel(op.forward_krn, buf_out.get_buffer(), buf_in.get_buffer() )
|
||||
|
||||
return buf_out
|
||||
|
||||
class _BinaryDilateOp():
|
||||
def __init__(self, i_shape : AShape, i_dtype, radius, o_dtype):
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
if i_shape.ndim < 2:
|
||||
raise ValueError(f'i_shape.ndim must be >= 2')
|
||||
|
||||
KS = radius*2+1
|
||||
IH,IW = i_shape[-2:]
|
||||
|
||||
ci = Conv2DInfo(IH, IW, KS, KS, stride=1, dilation=1, padding='same')
|
||||
|
||||
self.o_shape = o_shape = i_shape
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
{HKernel.define_tensor('I', i_shape, i_dtype)}
|
||||
|
||||
#define PADL {ci.PADL}
|
||||
#define PADT {ci.PADT}
|
||||
|
||||
#define RADIUS {radius}
|
||||
#define KS {KS}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
|
||||
|
||||
{'#pragma unroll' if KS <= 16 else ''}
|
||||
for (int kh=0; kh<KS; ++kh)
|
||||
{'#pragma unroll' if KS <= 16 else ''}
|
||||
for (int kw=0; kw<KS; ++kw)
|
||||
{{
|
||||
if ( hypot( (float)(kh-RADIUS), (float)(kw-RADIUS) ) <= RADIUS)
|
||||
{{
|
||||
int im2 = -PADT + kh + om2;
|
||||
int im1 = -PADL + kw + om1;
|
||||
|
||||
I_TYPE i_val = (im1 >= 0 & im1 < Im1 & im2 >= 0 & im2 < Im2) ?
|
||||
I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))
|
||||
: 0;
|
||||
|
||||
if (i_val != (I_TYPE)0)
|
||||
{{
|
||||
O_GLOBAL_STORE(gid, (O_TYPE) 1);
|
||||
return;
|
||||
}}
|
||||
}}
|
||||
}}
|
||||
|
||||
O_GLOBAL_STORE(gid, (O_TYPE) 0 );
|
||||
}}
|
||||
""")
|
||||
|
||||
|
91
xlib/avecl/_internal/op/binary_erode_circle.py
Normal file
91
xlib/avecl/_internal/op/binary_erode_circle.py
Normal file
|
@ -0,0 +1,91 @@
|
|||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..info import Conv2DInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
|
||||
def binary_erode_circle (input_t : Tensor, radius : int = 1, iterations : int = 1, dtype=None):
|
||||
"""
|
||||
Binary erode operator using circle kernel with radius.
|
||||
|
||||
input_t Tensor (...,H,W)
|
||||
|
||||
per-element of H,W, set 1 if all neighbor elements inside circle with radius != 0.
|
||||
otherwise set 0.
|
||||
"""
|
||||
op = SCacheton.get(_BinaryErodeOp, input_t.shape, input_t.dtype, int(radius), dtype)
|
||||
|
||||
device = input_t.get_device()
|
||||
|
||||
if radius <= 0 or iterations <= 0:
|
||||
return input_t.copy()
|
||||
else:
|
||||
for i in range(iterations):
|
||||
if i == 0:
|
||||
buf_in = input_t
|
||||
else:
|
||||
buf_in, buf_out = buf_out, buf_in
|
||||
if i <= 1:
|
||||
buf_out = Tensor( op.o_shape, op.o_dtype, device=device )
|
||||
device.run_kernel(op.forward_krn, buf_out.get_buffer(), buf_in.get_buffer() )
|
||||
|
||||
return buf_out
|
||||
|
||||
class _BinaryErodeOp():
|
||||
def __init__(self, i_shape : AShape, i_dtype, radius, o_dtype):
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
if i_shape.ndim < 2:
|
||||
raise ValueError(f'i_shape.ndim must be >= 2')
|
||||
|
||||
KS = radius*2+1
|
||||
IH,IW = i_shape[-2:]
|
||||
|
||||
ci = Conv2DInfo(IH, IW, KS, KS, stride=1, dilation=1, padding='same')
|
||||
|
||||
self.o_shape = o_shape = i_shape
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
{HKernel.define_tensor('I', i_shape, i_dtype)}
|
||||
|
||||
#define PADL {ci.PADL}
|
||||
#define PADT {ci.PADT}
|
||||
|
||||
#define RADIUS {radius}
|
||||
#define KS {KS}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
|
||||
|
||||
{'#pragma unroll' if KS <= 16 else ''}
|
||||
for (int kh=0; kh<KS; ++kh)
|
||||
{'#pragma unroll' if KS <= 16 else ''}
|
||||
for (int kw=0; kw<KS; ++kw)
|
||||
{{
|
||||
if ( hypot( (float)(kh-RADIUS), (float)(kw-RADIUS) ) <= RADIUS)
|
||||
{{
|
||||
int im2 = -PADT + kh + om2;
|
||||
int im1 = -PADL + kw + om1;
|
||||
|
||||
I_TYPE i_val = (im1 >= 0 & im1 < Im1 & im2 >= 0 & im2 < Im2) ?
|
||||
I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))
|
||||
: 0;
|
||||
|
||||
if (i_val == (I_TYPE)0)
|
||||
{{
|
||||
O_GLOBAL_STORE(gid, (O_TYPE) 0 );
|
||||
return;
|
||||
}}
|
||||
}}
|
||||
}}
|
||||
|
||||
O_GLOBAL_STORE(gid, (O_TYPE) 1 );
|
||||
}}
|
||||
""")
|
||||
|
||||
|
43
xlib/avecl/_internal/op/binary_morph.py
Normal file
43
xlib/avecl/_internal/op/binary_morph.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
from ..Tensor import Tensor
|
||||
from .binary_dilate_circle import binary_dilate_circle
|
||||
from .binary_erode_circle import binary_erode_circle
|
||||
from .gaussian_blur import gaussian_blur
|
||||
from .pad import pad
|
||||
|
||||
|
||||
def binary_morph(input_t : Tensor, erode_dilate : int, blur : float, fade_to_border : bool = False, dtype=None) -> Tensor:
|
||||
"""
|
||||
Apply optional binary erode/dilate and optional blur.
|
||||
|
||||
input_t (...,H,W) tensor. Non zero values will be treated as 1.
|
||||
|
||||
erode_dilate int >= 0 amount of pixels to dilate
|
||||
|
||||
blur float >= 0 amount of pixels to blur
|
||||
|
||||
fade_to_border(False) clip the image in order
|
||||
to fade smoothly to the border with specified blur amount
|
||||
"""
|
||||
x = input_t
|
||||
|
||||
H,W = input_t.shape[-2:]
|
||||
|
||||
x = pad(x, (...,(H,H),(W,W)), mode='constant', constant_value=0)
|
||||
|
||||
if erode_dilate > 0:
|
||||
x = binary_erode_circle(x, radius=1, iterations=max(1,erode_dilate//2))
|
||||
elif erode_dilate < 0:
|
||||
x = binary_dilate_circle(x, radius=1, iterations=max(1,-erode_dilate//2) )
|
||||
|
||||
if fade_to_border:
|
||||
h_clip_size = H + blur // 2
|
||||
w_clip_size = W + blur // 2
|
||||
x[...,:h_clip_size,:] = 0
|
||||
x[...,-h_clip_size:,:] = 0
|
||||
x[...,:,:w_clip_size] = 0
|
||||
x[...,:,-w_clip_size:] = 0
|
||||
|
||||
if blur > 0:
|
||||
x = gaussian_blur(x, blur * 0.250, dtype=dtype)
|
||||
|
||||
return x[...,H:-H,W:-W]
|
17
xlib/avecl/_internal/op/cast.py
Normal file
17
xlib/avecl/_internal/op/cast.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
from ..Tensor import Tensor
|
||||
|
||||
from .any_wise import any_wise
|
||||
|
||||
def cast(input_t : Tensor, dtype, output_t:Tensor=None) -> Tensor:
|
||||
"""
|
||||
cast operator
|
||||
|
||||
arguments
|
||||
input_t
|
||||
|
||||
dtype
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape, but should match total size.
|
||||
"""
|
||||
return any_wise('O=I0', input_t, dtype=dtype, output_t=output_t)
|
70
xlib/avecl/_internal/op/concat.py
Normal file
70
xlib/avecl/_internal/op/concat.py
Normal file
|
@ -0,0 +1,70 @@
|
|||
from ..backend import Kernel
|
||||
from ..HArgs import HArgs
|
||||
from ..HType import HType
|
||||
from ..HKernel import HKernel
|
||||
from ..info import ConcatInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
def concat(tensor_list, axis, dtype=None, output_t=None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
arguments
|
||||
|
||||
tensor_list Iterable
|
||||
|
||||
axis Int
|
||||
|
||||
dtype np.dtype
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape,
|
||||
but should match total size.
|
||||
gradfn will not be set.
|
||||
|
||||
is_add_to_output add result to output_t if output_t is set.
|
||||
"""
|
||||
tensor_list = tuple(tensor_list)
|
||||
HArgs.check_zero_get_length(tensor_list)
|
||||
HArgs.check_all_tensors(tensor_list)
|
||||
|
||||
device = HArgs.check_get_same_device(tensor_list)
|
||||
shape_list, dtype_list, _ = HArgs.decompose(tensor_list)
|
||||
|
||||
op = SCacheton.get(_ConcatOp, shape_list, dtype_list, dtype, int(axis), False if output_t is None else is_add_to_output)
|
||||
|
||||
if output_t is None:
|
||||
output_t = Tensor (op.info.o_shape, op.o_dtype, device=device)
|
||||
elif output_t.shape.size != op.info.o_shape.size:
|
||||
raise ValueError(f'output_t must have size {op.info.o_shape.size}')
|
||||
|
||||
for forward_krn,t in zip(op.forward_krns,tensor_list):
|
||||
device.run_kernel(forward_krn, output_t.get_buffer(), t.get_buffer(), global_shape=(t.shape.size,) )
|
||||
|
||||
return output_t
|
||||
|
||||
class _ConcatOp:
|
||||
def __init__(self, shape_list, dtype_list, o_dtype, axis, is_add_to_output):
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
|
||||
|
||||
self.info = info = ConcatInfo(shape_list, axis)
|
||||
|
||||
self.forward_krns = forward_krns = []
|
||||
|
||||
for i, (shape, dtype) in enumerate(zip(shape_list, dtype_list)):
|
||||
forward_krn = Kernel(f"""
|
||||
|
||||
{HKernel.define_tensor('O', info.o_shape, o_dtype )}
|
||||
{HKernel.define_tensor('I', shape, dtype)}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'I', shape.ndim)}
|
||||
|
||||
i{info.axis} += {info.axis_offsets[i]};
|
||||
|
||||
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_seq_enum('I', info.o_shape.ndim)}), I_GLOBAL_LOAD(gid) );
|
||||
}}
|
||||
""")
|
||||
forward_krns.append(forward_krn)
|
107
xlib/avecl/_internal/op/depthwise_conv2D.py
Normal file
107
xlib/avecl/_internal/op/depthwise_conv2D.py
Normal file
|
@ -0,0 +1,107 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..info import BroadcastInfo, Conv2DInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
|
||||
def depthwise_conv2D (input_t : Tensor, kernel_t : Tensor, stride=1, dilation=1, padding='same', dtype=None):
|
||||
"""
|
||||
Depthwise Conv2D operator.
|
||||
|
||||
input_t Tensor (...,H,W)
|
||||
|
||||
kernel_t Tensor (...,H,W)
|
||||
|
||||
stride(1) int
|
||||
|
||||
dilation(1) int
|
||||
|
||||
padding(same) 'valid' no padding
|
||||
'same' output size will be the same
|
||||
or divided by stride
|
||||
int padding value for all sides
|
||||
Iterable of 4 ints
|
||||
paddings for left,top,right,bottom sides
|
||||
|
||||
...-head part of shapes will be broadcasted to each other
|
||||
"""
|
||||
|
||||
op = SCacheton.get(_DepthwiseConv2DOp, input_t.shape, input_t.dtype, kernel_t.shape, kernel_t.dtype, dtype, int(stride), int(dilation), padding)
|
||||
|
||||
output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
|
||||
output_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(), kernel_t.get_buffer())
|
||||
|
||||
return output_t
|
||||
|
||||
class _DepthwiseConv2DOp():
|
||||
def __init__(self, i_shape : AShape, i_dtype, k_shape : AShape, k_dtype, o_dtype, stride, dilation, padding):
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
if i_shape.ndim < 2:
|
||||
raise ValueError(f'i_shape.ndim must be >= 2')
|
||||
|
||||
if k_shape.ndim < 2:
|
||||
raise ValueError(f'k_shape.ndim must be >= 2')
|
||||
|
||||
IH,IW = i_shape[-2:]
|
||||
KH,KW = k_shape[-2:]
|
||||
|
||||
ci = Conv2DInfo(IH, IW, KH, KW, stride, dilation, padding)
|
||||
|
||||
if i_shape.ndim == 2 and k_shape.ndim == 2:
|
||||
# nothing to broadcast
|
||||
i_br_shape = i_shape
|
||||
k_br_shape = k_shape
|
||||
|
||||
o_shape = AShape([ci.OH, ci.OW])
|
||||
else:
|
||||
op = BroadcastInfo([ i_shape[:-2], k_shape[:-2] ])
|
||||
|
||||
i_br_shape = op.br_shapes[0] + i_shape[-2:]
|
||||
k_br_shape = op.br_shapes[1] + k_shape[-2:]
|
||||
|
||||
o_shape = op.o_shape + [ci.OH, ci.OW]
|
||||
|
||||
self.o_shape = o_shape
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
{HKernel.define_tensor('I', i_br_shape, i_dtype)}
|
||||
{HKernel.define_tensor('K', k_br_shape, k_dtype)}
|
||||
|
||||
#define PADL {ci.PADL}
|
||||
#define PADT {ci.PADT}
|
||||
|
||||
#define STRIDE {stride}
|
||||
#define DILATION {dilation}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, __global const K_PTR_TYPE* K_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
|
||||
|
||||
float v = 0.0;
|
||||
{'#pragma unroll' if KH <= 9 else ''}
|
||||
for (int km2=0; km2<Km2; ++km2)
|
||||
{{
|
||||
int im2 = -PADT + km2*DILATION + om2*STRIDE;
|
||||
if (im2 >= 0 & im2 < Im2)
|
||||
{'#pragma unroll' if KW <= 9 else ''}
|
||||
for (int km1=0; km1<Km1; ++km1)
|
||||
{{
|
||||
int im1 = -PADL + km1*DILATION + om1*STRIDE;
|
||||
if (im1 >= 0 & im1 < Im1)
|
||||
v += ((float)(I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))))
|
||||
*K_GLOBAL_LOAD(K_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='km2,km1' )}));
|
||||
}}
|
||||
}}
|
||||
|
||||
O_GLOBAL_STORE(gid, (O_TYPE) v);
|
||||
}}
|
||||
""")
|
||||
|
||||
|
39
xlib/avecl/_internal/op/gaussian_blur.py
Normal file
39
xlib/avecl/_internal/op/gaussian_blur.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
import numpy as np
|
||||
|
||||
from ..Tensor import Tensor
|
||||
from .depthwise_conv2D import depthwise_conv2D
|
||||
|
||||
|
||||
def gaussian_blur (input_t : Tensor, sigma, dtype=None) -> Tensor:
|
||||
"""
|
||||
arguments
|
||||
|
||||
input_t Tensor(...,H,W)
|
||||
|
||||
sigma float
|
||||
|
||||
|
||||
"""
|
||||
if sigma <= 0.0:
|
||||
return input_t.copy() #TODO
|
||||
|
||||
device = input_t.get_device()
|
||||
|
||||
key = (gaussian_blur, sigma)
|
||||
kernel_t = device.get_cached_data(key)
|
||||
if kernel_t is None:
|
||||
kernel_t = Tensor.from_value( _make_gaussian_kernel(sigma, np.float32), device=device )
|
||||
device.set_cached_data(key, kernel_t)
|
||||
|
||||
output_t = depthwise_conv2D(input_t, kernel_t, dtype=dtype)
|
||||
return output_t
|
||||
|
||||
def _make_gaussian_kernel(sigma : float, dtype):
|
||||
kernel_size = max(3, int(2 * 2 * sigma))
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size += 1
|
||||
mean = np.floor(0.5 * kernel_size)
|
||||
kernel_1d = np.array([ np.exp(-(float(x) - float(mean)) ** 2 / (2 * sigma ** 2)) for x in range(kernel_size)])
|
||||
np_kernel = np.outer(kernel_1d, kernel_1d)
|
||||
kernel = np_kernel / np.sum(np_kernel)
|
||||
return kernel.astype(dtype)
|
158
xlib/avecl/_internal/op/matmul.py
Normal file
158
xlib/avecl/_internal/op/matmul.py
Normal file
|
@ -0,0 +1,158 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HArgs import HArgs
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
|
||||
def matmul(a_t : Tensor, b_t : Tensor, output_t: Tensor=None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
matmul operator in row-major format
|
||||
|
||||
A(...,M,K) x
|
||||
B(...,K,N) =
|
||||
(...,M,N)
|
||||
|
||||
arguments
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape,
|
||||
but should match total size.
|
||||
gradfn will not be set.
|
||||
|
||||
is_add_to_output add result to output_t if output_t is set.
|
||||
"""
|
||||
|
||||
return matmulc(b_t, a_t, output_t=output_t, is_add_to_output=is_add_to_output)
|
||||
|
||||
|
||||
|
||||
def matmulc(a_t : Tensor, b_t : Tensor, output_t : Tensor = None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
matmul operator in col-major format
|
||||
|
||||
A(...,K,M) x
|
||||
B(...,N,K) =
|
||||
(...,N,M)
|
||||
|
||||
arguments
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape,
|
||||
but should match total size.
|
||||
gradfn will not be set.
|
||||
|
||||
is_add_to_output add result to output_t if output_t is set.
|
||||
"""
|
||||
device = HArgs.check_get_same_device([a_t, b_t])
|
||||
|
||||
op = SCacheton.get(_MatmulOp, a_t.shape, a_t.dtype, b_t.shape, b_t.dtype, False if output_t is None else is_add_to_output)
|
||||
|
||||
if output_t is None:
|
||||
output_t = Tensor (op.o_shape, op.o_dtype, device=device )
|
||||
elif output_t.shape.size != op.o_shape.size:
|
||||
raise ValueError(f'output_t must have size {op.o_shape.size}')
|
||||
|
||||
device.run_kernel(op.forward_krn, output_t.get_buffer(), a_t.get_buffer(), b_t.get_buffer(), )
|
||||
|
||||
return output_t
|
||||
|
||||
|
||||
class _MatmulOp:
|
||||
def __init__(self, a_shape, a_dtype, b_shape, b_dtype, is_add_to_output):
|
||||
a_dtype = np.dtype(a_dtype).type
|
||||
b_dtype = np.dtype(b_dtype).type
|
||||
|
||||
if a_dtype != np.float32 or b_dtype != np.float32:
|
||||
raise ValueError('matmul works only with float32 tensors.')
|
||||
|
||||
if a_shape.ndim != b_shape.ndim:
|
||||
raise ValueError(f'ndims are not equal. {a_shape.ndim} != {b_shape.ndim}')
|
||||
|
||||
ndim = a_shape.ndim
|
||||
if ndim < 2:
|
||||
raise ValueError('Tensors ndim must be at least 2.')
|
||||
|
||||
K, M = a_shape[-2], a_shape[-1]
|
||||
N, B_COLS = b_shape[-2], b_shape[-1]
|
||||
|
||||
if K != B_COLS:
|
||||
raise ValueError('A_ROWS != B_COLS')
|
||||
|
||||
BATCH = a_shape[0:-2].size
|
||||
B_BATCH = b_shape[0:-2].size
|
||||
|
||||
if BATCH != B_BATCH:
|
||||
raise ValueError(f'BATCH size {BATCH} != {B_BATCH} in shapes {a_shape} {b_shape}')
|
||||
|
||||
if ndim == 2:
|
||||
self.o_shape = AShape( (N, M) )
|
||||
else:
|
||||
self.o_shape = AShape( a_shape[:-2]+(N, M) )
|
||||
self.o_dtype = np.float32
|
||||
|
||||
self.M = M
|
||||
self.N = N
|
||||
self.K = K
|
||||
|
||||
# Determining optimal tile widths
|
||||
for MW in [8,4,2,1]:
|
||||
if M % MW == 0:
|
||||
break
|
||||
for KW in [8,4,2,1]:
|
||||
if N % KW == 0 and K % KW == 0:
|
||||
break
|
||||
NW = KW
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(M//MW, N//NW, BATCH), kernel_text=f"""
|
||||
#define K {K}
|
||||
#define N {N}
|
||||
#define MW {MW} // M tile Width
|
||||
#define NW {NW} // N tile Width -- NW & KW should be the same !
|
||||
#define KW {KW} // K tile Width
|
||||
#define MT {M//MW} // MT is max for 'mt' (M tile count)
|
||||
#define KT {K//KW} // KT is max for 'kt' (K tile count)
|
||||
|
||||
#define floatMW { f'float{MW}' if MW != 1 else 'float'}
|
||||
#define floatKW { f'float{KW}' if KW != 1 else 'float'}
|
||||
|
||||
__kernel void GeMM(__global floatMW* O, const __global floatMW* restrict A, const __global floatKW* restrict B)
|
||||
{{
|
||||
size_t mt = get_global_id(0); //global M-tile id
|
||||
size_t nc = get_global_id(1); //global N-tile id
|
||||
size_t batch = get_global_id(2);
|
||||
|
||||
float AT[KW][MW]; // sub tiles
|
||||
float BT[NW][KW];
|
||||
float CT[NW][MW];
|
||||
|
||||
#pragma unroll
|
||||
for (uint i=0; i<NW*MW; ++i) // zero CT tile
|
||||
((float*) CT)[i] = 0.0;
|
||||
|
||||
for (uint kt=0; kt<KT; ++kt) // iterate over K-dim tiles
|
||||
{{
|
||||
#pragma unroll
|
||||
for (uint k=0; k<KW; ++k) // every k-element inside K-dim tile
|
||||
*( (floatMW*) AT[k] ) = A[batch*K*MT + (kt*KW + k)*MT + mt]; // store M-Width floats
|
||||
|
||||
#pragma unroll
|
||||
for (uint n=0; n<NW; ++n) // every n-element inside N-dim tile
|
||||
*( (floatKW*) BT[n] ) = B[batch*N*KT + (nc*NW + n)*KT + kt]; // store K-Width floats
|
||||
|
||||
#pragma unroll
|
||||
for (uint k=0; k<KW; ++k)
|
||||
#pragma unroll
|
||||
for (uint n=0; n<NW; ++n) // sub tiles multiplication
|
||||
#pragma unroll
|
||||
for (uint m=0; m<MW; ++m)
|
||||
CT[n][m] += AT[k][m] * BT[n][k];
|
||||
}}
|
||||
|
||||
#pragma unroll
|
||||
for (uint n=0; n<NW; ++n)
|
||||
O[ batch*N*MT + (nc*NW + n)*MT + mt] {'+=' if is_add_to_output else '='}
|
||||
*( (floatMW*) CT[n]);
|
||||
}}""")
|
||||
|
71
xlib/avecl/_internal/op/pad.py
Normal file
71
xlib/avecl/_internal/op/pad.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..HType import HType
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..info import PadInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
|
||||
def pad(input_t : Tensor, axes_paddings : List, mode : str = 'constant', constant_value=0, dtype : np.dtype = None, output_t : Tensor=None) -> Tensor:
|
||||
"""
|
||||
arguments:
|
||||
|
||||
axes_paddings list of (l_pad, r_pad),
|
||||
|
||||
if [0] == ... (Ellipsis), then left-side paddings will be filled with (0,0) for remain axes
|
||||
if [-1] == ... , same for ride-side
|
||||
|
||||
dtype cast to dtype
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape, but should match total size
|
||||
|
||||
"""
|
||||
op = SCacheton.get(_PadOp, input_t.shape, input_t.dtype, dtype, tuple(axes_paddings), mode, constant_value )
|
||||
|
||||
if output_t is None:
|
||||
output_t = Tensor (op.o_shape, op.o_dtype, device=input_t.get_device())
|
||||
elif output_t.shape.size != op.o_shape.size:
|
||||
raise ValueError(f'output_t must have size {op.o_shape.size}')
|
||||
|
||||
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
|
||||
|
||||
return output_t
|
||||
|
||||
|
||||
class _PadOp:
|
||||
def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, axes_paddings, mode, constant_value ):
|
||||
_allow_modes = ['constant']
|
||||
if mode not in _allow_modes:
|
||||
raise ValueError(f'Allowed pads modes: {_allow_modes}')
|
||||
|
||||
if mode == 'constant':
|
||||
if not HType.is_scalar_type(constant_value):
|
||||
raise ValueError('constan_value must be scalar')
|
||||
|
||||
info = PadInfo(i_shape, axes_paddings)
|
||||
|
||||
self.o_shape = o_shape = info.o_shape
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
{HKernel.define_tensor('I', i_shape, i_dtype)}
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
|
||||
|
||||
if ({' & '.join(f'o{i} >= {l_pad} & o{i} < (O{i}-{r_pad})' for i, (l_pad,r_pad) in enumerate(info.axes_paddings))})
|
||||
O_GLOBAL_STORE(gid, I_GLOBAL_LOAD( I_IDX({ ','.join(f'o{i}-{l_pad}' for i,(l_pad,r_pad) in zip(range(o_shape.ndim), info.axes_paddings) ) }) ) );
|
||||
else
|
||||
O_GLOBAL_STORE(gid, (O_TYPE){constant_value} );
|
||||
//O_GLOBAL_STORE(gid, I_GLOBAL_LOAD( I_IDX_MOD({ ','.join(f' I{i} + ( (o{i}-{l_pad})*( ((o{i}-{l_pad})/I{i}) % 2 == 0 ? 1: -1) ) % I{i} ' for i,(l_pad,r_pad) in zip(range(o_shape.ndim), info.axes_paddings) ) }) ) );
|
||||
}}""")
|
||||
#print(self.forward_krn)
|
214
xlib/avecl/_internal/op/reduce.py
Normal file
214
xlib/avecl/_internal/op/reduce.py
Normal file
|
@ -0,0 +1,214 @@
|
|||
import math
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..AAxes import AAxes
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..info import ReductionInfo, TransposeInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
from .slice_ import slice_
|
||||
from .transpose import transpose
|
||||
from .any_wise import square, sqrt
|
||||
|
||||
|
||||
def reduce_mean (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
Reduce mean operator.
|
||||
|
||||
input_t Tensor
|
||||
|
||||
axes(None) int
|
||||
Iterable of ints.
|
||||
None - all axes
|
||||
|
||||
keepdims(False) keep reduced axes
|
||||
"""
|
||||
return reduce_op ('mean', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
|
||||
|
||||
def reduce_std(input_t, axes=None, keepdims=False):
|
||||
"""
|
||||
Reduce std operator.
|
||||
|
||||
input_t Tensor
|
||||
|
||||
axes(None) int
|
||||
Iterable of ints.
|
||||
None - all axes
|
||||
|
||||
keepdims(False) keep reduced axes
|
||||
"""
|
||||
return sqrt(reduce_variance(input_t, axes, keepdims))
|
||||
|
||||
|
||||
def reduce_variance(input_t, axes=None, keepdims=False):
|
||||
"""
|
||||
Reduce variance operator.
|
||||
|
||||
input_t Tensor
|
||||
|
||||
axes(None) int
|
||||
Iterable of ints.
|
||||
None - all axes
|
||||
|
||||
keepdims(False) keep reduced axes
|
||||
"""
|
||||
mean = reduce_mean(input_t, axes, keepdims=True)
|
||||
return reduce_mean(square(input_t - mean), axes, keepdims)
|
||||
|
||||
def moments(input_t, axes=None, keepdims=False):
|
||||
"""
|
||||
Returns (mean, variance) of input_t
|
||||
|
||||
input_t Tensor
|
||||
|
||||
axes(None) int
|
||||
Iterable of ints.
|
||||
None - all axes
|
||||
|
||||
keepdims(False) keep reduced axes
|
||||
"""
|
||||
mean = reduce_mean(input_t, axes, keepdims)
|
||||
mean_shape_keepdims = mean._op.info.o_shape_kd
|
||||
var = reduce_mean(square(input_t - mean.reshape(mean_shape_keepdims) ), axes, keepdims)
|
||||
return mean, var
|
||||
|
||||
def reduce_min (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
Reduce min operator.
|
||||
|
||||
input_t Tensor
|
||||
|
||||
axes(None) int
|
||||
Iterable of ints.
|
||||
None - all axes
|
||||
|
||||
keepdims(False) keep reduced axes
|
||||
"""
|
||||
return reduce_op ('min', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
|
||||
|
||||
def reduce_max (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
Reduce max operator.
|
||||
|
||||
input_t Tensor
|
||||
|
||||
axes(None) int
|
||||
Iterable of ints.
|
||||
None - all axes
|
||||
|
||||
keepdims(False) keep reduced axes
|
||||
"""
|
||||
return reduce_op ('max', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
|
||||
|
||||
def reduce_sum (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
Reduce sum operator.
|
||||
|
||||
input_t Tensor
|
||||
|
||||
axes(None) int
|
||||
Iterable of ints.
|
||||
None - all axes
|
||||
|
||||
keepdims(False) keep reduced axes
|
||||
"""
|
||||
return reduce_op ('sum', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
|
||||
|
||||
def reduce_op (op_type : str, input_t, axes=None, keepdims=False, output_t=None, is_add_to_output=False):
|
||||
"""
|
||||
arguments
|
||||
|
||||
op_type 'sum' 'mean' 'min' 'max'
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape,
|
||||
but should match total size.
|
||||
gradfn will not be set.
|
||||
|
||||
is_add_to_output add result to output_t if output_t is set.
|
||||
"""
|
||||
|
||||
op = SCacheton.get(_ReduceOp, op_type, input_t.shape, input_t.dtype, AAxes(axes, input_t.shape.ndim), keepdims)
|
||||
|
||||
if output_t is None:
|
||||
output_t = Tensor ( op.info.o_shape, input_t.dtype, device=input_t.get_device() )
|
||||
elif output_t.shape.size != op.info.o_shape.size:
|
||||
raise ValueError(f'output_t must have size {op.info.o_shape.size}')
|
||||
|
||||
# Make an intermediate tensor
|
||||
input_t_inter = transpose(input_t, op.intermediate_transpose_axes)
|
||||
|
||||
# Perform multistage inplace operation in intermediate tensor
|
||||
for stage, (shape, STAGE_COLS, STAGE_VALID_COLS) in enumerate(zip(op.forward_krn_shapes, op.forward_krn_stage_cols, op.forward_krn_stage_valid_cols)):
|
||||
input_t_inter.get_device().run_kernel(op.forward_krn, input_t_inter.get_buffer(), np.int64(op.COLS), np.int64(STAGE_COLS), np.int64(STAGE_VALID_COLS),
|
||||
global_shape=shape)
|
||||
|
||||
if op_type == 'mean':
|
||||
# divide values in ROWS by number of COLS
|
||||
input_t_inter.get_device().run_kernel(op.mean_div_forward_krn, input_t_inter.get_buffer(), np.int64(op.COLS), global_shape=(op.ROWS,) )
|
||||
|
||||
# Fetch final tensor from zero indexes using slices argument
|
||||
slice_(input_t_inter, op.inter_slices, output_t=output_t, is_add_to_output=is_add_to_output)
|
||||
|
||||
return output_t
|
||||
|
||||
|
||||
class _ReduceOp:
|
||||
def __init__(self, op_type, i_shape : AShape, i_dtype : np.dtype, axes : AAxes, keepdims=False):
|
||||
self.op_type = op_type
|
||||
self.info = info = ReductionInfo(i_shape, axes, keepdims)
|
||||
|
||||
# Determine transpose order for intermediate tensor, where reduction axes will be at the end
|
||||
self.intermediate_transpose_axes = info.o_axes + info.reduction_axes
|
||||
self.intermediate_shape = TransposeInfo(i_shape, self.intermediate_transpose_axes).o_shape
|
||||
|
||||
# slices argument to fetch processed tensor from zero indexes
|
||||
self.inter_slices = ( slice(None,None,None), ) * info.o_axes.ndim + (0,) * info.reduction_axes.ndim
|
||||
|
||||
# COLS are reduction axes, ROWS are remaining axes
|
||||
rows_ndim = info.o_axes.ndim
|
||||
self.ROWS = ROWS = self.intermediate_shape[:rows_ndim].size
|
||||
self.COLS = COLS = self.intermediate_shape[rows_ndim:].size
|
||||
|
||||
# Number of stages to operate COLS
|
||||
n_stages = (COLS-1).bit_length()
|
||||
self.forward_krn_shapes = [ (ROWS * math.ceil(COLS/ (2**(stage+1)) ),) for stage in range(n_stages) ]
|
||||
self.forward_krn_stage_cols = [ math.ceil(COLS / (2**(stage+1)) ) for stage in range(n_stages) ]
|
||||
self.forward_krn_stage_valid_cols = [ math.ceil(COLS / (2** stage ) ) for stage in range(n_stages) ]
|
||||
|
||||
self.forward_krn = Kernel(f"""
|
||||
{HKernel.define_tensor('I', (1,), i_dtype)}
|
||||
|
||||
__kernel void impl(__global I_PTR_TYPE* I_PTR_NAME, long COLS, long STAGE_COLS, long STAGE_VALID_COLS)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
size_t col = gid % STAGE_COLS;
|
||||
size_t row = gid / STAGE_COLS;
|
||||
size_t i_idx = row*COLS + col;
|
||||
|
||||
size_t other_col = col + STAGE_COLS;
|
||||
if (other_col < STAGE_VALID_COLS)
|
||||
{{
|
||||
I_TYPE val_a = I_GLOBAL_LOAD(i_idx);
|
||||
I_TYPE val_b = I_GLOBAL_LOAD(row*COLS + other_col);
|
||||
|
||||
{'I_TYPE val_x = val_a + val_b;' if op_type in ['sum','mean'] else
|
||||
'I_TYPE val_x = fmin( I_TO_FLOATX(val_a), I_TO_FLOATX(val_b) );' if op_type == 'min' else
|
||||
'I_TYPE val_x = fmax( I_TO_FLOATX(val_a), I_TO_FLOATX(val_b) );' if op_type == 'max' else ''
|
||||
}
|
||||
I_GLOBAL_STORE(i_idx, val_x);
|
||||
}}
|
||||
}}
|
||||
""")
|
||||
self.mean_div_forward_krn = Kernel(f"""
|
||||
{HKernel.define_tensor('I', (1,), i_dtype)}
|
||||
__kernel void impl(__global I_PTR_TYPE* I_PTR_NAME, long COLS)
|
||||
{{
|
||||
size_t row = get_global_id(0);
|
||||
I_GLOBAL_STORE(row*COLS, I_GLOBAL_LOAD(row*COLS) / COLS );
|
||||
}}
|
||||
""")
|
103
xlib/avecl/_internal/op/remap.py
Normal file
103
xlib/avecl/_internal/op/remap.py
Normal file
|
@ -0,0 +1,103 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..info import BroadcastInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
def remap (input_t : Tensor, coords_t : Tensor, dtype=None) -> Tensor:
|
||||
"""
|
||||
remap input_t in spatial axes using coords_t
|
||||
|
||||
arguments
|
||||
|
||||
input_t Tensor( ...,IH,IW )
|
||||
|
||||
coords_t Tensor( ...,OH,OW,D )
|
||||
OH - output height
|
||||
OW - output width
|
||||
D is (2)[x,y] coords
|
||||
|
||||
dtype
|
||||
|
||||
...-head part of shapes will be broadcasted to each other
|
||||
"""
|
||||
|
||||
op = SCacheton.get(_RemapOp, input_t.shape, input_t.dtype, coords_t.shape, coords_t.dtype, dtype)
|
||||
|
||||
output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
|
||||
|
||||
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(), coords_t.get_buffer())
|
||||
|
||||
return output_t
|
||||
|
||||
|
||||
class _RemapOp():
|
||||
def __init__(self, i_shape : AShape, i_dtype, c_shape : AShape, c_dtype, o_dtype):
|
||||
if np.dtype(i_dtype).type == np.bool_:
|
||||
raise ValueError('np.bool_ dtype of i_dtype is not supported.')
|
||||
if np.dtype(c_dtype).type == np.bool_:
|
||||
raise ValueError('np.bool_ dtype of c_dtype is not supported.')
|
||||
if i_shape.ndim < 2:
|
||||
raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
|
||||
if c_shape.ndim < 3:
|
||||
raise ValueError(f'Coords shape ndim must be >= 3(...,H,W,D)')
|
||||
if c_shape[-1] != 2:
|
||||
raise ValueError('Last coords dim must be == 2 (x,y)')
|
||||
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
if i_shape.ndim == 2 and c_shape.ndim == 3:
|
||||
# nothing to broadcast
|
||||
|
||||
i_br_shape = i_shape
|
||||
c_br_shape = c_shape
|
||||
|
||||
o_shape = c_shape[-3:-1]
|
||||
else:
|
||||
op = BroadcastInfo([ i_shape[:-2], c_shape[:-3] ])
|
||||
|
||||
i_br_shape = op.br_shapes[0] + i_shape[-2:]
|
||||
c_br_shape = op.br_shapes[1] + c_shape[-3:]
|
||||
|
||||
o_shape = op.o_shape + c_shape[-3:-1]
|
||||
|
||||
self.o_shape = o_shape
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
|
||||
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
{HKernel.define_tensor('I', i_br_shape, i_dtype)}
|
||||
{HKernel.define_tensor('C', c_br_shape[:-1], c_dtype)}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, __global const C_PTR_TYPE2* C_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)}
|
||||
|
||||
C_TYPE2 c_value = C_GLOBAL_LOAD2(C_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim)}));
|
||||
|
||||
float cx01 = (float) c_value.x;
|
||||
float cy01 = (float) c_value.y;
|
||||
|
||||
float cx0f = floor(cx01); int cx0 = (int)cx0f;
|
||||
float cy0f = floor(cy01); int cy0 = (int)cy0f;
|
||||
float cx1f = cx0f+1; int cx1 = (int)cx1f;
|
||||
float cy1f = cy0f+1; int cy1 = (int)cy1f;
|
||||
|
||||
float p00 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx0')}));
|
||||
float p01 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx1')}));
|
||||
float p10 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx0')}));
|
||||
float p11 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx1')}));
|
||||
|
||||
p00 *= (cx1f - cx01)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx0 >= 0 & cx0 < Im1);
|
||||
p01 *= (cx01 - cx0f)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx1 >= 0 & cx1 < Im1);
|
||||
p10 *= (cx1f - cx01)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx0 >= 0 & cx0 < Im1);
|
||||
p11 *= (cx01 - cx0f)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx1 >= 0 & cx1 < Im1);
|
||||
|
||||
O_GLOBAL_STORE(gid, p00 + p01 + p10 + p11);
|
||||
}}
|
||||
""")
|
96
xlib/avecl/_internal/op/remap_np_affine.py
Normal file
96
xlib/avecl/_internal/op/remap_np_affine.py
Normal file
|
@ -0,0 +1,96 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
def remap_np_affine (input_t : Tensor, affine_n : np.array, inverse=False, output_size=None, dtype=None) -> Tensor:
|
||||
"""
|
||||
remap affine operator for all channels using single numpy affine mat
|
||||
|
||||
arguments
|
||||
|
||||
input_t Tensor (...,H,W)
|
||||
|
||||
affine_n np.array (2,3)
|
||||
|
||||
dtype
|
||||
"""
|
||||
if affine_n.shape != (2,3):
|
||||
raise ValueError('affine_n.shape must be (2,3)')
|
||||
|
||||
op = SCacheton.get(_RemapAffineOp, input_t.shape, input_t.dtype, output_size, dtype)
|
||||
|
||||
output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
|
||||
|
||||
((a, b, c),
|
||||
(d, e, f)) = affine_n
|
||||
if not inverse:
|
||||
# do inverse by default, match cv2.warpAffine behaviour
|
||||
D = a*e - b*d
|
||||
D = 1.0 / D if D != 0.0 else 0.0
|
||||
a, b, c, d, e, f = ( e*D, -b*D, (b*f-e*c)*D ,
|
||||
-d*D, a*D, (d*c-a*f)*D )
|
||||
|
||||
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(),
|
||||
np.float32(a), np.float32(b), np.float32(c), np.float32(d), np.float32(e), np.float32(f) )
|
||||
|
||||
return output_t
|
||||
|
||||
|
||||
class _RemapAffineOp():
|
||||
def __init__(self, i_shape : AShape, i_dtype, o_size, o_dtype):
|
||||
if np.dtype(i_dtype).type == np.bool_:
|
||||
raise ValueError('np.bool_ dtype of i_dtype is not supported.')
|
||||
if i_shape.ndim < 2:
|
||||
raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
|
||||
|
||||
IH,IW = i_shape[-2:]
|
||||
if o_size is not None:
|
||||
OH,OW = o_size
|
||||
else:
|
||||
OH,OW = IH,IW
|
||||
|
||||
o_shape = AShape( (OH,OW) )
|
||||
if i_shape.ndim > 2:
|
||||
o_shape = i_shape[:-2] + o_shape
|
||||
|
||||
self.o_shape = o_shape
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
|
||||
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
{HKernel.define_tensor('I', i_shape, i_dtype)}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME,
|
||||
float a, float b, float c,
|
||||
float d, float e, float f)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
|
||||
|
||||
float cx01 = om1*a + om2*b + c;
|
||||
float cy01 = om1*d + om2*e + f;
|
||||
|
||||
float cx0f = floor(cx01); int cx0 = (int)cx0f;
|
||||
float cy0f = floor(cy01); int cy0 = (int)cy0f;
|
||||
float cx1f = cx0f+1; int cx1 = (int)cx1f;
|
||||
float cy1f = cy0f+1; int cy1 = (int)cy1f;
|
||||
|
||||
float p00 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx0')}));
|
||||
float p01 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx1')}));
|
||||
float p10 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx0')}));
|
||||
float p11 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx1')}));
|
||||
|
||||
p00 *= (cx1f - cx01)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx0 >= 0 & cx0 < Im1);
|
||||
p01 *= (cx01 - cx0f)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx1 >= 0 & cx1 < Im1);
|
||||
p10 *= (cx1f - cx01)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx0 >= 0 & cx0 < Im1);
|
||||
p11 *= (cx01 - cx0f)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx1 >= 0 & cx1 < Im1);
|
||||
|
||||
O_GLOBAL_STORE(gid, p00 + p01 + p10 + p11);
|
||||
}}
|
||||
""")
|
26
xlib/avecl/_internal/op/reshape.py
Normal file
26
xlib/avecl/_internal/op/reshape.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
from typing import Iterable
|
||||
|
||||
from ..Tensor import Tensor
|
||||
from ..SCacheton import SCacheton
|
||||
from ..info import ReshapeInfo
|
||||
|
||||
|
||||
def reshape(input_t : Tensor, new_shape : Iterable, copy=True) -> Tensor:
|
||||
"""
|
||||
reshape operator
|
||||
|
||||
arguments
|
||||
|
||||
new_shape Iterable of ints
|
||||
|
||||
copy(True) if True, produces new Tensor
|
||||
otherwise result tensor points to the same memory
|
||||
|
||||
Produces reference Tensor with new shape.
|
||||
"""
|
||||
info = SCacheton.get(ReshapeInfo, input_t.shape, tuple(int(x) for x in new_shape) )
|
||||
|
||||
if copy:
|
||||
return Tensor(info.o_shape, input_t.dtype, device=input_t.get_device()).set(input_t)
|
||||
return input_t.as_shape( info.o_shape )
|
||||
|
67
xlib/avecl/_internal/op/slice_.py
Normal file
67
xlib/avecl/_internal/op/slice_.py
Normal file
|
@ -0,0 +1,67 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..HType import HType
|
||||
from ..info import SliceInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
|
||||
def slice_(input_t : Tensor, slices, dtype : np.dtype = None, output_t=None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
arguments:
|
||||
|
||||
input_t input tensor
|
||||
slices argument received from class.__getitem__(slices)
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape, but should match total size.
|
||||
gradfn will not be set.
|
||||
|
||||
is_add_to_output add result to output_t if output_t is set.
|
||||
|
||||
Remark.
|
||||
|
||||
Slicing logic is not the same as numpy:
|
||||
For example np[2:0:1] slice will produce invalid array with zero index,
|
||||
but nn.slice() will select 2 index, same as val_t[2].
|
||||
"""
|
||||
op = SCacheton.get(_SliceOp, input_t.shape, input_t.dtype, dtype, HType.hashable_slices(slices), False if output_t is None else is_add_to_output )
|
||||
o_shape = op.slice_info.o_shape
|
||||
|
||||
if output_t is None:
|
||||
if op.slice_info.just_reshaped:
|
||||
return input_t.reshape(o_shape)
|
||||
else:
|
||||
output_t = Tensor(o_shape, op.o_dtype, device=input_t.get_device())
|
||||
|
||||
elif output_t.shape.size != o_shape.size:
|
||||
raise ValueError(f'output_t must have size {o_shape.size}')
|
||||
|
||||
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
|
||||
|
||||
return output_t
|
||||
|
||||
|
||||
class _SliceOp:
|
||||
def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, slices, is_add_to_output):
|
||||
self.slice_info = slice_info = SliceInfo(i_shape, slices)
|
||||
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(slice_info.o_shape_kd.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', slice_info.o_shape_kd, o_dtype )}
|
||||
{HKernel.define_tensor('I', i_shape, i_dtype )}
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'o', slice_info.o_shape_kd.ndim)}
|
||||
|
||||
{chr(10).join( f'size_t i{i} = {b} + o{i} * {s}; ' for i, (b,e,s) in enumerate(slice_info.axes_bes) ) }
|
||||
|
||||
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}(gid, I_GLOBAL_LOAD( I_IDX({HKernel.axes_seq_enum('i', i_shape.ndim)}) ) );
|
||||
}}
|
||||
""")
|
73
xlib/avecl/_internal/op/slice_set.py
Normal file
73
xlib/avecl/_internal/op/slice_set.py
Normal file
|
@ -0,0 +1,73 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..HType import HType
|
||||
from ..info import BroadcastInfo, SliceInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
|
||||
def slice_set(input_t : Tensor, slices, value) -> Tensor:
|
||||
"""
|
||||
arguments:
|
||||
|
||||
input_t input tensor
|
||||
slices argument received from class.__getitem__(slices)
|
||||
value
|
||||
|
||||
|
||||
Remark.
|
||||
|
||||
"""
|
||||
if HType.is_scalar_type(value):
|
||||
v_shape = None
|
||||
v_dtype = None
|
||||
v_scalar = value
|
||||
elif not isinstance(value, Tensor):
|
||||
value = Tensor.from_value(value, dtype=input_t.dtype, device=input_t.get_device())
|
||||
v_shape = value.shape
|
||||
v_dtype = value.dtype
|
||||
v_scalar = None
|
||||
|
||||
op = SCacheton.get(_SliceSetOp, input_t.shape, input_t.dtype, v_shape, v_dtype, v_scalar, HType.hashable_slices(slices) )
|
||||
|
||||
if v_scalar is not None:
|
||||
input_t.get_device().run_kernel(op.forward_krn, input_t.get_buffer() )
|
||||
else:
|
||||
input_t.get_device().run_kernel(op.forward_krn, input_t.get_buffer(), value.get_buffer() )
|
||||
|
||||
return input_t
|
||||
|
||||
class _SliceSetOp:
|
||||
def __init__(self, i_shape : AShape, i_dtype : np.dtype, v_shape : AShape, v_dtype : np.dtype, v_scalar, slices):
|
||||
slice_info = SliceInfo(i_shape, slices)
|
||||
|
||||
if v_scalar is None:
|
||||
if v_shape.ndim > i_shape.ndim:
|
||||
raise ValueError(f'v_shape.ndim {v_shape.ndim} cannot be larger than i_shape.ndim {i_shape.ndim}')
|
||||
|
||||
# Check that v_shape can broadcast with slice_info.shape
|
||||
br_info = BroadcastInfo([slice_info.o_shape_kd, v_shape])
|
||||
|
||||
v_br_shape = br_info.br_shapes[1]
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(i_shape.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', i_shape, i_dtype )}
|
||||
|
||||
{HKernel.define_tensor('I', v_br_shape, v_dtype ) if v_scalar is None else ''}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME
|
||||
{', __global const I_PTR_TYPE* I_PTR_NAME' if v_scalar is None else ''})
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', slice_info.o_shape_kd.ndim)}
|
||||
|
||||
if ({' & '.join( [f'o{i} >= {b} & o{i} < {e}' if s != 0 else f'o{i} == {b}' for i, (b,e,s) in enumerate(slice_info.axes_abs_bes)] +
|
||||
[f'((o{i} % {s}) == 0)' for i, (_,_,s) in enumerate(slice_info.axes_abs_bes) if s > 1 ] ) } )
|
||||
|
||||
O_GLOBAL_STORE(gid, {f"I_GLOBAL_LOAD( I_IDX_MOD({HKernel.axes_seq_enum('O', i_shape.ndim)}) ) " if v_scalar is None else f" (O_TYPE)({v_scalar})"} );
|
||||
}}
|
||||
""")
|
74
xlib/avecl/_internal/op/stack.py
Normal file
74
xlib/avecl/_internal/op/stack.py
Normal file
|
@ -0,0 +1,74 @@
|
|||
import numpy as np
|
||||
from typing import List
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HArgs import HArgs
|
||||
from ..HKernel import HKernel
|
||||
from ..HType import HType
|
||||
from ..info import StackInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
def stack(tensor_list : List[Tensor], axis, dtype=None, output_t=None, is_add_to_output=False):
|
||||
"""
|
||||
Stack operator.
|
||||
|
||||
arguments:
|
||||
|
||||
tensor_list List of Tensors
|
||||
|
||||
axis Int
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape, but should match total size.
|
||||
gradfn will not be set.
|
||||
|
||||
is_add_to_output add result to output_t if output_t is set.
|
||||
"""
|
||||
HArgs.check_zero_get_length(tensor_list)
|
||||
HArgs.check_all_tensors(tensor_list)
|
||||
|
||||
device = HArgs.check_get_same_device(tensor_list)
|
||||
|
||||
shape_list, dtype_list, _ = HArgs.decompose(tensor_list)
|
||||
|
||||
op = SCacheton.get(_StackOp, shape_list, dtype_list, int(axis), dtype, False if output_t is None else is_add_to_output)
|
||||
|
||||
if output_t is None:
|
||||
output_t = Tensor (op.info.o_shape, op.o_dtype, device=device)
|
||||
elif output_t.shape.size != op.info.o_shape.size:
|
||||
raise ValueError(f'output_t must have size {op.info.o_shape.size}')
|
||||
|
||||
for i, krn in enumerate(op.forward_krns):
|
||||
device.run_kernel(krn, output_t.get_buffer(), tensor_list[i].get_buffer(), np.int64(i) )
|
||||
|
||||
return output_t
|
||||
|
||||
|
||||
class _StackOp:
|
||||
def __init__(self, shape_list : List[AShape], dtype_list : List[np.dtype], axis, o_dtype, is_add_to_output):
|
||||
self.stack_count = stack_count = len(shape_list)
|
||||
|
||||
i_shape = shape_list[0]
|
||||
if not all (s == i_shape for s in shape_list):
|
||||
raise ValueError('All shapes must be the same')
|
||||
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
|
||||
self.info = info = StackInfo(i_shape, axis, stack_count)
|
||||
|
||||
self.forward_krns = forward_krns = []
|
||||
for i_dtype in dtype_list:
|
||||
forward_krns.append( Kernel(global_shape=(i_shape.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', info.o_shape, o_dtype )}
|
||||
{HKernel.define_tensor('I', i_shape, i_dtype )}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, long i_new_idx)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'I', i_shape.ndim)}
|
||||
|
||||
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_seq_enum('I', i_shape.ndim, new_axis=('i_new_idx', info.axis))}), I_GLOBAL_LOAD(gid) );
|
||||
}}
|
||||
"""))
|
58
xlib/avecl/_internal/op/tile.py
Normal file
58
xlib/avecl/_internal/op/tile.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
import numpy as np
|
||||
from typing import List
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..info import TileInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
def tile(input_t : Tensor, tiles : List[int], dtype : np.dtype = None, output_t=None, is_add_to_output=False):
|
||||
"""
|
||||
Tile operator
|
||||
|
||||
arguments
|
||||
|
||||
tiles Iterable of ints
|
||||
|
||||
dtype
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape, but should match total size.
|
||||
gradfn will not be set.
|
||||
|
||||
is_add_to_output add result to output_t if output_t is set.
|
||||
"""
|
||||
|
||||
op = SCacheton.get(_TileOp, input_t.shape, input_t.dtype, tuple(int(tile) for tile in tiles), dtype, False if output_t is None else is_add_to_output)
|
||||
|
||||
if output_t is None:
|
||||
output_t = Tensor (op.info.o_shape, op.o_dtype, device=input_t.get_device())
|
||||
elif output_t.shape.size != op.info.o_shape.size:
|
||||
raise ValueError(f'output_t must have size {op.info.o_shape.size}')
|
||||
|
||||
input_t.get_device().run_kernel( op.forward_krn, output_t.get_buffer(), input_t.get_buffer())
|
||||
|
||||
return output_t
|
||||
|
||||
class _TileOp:
|
||||
def __init__(self, i_shape : AShape, i_dtype, tiles, o_dtype, is_add_to_output):
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
self.info = info = TileInfo(i_shape, tiles)
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(info.o_shape.size,), kernel_text=f"""
|
||||
|
||||
{HKernel.define_tensor('I', i_shape, i_dtype)}
|
||||
{HKernel.define_tensor('O', info.o_shape, o_dtype)}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
{HKernel.decompose_idx_to_axes_idxs ('gid', 'O', info.o_shape.ndim)}
|
||||
|
||||
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'} (gid, I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)})) );
|
||||
}}
|
||||
""")
|
||||
|
68
xlib/avecl/_internal/op/transpose.py
Normal file
68
xlib/avecl/_internal/op/transpose.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AAxes import AAxes
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..info import TransposeInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
def transpose(input_t : Tensor, axes_order, op_text=None, dtype : np.dtype = None, output_t : Tensor=None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
arguments:
|
||||
|
||||
axes_order Int
|
||||
Iterable of ints
|
||||
None
|
||||
|
||||
dtype cast to dtype
|
||||
|
||||
op_text(None) optional op with value during transpose.
|
||||
'O = I'
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape, but should match total size
|
||||
|
||||
"""
|
||||
op = SCacheton.get(_TransposeOp, input_t.shape, input_t.dtype, dtype, AAxes(axes_order), op_text, False if output_t is None else is_add_to_output )
|
||||
|
||||
if output_t is None:
|
||||
output_t = Tensor (op.o_shape, op.o_dtype, device=input_t.get_device())
|
||||
elif output_t.shape.size != op.o_shape.size:
|
||||
raise ValueError(f'output_t must have size {op.o_shape.size}')
|
||||
|
||||
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
|
||||
|
||||
return output_t
|
||||
|
||||
|
||||
class _TransposeOp:
|
||||
def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, axes_order : AAxes, op_text, is_add_to_output : bool ):
|
||||
self.axes_order = axes_order
|
||||
self.o_shape = o_shape = TransposeInfo(i_shape, axes_order).o_shape
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
if op_text is None:
|
||||
op_text = 'O = I'
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(i_shape.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
{HKernel.define_tensor('I', i_shape, i_dtype)}
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
|
||||
{{
|
||||
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'i', i_shape.ndim)}
|
||||
|
||||
I_TYPE I = I_GLOBAL_LOAD(gid);
|
||||
O_TYPE O;
|
||||
|
||||
{op_text};
|
||||
|
||||
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_order_enum('I', axes_order )}), O );
|
||||
|
||||
}}""")
|
||||
|
||||
|
BIN
xlib/avecl/_internal/op/unused.zip
Normal file
BIN
xlib/avecl/_internal/op/unused.zip
Normal file
Binary file not shown.
72
xlib/avecl/_internal/op/warp_affine.py
Normal file
72
xlib/avecl/_internal/op/warp_affine.py
Normal file
|
@ -0,0 +1,72 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..initializer import InitCoords2DArange
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
from .matmul import matmul
|
||||
from .remap import remap
|
||||
|
||||
def warp_affine (input_t : Tensor, affine_t : Tensor, output_size=None, dtype=None) -> Tensor:
|
||||
"""
|
||||
arguments
|
||||
|
||||
input_t Tensor(...,H,W)
|
||||
|
||||
affine_t Tensor(...,2,3)
|
||||
affine matrix
|
||||
|
||||
example of identity affine matrix
|
||||
[1,0,0],
|
||||
[0,1,0]
|
||||
|
||||
...-head part of shapes will be broadcasted to each other
|
||||
|
||||
output_size(None)
|
||||
|
||||
tuple of 2 ints (HW)
|
||||
of output size
|
||||
if None , size will not be changed
|
||||
|
||||
"""
|
||||
op = SCacheton.get(_WarpAffineOp, input_t.shape, input_t.dtype, affine_t.shape, affine_t.dtype, output_size)
|
||||
|
||||
affine_t = affine_t.transpose( op.affine_transpose_axes, dtype=np.float32 ).reshape( (-1,3,2) )
|
||||
|
||||
coords_t = Tensor(op.coords_shape, np.float32, device=input_t.get_device(), initializer=op.coords_init )
|
||||
coords_t = coords_t.reshape(op.coords_reshape)
|
||||
coords_t = matmul(coords_t, affine_t).reshape(op.coords_affined_shape)
|
||||
|
||||
output_t = remap(input_t, coords_t, dtype=dtype)
|
||||
return output_t
|
||||
|
||||
|
||||
class _WarpAffineOp():
|
||||
def __init__(self, i_shape : AShape, i_dtype, a_shape : AShape, a_dtype, o_size):
|
||||
if np.dtype(i_dtype).type == np.bool_:
|
||||
raise ValueError('np.bool_ dtype of i_dtype is not supported.')
|
||||
if np.dtype(a_dtype).type == np.bool_:
|
||||
raise ValueError('np.bool_ dtype of a_dtype is not supported.')
|
||||
if i_shape.ndim < 2:
|
||||
raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
|
||||
if a_shape.ndim < 2:
|
||||
raise ValueError(f'a_shape.ndim must be >= 2 (...,2,3)')
|
||||
if a_shape[-2] != 2 or a_shape[-1] != 3:
|
||||
raise ValueError('Last a_shape dims must be == (...,2,3)')
|
||||
|
||||
IH,IW = i_shape[-2:]
|
||||
if o_size is not None:
|
||||
OH,OW = o_size
|
||||
else:
|
||||
OH,OW = IH,IW
|
||||
|
||||
self.coords_shape = AShape( (OH,OW,3) )
|
||||
self.coords_affined_shape = AShape( (OH,OW,2) )
|
||||
|
||||
if a_shape.ndim > 2:
|
||||
self.coords_shape = a_shape[:-2] + self.coords_shape
|
||||
self.coords_affined_shape = a_shape[:-2] + self.coords_affined_shape
|
||||
|
||||
self.coords_init = InitCoords2DArange(0,OH-1,0,OW-1)
|
||||
self.coords_reshape = (-1,OH*OW,3)
|
||||
self.affine_transpose_axes = a_shape.axes_arange().swapped_axes(-2,-1)
|
Loading…
Add table
Add a link
Reference in a new issue