add xlib.avecl

This commit is contained in:
iperov 2021-09-30 18:21:30 +04:00
commit 0058474da7
56 changed files with 5569 additions and 0 deletions

View file

@ -0,0 +1,21 @@
from .any_wise import add, any_wise, div, max_, min_, mul, sqrt, square, sub
from .binary_dilate_circle import binary_dilate_circle
from .binary_erode_circle import binary_erode_circle
from .binary_morph import binary_morph
from .cast import cast
from .concat import concat
from .depthwise_conv2D import depthwise_conv2D
from .gaussian_blur import gaussian_blur
from .matmul import matmul, matmulc
from .pad import pad
from .reduce import (moments, reduce_max, reduce_mean, reduce_min, reduce_std,
reduce_sum, reduce_variance)
from .remap import remap
from .remap_np_affine import remap_np_affine
from .reshape import reshape
from .slice_ import slice_
from .slice_set import slice_set
from .stack import stack
from .tile import tile
from .transpose import transpose
from .warp_affine import warp_affine

View file

@ -0,0 +1,111 @@
import numpy as np
from ..AShape import AShape
from ..backend import Kernel
from ..HArgs import HArgs
from ..HKernel import HKernel
from ..HType import HType
from ..info import BroadcastInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def any_wise(op_text : str,
*args,
dtype : np.dtype = None,
output_t:Tensor=None) -> Tensor:
"""
operator for N-wise ops with N inputs
arguments
op_text example: O=(2*I0*I1)+I2
*args List[ Tensor | number ]
dtype
output_t compute result to this Tensor.
Tensor may be with different shape, but should match total size.
"""
HArgs.check_zero_get_length(args)
tensor_args = HArgs.filter_tensor(args, raise_on_empty=True)
device = HArgs.check_get_same_device(tensor_args)
shape_list, dtype_list, krn_args = HArgs.decompose(args)
op = SCacheton.get(_AnyWiseOp, shape_list, dtype_list, dtype, op_text)
if output_t is None:
output_t = Tensor ( op.o_shape, op.o_dtype, device=device )
elif output_t.shape.size != op.o_shape.size:
raise ValueError(f'output_t must have size {op.o_shape.size}')
device.run_kernel(op.forward_krn, output_t.get_buffer(), *krn_args)
return output_t
class _AnyWiseOp:
def __init__(self, shape_list, dtype_list, o_dtype, op_text : str):
if len(shape_list) != len(dtype_list):
raise ValueError('len(shape_list) != len(dtype_list)')
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
if len(shape_list) == 1:
# element-wise.
i_shape, i_dtype = shape_list[0], dtype_list[0]
self.o_shape = o_shape = i_shape
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', o_shape, o_dtype)}
{HKernel.define_tensor('IN', i_shape, i_dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const IN_PTR_TYPE* IN_PTR_NAME)
{{
size_t gid = get_global_id(0);
O_TYPE O = O_GLOBAL_LOAD(gid);
IN_TYPE I0 = IN_GLOBAL_LOAD(gid);
{op_text};
O_GLOBAL_STORE(gid, O);
}}
""")
else:
# Multi arg.
self.info = info = BroadcastInfo( [ shape if shape is not None else AShape((1,)) for shape in shape_list ])
self.o_shape = o_shape = info.o_shape
defs, arg_defs, impls = [], [], []
for i, (t_shape, t_dtype) in enumerate(zip(shape_list, dtype_list)):
t_name = f'I{i}'
if t_shape is not None:
defs.append( HKernel.define_tensor(t_name, info.br_shapes[i], t_dtype) )
arg_defs.append( f", __global const {t_name}_PTR_TYPE* {t_name}_PTR_NAME" )
impls.append( f"{t_name}_TYPE {t_name} = {t_name}_GLOBAL_LOAD({t_name}_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)}));")
else:
arg_defs.append( f", {HKernel.define_scalar_func_arg(t_name, t_dtype)}" )
defs, arg_defs, impls = '\n'.join(defs), '\n'.join(arg_defs), '\n'.join(impls)
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
{defs}
{HKernel.define_tensor('O', o_shape, o_dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME{arg_defs})
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)}
{impls}
O_TYPE O;
{op_text};
O_GLOBAL_STORE(gid, O);
}}
""")
def add(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0+I1', a_t, b_t)
def sub(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0-I1', a_t, b_t)
def mul(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0*I1', a_t, b_t)
def div(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0/I1', a_t, b_t)
def min_(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=fmin( I0_TO_FLOATX(I0), I1_TO_FLOATX(I1) )', a_t, b_t)
def max_(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=fmax( I0_TO_FLOATX(I0), I1_TO_FLOATX(I1) )', a_t, b_t)
def square(a_t : Tensor) -> Tensor: return any_wise('O=I0*I0', a_t)
def sqrt(a_t : Tensor) -> Tensor: return any_wise('O=sqrt(I0_TO_FLOATX(I0))', a_t)

View file

@ -0,0 +1,91 @@
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..info import Conv2DInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def binary_dilate_circle (input_t : Tensor, radius : int = 1, iterations : int = 1, dtype=None):
"""
Binary dilate operator using circle kernel with radius.
input_t Tensor (...,H,W)
per-element of H,W, set 1 if any neighbor elements inside circle with radius != 0.
otherwise set 0.
"""
op = SCacheton.get(_BinaryDilateOp, input_t.shape, input_t.dtype, int(radius), dtype)
device = input_t.get_device()
if radius <= 0 or iterations <= 0:
return input_t.copy()
else:
for i in range(iterations):
if i == 0:
buf_in = input_t
else:
buf_in, buf_out = buf_out, buf_in
if i <= 1:
buf_out = Tensor( op.o_shape, op.o_dtype, device=device )
device.run_kernel(op.forward_krn, buf_out.get_buffer(), buf_in.get_buffer() )
return buf_out
class _BinaryDilateOp():
def __init__(self, i_shape : AShape, i_dtype, radius, o_dtype):
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
if i_shape.ndim < 2:
raise ValueError(f'i_shape.ndim must be >= 2')
KS = radius*2+1
IH,IW = i_shape[-2:]
ci = Conv2DInfo(IH, IW, KS, KS, stride=1, dilation=1, padding='same')
self.o_shape = o_shape = i_shape
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', o_shape, o_dtype)}
{HKernel.define_tensor('I', i_shape, i_dtype)}
#define PADL {ci.PADL}
#define PADT {ci.PADT}
#define RADIUS {radius}
#define KS {KS}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
{'#pragma unroll' if KS <= 16 else ''}
for (int kh=0; kh<KS; ++kh)
{'#pragma unroll' if KS <= 16 else ''}
for (int kw=0; kw<KS; ++kw)
{{
if ( hypot( (float)(kh-RADIUS), (float)(kw-RADIUS) ) <= RADIUS)
{{
int im2 = -PADT + kh + om2;
int im1 = -PADL + kw + om1;
I_TYPE i_val = (im1 >= 0 & im1 < Im1 & im2 >= 0 & im2 < Im2) ?
I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))
: 0;
if (i_val != (I_TYPE)0)
{{
O_GLOBAL_STORE(gid, (O_TYPE) 1);
return;
}}
}}
}}
O_GLOBAL_STORE(gid, (O_TYPE) 0 );
}}
""")

View file

@ -0,0 +1,91 @@
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..info import Conv2DInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def binary_erode_circle (input_t : Tensor, radius : int = 1, iterations : int = 1, dtype=None):
"""
Binary erode operator using circle kernel with radius.
input_t Tensor (...,H,W)
per-element of H,W, set 1 if all neighbor elements inside circle with radius != 0.
otherwise set 0.
"""
op = SCacheton.get(_BinaryErodeOp, input_t.shape, input_t.dtype, int(radius), dtype)
device = input_t.get_device()
if radius <= 0 or iterations <= 0:
return input_t.copy()
else:
for i in range(iterations):
if i == 0:
buf_in = input_t
else:
buf_in, buf_out = buf_out, buf_in
if i <= 1:
buf_out = Tensor( op.o_shape, op.o_dtype, device=device )
device.run_kernel(op.forward_krn, buf_out.get_buffer(), buf_in.get_buffer() )
return buf_out
class _BinaryErodeOp():
def __init__(self, i_shape : AShape, i_dtype, radius, o_dtype):
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
if i_shape.ndim < 2:
raise ValueError(f'i_shape.ndim must be >= 2')
KS = radius*2+1
IH,IW = i_shape[-2:]
ci = Conv2DInfo(IH, IW, KS, KS, stride=1, dilation=1, padding='same')
self.o_shape = o_shape = i_shape
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', o_shape, o_dtype)}
{HKernel.define_tensor('I', i_shape, i_dtype)}
#define PADL {ci.PADL}
#define PADT {ci.PADT}
#define RADIUS {radius}
#define KS {KS}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
{'#pragma unroll' if KS <= 16 else ''}
for (int kh=0; kh<KS; ++kh)
{'#pragma unroll' if KS <= 16 else ''}
for (int kw=0; kw<KS; ++kw)
{{
if ( hypot( (float)(kh-RADIUS), (float)(kw-RADIUS) ) <= RADIUS)
{{
int im2 = -PADT + kh + om2;
int im1 = -PADL + kw + om1;
I_TYPE i_val = (im1 >= 0 & im1 < Im1 & im2 >= 0 & im2 < Im2) ?
I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))
: 0;
if (i_val == (I_TYPE)0)
{{
O_GLOBAL_STORE(gid, (O_TYPE) 0 );
return;
}}
}}
}}
O_GLOBAL_STORE(gid, (O_TYPE) 1 );
}}
""")

View file

@ -0,0 +1,43 @@
from ..Tensor import Tensor
from .binary_dilate_circle import binary_dilate_circle
from .binary_erode_circle import binary_erode_circle
from .gaussian_blur import gaussian_blur
from .pad import pad
def binary_morph(input_t : Tensor, erode_dilate : int, blur : float, fade_to_border : bool = False, dtype=None) -> Tensor:
"""
Apply optional binary erode/dilate and optional blur.
input_t (...,H,W) tensor. Non zero values will be treated as 1.
erode_dilate int >= 0 amount of pixels to dilate
blur float >= 0 amount of pixels to blur
fade_to_border(False) clip the image in order
to fade smoothly to the border with specified blur amount
"""
x = input_t
H,W = input_t.shape[-2:]
x = pad(x, (...,(H,H),(W,W)), mode='constant', constant_value=0)
if erode_dilate > 0:
x = binary_erode_circle(x, radius=1, iterations=max(1,erode_dilate//2))
elif erode_dilate < 0:
x = binary_dilate_circle(x, radius=1, iterations=max(1,-erode_dilate//2) )
if fade_to_border:
h_clip_size = H + blur // 2
w_clip_size = W + blur // 2
x[...,:h_clip_size,:] = 0
x[...,-h_clip_size:,:] = 0
x[...,:,:w_clip_size] = 0
x[...,:,-w_clip_size:] = 0
if blur > 0:
x = gaussian_blur(x, blur * 0.250, dtype=dtype)
return x[...,H:-H,W:-W]

View file

@ -0,0 +1,17 @@
from ..Tensor import Tensor
from .any_wise import any_wise
def cast(input_t : Tensor, dtype, output_t:Tensor=None) -> Tensor:
"""
cast operator
arguments
input_t
dtype
output_t compute result to this Tensor.
Tensor may be with different shape, but should match total size.
"""
return any_wise('O=I0', input_t, dtype=dtype, output_t=output_t)

View file

@ -0,0 +1,70 @@
from ..backend import Kernel
from ..HArgs import HArgs
from ..HType import HType
from ..HKernel import HKernel
from ..info import ConcatInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def concat(tensor_list, axis, dtype=None, output_t=None, is_add_to_output=False) -> Tensor:
"""
arguments
tensor_list Iterable
axis Int
dtype np.dtype
output_t compute result to this Tensor.
Tensor may be with different shape,
but should match total size.
gradfn will not be set.
is_add_to_output add result to output_t if output_t is set.
"""
tensor_list = tuple(tensor_list)
HArgs.check_zero_get_length(tensor_list)
HArgs.check_all_tensors(tensor_list)
device = HArgs.check_get_same_device(tensor_list)
shape_list, dtype_list, _ = HArgs.decompose(tensor_list)
op = SCacheton.get(_ConcatOp, shape_list, dtype_list, dtype, int(axis), False if output_t is None else is_add_to_output)
if output_t is None:
output_t = Tensor (op.info.o_shape, op.o_dtype, device=device)
elif output_t.shape.size != op.info.o_shape.size:
raise ValueError(f'output_t must have size {op.info.o_shape.size}')
for forward_krn,t in zip(op.forward_krns,tensor_list):
device.run_kernel(forward_krn, output_t.get_buffer(), t.get_buffer(), global_shape=(t.shape.size,) )
return output_t
class _ConcatOp:
def __init__(self, shape_list, dtype_list, o_dtype, axis, is_add_to_output):
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
self.info = info = ConcatInfo(shape_list, axis)
self.forward_krns = forward_krns = []
for i, (shape, dtype) in enumerate(zip(shape_list, dtype_list)):
forward_krn = Kernel(f"""
{HKernel.define_tensor('O', info.o_shape, o_dtype )}
{HKernel.define_tensor('I', shape, dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'I', shape.ndim)}
i{info.axis} += {info.axis_offsets[i]};
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_seq_enum('I', info.o_shape.ndim)}), I_GLOBAL_LOAD(gid) );
}}
""")
forward_krns.append(forward_krn)

View file

@ -0,0 +1,107 @@
import numpy as np
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..info import BroadcastInfo, Conv2DInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def depthwise_conv2D (input_t : Tensor, kernel_t : Tensor, stride=1, dilation=1, padding='same', dtype=None):
"""
Depthwise Conv2D operator.
input_t Tensor (...,H,W)
kernel_t Tensor (...,H,W)
stride(1) int
dilation(1) int
padding(same) 'valid' no padding
'same' output size will be the same
or divided by stride
int padding value for all sides
Iterable of 4 ints
paddings for left,top,right,bottom sides
...-head part of shapes will be broadcasted to each other
"""
op = SCacheton.get(_DepthwiseConv2DOp, input_t.shape, input_t.dtype, kernel_t.shape, kernel_t.dtype, dtype, int(stride), int(dilation), padding)
output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
output_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(), kernel_t.get_buffer())
return output_t
class _DepthwiseConv2DOp():
def __init__(self, i_shape : AShape, i_dtype, k_shape : AShape, k_dtype, o_dtype, stride, dilation, padding):
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
if i_shape.ndim < 2:
raise ValueError(f'i_shape.ndim must be >= 2')
if k_shape.ndim < 2:
raise ValueError(f'k_shape.ndim must be >= 2')
IH,IW = i_shape[-2:]
KH,KW = k_shape[-2:]
ci = Conv2DInfo(IH, IW, KH, KW, stride, dilation, padding)
if i_shape.ndim == 2 and k_shape.ndim == 2:
# nothing to broadcast
i_br_shape = i_shape
k_br_shape = k_shape
o_shape = AShape([ci.OH, ci.OW])
else:
op = BroadcastInfo([ i_shape[:-2], k_shape[:-2] ])
i_br_shape = op.br_shapes[0] + i_shape[-2:]
k_br_shape = op.br_shapes[1] + k_shape[-2:]
o_shape = op.o_shape + [ci.OH, ci.OW]
self.o_shape = o_shape
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', o_shape, o_dtype)}
{HKernel.define_tensor('I', i_br_shape, i_dtype)}
{HKernel.define_tensor('K', k_br_shape, k_dtype)}
#define PADL {ci.PADL}
#define PADT {ci.PADT}
#define STRIDE {stride}
#define DILATION {dilation}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, __global const K_PTR_TYPE* K_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
float v = 0.0;
{'#pragma unroll' if KH <= 9 else ''}
for (int km2=0; km2<Km2; ++km2)
{{
int im2 = -PADT + km2*DILATION + om2*STRIDE;
if (im2 >= 0 & im2 < Im2)
{'#pragma unroll' if KW <= 9 else ''}
for (int km1=0; km1<Km1; ++km1)
{{
int im1 = -PADL + km1*DILATION + om1*STRIDE;
if (im1 >= 0 & im1 < Im1)
v += ((float)(I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))))
*K_GLOBAL_LOAD(K_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='km2,km1' )}));
}}
}}
O_GLOBAL_STORE(gid, (O_TYPE) v);
}}
""")

View file

@ -0,0 +1,39 @@
import numpy as np
from ..Tensor import Tensor
from .depthwise_conv2D import depthwise_conv2D
def gaussian_blur (input_t : Tensor, sigma, dtype=None) -> Tensor:
"""
arguments
input_t Tensor(...,H,W)
sigma float
"""
if sigma <= 0.0:
return input_t.copy() #TODO
device = input_t.get_device()
key = (gaussian_blur, sigma)
kernel_t = device.get_cached_data(key)
if kernel_t is None:
kernel_t = Tensor.from_value( _make_gaussian_kernel(sigma, np.float32), device=device )
device.set_cached_data(key, kernel_t)
output_t = depthwise_conv2D(input_t, kernel_t, dtype=dtype)
return output_t
def _make_gaussian_kernel(sigma : float, dtype):
kernel_size = max(3, int(2 * 2 * sigma))
if kernel_size % 2 == 0:
kernel_size += 1
mean = np.floor(0.5 * kernel_size)
kernel_1d = np.array([ np.exp(-(float(x) - float(mean)) ** 2 / (2 * sigma ** 2)) for x in range(kernel_size)])
np_kernel = np.outer(kernel_1d, kernel_1d)
kernel = np_kernel / np.sum(np_kernel)
return kernel.astype(dtype)

View file

@ -0,0 +1,158 @@
import numpy as np
from ..AShape import AShape
from ..backend import Kernel
from ..HArgs import HArgs
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def matmul(a_t : Tensor, b_t : Tensor, output_t: Tensor=None, is_add_to_output=False) -> Tensor:
"""
matmul operator in row-major format
A(...,M,K) x
B(...,K,N) =
(...,M,N)
arguments
output_t compute result to this Tensor.
Tensor may be with different shape,
but should match total size.
gradfn will not be set.
is_add_to_output add result to output_t if output_t is set.
"""
return matmulc(b_t, a_t, output_t=output_t, is_add_to_output=is_add_to_output)
def matmulc(a_t : Tensor, b_t : Tensor, output_t : Tensor = None, is_add_to_output=False) -> Tensor:
"""
matmul operator in col-major format
A(...,K,M) x
B(...,N,K) =
(...,N,M)
arguments
output_t compute result to this Tensor.
Tensor may be with different shape,
but should match total size.
gradfn will not be set.
is_add_to_output add result to output_t if output_t is set.
"""
device = HArgs.check_get_same_device([a_t, b_t])
op = SCacheton.get(_MatmulOp, a_t.shape, a_t.dtype, b_t.shape, b_t.dtype, False if output_t is None else is_add_to_output)
if output_t is None:
output_t = Tensor (op.o_shape, op.o_dtype, device=device )
elif output_t.shape.size != op.o_shape.size:
raise ValueError(f'output_t must have size {op.o_shape.size}')
device.run_kernel(op.forward_krn, output_t.get_buffer(), a_t.get_buffer(), b_t.get_buffer(), )
return output_t
class _MatmulOp:
def __init__(self, a_shape, a_dtype, b_shape, b_dtype, is_add_to_output):
a_dtype = np.dtype(a_dtype).type
b_dtype = np.dtype(b_dtype).type
if a_dtype != np.float32 or b_dtype != np.float32:
raise ValueError('matmul works only with float32 tensors.')
if a_shape.ndim != b_shape.ndim:
raise ValueError(f'ndims are not equal. {a_shape.ndim} != {b_shape.ndim}')
ndim = a_shape.ndim
if ndim < 2:
raise ValueError('Tensors ndim must be at least 2.')
K, M = a_shape[-2], a_shape[-1]
N, B_COLS = b_shape[-2], b_shape[-1]
if K != B_COLS:
raise ValueError('A_ROWS != B_COLS')
BATCH = a_shape[0:-2].size
B_BATCH = b_shape[0:-2].size
if BATCH != B_BATCH:
raise ValueError(f'BATCH size {BATCH} != {B_BATCH} in shapes {a_shape} {b_shape}')
if ndim == 2:
self.o_shape = AShape( (N, M) )
else:
self.o_shape = AShape( a_shape[:-2]+(N, M) )
self.o_dtype = np.float32
self.M = M
self.N = N
self.K = K
# Determining optimal tile widths
for MW in [8,4,2,1]:
if M % MW == 0:
break
for KW in [8,4,2,1]:
if N % KW == 0 and K % KW == 0:
break
NW = KW
self.forward_krn = Kernel(global_shape=(M//MW, N//NW, BATCH), kernel_text=f"""
#define K {K}
#define N {N}
#define MW {MW} // M tile Width
#define NW {NW} // N tile Width -- NW & KW should be the same !
#define KW {KW} // K tile Width
#define MT {M//MW} // MT is max for 'mt' (M tile count)
#define KT {K//KW} // KT is max for 'kt' (K tile count)
#define floatMW { f'float{MW}' if MW != 1 else 'float'}
#define floatKW { f'float{KW}' if KW != 1 else 'float'}
__kernel void GeMM(__global floatMW* O, const __global floatMW* restrict A, const __global floatKW* restrict B)
{{
size_t mt = get_global_id(0); //global M-tile id
size_t nc = get_global_id(1); //global N-tile id
size_t batch = get_global_id(2);
float AT[KW][MW]; // sub tiles
float BT[NW][KW];
float CT[NW][MW];
#pragma unroll
for (uint i=0; i<NW*MW; ++i) // zero CT tile
((float*) CT)[i] = 0.0;
for (uint kt=0; kt<KT; ++kt) // iterate over K-dim tiles
{{
#pragma unroll
for (uint k=0; k<KW; ++k) // every k-element inside K-dim tile
*( (floatMW*) AT[k] ) = A[batch*K*MT + (kt*KW + k)*MT + mt]; // store M-Width floats
#pragma unroll
for (uint n=0; n<NW; ++n) // every n-element inside N-dim tile
*( (floatKW*) BT[n] ) = B[batch*N*KT + (nc*NW + n)*KT + kt]; // store K-Width floats
#pragma unroll
for (uint k=0; k<KW; ++k)
#pragma unroll
for (uint n=0; n<NW; ++n) // sub tiles multiplication
#pragma unroll
for (uint m=0; m<MW; ++m)
CT[n][m] += AT[k][m] * BT[n][k];
}}
#pragma unroll
for (uint n=0; n<NW; ++n)
O[ batch*N*MT + (nc*NW + n)*MT + mt] {'+=' if is_add_to_output else '='}
*( (floatMW*) CT[n]);
}}""")

View file

@ -0,0 +1,71 @@
from typing import List
import numpy as np
from ..HType import HType
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..info import PadInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def pad(input_t : Tensor, axes_paddings : List, mode : str = 'constant', constant_value=0, dtype : np.dtype = None, output_t : Tensor=None) -> Tensor:
"""
arguments:
axes_paddings list of (l_pad, r_pad),
if [0] == ... (Ellipsis), then left-side paddings will be filled with (0,0) for remain axes
if [-1] == ... , same for ride-side
dtype cast to dtype
output_t compute result to this Tensor.
Tensor may be with different shape, but should match total size
"""
op = SCacheton.get(_PadOp, input_t.shape, input_t.dtype, dtype, tuple(axes_paddings), mode, constant_value )
if output_t is None:
output_t = Tensor (op.o_shape, op.o_dtype, device=input_t.get_device())
elif output_t.shape.size != op.o_shape.size:
raise ValueError(f'output_t must have size {op.o_shape.size}')
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
return output_t
class _PadOp:
def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, axes_paddings, mode, constant_value ):
_allow_modes = ['constant']
if mode not in _allow_modes:
raise ValueError(f'Allowed pads modes: {_allow_modes}')
if mode == 'constant':
if not HType.is_scalar_type(constant_value):
raise ValueError('constan_value must be scalar')
info = PadInfo(i_shape, axes_paddings)
self.o_shape = o_shape = info.o_shape
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', o_shape, o_dtype)}
{HKernel.define_tensor('I', i_shape, i_dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
if ({' & '.join(f'o{i} >= {l_pad} & o{i} < (O{i}-{r_pad})' for i, (l_pad,r_pad) in enumerate(info.axes_paddings))})
O_GLOBAL_STORE(gid, I_GLOBAL_LOAD( I_IDX({ ','.join(f'o{i}-{l_pad}' for i,(l_pad,r_pad) in zip(range(o_shape.ndim), info.axes_paddings) ) }) ) );
else
O_GLOBAL_STORE(gid, (O_TYPE){constant_value} );
//O_GLOBAL_STORE(gid, I_GLOBAL_LOAD( I_IDX_MOD({ ','.join(f' I{i} + ( (o{i}-{l_pad})*( ((o{i}-{l_pad})/I{i}) % 2 == 0 ? 1: -1) ) % I{i} ' for i,(l_pad,r_pad) in zip(range(o_shape.ndim), info.axes_paddings) ) }) ) );
}}""")
#print(self.forward_krn)

View file

@ -0,0 +1,214 @@
import math
import numpy as np
from ..AAxes import AAxes
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..info import ReductionInfo, TransposeInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
from .slice_ import slice_
from .transpose import transpose
from .any_wise import square, sqrt
def reduce_mean (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
"""
Reduce mean operator.
input_t Tensor
axes(None) int
Iterable of ints.
None - all axes
keepdims(False) keep reduced axes
"""
return reduce_op ('mean', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
def reduce_std(input_t, axes=None, keepdims=False):
"""
Reduce std operator.
input_t Tensor
axes(None) int
Iterable of ints.
None - all axes
keepdims(False) keep reduced axes
"""
return sqrt(reduce_variance(input_t, axes, keepdims))
def reduce_variance(input_t, axes=None, keepdims=False):
"""
Reduce variance operator.
input_t Tensor
axes(None) int
Iterable of ints.
None - all axes
keepdims(False) keep reduced axes
"""
mean = reduce_mean(input_t, axes, keepdims=True)
return reduce_mean(square(input_t - mean), axes, keepdims)
def moments(input_t, axes=None, keepdims=False):
"""
Returns (mean, variance) of input_t
input_t Tensor
axes(None) int
Iterable of ints.
None - all axes
keepdims(False) keep reduced axes
"""
mean = reduce_mean(input_t, axes, keepdims)
mean_shape_keepdims = mean._op.info.o_shape_kd
var = reduce_mean(square(input_t - mean.reshape(mean_shape_keepdims) ), axes, keepdims)
return mean, var
def reduce_min (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
"""
Reduce min operator.
input_t Tensor
axes(None) int
Iterable of ints.
None - all axes
keepdims(False) keep reduced axes
"""
return reduce_op ('min', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
def reduce_max (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
"""
Reduce max operator.
input_t Tensor
axes(None) int
Iterable of ints.
None - all axes
keepdims(False) keep reduced axes
"""
return reduce_op ('max', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
def reduce_sum (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
"""
Reduce sum operator.
input_t Tensor
axes(None) int
Iterable of ints.
None - all axes
keepdims(False) keep reduced axes
"""
return reduce_op ('sum', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
def reduce_op (op_type : str, input_t, axes=None, keepdims=False, output_t=None, is_add_to_output=False):
"""
arguments
op_type 'sum' 'mean' 'min' 'max'
output_t compute result to this Tensor.
Tensor may be with different shape,
but should match total size.
gradfn will not be set.
is_add_to_output add result to output_t if output_t is set.
"""
op = SCacheton.get(_ReduceOp, op_type, input_t.shape, input_t.dtype, AAxes(axes, input_t.shape.ndim), keepdims)
if output_t is None:
output_t = Tensor ( op.info.o_shape, input_t.dtype, device=input_t.get_device() )
elif output_t.shape.size != op.info.o_shape.size:
raise ValueError(f'output_t must have size {op.info.o_shape.size}')
# Make an intermediate tensor
input_t_inter = transpose(input_t, op.intermediate_transpose_axes)
# Perform multistage inplace operation in intermediate tensor
for stage, (shape, STAGE_COLS, STAGE_VALID_COLS) in enumerate(zip(op.forward_krn_shapes, op.forward_krn_stage_cols, op.forward_krn_stage_valid_cols)):
input_t_inter.get_device().run_kernel(op.forward_krn, input_t_inter.get_buffer(), np.int64(op.COLS), np.int64(STAGE_COLS), np.int64(STAGE_VALID_COLS),
global_shape=shape)
if op_type == 'mean':
# divide values in ROWS by number of COLS
input_t_inter.get_device().run_kernel(op.mean_div_forward_krn, input_t_inter.get_buffer(), np.int64(op.COLS), global_shape=(op.ROWS,) )
# Fetch final tensor from zero indexes using slices argument
slice_(input_t_inter, op.inter_slices, output_t=output_t, is_add_to_output=is_add_to_output)
return output_t
class _ReduceOp:
def __init__(self, op_type, i_shape : AShape, i_dtype : np.dtype, axes : AAxes, keepdims=False):
self.op_type = op_type
self.info = info = ReductionInfo(i_shape, axes, keepdims)
# Determine transpose order for intermediate tensor, where reduction axes will be at the end
self.intermediate_transpose_axes = info.o_axes + info.reduction_axes
self.intermediate_shape = TransposeInfo(i_shape, self.intermediate_transpose_axes).o_shape
# slices argument to fetch processed tensor from zero indexes
self.inter_slices = ( slice(None,None,None), ) * info.o_axes.ndim + (0,) * info.reduction_axes.ndim
# COLS are reduction axes, ROWS are remaining axes
rows_ndim = info.o_axes.ndim
self.ROWS = ROWS = self.intermediate_shape[:rows_ndim].size
self.COLS = COLS = self.intermediate_shape[rows_ndim:].size
# Number of stages to operate COLS
n_stages = (COLS-1).bit_length()
self.forward_krn_shapes = [ (ROWS * math.ceil(COLS/ (2**(stage+1)) ),) for stage in range(n_stages) ]
self.forward_krn_stage_cols = [ math.ceil(COLS / (2**(stage+1)) ) for stage in range(n_stages) ]
self.forward_krn_stage_valid_cols = [ math.ceil(COLS / (2** stage ) ) for stage in range(n_stages) ]
self.forward_krn = Kernel(f"""
{HKernel.define_tensor('I', (1,), i_dtype)}
__kernel void impl(__global I_PTR_TYPE* I_PTR_NAME, long COLS, long STAGE_COLS, long STAGE_VALID_COLS)
{{
size_t gid = get_global_id(0);
size_t col = gid % STAGE_COLS;
size_t row = gid / STAGE_COLS;
size_t i_idx = row*COLS + col;
size_t other_col = col + STAGE_COLS;
if (other_col < STAGE_VALID_COLS)
{{
I_TYPE val_a = I_GLOBAL_LOAD(i_idx);
I_TYPE val_b = I_GLOBAL_LOAD(row*COLS + other_col);
{'I_TYPE val_x = val_a + val_b;' if op_type in ['sum','mean'] else
'I_TYPE val_x = fmin( I_TO_FLOATX(val_a), I_TO_FLOATX(val_b) );' if op_type == 'min' else
'I_TYPE val_x = fmax( I_TO_FLOATX(val_a), I_TO_FLOATX(val_b) );' if op_type == 'max' else ''
}
I_GLOBAL_STORE(i_idx, val_x);
}}
}}
""")
self.mean_div_forward_krn = Kernel(f"""
{HKernel.define_tensor('I', (1,), i_dtype)}
__kernel void impl(__global I_PTR_TYPE* I_PTR_NAME, long COLS)
{{
size_t row = get_global_id(0);
I_GLOBAL_STORE(row*COLS, I_GLOBAL_LOAD(row*COLS) / COLS );
}}
""")

View file

@ -0,0 +1,103 @@
import numpy as np
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..info import BroadcastInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def remap (input_t : Tensor, coords_t : Tensor, dtype=None) -> Tensor:
"""
remap input_t in spatial axes using coords_t
arguments
input_t Tensor( ...,IH,IW )
coords_t Tensor( ...,OH,OW,D )
OH - output height
OW - output width
D is (2)[x,y] coords
dtype
...-head part of shapes will be broadcasted to each other
"""
op = SCacheton.get(_RemapOp, input_t.shape, input_t.dtype, coords_t.shape, coords_t.dtype, dtype)
output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(), coords_t.get_buffer())
return output_t
class _RemapOp():
def __init__(self, i_shape : AShape, i_dtype, c_shape : AShape, c_dtype, o_dtype):
if np.dtype(i_dtype).type == np.bool_:
raise ValueError('np.bool_ dtype of i_dtype is not supported.')
if np.dtype(c_dtype).type == np.bool_:
raise ValueError('np.bool_ dtype of c_dtype is not supported.')
if i_shape.ndim < 2:
raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
if c_shape.ndim < 3:
raise ValueError(f'Coords shape ndim must be >= 3(...,H,W,D)')
if c_shape[-1] != 2:
raise ValueError('Last coords dim must be == 2 (x,y)')
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
if i_shape.ndim == 2 and c_shape.ndim == 3:
# nothing to broadcast
i_br_shape = i_shape
c_br_shape = c_shape
o_shape = c_shape[-3:-1]
else:
op = BroadcastInfo([ i_shape[:-2], c_shape[:-3] ])
i_br_shape = op.br_shapes[0] + i_shape[-2:]
c_br_shape = op.br_shapes[1] + c_shape[-3:]
o_shape = op.o_shape + c_shape[-3:-1]
self.o_shape = o_shape
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', o_shape, o_dtype)}
{HKernel.define_tensor('I', i_br_shape, i_dtype)}
{HKernel.define_tensor('C', c_br_shape[:-1], c_dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, __global const C_PTR_TYPE2* C_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)}
C_TYPE2 c_value = C_GLOBAL_LOAD2(C_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim)}));
float cx01 = (float) c_value.x;
float cy01 = (float) c_value.y;
float cx0f = floor(cx01); int cx0 = (int)cx0f;
float cy0f = floor(cy01); int cy0 = (int)cy0f;
float cx1f = cx0f+1; int cx1 = (int)cx1f;
float cy1f = cy0f+1; int cy1 = (int)cy1f;
float p00 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx0')}));
float p01 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx1')}));
float p10 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx0')}));
float p11 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx1')}));
p00 *= (cx1f - cx01)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx0 >= 0 & cx0 < Im1);
p01 *= (cx01 - cx0f)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx1 >= 0 & cx1 < Im1);
p10 *= (cx1f - cx01)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx0 >= 0 & cx0 < Im1);
p11 *= (cx01 - cx0f)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx1 >= 0 & cx1 < Im1);
O_GLOBAL_STORE(gid, p00 + p01 + p10 + p11);
}}
""")

View file

@ -0,0 +1,96 @@
import numpy as np
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def remap_np_affine (input_t : Tensor, affine_n : np.array, inverse=False, output_size=None, dtype=None) -> Tensor:
"""
remap affine operator for all channels using single numpy affine mat
arguments
input_t Tensor (...,H,W)
affine_n np.array (2,3)
dtype
"""
if affine_n.shape != (2,3):
raise ValueError('affine_n.shape must be (2,3)')
op = SCacheton.get(_RemapAffineOp, input_t.shape, input_t.dtype, output_size, dtype)
output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
((a, b, c),
(d, e, f)) = affine_n
if not inverse:
# do inverse by default, match cv2.warpAffine behaviour
D = a*e - b*d
D = 1.0 / D if D != 0.0 else 0.0
a, b, c, d, e, f = ( e*D, -b*D, (b*f-e*c)*D ,
-d*D, a*D, (d*c-a*f)*D )
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(),
np.float32(a), np.float32(b), np.float32(c), np.float32(d), np.float32(e), np.float32(f) )
return output_t
class _RemapAffineOp():
def __init__(self, i_shape : AShape, i_dtype, o_size, o_dtype):
if np.dtype(i_dtype).type == np.bool_:
raise ValueError('np.bool_ dtype of i_dtype is not supported.')
if i_shape.ndim < 2:
raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
IH,IW = i_shape[-2:]
if o_size is not None:
OH,OW = o_size
else:
OH,OW = IH,IW
o_shape = AShape( (OH,OW) )
if i_shape.ndim > 2:
o_shape = i_shape[:-2] + o_shape
self.o_shape = o_shape
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', o_shape, o_dtype)}
{HKernel.define_tensor('I', i_shape, i_dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME,
float a, float b, float c,
float d, float e, float f)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
float cx01 = om1*a + om2*b + c;
float cy01 = om1*d + om2*e + f;
float cx0f = floor(cx01); int cx0 = (int)cx0f;
float cy0f = floor(cy01); int cy0 = (int)cy0f;
float cx1f = cx0f+1; int cx1 = (int)cx1f;
float cy1f = cy0f+1; int cy1 = (int)cy1f;
float p00 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx0')}));
float p01 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx1')}));
float p10 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx0')}));
float p11 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx1')}));
p00 *= (cx1f - cx01)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx0 >= 0 & cx0 < Im1);
p01 *= (cx01 - cx0f)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx1 >= 0 & cx1 < Im1);
p10 *= (cx1f - cx01)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx0 >= 0 & cx0 < Im1);
p11 *= (cx01 - cx0f)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx1 >= 0 & cx1 < Im1);
O_GLOBAL_STORE(gid, p00 + p01 + p10 + p11);
}}
""")

View file

@ -0,0 +1,26 @@
from typing import Iterable
from ..Tensor import Tensor
from ..SCacheton import SCacheton
from ..info import ReshapeInfo
def reshape(input_t : Tensor, new_shape : Iterable, copy=True) -> Tensor:
"""
reshape operator
arguments
new_shape Iterable of ints
copy(True) if True, produces new Tensor
otherwise result tensor points to the same memory
Produces reference Tensor with new shape.
"""
info = SCacheton.get(ReshapeInfo, input_t.shape, tuple(int(x) for x in new_shape) )
if copy:
return Tensor(info.o_shape, input_t.dtype, device=input_t.get_device()).set(input_t)
return input_t.as_shape( info.o_shape )

View file

@ -0,0 +1,67 @@
import numpy as np
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..HType import HType
from ..info import SliceInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def slice_(input_t : Tensor, slices, dtype : np.dtype = None, output_t=None, is_add_to_output=False) -> Tensor:
"""
arguments:
input_t input tensor
slices argument received from class.__getitem__(slices)
output_t compute result to this Tensor.
Tensor may be with different shape, but should match total size.
gradfn will not be set.
is_add_to_output add result to output_t if output_t is set.
Remark.
Slicing logic is not the same as numpy:
For example np[2:0:1] slice will produce invalid array with zero index,
but nn.slice() will select 2 index, same as val_t[2].
"""
op = SCacheton.get(_SliceOp, input_t.shape, input_t.dtype, dtype, HType.hashable_slices(slices), False if output_t is None else is_add_to_output )
o_shape = op.slice_info.o_shape
if output_t is None:
if op.slice_info.just_reshaped:
return input_t.reshape(o_shape)
else:
output_t = Tensor(o_shape, op.o_dtype, device=input_t.get_device())
elif output_t.shape.size != o_shape.size:
raise ValueError(f'output_t must have size {o_shape.size}')
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
return output_t
class _SliceOp:
def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, slices, is_add_to_output):
self.slice_info = slice_info = SliceInfo(i_shape, slices)
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
self.forward_krn = Kernel(global_shape=(slice_info.o_shape_kd.size,), kernel_text=f"""
{HKernel.define_tensor('O', slice_info.o_shape_kd, o_dtype )}
{HKernel.define_tensor('I', i_shape, i_dtype )}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'o', slice_info.o_shape_kd.ndim)}
{chr(10).join( f'size_t i{i} = {b} + o{i} * {s}; ' for i, (b,e,s) in enumerate(slice_info.axes_bes) ) }
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}(gid, I_GLOBAL_LOAD( I_IDX({HKernel.axes_seq_enum('i', i_shape.ndim)}) ) );
}}
""")

View file

@ -0,0 +1,73 @@
import numpy as np
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..HType import HType
from ..info import BroadcastInfo, SliceInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def slice_set(input_t : Tensor, slices, value) -> Tensor:
"""
arguments:
input_t input tensor
slices argument received from class.__getitem__(slices)
value
Remark.
"""
if HType.is_scalar_type(value):
v_shape = None
v_dtype = None
v_scalar = value
elif not isinstance(value, Tensor):
value = Tensor.from_value(value, dtype=input_t.dtype, device=input_t.get_device())
v_shape = value.shape
v_dtype = value.dtype
v_scalar = None
op = SCacheton.get(_SliceSetOp, input_t.shape, input_t.dtype, v_shape, v_dtype, v_scalar, HType.hashable_slices(slices) )
if v_scalar is not None:
input_t.get_device().run_kernel(op.forward_krn, input_t.get_buffer() )
else:
input_t.get_device().run_kernel(op.forward_krn, input_t.get_buffer(), value.get_buffer() )
return input_t
class _SliceSetOp:
def __init__(self, i_shape : AShape, i_dtype : np.dtype, v_shape : AShape, v_dtype : np.dtype, v_scalar, slices):
slice_info = SliceInfo(i_shape, slices)
if v_scalar is None:
if v_shape.ndim > i_shape.ndim:
raise ValueError(f'v_shape.ndim {v_shape.ndim} cannot be larger than i_shape.ndim {i_shape.ndim}')
# Check that v_shape can broadcast with slice_info.shape
br_info = BroadcastInfo([slice_info.o_shape_kd, v_shape])
v_br_shape = br_info.br_shapes[1]
self.forward_krn = Kernel(global_shape=(i_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', i_shape, i_dtype )}
{HKernel.define_tensor('I', v_br_shape, v_dtype ) if v_scalar is None else ''}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME
{', __global const I_PTR_TYPE* I_PTR_NAME' if v_scalar is None else ''})
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', slice_info.o_shape_kd.ndim)}
if ({' & '.join( [f'o{i} >= {b} & o{i} < {e}' if s != 0 else f'o{i} == {b}' for i, (b,e,s) in enumerate(slice_info.axes_abs_bes)] +
[f'((o{i} % {s}) == 0)' for i, (_,_,s) in enumerate(slice_info.axes_abs_bes) if s > 1 ] ) } )
O_GLOBAL_STORE(gid, {f"I_GLOBAL_LOAD( I_IDX_MOD({HKernel.axes_seq_enum('O', i_shape.ndim)}) ) " if v_scalar is None else f" (O_TYPE)({v_scalar})"} );
}}
""")

View file

@ -0,0 +1,74 @@
import numpy as np
from typing import List
from ..AShape import AShape
from ..backend import Kernel
from ..HArgs import HArgs
from ..HKernel import HKernel
from ..HType import HType
from ..info import StackInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def stack(tensor_list : List[Tensor], axis, dtype=None, output_t=None, is_add_to_output=False):
"""
Stack operator.
arguments:
tensor_list List of Tensors
axis Int
output_t compute result to this Tensor.
Tensor may be with different shape, but should match total size.
gradfn will not be set.
is_add_to_output add result to output_t if output_t is set.
"""
HArgs.check_zero_get_length(tensor_list)
HArgs.check_all_tensors(tensor_list)
device = HArgs.check_get_same_device(tensor_list)
shape_list, dtype_list, _ = HArgs.decompose(tensor_list)
op = SCacheton.get(_StackOp, shape_list, dtype_list, int(axis), dtype, False if output_t is None else is_add_to_output)
if output_t is None:
output_t = Tensor (op.info.o_shape, op.o_dtype, device=device)
elif output_t.shape.size != op.info.o_shape.size:
raise ValueError(f'output_t must have size {op.info.o_shape.size}')
for i, krn in enumerate(op.forward_krns):
device.run_kernel(krn, output_t.get_buffer(), tensor_list[i].get_buffer(), np.int64(i) )
return output_t
class _StackOp:
def __init__(self, shape_list : List[AShape], dtype_list : List[np.dtype], axis, o_dtype, is_add_to_output):
self.stack_count = stack_count = len(shape_list)
i_shape = shape_list[0]
if not all (s == i_shape for s in shape_list):
raise ValueError('All shapes must be the same')
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
self.info = info = StackInfo(i_shape, axis, stack_count)
self.forward_krns = forward_krns = []
for i_dtype in dtype_list:
forward_krns.append( Kernel(global_shape=(i_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', info.o_shape, o_dtype )}
{HKernel.define_tensor('I', i_shape, i_dtype )}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, long i_new_idx)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'I', i_shape.ndim)}
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_seq_enum('I', i_shape.ndim, new_axis=('i_new_idx', info.axis))}), I_GLOBAL_LOAD(gid) );
}}
"""))

View file

@ -0,0 +1,58 @@
import numpy as np
from typing import List
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..info import TileInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def tile(input_t : Tensor, tiles : List[int], dtype : np.dtype = None, output_t=None, is_add_to_output=False):
"""
Tile operator
arguments
tiles Iterable of ints
dtype
output_t compute result to this Tensor.
Tensor may be with different shape, but should match total size.
gradfn will not be set.
is_add_to_output add result to output_t if output_t is set.
"""
op = SCacheton.get(_TileOp, input_t.shape, input_t.dtype, tuple(int(tile) for tile in tiles), dtype, False if output_t is None else is_add_to_output)
if output_t is None:
output_t = Tensor (op.info.o_shape, op.o_dtype, device=input_t.get_device())
elif output_t.shape.size != op.info.o_shape.size:
raise ValueError(f'output_t must have size {op.info.o_shape.size}')
input_t.get_device().run_kernel( op.forward_krn, output_t.get_buffer(), input_t.get_buffer())
return output_t
class _TileOp:
def __init__(self, i_shape : AShape, i_dtype, tiles, o_dtype, is_add_to_output):
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
self.info = info = TileInfo(i_shape, tiles)
self.forward_krn = Kernel(global_shape=(info.o_shape.size,), kernel_text=f"""
{HKernel.define_tensor('I', i_shape, i_dtype)}
{HKernel.define_tensor('O', info.o_shape, o_dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs ('gid', 'O', info.o_shape.ndim)}
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'} (gid, I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)})) );
}}
""")

View file

@ -0,0 +1,68 @@
import numpy as np
from ..AAxes import AAxes
from ..AShape import AShape
from ..backend import Kernel
from ..HKernel import HKernel
from ..info import TransposeInfo
from ..SCacheton import SCacheton
from ..Tensor import Tensor
def transpose(input_t : Tensor, axes_order, op_text=None, dtype : np.dtype = None, output_t : Tensor=None, is_add_to_output=False) -> Tensor:
"""
arguments:
axes_order Int
Iterable of ints
None
dtype cast to dtype
op_text(None) optional op with value during transpose.
'O = I'
output_t compute result to this Tensor.
Tensor may be with different shape, but should match total size
"""
op = SCacheton.get(_TransposeOp, input_t.shape, input_t.dtype, dtype, AAxes(axes_order), op_text, False if output_t is None else is_add_to_output )
if output_t is None:
output_t = Tensor (op.o_shape, op.o_dtype, device=input_t.get_device())
elif output_t.shape.size != op.o_shape.size:
raise ValueError(f'output_t must have size {op.o_shape.size}')
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
return output_t
class _TransposeOp:
def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, axes_order : AAxes, op_text, is_add_to_output : bool ):
self.axes_order = axes_order
self.o_shape = o_shape = TransposeInfo(i_shape, axes_order).o_shape
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
if op_text is None:
op_text = 'O = I'
self.forward_krn = Kernel(global_shape=(i_shape.size,), kernel_text=f"""
{HKernel.define_tensor('O', o_shape, o_dtype)}
{HKernel.define_tensor('I', i_shape, i_dtype)}
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
{{
size_t gid = get_global_id(0);
{HKernel.decompose_idx_to_axes_idxs('gid', 'i', i_shape.ndim)}
I_TYPE I = I_GLOBAL_LOAD(gid);
O_TYPE O;
{op_text};
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_order_enum('I', axes_order )}), O );
}}""")

Binary file not shown.

View file

@ -0,0 +1,72 @@
import numpy as np
from ..AShape import AShape
from ..initializer import InitCoords2DArange
from ..SCacheton import SCacheton
from ..Tensor import Tensor
from .matmul import matmul
from .remap import remap
def warp_affine (input_t : Tensor, affine_t : Tensor, output_size=None, dtype=None) -> Tensor:
"""
arguments
input_t Tensor(...,H,W)
affine_t Tensor(...,2,3)
affine matrix
example of identity affine matrix
[1,0,0],
[0,1,0]
...-head part of shapes will be broadcasted to each other
output_size(None)
tuple of 2 ints (HW)
of output size
if None , size will not be changed
"""
op = SCacheton.get(_WarpAffineOp, input_t.shape, input_t.dtype, affine_t.shape, affine_t.dtype, output_size)
affine_t = affine_t.transpose( op.affine_transpose_axes, dtype=np.float32 ).reshape( (-1,3,2) )
coords_t = Tensor(op.coords_shape, np.float32, device=input_t.get_device(), initializer=op.coords_init )
coords_t = coords_t.reshape(op.coords_reshape)
coords_t = matmul(coords_t, affine_t).reshape(op.coords_affined_shape)
output_t = remap(input_t, coords_t, dtype=dtype)
return output_t
class _WarpAffineOp():
def __init__(self, i_shape : AShape, i_dtype, a_shape : AShape, a_dtype, o_size):
if np.dtype(i_dtype).type == np.bool_:
raise ValueError('np.bool_ dtype of i_dtype is not supported.')
if np.dtype(a_dtype).type == np.bool_:
raise ValueError('np.bool_ dtype of a_dtype is not supported.')
if i_shape.ndim < 2:
raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
if a_shape.ndim < 2:
raise ValueError(f'a_shape.ndim must be >= 2 (...,2,3)')
if a_shape[-2] != 2 or a_shape[-1] != 3:
raise ValueError('Last a_shape dims must be == (...,2,3)')
IH,IW = i_shape[-2:]
if o_size is not None:
OH,OW = o_size
else:
OH,OW = IH,IW
self.coords_shape = AShape( (OH,OW,3) )
self.coords_affined_shape = AShape( (OH,OW,2) )
if a_shape.ndim > 2:
self.coords_shape = a_shape[:-2] + self.coords_shape
self.coords_affined_shape = a_shape[:-2] + self.coords_affined_shape
self.coords_init = InitCoords2DArange(0,OH-1,0,OW-1)
self.coords_reshape = (-1,OH*OW,3)
self.affine_transpose_axes = a_shape.axes_arange().swapped_axes(-2,-1)