mirror of
https://github.com/iperov/DeepFaceLive
synced 2025-08-14 18:57:24 -07:00
add xlib.avecl
This commit is contained in:
parent
932edfe875
commit
0058474da7
56 changed files with 5569 additions and 0 deletions
21
xlib/avecl/_internal/op/__init__.py
Normal file
21
xlib/avecl/_internal/op/__init__.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
from .any_wise import add, any_wise, div, max_, min_, mul, sqrt, square, sub
|
||||
from .binary_dilate_circle import binary_dilate_circle
|
||||
from .binary_erode_circle import binary_erode_circle
|
||||
from .binary_morph import binary_morph
|
||||
from .cast import cast
|
||||
from .concat import concat
|
||||
from .depthwise_conv2D import depthwise_conv2D
|
||||
from .gaussian_blur import gaussian_blur
|
||||
from .matmul import matmul, matmulc
|
||||
from .pad import pad
|
||||
from .reduce import (moments, reduce_max, reduce_mean, reduce_min, reduce_std,
|
||||
reduce_sum, reduce_variance)
|
||||
from .remap import remap
|
||||
from .remap_np_affine import remap_np_affine
|
||||
from .reshape import reshape
|
||||
from .slice_ import slice_
|
||||
from .slice_set import slice_set
|
||||
from .stack import stack
|
||||
from .tile import tile
|
||||
from .transpose import transpose
|
||||
from .warp_affine import warp_affine
|
111
xlib/avecl/_internal/op/any_wise.py
Normal file
111
xlib/avecl/_internal/op/any_wise.py
Normal file
|
@ -0,0 +1,111 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HArgs import HArgs
|
||||
from ..HKernel import HKernel
|
||||
from ..HType import HType
|
||||
from ..info import BroadcastInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
|
||||
def any_wise(op_text : str,
|
||||
*args,
|
||||
dtype : np.dtype = None,
|
||||
output_t:Tensor=None) -> Tensor:
|
||||
"""
|
||||
operator for N-wise ops with N inputs
|
||||
|
||||
arguments
|
||||
op_text example: O=(2*I0*I1)+I2
|
||||
|
||||
*args List[ Tensor | number ]
|
||||
|
||||
dtype
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape, but should match total size.
|
||||
"""
|
||||
HArgs.check_zero_get_length(args)
|
||||
tensor_args = HArgs.filter_tensor(args, raise_on_empty=True)
|
||||
device = HArgs.check_get_same_device(tensor_args)
|
||||
|
||||
shape_list, dtype_list, krn_args = HArgs.decompose(args)
|
||||
|
||||
op = SCacheton.get(_AnyWiseOp, shape_list, dtype_list, dtype, op_text)
|
||||
|
||||
if output_t is None:
|
||||
output_t = Tensor ( op.o_shape, op.o_dtype, device=device )
|
||||
elif output_t.shape.size != op.o_shape.size:
|
||||
raise ValueError(f'output_t must have size {op.o_shape.size}')
|
||||
|
||||
device.run_kernel(op.forward_krn, output_t.get_buffer(), *krn_args)
|
||||
|
||||
return output_t
|
||||
|
||||
class _AnyWiseOp:
|
||||
def __init__(self, shape_list, dtype_list, o_dtype, op_text : str):
|
||||
if len(shape_list) != len(dtype_list):
|
||||
raise ValueError('len(shape_list) != len(dtype_list)')
|
||||
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
|
||||
|
||||
if len(shape_list) == 1:
|
||||
# element-wise.
|
||||
i_shape, i_dtype = shape_list[0], dtype_list[0]
|
||||
self.o_shape = o_shape = i_shape
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
{HKernel.define_tensor('IN', i_shape, i_dtype)}
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const IN_PTR_TYPE* IN_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
O_TYPE O = O_GLOBAL_LOAD(gid);
|
||||
IN_TYPE I0 = IN_GLOBAL_LOAD(gid);
|
||||
{op_text};
|
||||
O_GLOBAL_STORE(gid, O);
|
||||
}}
|
||||
""")
|
||||
else:
|
||||
# Multi arg.
|
||||
self.info = info = BroadcastInfo( [ shape if shape is not None else AShape((1,)) for shape in shape_list ])
|
||||
|
||||
self.o_shape = o_shape = info.o_shape
|
||||
|
||||
defs, arg_defs, impls = [], [], []
|
||||
for i, (t_shape, t_dtype) in enumerate(zip(shape_list, dtype_list)):
|
||||
t_name = f'I{i}'
|
||||
if t_shape is not None:
|
||||
defs.append( HKernel.define_tensor(t_name, info.br_shapes[i], t_dtype) )
|
||||
arg_defs.append( f", __global const {t_name}_PTR_TYPE* {t_name}_PTR_NAME" )
|
||||
impls.append( f"{t_name}_TYPE {t_name} = {t_name}_GLOBAL_LOAD({t_name}_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)}));")
|
||||
else:
|
||||
arg_defs.append( f", {HKernel.define_scalar_func_arg(t_name, t_dtype)}" )
|
||||
|
||||
defs, arg_defs, impls = '\n'.join(defs), '\n'.join(arg_defs), '\n'.join(impls)
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
|
||||
{defs}
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME{arg_defs})
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)}
|
||||
{impls}
|
||||
O_TYPE O;
|
||||
{op_text};
|
||||
O_GLOBAL_STORE(gid, O);
|
||||
}}
|
||||
""")
|
||||
|
||||
def add(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0+I1', a_t, b_t)
|
||||
def sub(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0-I1', a_t, b_t)
|
||||
def mul(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0*I1', a_t, b_t)
|
||||
def div(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0/I1', a_t, b_t)
|
||||
def min_(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=fmin( I0_TO_FLOATX(I0), I1_TO_FLOATX(I1) )', a_t, b_t)
|
||||
def max_(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=fmax( I0_TO_FLOATX(I0), I1_TO_FLOATX(I1) )', a_t, b_t)
|
||||
def square(a_t : Tensor) -> Tensor: return any_wise('O=I0*I0', a_t)
|
||||
def sqrt(a_t : Tensor) -> Tensor: return any_wise('O=sqrt(I0_TO_FLOATX(I0))', a_t)
|
91
xlib/avecl/_internal/op/binary_dilate_circle.py
Normal file
91
xlib/avecl/_internal/op/binary_dilate_circle.py
Normal file
|
@ -0,0 +1,91 @@
|
|||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..info import Conv2DInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
|
||||
def binary_dilate_circle (input_t : Tensor, radius : int = 1, iterations : int = 1, dtype=None):
|
||||
"""
|
||||
Binary dilate operator using circle kernel with radius.
|
||||
|
||||
input_t Tensor (...,H,W)
|
||||
|
||||
per-element of H,W, set 1 if any neighbor elements inside circle with radius != 0.
|
||||
otherwise set 0.
|
||||
"""
|
||||
op = SCacheton.get(_BinaryDilateOp, input_t.shape, input_t.dtype, int(radius), dtype)
|
||||
|
||||
device = input_t.get_device()
|
||||
|
||||
if radius <= 0 or iterations <= 0:
|
||||
return input_t.copy()
|
||||
else:
|
||||
for i in range(iterations):
|
||||
if i == 0:
|
||||
buf_in = input_t
|
||||
else:
|
||||
buf_in, buf_out = buf_out, buf_in
|
||||
if i <= 1:
|
||||
buf_out = Tensor( op.o_shape, op.o_dtype, device=device )
|
||||
device.run_kernel(op.forward_krn, buf_out.get_buffer(), buf_in.get_buffer() )
|
||||
|
||||
return buf_out
|
||||
|
||||
class _BinaryDilateOp():
|
||||
def __init__(self, i_shape : AShape, i_dtype, radius, o_dtype):
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
if i_shape.ndim < 2:
|
||||
raise ValueError(f'i_shape.ndim must be >= 2')
|
||||
|
||||
KS = radius*2+1
|
||||
IH,IW = i_shape[-2:]
|
||||
|
||||
ci = Conv2DInfo(IH, IW, KS, KS, stride=1, dilation=1, padding='same')
|
||||
|
||||
self.o_shape = o_shape = i_shape
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
{HKernel.define_tensor('I', i_shape, i_dtype)}
|
||||
|
||||
#define PADL {ci.PADL}
|
||||
#define PADT {ci.PADT}
|
||||
|
||||
#define RADIUS {radius}
|
||||
#define KS {KS}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
|
||||
|
||||
{'#pragma unroll' if KS <= 16 else ''}
|
||||
for (int kh=0; kh<KS; ++kh)
|
||||
{'#pragma unroll' if KS <= 16 else ''}
|
||||
for (int kw=0; kw<KS; ++kw)
|
||||
{{
|
||||
if ( hypot( (float)(kh-RADIUS), (float)(kw-RADIUS) ) <= RADIUS)
|
||||
{{
|
||||
int im2 = -PADT + kh + om2;
|
||||
int im1 = -PADL + kw + om1;
|
||||
|
||||
I_TYPE i_val = (im1 >= 0 & im1 < Im1 & im2 >= 0 & im2 < Im2) ?
|
||||
I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))
|
||||
: 0;
|
||||
|
||||
if (i_val != (I_TYPE)0)
|
||||
{{
|
||||
O_GLOBAL_STORE(gid, (O_TYPE) 1);
|
||||
return;
|
||||
}}
|
||||
}}
|
||||
}}
|
||||
|
||||
O_GLOBAL_STORE(gid, (O_TYPE) 0 );
|
||||
}}
|
||||
""")
|
||||
|
||||
|
91
xlib/avecl/_internal/op/binary_erode_circle.py
Normal file
91
xlib/avecl/_internal/op/binary_erode_circle.py
Normal file
|
@ -0,0 +1,91 @@
|
|||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..info import Conv2DInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
|
||||
def binary_erode_circle (input_t : Tensor, radius : int = 1, iterations : int = 1, dtype=None):
|
||||
"""
|
||||
Binary erode operator using circle kernel with radius.
|
||||
|
||||
input_t Tensor (...,H,W)
|
||||
|
||||
per-element of H,W, set 1 if all neighbor elements inside circle with radius != 0.
|
||||
otherwise set 0.
|
||||
"""
|
||||
op = SCacheton.get(_BinaryErodeOp, input_t.shape, input_t.dtype, int(radius), dtype)
|
||||
|
||||
device = input_t.get_device()
|
||||
|
||||
if radius <= 0 or iterations <= 0:
|
||||
return input_t.copy()
|
||||
else:
|
||||
for i in range(iterations):
|
||||
if i == 0:
|
||||
buf_in = input_t
|
||||
else:
|
||||
buf_in, buf_out = buf_out, buf_in
|
||||
if i <= 1:
|
||||
buf_out = Tensor( op.o_shape, op.o_dtype, device=device )
|
||||
device.run_kernel(op.forward_krn, buf_out.get_buffer(), buf_in.get_buffer() )
|
||||
|
||||
return buf_out
|
||||
|
||||
class _BinaryErodeOp():
|
||||
def __init__(self, i_shape : AShape, i_dtype, radius, o_dtype):
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
if i_shape.ndim < 2:
|
||||
raise ValueError(f'i_shape.ndim must be >= 2')
|
||||
|
||||
KS = radius*2+1
|
||||
IH,IW = i_shape[-2:]
|
||||
|
||||
ci = Conv2DInfo(IH, IW, KS, KS, stride=1, dilation=1, padding='same')
|
||||
|
||||
self.o_shape = o_shape = i_shape
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
{HKernel.define_tensor('I', i_shape, i_dtype)}
|
||||
|
||||
#define PADL {ci.PADL}
|
||||
#define PADT {ci.PADT}
|
||||
|
||||
#define RADIUS {radius}
|
||||
#define KS {KS}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
|
||||
|
||||
{'#pragma unroll' if KS <= 16 else ''}
|
||||
for (int kh=0; kh<KS; ++kh)
|
||||
{'#pragma unroll' if KS <= 16 else ''}
|
||||
for (int kw=0; kw<KS; ++kw)
|
||||
{{
|
||||
if ( hypot( (float)(kh-RADIUS), (float)(kw-RADIUS) ) <= RADIUS)
|
||||
{{
|
||||
int im2 = -PADT + kh + om2;
|
||||
int im1 = -PADL + kw + om1;
|
||||
|
||||
I_TYPE i_val = (im1 >= 0 & im1 < Im1 & im2 >= 0 & im2 < Im2) ?
|
||||
I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))
|
||||
: 0;
|
||||
|
||||
if (i_val == (I_TYPE)0)
|
||||
{{
|
||||
O_GLOBAL_STORE(gid, (O_TYPE) 0 );
|
||||
return;
|
||||
}}
|
||||
}}
|
||||
}}
|
||||
|
||||
O_GLOBAL_STORE(gid, (O_TYPE) 1 );
|
||||
}}
|
||||
""")
|
||||
|
||||
|
43
xlib/avecl/_internal/op/binary_morph.py
Normal file
43
xlib/avecl/_internal/op/binary_morph.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
from ..Tensor import Tensor
|
||||
from .binary_dilate_circle import binary_dilate_circle
|
||||
from .binary_erode_circle import binary_erode_circle
|
||||
from .gaussian_blur import gaussian_blur
|
||||
from .pad import pad
|
||||
|
||||
|
||||
def binary_morph(input_t : Tensor, erode_dilate : int, blur : float, fade_to_border : bool = False, dtype=None) -> Tensor:
|
||||
"""
|
||||
Apply optional binary erode/dilate and optional blur.
|
||||
|
||||
input_t (...,H,W) tensor. Non zero values will be treated as 1.
|
||||
|
||||
erode_dilate int >= 0 amount of pixels to dilate
|
||||
|
||||
blur float >= 0 amount of pixels to blur
|
||||
|
||||
fade_to_border(False) clip the image in order
|
||||
to fade smoothly to the border with specified blur amount
|
||||
"""
|
||||
x = input_t
|
||||
|
||||
H,W = input_t.shape[-2:]
|
||||
|
||||
x = pad(x, (...,(H,H),(W,W)), mode='constant', constant_value=0)
|
||||
|
||||
if erode_dilate > 0:
|
||||
x = binary_erode_circle(x, radius=1, iterations=max(1,erode_dilate//2))
|
||||
elif erode_dilate < 0:
|
||||
x = binary_dilate_circle(x, radius=1, iterations=max(1,-erode_dilate//2) )
|
||||
|
||||
if fade_to_border:
|
||||
h_clip_size = H + blur // 2
|
||||
w_clip_size = W + blur // 2
|
||||
x[...,:h_clip_size,:] = 0
|
||||
x[...,-h_clip_size:,:] = 0
|
||||
x[...,:,:w_clip_size] = 0
|
||||
x[...,:,-w_clip_size:] = 0
|
||||
|
||||
if blur > 0:
|
||||
x = gaussian_blur(x, blur * 0.250, dtype=dtype)
|
||||
|
||||
return x[...,H:-H,W:-W]
|
17
xlib/avecl/_internal/op/cast.py
Normal file
17
xlib/avecl/_internal/op/cast.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
from ..Tensor import Tensor
|
||||
|
||||
from .any_wise import any_wise
|
||||
|
||||
def cast(input_t : Tensor, dtype, output_t:Tensor=None) -> Tensor:
|
||||
"""
|
||||
cast operator
|
||||
|
||||
arguments
|
||||
input_t
|
||||
|
||||
dtype
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape, but should match total size.
|
||||
"""
|
||||
return any_wise('O=I0', input_t, dtype=dtype, output_t=output_t)
|
70
xlib/avecl/_internal/op/concat.py
Normal file
70
xlib/avecl/_internal/op/concat.py
Normal file
|
@ -0,0 +1,70 @@
|
|||
from ..backend import Kernel
|
||||
from ..HArgs import HArgs
|
||||
from ..HType import HType
|
||||
from ..HKernel import HKernel
|
||||
from ..info import ConcatInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
def concat(tensor_list, axis, dtype=None, output_t=None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
arguments
|
||||
|
||||
tensor_list Iterable
|
||||
|
||||
axis Int
|
||||
|
||||
dtype np.dtype
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape,
|
||||
but should match total size.
|
||||
gradfn will not be set.
|
||||
|
||||
is_add_to_output add result to output_t if output_t is set.
|
||||
"""
|
||||
tensor_list = tuple(tensor_list)
|
||||
HArgs.check_zero_get_length(tensor_list)
|
||||
HArgs.check_all_tensors(tensor_list)
|
||||
|
||||
device = HArgs.check_get_same_device(tensor_list)
|
||||
shape_list, dtype_list, _ = HArgs.decompose(tensor_list)
|
||||
|
||||
op = SCacheton.get(_ConcatOp, shape_list, dtype_list, dtype, int(axis), False if output_t is None else is_add_to_output)
|
||||
|
||||
if output_t is None:
|
||||
output_t = Tensor (op.info.o_shape, op.o_dtype, device=device)
|
||||
elif output_t.shape.size != op.info.o_shape.size:
|
||||
raise ValueError(f'output_t must have size {op.info.o_shape.size}')
|
||||
|
||||
for forward_krn,t in zip(op.forward_krns,tensor_list):
|
||||
device.run_kernel(forward_krn, output_t.get_buffer(), t.get_buffer(), global_shape=(t.shape.size,) )
|
||||
|
||||
return output_t
|
||||
|
||||
class _ConcatOp:
|
||||
def __init__(self, shape_list, dtype_list, o_dtype, axis, is_add_to_output):
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
|
||||
|
||||
self.info = info = ConcatInfo(shape_list, axis)
|
||||
|
||||
self.forward_krns = forward_krns = []
|
||||
|
||||
for i, (shape, dtype) in enumerate(zip(shape_list, dtype_list)):
|
||||
forward_krn = Kernel(f"""
|
||||
|
||||
{HKernel.define_tensor('O', info.o_shape, o_dtype )}
|
||||
{HKernel.define_tensor('I', shape, dtype)}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'I', shape.ndim)}
|
||||
|
||||
i{info.axis} += {info.axis_offsets[i]};
|
||||
|
||||
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_seq_enum('I', info.o_shape.ndim)}), I_GLOBAL_LOAD(gid) );
|
||||
}}
|
||||
""")
|
||||
forward_krns.append(forward_krn)
|
107
xlib/avecl/_internal/op/depthwise_conv2D.py
Normal file
107
xlib/avecl/_internal/op/depthwise_conv2D.py
Normal file
|
@ -0,0 +1,107 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..info import BroadcastInfo, Conv2DInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
|
||||
def depthwise_conv2D (input_t : Tensor, kernel_t : Tensor, stride=1, dilation=1, padding='same', dtype=None):
|
||||
"""
|
||||
Depthwise Conv2D operator.
|
||||
|
||||
input_t Tensor (...,H,W)
|
||||
|
||||
kernel_t Tensor (...,H,W)
|
||||
|
||||
stride(1) int
|
||||
|
||||
dilation(1) int
|
||||
|
||||
padding(same) 'valid' no padding
|
||||
'same' output size will be the same
|
||||
or divided by stride
|
||||
int padding value for all sides
|
||||
Iterable of 4 ints
|
||||
paddings for left,top,right,bottom sides
|
||||
|
||||
...-head part of shapes will be broadcasted to each other
|
||||
"""
|
||||
|
||||
op = SCacheton.get(_DepthwiseConv2DOp, input_t.shape, input_t.dtype, kernel_t.shape, kernel_t.dtype, dtype, int(stride), int(dilation), padding)
|
||||
|
||||
output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
|
||||
output_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(), kernel_t.get_buffer())
|
||||
|
||||
return output_t
|
||||
|
||||
class _DepthwiseConv2DOp():
|
||||
def __init__(self, i_shape : AShape, i_dtype, k_shape : AShape, k_dtype, o_dtype, stride, dilation, padding):
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
if i_shape.ndim < 2:
|
||||
raise ValueError(f'i_shape.ndim must be >= 2')
|
||||
|
||||
if k_shape.ndim < 2:
|
||||
raise ValueError(f'k_shape.ndim must be >= 2')
|
||||
|
||||
IH,IW = i_shape[-2:]
|
||||
KH,KW = k_shape[-2:]
|
||||
|
||||
ci = Conv2DInfo(IH, IW, KH, KW, stride, dilation, padding)
|
||||
|
||||
if i_shape.ndim == 2 and k_shape.ndim == 2:
|
||||
# nothing to broadcast
|
||||
i_br_shape = i_shape
|
||||
k_br_shape = k_shape
|
||||
|
||||
o_shape = AShape([ci.OH, ci.OW])
|
||||
else:
|
||||
op = BroadcastInfo([ i_shape[:-2], k_shape[:-2] ])
|
||||
|
||||
i_br_shape = op.br_shapes[0] + i_shape[-2:]
|
||||
k_br_shape = op.br_shapes[1] + k_shape[-2:]
|
||||
|
||||
o_shape = op.o_shape + [ci.OH, ci.OW]
|
||||
|
||||
self.o_shape = o_shape
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
{HKernel.define_tensor('I', i_br_shape, i_dtype)}
|
||||
{HKernel.define_tensor('K', k_br_shape, k_dtype)}
|
||||
|
||||
#define PADL {ci.PADL}
|
||||
#define PADT {ci.PADT}
|
||||
|
||||
#define STRIDE {stride}
|
||||
#define DILATION {dilation}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, __global const K_PTR_TYPE* K_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
|
||||
|
||||
float v = 0.0;
|
||||
{'#pragma unroll' if KH <= 9 else ''}
|
||||
for (int km2=0; km2<Km2; ++km2)
|
||||
{{
|
||||
int im2 = -PADT + km2*DILATION + om2*STRIDE;
|
||||
if (im2 >= 0 & im2 < Im2)
|
||||
{'#pragma unroll' if KW <= 9 else ''}
|
||||
for (int km1=0; km1<Km1; ++km1)
|
||||
{{
|
||||
int im1 = -PADL + km1*DILATION + om1*STRIDE;
|
||||
if (im1 >= 0 & im1 < Im1)
|
||||
v += ((float)(I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))))
|
||||
*K_GLOBAL_LOAD(K_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='km2,km1' )}));
|
||||
}}
|
||||
}}
|
||||
|
||||
O_GLOBAL_STORE(gid, (O_TYPE) v);
|
||||
}}
|
||||
""")
|
||||
|
||||
|
39
xlib/avecl/_internal/op/gaussian_blur.py
Normal file
39
xlib/avecl/_internal/op/gaussian_blur.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
import numpy as np
|
||||
|
||||
from ..Tensor import Tensor
|
||||
from .depthwise_conv2D import depthwise_conv2D
|
||||
|
||||
|
||||
def gaussian_blur (input_t : Tensor, sigma, dtype=None) -> Tensor:
|
||||
"""
|
||||
arguments
|
||||
|
||||
input_t Tensor(...,H,W)
|
||||
|
||||
sigma float
|
||||
|
||||
|
||||
"""
|
||||
if sigma <= 0.0:
|
||||
return input_t.copy() #TODO
|
||||
|
||||
device = input_t.get_device()
|
||||
|
||||
key = (gaussian_blur, sigma)
|
||||
kernel_t = device.get_cached_data(key)
|
||||
if kernel_t is None:
|
||||
kernel_t = Tensor.from_value( _make_gaussian_kernel(sigma, np.float32), device=device )
|
||||
device.set_cached_data(key, kernel_t)
|
||||
|
||||
output_t = depthwise_conv2D(input_t, kernel_t, dtype=dtype)
|
||||
return output_t
|
||||
|
||||
def _make_gaussian_kernel(sigma : float, dtype):
|
||||
kernel_size = max(3, int(2 * 2 * sigma))
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size += 1
|
||||
mean = np.floor(0.5 * kernel_size)
|
||||
kernel_1d = np.array([ np.exp(-(float(x) - float(mean)) ** 2 / (2 * sigma ** 2)) for x in range(kernel_size)])
|
||||
np_kernel = np.outer(kernel_1d, kernel_1d)
|
||||
kernel = np_kernel / np.sum(np_kernel)
|
||||
return kernel.astype(dtype)
|
158
xlib/avecl/_internal/op/matmul.py
Normal file
158
xlib/avecl/_internal/op/matmul.py
Normal file
|
@ -0,0 +1,158 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HArgs import HArgs
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
|
||||
def matmul(a_t : Tensor, b_t : Tensor, output_t: Tensor=None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
matmul operator in row-major format
|
||||
|
||||
A(...,M,K) x
|
||||
B(...,K,N) =
|
||||
(...,M,N)
|
||||
|
||||
arguments
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape,
|
||||
but should match total size.
|
||||
gradfn will not be set.
|
||||
|
||||
is_add_to_output add result to output_t if output_t is set.
|
||||
"""
|
||||
|
||||
return matmulc(b_t, a_t, output_t=output_t, is_add_to_output=is_add_to_output)
|
||||
|
||||
|
||||
|
||||
def matmulc(a_t : Tensor, b_t : Tensor, output_t : Tensor = None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
matmul operator in col-major format
|
||||
|
||||
A(...,K,M) x
|
||||
B(...,N,K) =
|
||||
(...,N,M)
|
||||
|
||||
arguments
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape,
|
||||
but should match total size.
|
||||
gradfn will not be set.
|
||||
|
||||
is_add_to_output add result to output_t if output_t is set.
|
||||
"""
|
||||
device = HArgs.check_get_same_device([a_t, b_t])
|
||||
|
||||
op = SCacheton.get(_MatmulOp, a_t.shape, a_t.dtype, b_t.shape, b_t.dtype, False if output_t is None else is_add_to_output)
|
||||
|
||||
if output_t is None:
|
||||
output_t = Tensor (op.o_shape, op.o_dtype, device=device )
|
||||
elif output_t.shape.size != op.o_shape.size:
|
||||
raise ValueError(f'output_t must have size {op.o_shape.size}')
|
||||
|
||||
device.run_kernel(op.forward_krn, output_t.get_buffer(), a_t.get_buffer(), b_t.get_buffer(), )
|
||||
|
||||
return output_t
|
||||
|
||||
|
||||
class _MatmulOp:
|
||||
def __init__(self, a_shape, a_dtype, b_shape, b_dtype, is_add_to_output):
|
||||
a_dtype = np.dtype(a_dtype).type
|
||||
b_dtype = np.dtype(b_dtype).type
|
||||
|
||||
if a_dtype != np.float32 or b_dtype != np.float32:
|
||||
raise ValueError('matmul works only with float32 tensors.')
|
||||
|
||||
if a_shape.ndim != b_shape.ndim:
|
||||
raise ValueError(f'ndims are not equal. {a_shape.ndim} != {b_shape.ndim}')
|
||||
|
||||
ndim = a_shape.ndim
|
||||
if ndim < 2:
|
||||
raise ValueError('Tensors ndim must be at least 2.')
|
||||
|
||||
K, M = a_shape[-2], a_shape[-1]
|
||||
N, B_COLS = b_shape[-2], b_shape[-1]
|
||||
|
||||
if K != B_COLS:
|
||||
raise ValueError('A_ROWS != B_COLS')
|
||||
|
||||
BATCH = a_shape[0:-2].size
|
||||
B_BATCH = b_shape[0:-2].size
|
||||
|
||||
if BATCH != B_BATCH:
|
||||
raise ValueError(f'BATCH size {BATCH} != {B_BATCH} in shapes {a_shape} {b_shape}')
|
||||
|
||||
if ndim == 2:
|
||||
self.o_shape = AShape( (N, M) )
|
||||
else:
|
||||
self.o_shape = AShape( a_shape[:-2]+(N, M) )
|
||||
self.o_dtype = np.float32
|
||||
|
||||
self.M = M
|
||||
self.N = N
|
||||
self.K = K
|
||||
|
||||
# Determining optimal tile widths
|
||||
for MW in [8,4,2,1]:
|
||||
if M % MW == 0:
|
||||
break
|
||||
for KW in [8,4,2,1]:
|
||||
if N % KW == 0 and K % KW == 0:
|
||||
break
|
||||
NW = KW
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(M//MW, N//NW, BATCH), kernel_text=f"""
|
||||
#define K {K}
|
||||
#define N {N}
|
||||
#define MW {MW} // M tile Width
|
||||
#define NW {NW} // N tile Width -- NW & KW should be the same !
|
||||
#define KW {KW} // K tile Width
|
||||
#define MT {M//MW} // MT is max for 'mt' (M tile count)
|
||||
#define KT {K//KW} // KT is max for 'kt' (K tile count)
|
||||
|
||||
#define floatMW { f'float{MW}' if MW != 1 else 'float'}
|
||||
#define floatKW { f'float{KW}' if KW != 1 else 'float'}
|
||||
|
||||
__kernel void GeMM(__global floatMW* O, const __global floatMW* restrict A, const __global floatKW* restrict B)
|
||||
{{
|
||||
size_t mt = get_global_id(0); //global M-tile id
|
||||
size_t nc = get_global_id(1); //global N-tile id
|
||||
size_t batch = get_global_id(2);
|
||||
|
||||
float AT[KW][MW]; // sub tiles
|
||||
float BT[NW][KW];
|
||||
float CT[NW][MW];
|
||||
|
||||
#pragma unroll
|
||||
for (uint i=0; i<NW*MW; ++i) // zero CT tile
|
||||
((float*) CT)[i] = 0.0;
|
||||
|
||||
for (uint kt=0; kt<KT; ++kt) // iterate over K-dim tiles
|
||||
{{
|
||||
#pragma unroll
|
||||
for (uint k=0; k<KW; ++k) // every k-element inside K-dim tile
|
||||
*( (floatMW*) AT[k] ) = A[batch*K*MT + (kt*KW + k)*MT + mt]; // store M-Width floats
|
||||
|
||||
#pragma unroll
|
||||
for (uint n=0; n<NW; ++n) // every n-element inside N-dim tile
|
||||
*( (floatKW*) BT[n] ) = B[batch*N*KT + (nc*NW + n)*KT + kt]; // store K-Width floats
|
||||
|
||||
#pragma unroll
|
||||
for (uint k=0; k<KW; ++k)
|
||||
#pragma unroll
|
||||
for (uint n=0; n<NW; ++n) // sub tiles multiplication
|
||||
#pragma unroll
|
||||
for (uint m=0; m<MW; ++m)
|
||||
CT[n][m] += AT[k][m] * BT[n][k];
|
||||
}}
|
||||
|
||||
#pragma unroll
|
||||
for (uint n=0; n<NW; ++n)
|
||||
O[ batch*N*MT + (nc*NW + n)*MT + mt] {'+=' if is_add_to_output else '='}
|
||||
*( (floatMW*) CT[n]);
|
||||
}}""")
|
||||
|
71
xlib/avecl/_internal/op/pad.py
Normal file
71
xlib/avecl/_internal/op/pad.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..HType import HType
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..info import PadInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
|
||||
def pad(input_t : Tensor, axes_paddings : List, mode : str = 'constant', constant_value=0, dtype : np.dtype = None, output_t : Tensor=None) -> Tensor:
|
||||
"""
|
||||
arguments:
|
||||
|
||||
axes_paddings list of (l_pad, r_pad),
|
||||
|
||||
if [0] == ... (Ellipsis), then left-side paddings will be filled with (0,0) for remain axes
|
||||
if [-1] == ... , same for ride-side
|
||||
|
||||
dtype cast to dtype
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape, but should match total size
|
||||
|
||||
"""
|
||||
op = SCacheton.get(_PadOp, input_t.shape, input_t.dtype, dtype, tuple(axes_paddings), mode, constant_value )
|
||||
|
||||
if output_t is None:
|
||||
output_t = Tensor (op.o_shape, op.o_dtype, device=input_t.get_device())
|
||||
elif output_t.shape.size != op.o_shape.size:
|
||||
raise ValueError(f'output_t must have size {op.o_shape.size}')
|
||||
|
||||
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
|
||||
|
||||
return output_t
|
||||
|
||||
|
||||
class _PadOp:
|
||||
def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, axes_paddings, mode, constant_value ):
|
||||
_allow_modes = ['constant']
|
||||
if mode not in _allow_modes:
|
||||
raise ValueError(f'Allowed pads modes: {_allow_modes}')
|
||||
|
||||
if mode == 'constant':
|
||||
if not HType.is_scalar_type(constant_value):
|
||||
raise ValueError('constan_value must be scalar')
|
||||
|
||||
info = PadInfo(i_shape, axes_paddings)
|
||||
|
||||
self.o_shape = o_shape = info.o_shape
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
{HKernel.define_tensor('I', i_shape, i_dtype)}
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
|
||||
|
||||
if ({' & '.join(f'o{i} >= {l_pad} & o{i} < (O{i}-{r_pad})' for i, (l_pad,r_pad) in enumerate(info.axes_paddings))})
|
||||
O_GLOBAL_STORE(gid, I_GLOBAL_LOAD( I_IDX({ ','.join(f'o{i}-{l_pad}' for i,(l_pad,r_pad) in zip(range(o_shape.ndim), info.axes_paddings) ) }) ) );
|
||||
else
|
||||
O_GLOBAL_STORE(gid, (O_TYPE){constant_value} );
|
||||
//O_GLOBAL_STORE(gid, I_GLOBAL_LOAD( I_IDX_MOD({ ','.join(f' I{i} + ( (o{i}-{l_pad})*( ((o{i}-{l_pad})/I{i}) % 2 == 0 ? 1: -1) ) % I{i} ' for i,(l_pad,r_pad) in zip(range(o_shape.ndim), info.axes_paddings) ) }) ) );
|
||||
}}""")
|
||||
#print(self.forward_krn)
|
214
xlib/avecl/_internal/op/reduce.py
Normal file
214
xlib/avecl/_internal/op/reduce.py
Normal file
|
@ -0,0 +1,214 @@
|
|||
import math
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..AAxes import AAxes
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..info import ReductionInfo, TransposeInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
from .slice_ import slice_
|
||||
from .transpose import transpose
|
||||
from .any_wise import square, sqrt
|
||||
|
||||
|
||||
def reduce_mean (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
Reduce mean operator.
|
||||
|
||||
input_t Tensor
|
||||
|
||||
axes(None) int
|
||||
Iterable of ints.
|
||||
None - all axes
|
||||
|
||||
keepdims(False) keep reduced axes
|
||||
"""
|
||||
return reduce_op ('mean', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
|
||||
|
||||
def reduce_std(input_t, axes=None, keepdims=False):
|
||||
"""
|
||||
Reduce std operator.
|
||||
|
||||
input_t Tensor
|
||||
|
||||
axes(None) int
|
||||
Iterable of ints.
|
||||
None - all axes
|
||||
|
||||
keepdims(False) keep reduced axes
|
||||
"""
|
||||
return sqrt(reduce_variance(input_t, axes, keepdims))
|
||||
|
||||
|
||||
def reduce_variance(input_t, axes=None, keepdims=False):
|
||||
"""
|
||||
Reduce variance operator.
|
||||
|
||||
input_t Tensor
|
||||
|
||||
axes(None) int
|
||||
Iterable of ints.
|
||||
None - all axes
|
||||
|
||||
keepdims(False) keep reduced axes
|
||||
"""
|
||||
mean = reduce_mean(input_t, axes, keepdims=True)
|
||||
return reduce_mean(square(input_t - mean), axes, keepdims)
|
||||
|
||||
def moments(input_t, axes=None, keepdims=False):
|
||||
"""
|
||||
Returns (mean, variance) of input_t
|
||||
|
||||
input_t Tensor
|
||||
|
||||
axes(None) int
|
||||
Iterable of ints.
|
||||
None - all axes
|
||||
|
||||
keepdims(False) keep reduced axes
|
||||
"""
|
||||
mean = reduce_mean(input_t, axes, keepdims)
|
||||
mean_shape_keepdims = mean._op.info.o_shape_kd
|
||||
var = reduce_mean(square(input_t - mean.reshape(mean_shape_keepdims) ), axes, keepdims)
|
||||
return mean, var
|
||||
|
||||
def reduce_min (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
Reduce min operator.
|
||||
|
||||
input_t Tensor
|
||||
|
||||
axes(None) int
|
||||
Iterable of ints.
|
||||
None - all axes
|
||||
|
||||
keepdims(False) keep reduced axes
|
||||
"""
|
||||
return reduce_op ('min', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
|
||||
|
||||
def reduce_max (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
Reduce max operator.
|
||||
|
||||
input_t Tensor
|
||||
|
||||
axes(None) int
|
||||
Iterable of ints.
|
||||
None - all axes
|
||||
|
||||
keepdims(False) keep reduced axes
|
||||
"""
|
||||
return reduce_op ('max', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
|
||||
|
||||
def reduce_sum (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
Reduce sum operator.
|
||||
|
||||
input_t Tensor
|
||||
|
||||
axes(None) int
|
||||
Iterable of ints.
|
||||
None - all axes
|
||||
|
||||
keepdims(False) keep reduced axes
|
||||
"""
|
||||
return reduce_op ('sum', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
|
||||
|
||||
def reduce_op (op_type : str, input_t, axes=None, keepdims=False, output_t=None, is_add_to_output=False):
|
||||
"""
|
||||
arguments
|
||||
|
||||
op_type 'sum' 'mean' 'min' 'max'
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape,
|
||||
but should match total size.
|
||||
gradfn will not be set.
|
||||
|
||||
is_add_to_output add result to output_t if output_t is set.
|
||||
"""
|
||||
|
||||
op = SCacheton.get(_ReduceOp, op_type, input_t.shape, input_t.dtype, AAxes(axes, input_t.shape.ndim), keepdims)
|
||||
|
||||
if output_t is None:
|
||||
output_t = Tensor ( op.info.o_shape, input_t.dtype, device=input_t.get_device() )
|
||||
elif output_t.shape.size != op.info.o_shape.size:
|
||||
raise ValueError(f'output_t must have size {op.info.o_shape.size}')
|
||||
|
||||
# Make an intermediate tensor
|
||||
input_t_inter = transpose(input_t, op.intermediate_transpose_axes)
|
||||
|
||||
# Perform multistage inplace operation in intermediate tensor
|
||||
for stage, (shape, STAGE_COLS, STAGE_VALID_COLS) in enumerate(zip(op.forward_krn_shapes, op.forward_krn_stage_cols, op.forward_krn_stage_valid_cols)):
|
||||
input_t_inter.get_device().run_kernel(op.forward_krn, input_t_inter.get_buffer(), np.int64(op.COLS), np.int64(STAGE_COLS), np.int64(STAGE_VALID_COLS),
|
||||
global_shape=shape)
|
||||
|
||||
if op_type == 'mean':
|
||||
# divide values in ROWS by number of COLS
|
||||
input_t_inter.get_device().run_kernel(op.mean_div_forward_krn, input_t_inter.get_buffer(), np.int64(op.COLS), global_shape=(op.ROWS,) )
|
||||
|
||||
# Fetch final tensor from zero indexes using slices argument
|
||||
slice_(input_t_inter, op.inter_slices, output_t=output_t, is_add_to_output=is_add_to_output)
|
||||
|
||||
return output_t
|
||||
|
||||
|
||||
class _ReduceOp:
|
||||
def __init__(self, op_type, i_shape : AShape, i_dtype : np.dtype, axes : AAxes, keepdims=False):
|
||||
self.op_type = op_type
|
||||
self.info = info = ReductionInfo(i_shape, axes, keepdims)
|
||||
|
||||
# Determine transpose order for intermediate tensor, where reduction axes will be at the end
|
||||
self.intermediate_transpose_axes = info.o_axes + info.reduction_axes
|
||||
self.intermediate_shape = TransposeInfo(i_shape, self.intermediate_transpose_axes).o_shape
|
||||
|
||||
# slices argument to fetch processed tensor from zero indexes
|
||||
self.inter_slices = ( slice(None,None,None), ) * info.o_axes.ndim + (0,) * info.reduction_axes.ndim
|
||||
|
||||
# COLS are reduction axes, ROWS are remaining axes
|
||||
rows_ndim = info.o_axes.ndim
|
||||
self.ROWS = ROWS = self.intermediate_shape[:rows_ndim].size
|
||||
self.COLS = COLS = self.intermediate_shape[rows_ndim:].size
|
||||
|
||||
# Number of stages to operate COLS
|
||||
n_stages = (COLS-1).bit_length()
|
||||
self.forward_krn_shapes = [ (ROWS * math.ceil(COLS/ (2**(stage+1)) ),) for stage in range(n_stages) ]
|
||||
self.forward_krn_stage_cols = [ math.ceil(COLS / (2**(stage+1)) ) for stage in range(n_stages) ]
|
||||
self.forward_krn_stage_valid_cols = [ math.ceil(COLS / (2** stage ) ) for stage in range(n_stages) ]
|
||||
|
||||
self.forward_krn = Kernel(f"""
|
||||
{HKernel.define_tensor('I', (1,), i_dtype)}
|
||||
|
||||
__kernel void impl(__global I_PTR_TYPE* I_PTR_NAME, long COLS, long STAGE_COLS, long STAGE_VALID_COLS)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
size_t col = gid % STAGE_COLS;
|
||||
size_t row = gid / STAGE_COLS;
|
||||
size_t i_idx = row*COLS + col;
|
||||
|
||||
size_t other_col = col + STAGE_COLS;
|
||||
if (other_col < STAGE_VALID_COLS)
|
||||
{{
|
||||
I_TYPE val_a = I_GLOBAL_LOAD(i_idx);
|
||||
I_TYPE val_b = I_GLOBAL_LOAD(row*COLS + other_col);
|
||||
|
||||
{'I_TYPE val_x = val_a + val_b;' if op_type in ['sum','mean'] else
|
||||
'I_TYPE val_x = fmin( I_TO_FLOATX(val_a), I_TO_FLOATX(val_b) );' if op_type == 'min' else
|
||||
'I_TYPE val_x = fmax( I_TO_FLOATX(val_a), I_TO_FLOATX(val_b) );' if op_type == 'max' else ''
|
||||
}
|
||||
I_GLOBAL_STORE(i_idx, val_x);
|
||||
}}
|
||||
}}
|
||||
""")
|
||||
self.mean_div_forward_krn = Kernel(f"""
|
||||
{HKernel.define_tensor('I', (1,), i_dtype)}
|
||||
__kernel void impl(__global I_PTR_TYPE* I_PTR_NAME, long COLS)
|
||||
{{
|
||||
size_t row = get_global_id(0);
|
||||
I_GLOBAL_STORE(row*COLS, I_GLOBAL_LOAD(row*COLS) / COLS );
|
||||
}}
|
||||
""")
|
103
xlib/avecl/_internal/op/remap.py
Normal file
103
xlib/avecl/_internal/op/remap.py
Normal file
|
@ -0,0 +1,103 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..info import BroadcastInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
def remap (input_t : Tensor, coords_t : Tensor, dtype=None) -> Tensor:
|
||||
"""
|
||||
remap input_t in spatial axes using coords_t
|
||||
|
||||
arguments
|
||||
|
||||
input_t Tensor( ...,IH,IW )
|
||||
|
||||
coords_t Tensor( ...,OH,OW,D )
|
||||
OH - output height
|
||||
OW - output width
|
||||
D is (2)[x,y] coords
|
||||
|
||||
dtype
|
||||
|
||||
...-head part of shapes will be broadcasted to each other
|
||||
"""
|
||||
|
||||
op = SCacheton.get(_RemapOp, input_t.shape, input_t.dtype, coords_t.shape, coords_t.dtype, dtype)
|
||||
|
||||
output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
|
||||
|
||||
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(), coords_t.get_buffer())
|
||||
|
||||
return output_t
|
||||
|
||||
|
||||
class _RemapOp():
|
||||
def __init__(self, i_shape : AShape, i_dtype, c_shape : AShape, c_dtype, o_dtype):
|
||||
if np.dtype(i_dtype).type == np.bool_:
|
||||
raise ValueError('np.bool_ dtype of i_dtype is not supported.')
|
||||
if np.dtype(c_dtype).type == np.bool_:
|
||||
raise ValueError('np.bool_ dtype of c_dtype is not supported.')
|
||||
if i_shape.ndim < 2:
|
||||
raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
|
||||
if c_shape.ndim < 3:
|
||||
raise ValueError(f'Coords shape ndim must be >= 3(...,H,W,D)')
|
||||
if c_shape[-1] != 2:
|
||||
raise ValueError('Last coords dim must be == 2 (x,y)')
|
||||
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
if i_shape.ndim == 2 and c_shape.ndim == 3:
|
||||
# nothing to broadcast
|
||||
|
||||
i_br_shape = i_shape
|
||||
c_br_shape = c_shape
|
||||
|
||||
o_shape = c_shape[-3:-1]
|
||||
else:
|
||||
op = BroadcastInfo([ i_shape[:-2], c_shape[:-3] ])
|
||||
|
||||
i_br_shape = op.br_shapes[0] + i_shape[-2:]
|
||||
c_br_shape = op.br_shapes[1] + c_shape[-3:]
|
||||
|
||||
o_shape = op.o_shape + c_shape[-3:-1]
|
||||
|
||||
self.o_shape = o_shape
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
|
||||
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
{HKernel.define_tensor('I', i_br_shape, i_dtype)}
|
||||
{HKernel.define_tensor('C', c_br_shape[:-1], c_dtype)}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, __global const C_PTR_TYPE2* C_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)}
|
||||
|
||||
C_TYPE2 c_value = C_GLOBAL_LOAD2(C_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim)}));
|
||||
|
||||
float cx01 = (float) c_value.x;
|
||||
float cy01 = (float) c_value.y;
|
||||
|
||||
float cx0f = floor(cx01); int cx0 = (int)cx0f;
|
||||
float cy0f = floor(cy01); int cy0 = (int)cy0f;
|
||||
float cx1f = cx0f+1; int cx1 = (int)cx1f;
|
||||
float cy1f = cy0f+1; int cy1 = (int)cy1f;
|
||||
|
||||
float p00 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx0')}));
|
||||
float p01 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx1')}));
|
||||
float p10 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx0')}));
|
||||
float p11 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx1')}));
|
||||
|
||||
p00 *= (cx1f - cx01)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx0 >= 0 & cx0 < Im1);
|
||||
p01 *= (cx01 - cx0f)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx1 >= 0 & cx1 < Im1);
|
||||
p10 *= (cx1f - cx01)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx0 >= 0 & cx0 < Im1);
|
||||
p11 *= (cx01 - cx0f)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx1 >= 0 & cx1 < Im1);
|
||||
|
||||
O_GLOBAL_STORE(gid, p00 + p01 + p10 + p11);
|
||||
}}
|
||||
""")
|
96
xlib/avecl/_internal/op/remap_np_affine.py
Normal file
96
xlib/avecl/_internal/op/remap_np_affine.py
Normal file
|
@ -0,0 +1,96 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
def remap_np_affine (input_t : Tensor, affine_n : np.array, inverse=False, output_size=None, dtype=None) -> Tensor:
|
||||
"""
|
||||
remap affine operator for all channels using single numpy affine mat
|
||||
|
||||
arguments
|
||||
|
||||
input_t Tensor (...,H,W)
|
||||
|
||||
affine_n np.array (2,3)
|
||||
|
||||
dtype
|
||||
"""
|
||||
if affine_n.shape != (2,3):
|
||||
raise ValueError('affine_n.shape must be (2,3)')
|
||||
|
||||
op = SCacheton.get(_RemapAffineOp, input_t.shape, input_t.dtype, output_size, dtype)
|
||||
|
||||
output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
|
||||
|
||||
((a, b, c),
|
||||
(d, e, f)) = affine_n
|
||||
if not inverse:
|
||||
# do inverse by default, match cv2.warpAffine behaviour
|
||||
D = a*e - b*d
|
||||
D = 1.0 / D if D != 0.0 else 0.0
|
||||
a, b, c, d, e, f = ( e*D, -b*D, (b*f-e*c)*D ,
|
||||
-d*D, a*D, (d*c-a*f)*D )
|
||||
|
||||
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(),
|
||||
np.float32(a), np.float32(b), np.float32(c), np.float32(d), np.float32(e), np.float32(f) )
|
||||
|
||||
return output_t
|
||||
|
||||
|
||||
class _RemapAffineOp():
|
||||
def __init__(self, i_shape : AShape, i_dtype, o_size, o_dtype):
|
||||
if np.dtype(i_dtype).type == np.bool_:
|
||||
raise ValueError('np.bool_ dtype of i_dtype is not supported.')
|
||||
if i_shape.ndim < 2:
|
||||
raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
|
||||
|
||||
IH,IW = i_shape[-2:]
|
||||
if o_size is not None:
|
||||
OH,OW = o_size
|
||||
else:
|
||||
OH,OW = IH,IW
|
||||
|
||||
o_shape = AShape( (OH,OW) )
|
||||
if i_shape.ndim > 2:
|
||||
o_shape = i_shape[:-2] + o_shape
|
||||
|
||||
self.o_shape = o_shape
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
|
||||
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
{HKernel.define_tensor('I', i_shape, i_dtype)}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME,
|
||||
float a, float b, float c,
|
||||
float d, float e, float f)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
|
||||
|
||||
float cx01 = om1*a + om2*b + c;
|
||||
float cy01 = om1*d + om2*e + f;
|
||||
|
||||
float cx0f = floor(cx01); int cx0 = (int)cx0f;
|
||||
float cy0f = floor(cy01); int cy0 = (int)cy0f;
|
||||
float cx1f = cx0f+1; int cx1 = (int)cx1f;
|
||||
float cy1f = cy0f+1; int cy1 = (int)cy1f;
|
||||
|
||||
float p00 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx0')}));
|
||||
float p01 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx1')}));
|
||||
float p10 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx0')}));
|
||||
float p11 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx1')}));
|
||||
|
||||
p00 *= (cx1f - cx01)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx0 >= 0 & cx0 < Im1);
|
||||
p01 *= (cx01 - cx0f)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx1 >= 0 & cx1 < Im1);
|
||||
p10 *= (cx1f - cx01)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx0 >= 0 & cx0 < Im1);
|
||||
p11 *= (cx01 - cx0f)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx1 >= 0 & cx1 < Im1);
|
||||
|
||||
O_GLOBAL_STORE(gid, p00 + p01 + p10 + p11);
|
||||
}}
|
||||
""")
|
26
xlib/avecl/_internal/op/reshape.py
Normal file
26
xlib/avecl/_internal/op/reshape.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
from typing import Iterable
|
||||
|
||||
from ..Tensor import Tensor
|
||||
from ..SCacheton import SCacheton
|
||||
from ..info import ReshapeInfo
|
||||
|
||||
|
||||
def reshape(input_t : Tensor, new_shape : Iterable, copy=True) -> Tensor:
|
||||
"""
|
||||
reshape operator
|
||||
|
||||
arguments
|
||||
|
||||
new_shape Iterable of ints
|
||||
|
||||
copy(True) if True, produces new Tensor
|
||||
otherwise result tensor points to the same memory
|
||||
|
||||
Produces reference Tensor with new shape.
|
||||
"""
|
||||
info = SCacheton.get(ReshapeInfo, input_t.shape, tuple(int(x) for x in new_shape) )
|
||||
|
||||
if copy:
|
||||
return Tensor(info.o_shape, input_t.dtype, device=input_t.get_device()).set(input_t)
|
||||
return input_t.as_shape( info.o_shape )
|
||||
|
67
xlib/avecl/_internal/op/slice_.py
Normal file
67
xlib/avecl/_internal/op/slice_.py
Normal file
|
@ -0,0 +1,67 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..HType import HType
|
||||
from ..info import SliceInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
|
||||
def slice_(input_t : Tensor, slices, dtype : np.dtype = None, output_t=None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
arguments:
|
||||
|
||||
input_t input tensor
|
||||
slices argument received from class.__getitem__(slices)
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape, but should match total size.
|
||||
gradfn will not be set.
|
||||
|
||||
is_add_to_output add result to output_t if output_t is set.
|
||||
|
||||
Remark.
|
||||
|
||||
Slicing logic is not the same as numpy:
|
||||
For example np[2:0:1] slice will produce invalid array with zero index,
|
||||
but nn.slice() will select 2 index, same as val_t[2].
|
||||
"""
|
||||
op = SCacheton.get(_SliceOp, input_t.shape, input_t.dtype, dtype, HType.hashable_slices(slices), False if output_t is None else is_add_to_output )
|
||||
o_shape = op.slice_info.o_shape
|
||||
|
||||
if output_t is None:
|
||||
if op.slice_info.just_reshaped:
|
||||
return input_t.reshape(o_shape)
|
||||
else:
|
||||
output_t = Tensor(o_shape, op.o_dtype, device=input_t.get_device())
|
||||
|
||||
elif output_t.shape.size != o_shape.size:
|
||||
raise ValueError(f'output_t must have size {o_shape.size}')
|
||||
|
||||
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
|
||||
|
||||
return output_t
|
||||
|
||||
|
||||
class _SliceOp:
|
||||
def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, slices, is_add_to_output):
|
||||
self.slice_info = slice_info = SliceInfo(i_shape, slices)
|
||||
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(slice_info.o_shape_kd.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', slice_info.o_shape_kd, o_dtype )}
|
||||
{HKernel.define_tensor('I', i_shape, i_dtype )}
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'o', slice_info.o_shape_kd.ndim)}
|
||||
|
||||
{chr(10).join( f'size_t i{i} = {b} + o{i} * {s}; ' for i, (b,e,s) in enumerate(slice_info.axes_bes) ) }
|
||||
|
||||
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}(gid, I_GLOBAL_LOAD( I_IDX({HKernel.axes_seq_enum('i', i_shape.ndim)}) ) );
|
||||
}}
|
||||
""")
|
73
xlib/avecl/_internal/op/slice_set.py
Normal file
73
xlib/avecl/_internal/op/slice_set.py
Normal file
|
@ -0,0 +1,73 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..HType import HType
|
||||
from ..info import BroadcastInfo, SliceInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
|
||||
def slice_set(input_t : Tensor, slices, value) -> Tensor:
|
||||
"""
|
||||
arguments:
|
||||
|
||||
input_t input tensor
|
||||
slices argument received from class.__getitem__(slices)
|
||||
value
|
||||
|
||||
|
||||
Remark.
|
||||
|
||||
"""
|
||||
if HType.is_scalar_type(value):
|
||||
v_shape = None
|
||||
v_dtype = None
|
||||
v_scalar = value
|
||||
elif not isinstance(value, Tensor):
|
||||
value = Tensor.from_value(value, dtype=input_t.dtype, device=input_t.get_device())
|
||||
v_shape = value.shape
|
||||
v_dtype = value.dtype
|
||||
v_scalar = None
|
||||
|
||||
op = SCacheton.get(_SliceSetOp, input_t.shape, input_t.dtype, v_shape, v_dtype, v_scalar, HType.hashable_slices(slices) )
|
||||
|
||||
if v_scalar is not None:
|
||||
input_t.get_device().run_kernel(op.forward_krn, input_t.get_buffer() )
|
||||
else:
|
||||
input_t.get_device().run_kernel(op.forward_krn, input_t.get_buffer(), value.get_buffer() )
|
||||
|
||||
return input_t
|
||||
|
||||
class _SliceSetOp:
|
||||
def __init__(self, i_shape : AShape, i_dtype : np.dtype, v_shape : AShape, v_dtype : np.dtype, v_scalar, slices):
|
||||
slice_info = SliceInfo(i_shape, slices)
|
||||
|
||||
if v_scalar is None:
|
||||
if v_shape.ndim > i_shape.ndim:
|
||||
raise ValueError(f'v_shape.ndim {v_shape.ndim} cannot be larger than i_shape.ndim {i_shape.ndim}')
|
||||
|
||||
# Check that v_shape can broadcast with slice_info.shape
|
||||
br_info = BroadcastInfo([slice_info.o_shape_kd, v_shape])
|
||||
|
||||
v_br_shape = br_info.br_shapes[1]
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(i_shape.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', i_shape, i_dtype )}
|
||||
|
||||
{HKernel.define_tensor('I', v_br_shape, v_dtype ) if v_scalar is None else ''}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME
|
||||
{', __global const I_PTR_TYPE* I_PTR_NAME' if v_scalar is None else ''})
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'O', slice_info.o_shape_kd.ndim)}
|
||||
|
||||
if ({' & '.join( [f'o{i} >= {b} & o{i} < {e}' if s != 0 else f'o{i} == {b}' for i, (b,e,s) in enumerate(slice_info.axes_abs_bes)] +
|
||||
[f'((o{i} % {s}) == 0)' for i, (_,_,s) in enumerate(slice_info.axes_abs_bes) if s > 1 ] ) } )
|
||||
|
||||
O_GLOBAL_STORE(gid, {f"I_GLOBAL_LOAD( I_IDX_MOD({HKernel.axes_seq_enum('O', i_shape.ndim)}) ) " if v_scalar is None else f" (O_TYPE)({v_scalar})"} );
|
||||
}}
|
||||
""")
|
74
xlib/avecl/_internal/op/stack.py
Normal file
74
xlib/avecl/_internal/op/stack.py
Normal file
|
@ -0,0 +1,74 @@
|
|||
import numpy as np
|
||||
from typing import List
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HArgs import HArgs
|
||||
from ..HKernel import HKernel
|
||||
from ..HType import HType
|
||||
from ..info import StackInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
def stack(tensor_list : List[Tensor], axis, dtype=None, output_t=None, is_add_to_output=False):
|
||||
"""
|
||||
Stack operator.
|
||||
|
||||
arguments:
|
||||
|
||||
tensor_list List of Tensors
|
||||
|
||||
axis Int
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape, but should match total size.
|
||||
gradfn will not be set.
|
||||
|
||||
is_add_to_output add result to output_t if output_t is set.
|
||||
"""
|
||||
HArgs.check_zero_get_length(tensor_list)
|
||||
HArgs.check_all_tensors(tensor_list)
|
||||
|
||||
device = HArgs.check_get_same_device(tensor_list)
|
||||
|
||||
shape_list, dtype_list, _ = HArgs.decompose(tensor_list)
|
||||
|
||||
op = SCacheton.get(_StackOp, shape_list, dtype_list, int(axis), dtype, False if output_t is None else is_add_to_output)
|
||||
|
||||
if output_t is None:
|
||||
output_t = Tensor (op.info.o_shape, op.o_dtype, device=device)
|
||||
elif output_t.shape.size != op.info.o_shape.size:
|
||||
raise ValueError(f'output_t must have size {op.info.o_shape.size}')
|
||||
|
||||
for i, krn in enumerate(op.forward_krns):
|
||||
device.run_kernel(krn, output_t.get_buffer(), tensor_list[i].get_buffer(), np.int64(i) )
|
||||
|
||||
return output_t
|
||||
|
||||
|
||||
class _StackOp:
|
||||
def __init__(self, shape_list : List[AShape], dtype_list : List[np.dtype], axis, o_dtype, is_add_to_output):
|
||||
self.stack_count = stack_count = len(shape_list)
|
||||
|
||||
i_shape = shape_list[0]
|
||||
if not all (s == i_shape for s in shape_list):
|
||||
raise ValueError('All shapes must be the same')
|
||||
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
|
||||
self.info = info = StackInfo(i_shape, axis, stack_count)
|
||||
|
||||
self.forward_krns = forward_krns = []
|
||||
for i_dtype in dtype_list:
|
||||
forward_krns.append( Kernel(global_shape=(i_shape.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', info.o_shape, o_dtype )}
|
||||
{HKernel.define_tensor('I', i_shape, i_dtype )}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, long i_new_idx)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'I', i_shape.ndim)}
|
||||
|
||||
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_seq_enum('I', i_shape.ndim, new_axis=('i_new_idx', info.axis))}), I_GLOBAL_LOAD(gid) );
|
||||
}}
|
||||
"""))
|
58
xlib/avecl/_internal/op/tile.py
Normal file
58
xlib/avecl/_internal/op/tile.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
import numpy as np
|
||||
from typing import List
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..info import TileInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
def tile(input_t : Tensor, tiles : List[int], dtype : np.dtype = None, output_t=None, is_add_to_output=False):
|
||||
"""
|
||||
Tile operator
|
||||
|
||||
arguments
|
||||
|
||||
tiles Iterable of ints
|
||||
|
||||
dtype
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape, but should match total size.
|
||||
gradfn will not be set.
|
||||
|
||||
is_add_to_output add result to output_t if output_t is set.
|
||||
"""
|
||||
|
||||
op = SCacheton.get(_TileOp, input_t.shape, input_t.dtype, tuple(int(tile) for tile in tiles), dtype, False if output_t is None else is_add_to_output)
|
||||
|
||||
if output_t is None:
|
||||
output_t = Tensor (op.info.o_shape, op.o_dtype, device=input_t.get_device())
|
||||
elif output_t.shape.size != op.info.o_shape.size:
|
||||
raise ValueError(f'output_t must have size {op.info.o_shape.size}')
|
||||
|
||||
input_t.get_device().run_kernel( op.forward_krn, output_t.get_buffer(), input_t.get_buffer())
|
||||
|
||||
return output_t
|
||||
|
||||
class _TileOp:
|
||||
def __init__(self, i_shape : AShape, i_dtype, tiles, o_dtype, is_add_to_output):
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
self.info = info = TileInfo(i_shape, tiles)
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(info.o_shape.size,), kernel_text=f"""
|
||||
|
||||
{HKernel.define_tensor('I', i_shape, i_dtype)}
|
||||
{HKernel.define_tensor('O', info.o_shape, o_dtype)}
|
||||
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
|
||||
{{
|
||||
size_t gid = get_global_id(0);
|
||||
{HKernel.decompose_idx_to_axes_idxs ('gid', 'O', info.o_shape.ndim)}
|
||||
|
||||
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'} (gid, I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)})) );
|
||||
}}
|
||||
""")
|
||||
|
68
xlib/avecl/_internal/op/transpose.py
Normal file
68
xlib/avecl/_internal/op/transpose.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AAxes import AAxes
|
||||
from ..AShape import AShape
|
||||
from ..backend import Kernel
|
||||
from ..HKernel import HKernel
|
||||
from ..info import TransposeInfo
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
|
||||
def transpose(input_t : Tensor, axes_order, op_text=None, dtype : np.dtype = None, output_t : Tensor=None, is_add_to_output=False) -> Tensor:
|
||||
"""
|
||||
arguments:
|
||||
|
||||
axes_order Int
|
||||
Iterable of ints
|
||||
None
|
||||
|
||||
dtype cast to dtype
|
||||
|
||||
op_text(None) optional op with value during transpose.
|
||||
'O = I'
|
||||
|
||||
output_t compute result to this Tensor.
|
||||
Tensor may be with different shape, but should match total size
|
||||
|
||||
"""
|
||||
op = SCacheton.get(_TransposeOp, input_t.shape, input_t.dtype, dtype, AAxes(axes_order), op_text, False if output_t is None else is_add_to_output )
|
||||
|
||||
if output_t is None:
|
||||
output_t = Tensor (op.o_shape, op.o_dtype, device=input_t.get_device())
|
||||
elif output_t.shape.size != op.o_shape.size:
|
||||
raise ValueError(f'output_t must have size {op.o_shape.size}')
|
||||
|
||||
input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
|
||||
|
||||
return output_t
|
||||
|
||||
|
||||
class _TransposeOp:
|
||||
def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, axes_order : AAxes, op_text, is_add_to_output : bool ):
|
||||
self.axes_order = axes_order
|
||||
self.o_shape = o_shape = TransposeInfo(i_shape, axes_order).o_shape
|
||||
self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
|
||||
|
||||
if op_text is None:
|
||||
op_text = 'O = I'
|
||||
|
||||
self.forward_krn = Kernel(global_shape=(i_shape.size,), kernel_text=f"""
|
||||
{HKernel.define_tensor('O', o_shape, o_dtype)}
|
||||
{HKernel.define_tensor('I', i_shape, i_dtype)}
|
||||
__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
|
||||
{{
|
||||
|
||||
size_t gid = get_global_id(0);
|
||||
|
||||
{HKernel.decompose_idx_to_axes_idxs('gid', 'i', i_shape.ndim)}
|
||||
|
||||
I_TYPE I = I_GLOBAL_LOAD(gid);
|
||||
O_TYPE O;
|
||||
|
||||
{op_text};
|
||||
|
||||
{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_order_enum('I', axes_order )}), O );
|
||||
|
||||
}}""")
|
||||
|
||||
|
BIN
xlib/avecl/_internal/op/unused.zip
Normal file
BIN
xlib/avecl/_internal/op/unused.zip
Normal file
Binary file not shown.
72
xlib/avecl/_internal/op/warp_affine.py
Normal file
72
xlib/avecl/_internal/op/warp_affine.py
Normal file
|
@ -0,0 +1,72 @@
|
|||
import numpy as np
|
||||
|
||||
from ..AShape import AShape
|
||||
from ..initializer import InitCoords2DArange
|
||||
from ..SCacheton import SCacheton
|
||||
from ..Tensor import Tensor
|
||||
from .matmul import matmul
|
||||
from .remap import remap
|
||||
|
||||
def warp_affine (input_t : Tensor, affine_t : Tensor, output_size=None, dtype=None) -> Tensor:
|
||||
"""
|
||||
arguments
|
||||
|
||||
input_t Tensor(...,H,W)
|
||||
|
||||
affine_t Tensor(...,2,3)
|
||||
affine matrix
|
||||
|
||||
example of identity affine matrix
|
||||
[1,0,0],
|
||||
[0,1,0]
|
||||
|
||||
...-head part of shapes will be broadcasted to each other
|
||||
|
||||
output_size(None)
|
||||
|
||||
tuple of 2 ints (HW)
|
||||
of output size
|
||||
if None , size will not be changed
|
||||
|
||||
"""
|
||||
op = SCacheton.get(_WarpAffineOp, input_t.shape, input_t.dtype, affine_t.shape, affine_t.dtype, output_size)
|
||||
|
||||
affine_t = affine_t.transpose( op.affine_transpose_axes, dtype=np.float32 ).reshape( (-1,3,2) )
|
||||
|
||||
coords_t = Tensor(op.coords_shape, np.float32, device=input_t.get_device(), initializer=op.coords_init )
|
||||
coords_t = coords_t.reshape(op.coords_reshape)
|
||||
coords_t = matmul(coords_t, affine_t).reshape(op.coords_affined_shape)
|
||||
|
||||
output_t = remap(input_t, coords_t, dtype=dtype)
|
||||
return output_t
|
||||
|
||||
|
||||
class _WarpAffineOp():
|
||||
def __init__(self, i_shape : AShape, i_dtype, a_shape : AShape, a_dtype, o_size):
|
||||
if np.dtype(i_dtype).type == np.bool_:
|
||||
raise ValueError('np.bool_ dtype of i_dtype is not supported.')
|
||||
if np.dtype(a_dtype).type == np.bool_:
|
||||
raise ValueError('np.bool_ dtype of a_dtype is not supported.')
|
||||
if i_shape.ndim < 2:
|
||||
raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
|
||||
if a_shape.ndim < 2:
|
||||
raise ValueError(f'a_shape.ndim must be >= 2 (...,2,3)')
|
||||
if a_shape[-2] != 2 or a_shape[-1] != 3:
|
||||
raise ValueError('Last a_shape dims must be == (...,2,3)')
|
||||
|
||||
IH,IW = i_shape[-2:]
|
||||
if o_size is not None:
|
||||
OH,OW = o_size
|
||||
else:
|
||||
OH,OW = IH,IW
|
||||
|
||||
self.coords_shape = AShape( (OH,OW,3) )
|
||||
self.coords_affined_shape = AShape( (OH,OW,2) )
|
||||
|
||||
if a_shape.ndim > 2:
|
||||
self.coords_shape = a_shape[:-2] + self.coords_shape
|
||||
self.coords_affined_shape = a_shape[:-2] + self.coords_affined_shape
|
||||
|
||||
self.coords_init = InitCoords2DArange(0,OH-1,0,OW-1)
|
||||
self.coords_reshape = (-1,OH*OW,3)
|
||||
self.affine_transpose_axes = a_shape.axes_arange().swapped_axes(-2,-1)
|
Loading…
Add table
Add a link
Reference in a new issue