add xlib.avecl

2025-08-14 18:57:24 -07:00 · 2021-09-30 18:21:30 +04:00 · 2021-09-30 18:21:30 +04:00 · 0058474da7
commit 0058474da7
parent 932edfe875
56 changed files with 5569 additions and 0 deletions
--- a/xlib/avecl/_internal/op/init.py
+++ b/xlib/avecl/_internal/op/init.py
@ -0,0 +1,21 @@
+from .any_wise import add, any_wise, div, max_, min_, mul, sqrt, square, sub
+from .binary_dilate_circle import binary_dilate_circle
+from .binary_erode_circle import binary_erode_circle
+from .binary_morph import binary_morph
+from .cast import cast
+from .concat import concat
+from .depthwise_conv2D import depthwise_conv2D
+from .gaussian_blur import gaussian_blur
+from .matmul import matmul, matmulc
+from .pad import pad
+from .reduce import (moments, reduce_max, reduce_mean, reduce_min, reduce_std,
+                     reduce_sum, reduce_variance)
+from .remap import remap
+from .remap_np_affine import remap_np_affine
+from .reshape import reshape
+from .slice_ import slice_
+from .slice_set import slice_set
+from .stack import stack
+from .tile import tile
+from .transpose import transpose
+from .warp_affine import warp_affine
--- a/xlib/avecl/_internal/op/any_wise.py
+++ b/xlib/avecl/_internal/op/any_wise.py
@ -0,0 +1,111 @@
+import numpy as np
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HArgs import HArgs
+from ..HKernel import HKernel
+from ..HType import HType
+from ..info import BroadcastInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+
+def any_wise(op_text : str,
+             *args,
+             dtype : np.dtype = None,
+             output_t:Tensor=None) -> Tensor:
+    """
+    operator for N-wise ops with N inputs
+
+    arguments
+        op_text     example: O=(2*I0*I1)+I2
+
+        *args       List[ Tensor | number ]
+
+        dtype
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape, but should match total size.
+    """
+    HArgs.check_zero_get_length(args)
+    tensor_args = HArgs.filter_tensor(args, raise_on_empty=True)
+    device = HArgs.check_get_same_device(tensor_args)
+
+    shape_list, dtype_list, krn_args = HArgs.decompose(args)
+
+    op = SCacheton.get(_AnyWiseOp, shape_list, dtype_list, dtype, op_text)
+
+    if output_t is None:
+        output_t = Tensor ( op.o_shape, op.o_dtype, device=device )
+    elif output_t.shape.size != op.o_shape.size:
+        raise ValueError(f'output_t must have size {op.o_shape.size}')
+
+    device.run_kernel(op.forward_krn, output_t.get_buffer(), *krn_args)
+
+    return output_t
+
+class _AnyWiseOp:
+    def __init__(self, shape_list, dtype_list, o_dtype, op_text : str):
+        if len(shape_list) != len(dtype_list):
+            raise ValueError('len(shape_list) != len(dtype_list)')
+
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
+
+        if len(shape_list) == 1:
+            # element-wise.
+            i_shape, i_dtype = shape_list[0], dtype_list[0]
+            self.o_shape = o_shape = i_shape
+
+            self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('IN', i_shape, i_dtype)}
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const IN_PTR_TYPE* IN_PTR_NAME)
+{{
+size_t gid = get_global_id(0);
+
+O_TYPE O = O_GLOBAL_LOAD(gid);
+IN_TYPE I0 = IN_GLOBAL_LOAD(gid);
+{op_text};
+O_GLOBAL_STORE(gid, O);
+}}
+""")
+        else:
+            # Multi arg.
+            self.info = info = BroadcastInfo( [ shape if shape is not None else AShape((1,)) for shape in shape_list ])
+
+            self.o_shape = o_shape = info.o_shape
+
+            defs, arg_defs, impls = [], [], []
+            for i, (t_shape, t_dtype) in enumerate(zip(shape_list, dtype_list)):
+                t_name = f'I{i}'
+                if t_shape is not None:
+                    defs.append( HKernel.define_tensor(t_name, info.br_shapes[i], t_dtype) )
+                    arg_defs.append( f", __global const {t_name}_PTR_TYPE* {t_name}_PTR_NAME" )
+                    impls.append( f"{t_name}_TYPE {t_name} = {t_name}_GLOBAL_LOAD({t_name}_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)}));")
+                else:
+                    arg_defs.append( f", {HKernel.define_scalar_func_arg(t_name, t_dtype)}" )
+
+            defs, arg_defs, impls = '\n'.join(defs), '\n'.join(arg_defs), '\n'.join(impls)
+
+            self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+{defs}
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME{arg_defs})
+{{
+size_t gid = get_global_id(0);
+{HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)}
+{impls}
+O_TYPE O;
+{op_text};
+O_GLOBAL_STORE(gid, O);
+}}
+""")
+
+def add(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0+I1', a_t, b_t)
+def sub(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0-I1', a_t, b_t)
+def mul(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0*I1', a_t, b_t)
+def div(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0/I1', a_t, b_t)
+def min_(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=fmin( I0_TO_FLOATX(I0), I1_TO_FLOATX(I1) )', a_t, b_t)
+def max_(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=fmax( I0_TO_FLOATX(I0), I1_TO_FLOATX(I1) )', a_t, b_t)
+def square(a_t : Tensor) -> Tensor:  return any_wise('O=I0*I0', a_t)
+def sqrt(a_t : Tensor) -> Tensor:    return any_wise('O=sqrt(I0_TO_FLOATX(I0))', a_t)
--- a/xlib/avecl/_internal/op/binary_dilate_circle.py
+++ b/xlib/avecl/_internal/op/binary_dilate_circle.py
@ -0,0 +1,91 @@
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..info import Conv2DInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+
+def binary_dilate_circle (input_t : Tensor, radius : int = 1, iterations : int = 1, dtype=None):
+    """
+    Binary dilate operator using circle kernel with radius.
+
+     input_t     Tensor (...,H,W)
+
+    per-element of H,W, set 1 if any neighbor elements inside circle with radius != 0.
+    otherwise set 0.
+    """
+    op = SCacheton.get(_BinaryDilateOp, input_t.shape, input_t.dtype, int(radius), dtype)
+
+    device = input_t.get_device()
+
+    if radius <= 0 or iterations <= 0:
+        return input_t.copy()
+    else:
+        for i in range(iterations):
+            if i == 0:
+                buf_in = input_t
+            else:
+                buf_in, buf_out = buf_out, buf_in
+            if i <= 1:
+                buf_out = Tensor( op.o_shape, op.o_dtype, device=device )
+            device.run_kernel(op.forward_krn, buf_out.get_buffer(), buf_in.get_buffer() )
+
+    return buf_out
+
+class _BinaryDilateOp():
+    def __init__(self, i_shape : AShape, i_dtype, radius, o_dtype):
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        if i_shape.ndim < 2:
+            raise ValueError(f'i_shape.ndim must be >= 2')
+
+        KS = radius*2+1
+        IH,IW = i_shape[-2:]
+
+        ci = Conv2DInfo(IH, IW, KS, KS, stride=1, dilation=1, padding='same')
+
+        self.o_shape = o_shape = i_shape
+
+        self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('I', i_shape, i_dtype)}
+
+#define PADL {ci.PADL}
+#define PADT {ci.PADT}
+
+#define RADIUS {radius}
+#define KS {KS}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
+{{
+    size_t gid = get_global_id(0);
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
+
+    {'#pragma unroll' if KS <= 16 else ''}
+    for (int kh=0; kh<KS; ++kh)
+    {'#pragma unroll' if KS <= 16 else ''}
+    for (int kw=0; kw<KS; ++kw)
+    {{
+        if ( hypot( (float)(kh-RADIUS), (float)(kw-RADIUS) )  <= RADIUS)
+        {{
+            int im2 = -PADT + kh + om2;
+            int im1 = -PADL + kw + om1;
+
+            I_TYPE i_val = (im1 >= 0 & im1 < Im1 & im2 >= 0 & im2 < Im2) ?
+                           I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))
+                           : 0;
+
+            if (i_val != (I_TYPE)0)
+            {{
+                O_GLOBAL_STORE(gid, (O_TYPE) 1);
+                return;
+            }}
+        }}
+    }}
+
+    O_GLOBAL_STORE(gid, (O_TYPE) 0 );
+}}
+""")
+
+
--- a/xlib/avecl/_internal/op/binary_erode_circle.py
+++ b/xlib/avecl/_internal/op/binary_erode_circle.py
@ -0,0 +1,91 @@
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..info import Conv2DInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+
+def binary_erode_circle (input_t : Tensor, radius : int = 1, iterations : int = 1, dtype=None):
+    """
+    Binary erode operator using circle kernel with radius.
+
+     input_t     Tensor (...,H,W)
+
+    per-element of H,W, set 1 if all neighbor elements inside circle with radius != 0.
+    otherwise set 0.
+    """
+    op = SCacheton.get(_BinaryErodeOp, input_t.shape, input_t.dtype, int(radius), dtype)
+
+    device = input_t.get_device()
+
+    if radius <= 0 or iterations <= 0:
+        return input_t.copy()
+    else:
+        for i in range(iterations):
+            if i == 0:
+                buf_in = input_t
+            else:
+                buf_in, buf_out = buf_out, buf_in
+            if i <= 1:
+                buf_out = Tensor( op.o_shape, op.o_dtype, device=device )
+            device.run_kernel(op.forward_krn, buf_out.get_buffer(), buf_in.get_buffer() )
+
+    return buf_out
+
+class _BinaryErodeOp():
+    def __init__(self, i_shape : AShape, i_dtype, radius, o_dtype):
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        if i_shape.ndim < 2:
+            raise ValueError(f'i_shape.ndim must be >= 2')
+
+        KS = radius*2+1
+        IH,IW = i_shape[-2:]
+
+        ci = Conv2DInfo(IH, IW, KS, KS, stride=1, dilation=1, padding='same')
+
+        self.o_shape = o_shape = i_shape
+
+        self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('I', i_shape, i_dtype)}
+
+#define PADL {ci.PADL}
+#define PADT {ci.PADT}
+
+#define RADIUS {radius}
+#define KS {KS}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
+{{
+    size_t gid = get_global_id(0);
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
+
+    {'#pragma unroll' if KS <= 16 else ''}
+    for (int kh=0; kh<KS; ++kh)
+    {'#pragma unroll' if KS <= 16 else ''}
+    for (int kw=0; kw<KS; ++kw)
+    {{
+        if ( hypot( (float)(kh-RADIUS), (float)(kw-RADIUS) ) <= RADIUS)
+        {{
+            int im2 = -PADT + kh + om2;
+            int im1 = -PADL + kw + om1;
+
+            I_TYPE i_val = (im1 >= 0 & im1 < Im1 & im2 >= 0 & im2 < Im2) ?
+                           I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))
+                           : 0;
+
+            if (i_val == (I_TYPE)0)
+            {{
+                O_GLOBAL_STORE(gid, (O_TYPE) 0 );
+                return;
+            }}
+        }}
+    }}
+
+    O_GLOBAL_STORE(gid, (O_TYPE) 1 );
+}}
+""")
+
+
--- a/xlib/avecl/_internal/op/binary_morph.py
+++ b/xlib/avecl/_internal/op/binary_morph.py
@ -0,0 +1,43 @@
+from ..Tensor import Tensor
+from .binary_dilate_circle import binary_dilate_circle
+from .binary_erode_circle import binary_erode_circle
+from .gaussian_blur import gaussian_blur
+from .pad import pad
+
+
+def binary_morph(input_t : Tensor, erode_dilate : int, blur : float, fade_to_border : bool = False, dtype=None) -> Tensor:
+    """
+    Apply optional binary erode/dilate and optional blur.
+
+        input_t    (...,H,W) tensor. Non zero values will be treated as 1.
+
+        erode_dilate    int     >= 0    amount of pixels to dilate
+
+        blur            float   >= 0    amount of pixels to blur
+
+        fade_to_border(False)   clip the image in order
+                                to fade smoothly to the border with specified blur amount
+    """
+    x = input_t
+
+    H,W = input_t.shape[-2:]
+
+    x = pad(x, (...,(H,H),(W,W)), mode='constant', constant_value=0)
+
+    if erode_dilate > 0:
+        x = binary_erode_circle(x, radius=1, iterations=max(1,erode_dilate//2))
+    elif erode_dilate < 0:
+        x = binary_dilate_circle(x, radius=1, iterations=max(1,-erode_dilate//2) )
+
+    if fade_to_border:
+        h_clip_size = H + blur // 2
+        w_clip_size = W + blur // 2
+        x[...,:h_clip_size,:] = 0
+        x[...,-h_clip_size:,:] = 0
+        x[...,:,:w_clip_size] = 0
+        x[...,:,-w_clip_size:] = 0
+
+    if blur > 0:
+        x = gaussian_blur(x, blur * 0.250, dtype=dtype)
+
+    return x[...,H:-H,W:-W]
--- a/xlib/avecl/_internal/op/cast.py
+++ b/xlib/avecl/_internal/op/cast.py
@ -0,0 +1,17 @@
+from ..Tensor import Tensor
+
+from .any_wise import any_wise
+
+def cast(input_t : Tensor, dtype, output_t:Tensor=None) -> Tensor:
+    """
+    cast operator
+
+    arguments
+        input_t
+
+        dtype
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape, but should match total size.
+    """
+    return any_wise('O=I0', input_t, dtype=dtype, output_t=output_t)
--- a/xlib/avecl/_internal/op/concat.py
+++ b/xlib/avecl/_internal/op/concat.py
@ -0,0 +1,70 @@
+from ..backend import Kernel
+from ..HArgs import HArgs
+from ..HType import HType
+from ..HKernel import HKernel
+from ..info import ConcatInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+def concat(tensor_list, axis, dtype=None, output_t=None, is_add_to_output=False) -> Tensor:
+    """
+    arguments
+
+        tensor_list     Iterable
+
+        axis            Int
+
+        dtype           np.dtype
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape,
+                            but should match total size.
+                            gradfn will not be set.
+
+        is_add_to_output    add result to output_t if output_t is set.
+    """
+    tensor_list = tuple(tensor_list)
+    HArgs.check_zero_get_length(tensor_list)
+    HArgs.check_all_tensors(tensor_list)
+
+    device = HArgs.check_get_same_device(tensor_list)
+    shape_list, dtype_list, _ = HArgs.decompose(tensor_list)
+
+    op = SCacheton.get(_ConcatOp, shape_list, dtype_list, dtype, int(axis), False if output_t is None else is_add_to_output)
+
+    if output_t is None:
+        output_t = Tensor (op.info.o_shape, op.o_dtype, device=device)
+    elif output_t.shape.size != op.info.o_shape.size:
+        raise ValueError(f'output_t must have size {op.info.o_shape.size}')
+
+    for forward_krn,t in zip(op.forward_krns,tensor_list):
+        device.run_kernel(forward_krn, output_t.get_buffer(), t.get_buffer(), global_shape=(t.shape.size,) )
+
+    return output_t
+
+class _ConcatOp:
+    def __init__(self, shape_list, dtype_list, o_dtype, axis, is_add_to_output):
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
+
+        self.info = info = ConcatInfo(shape_list, axis)
+
+        self.forward_krns = forward_krns = []
+
+        for i, (shape, dtype) in enumerate(zip(shape_list, dtype_list)):
+            forward_krn = Kernel(f"""
+
+{HKernel.define_tensor('O', info.o_shape, o_dtype )}
+{HKernel.define_tensor('I', shape, dtype)}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
+{{
+    size_t gid = get_global_id(0);
+
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'I', shape.ndim)}
+
+    i{info.axis} += {info.axis_offsets[i]};
+
+    {'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_seq_enum('I', info.o_shape.ndim)}), I_GLOBAL_LOAD(gid) );
+}}
+""")
+            forward_krns.append(forward_krn)
--- a/xlib/avecl/_internal/op/depthwise_conv2D.py
+++ b/xlib/avecl/_internal/op/depthwise_conv2D.py
@ -0,0 +1,107 @@
+import numpy as np
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..info import BroadcastInfo, Conv2DInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+
+def depthwise_conv2D (input_t : Tensor, kernel_t : Tensor, stride=1, dilation=1, padding='same', dtype=None):
+    """
+    Depthwise Conv2D operator.
+
+     input_t     Tensor (...,H,W)
+
+     kernel_t    Tensor (...,H,W)
+
+     stride(1)       int
+
+     dilation(1)     int
+
+     padding(same)   'valid'         no padding
+                     'same'          output size will be the same
+                                     or divided by stride
+                     int             padding value for all sides
+                     Iterable of 4 ints
+                                paddings for left,top,right,bottom sides
+
+    ...-head part of shapes will be broadcasted to each other
+    """
+
+    op = SCacheton.get(_DepthwiseConv2DOp, input_t.shape, input_t.dtype, kernel_t.shape, kernel_t.dtype, dtype, int(stride), int(dilation), padding)
+
+    output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
+    output_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(), kernel_t.get_buffer())
+
+    return output_t
+
+class _DepthwiseConv2DOp():
+    def __init__(self, i_shape : AShape, i_dtype, k_shape : AShape, k_dtype, o_dtype, stride, dilation, padding):
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        if i_shape.ndim < 2:
+            raise ValueError(f'i_shape.ndim must be >= 2')
+
+        if k_shape.ndim < 2:
+            raise ValueError(f'k_shape.ndim must be >= 2')
+
+        IH,IW = i_shape[-2:]
+        KH,KW = k_shape[-2:]
+
+        ci = Conv2DInfo(IH, IW, KH, KW, stride, dilation, padding)
+
+        if i_shape.ndim == 2 and k_shape.ndim == 2:
+            # nothing to broadcast
+            i_br_shape = i_shape
+            k_br_shape = k_shape
+
+            o_shape = AShape([ci.OH, ci.OW])
+        else:
+            op = BroadcastInfo([ i_shape[:-2], k_shape[:-2] ])
+
+            i_br_shape = op.br_shapes[0] + i_shape[-2:]
+            k_br_shape = op.br_shapes[1] + k_shape[-2:]
+
+            o_shape = op.o_shape + [ci.OH, ci.OW]
+
+        self.o_shape = o_shape
+
+        self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('I', i_br_shape, i_dtype)}
+{HKernel.define_tensor('K', k_br_shape, k_dtype)}
+
+#define PADL {ci.PADL}
+#define PADT {ci.PADT}
+
+#define STRIDE {stride}
+#define DILATION {dilation}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, __global const K_PTR_TYPE* K_PTR_NAME)
+{{
+    size_t gid = get_global_id(0);
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
+
+    float v = 0.0;
+    {'#pragma unroll' if KH <= 9 else ''}
+    for (int km2=0; km2<Km2; ++km2)
+    {{
+        int im2 = -PADT + km2*DILATION + om2*STRIDE;
+        if (im2 >= 0 & im2 < Im2)
+            {'#pragma unroll' if KW <= 9 else ''}
+            for (int km1=0; km1<Km1; ++km1)
+            {{
+                int im1 = -PADL + km1*DILATION + om1*STRIDE;
+                if (im1 >= 0 & im1 < Im1)
+                    v += ((float)(I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))))
+                                 *K_GLOBAL_LOAD(K_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='km2,km1' )}));
+            }}
+    }}
+
+    O_GLOBAL_STORE(gid, (O_TYPE) v);
+}}
+""")
+
+
--- a/xlib/avecl/_internal/op/gaussian_blur.py
+++ b/xlib/avecl/_internal/op/gaussian_blur.py
@ -0,0 +1,39 @@
+import numpy as np
+
+from ..Tensor import Tensor
+from .depthwise_conv2D import depthwise_conv2D
+
+
+def gaussian_blur (input_t : Tensor, sigma, dtype=None) -> Tensor:
+    """
+    arguments
+
+        input_t     Tensor(...,H,W)
+
+        sigma       float
+
+
+    """
+    if sigma <= 0.0:
+        return input_t.copy() #TODO
+
+    device = input_t.get_device()
+
+    key = (gaussian_blur, sigma)
+    kernel_t = device.get_cached_data(key)
+    if kernel_t is None:
+        kernel_t = Tensor.from_value( _make_gaussian_kernel(sigma, np.float32), device=device )
+        device.set_cached_data(key, kernel_t)
+
+    output_t = depthwise_conv2D(input_t, kernel_t, dtype=dtype)
+    return output_t
+
+def _make_gaussian_kernel(sigma : float, dtype):
+    kernel_size = max(3, int(2 * 2 * sigma))
+    if kernel_size % 2 == 0:
+        kernel_size += 1
+    mean = np.floor(0.5 * kernel_size)
+    kernel_1d = np.array([ np.exp(-(float(x) - float(mean)) ** 2 / (2 * sigma ** 2)) for x in range(kernel_size)])
+    np_kernel = np.outer(kernel_1d, kernel_1d)
+    kernel = np_kernel / np.sum(np_kernel)
+    return kernel.astype(dtype)
--- a/xlib/avecl/_internal/op/matmul.py
+++ b/xlib/avecl/_internal/op/matmul.py
@ -0,0 +1,158 @@
+import numpy as np
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HArgs import HArgs
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+
+def matmul(a_t : Tensor, b_t : Tensor, output_t: Tensor=None, is_add_to_output=False) -> Tensor:
+    """
+    matmul operator in row-major format
+
+     A(...,M,K) x
+     B(...,K,N) =
+      (...,M,N)
+
+     arguments
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape,
+                            but should match total size.
+                            gradfn will not be set.
+
+        is_add_to_output    add result to output_t if output_t is set.
+    """
+
+    return matmulc(b_t, a_t, output_t=output_t, is_add_to_output=is_add_to_output)
+
+
+
+def matmulc(a_t : Tensor, b_t : Tensor, output_t : Tensor = None, is_add_to_output=False) -> Tensor:
+    """
+    matmul operator in col-major format
+
+        A(...,K,M) x
+        B(...,N,K) =
+         (...,N,M)
+
+    arguments
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape,
+                            but should match total size.
+                            gradfn will not be set.
+
+        is_add_to_output    add result to output_t if output_t is set.
+    """
+    device = HArgs.check_get_same_device([a_t, b_t])
+
+    op = SCacheton.get(_MatmulOp, a_t.shape, a_t.dtype, b_t.shape, b_t.dtype, False if output_t is None else is_add_to_output)
+
+    if output_t is None:
+        output_t = Tensor (op.o_shape, op.o_dtype, device=device )
+    elif output_t.shape.size != op.o_shape.size:
+        raise ValueError(f'output_t must have size {op.o_shape.size}')
+
+    device.run_kernel(op.forward_krn, output_t.get_buffer(), a_t.get_buffer(), b_t.get_buffer(), )
+
+    return output_t
+
+
+class _MatmulOp:
+    def __init__(self, a_shape, a_dtype, b_shape, b_dtype, is_add_to_output):
+        a_dtype = np.dtype(a_dtype).type
+        b_dtype = np.dtype(b_dtype).type
+
+        if a_dtype != np.float32 or b_dtype != np.float32:
+            raise ValueError('matmul works only with float32 tensors.')
+
+        if a_shape.ndim != b_shape.ndim:
+            raise ValueError(f'ndims are not equal. {a_shape.ndim} != {b_shape.ndim}')
+
+        ndim = a_shape.ndim
+        if ndim < 2:
+            raise ValueError('Tensors ndim must be at least 2.')
+
+        K, M = a_shape[-2], a_shape[-1]
+        N, B_COLS = b_shape[-2], b_shape[-1]
+
+        if K != B_COLS:
+            raise ValueError('A_ROWS != B_COLS')
+
+        BATCH = a_shape[0:-2].size
+        B_BATCH = b_shape[0:-2].size
+
+        if BATCH != B_BATCH:
+            raise ValueError(f'BATCH size {BATCH} != {B_BATCH} in shapes {a_shape} {b_shape}')
+
+        if ndim == 2:
+            self.o_shape = AShape( (N, M) )
+        else:
+            self.o_shape = AShape( a_shape[:-2]+(N, M) )
+        self.o_dtype = np.float32
+
+        self.M = M
+        self.N = N
+        self.K = K
+
+        # Determining optimal tile widths
+        for MW in [8,4,2,1]:
+            if M % MW == 0:
+                break
+        for KW in [8,4,2,1]:
+            if N % KW == 0 and K % KW == 0:
+                break
+        NW = KW
+
+        self.forward_krn = Kernel(global_shape=(M//MW, N//NW, BATCH), kernel_text=f"""
+#define K {K}
+#define N {N}
+#define MW {MW}     // M tile Width
+#define NW {NW}     // N tile Width  -- NW & KW should be the same !
+#define KW {KW}     // K tile Width
+#define MT {M//MW}  // MT is max for 'mt' (M tile count)
+#define KT {K//KW}  // KT is max for 'kt' (K tile count)
+
+#define floatMW { f'float{MW}' if MW != 1 else 'float'}
+#define floatKW { f'float{KW}' if KW != 1 else 'float'}
+
+__kernel void GeMM(__global floatMW* O, const __global floatMW* restrict A, const __global floatKW* restrict B)
+{{
+    size_t mt = get_global_id(0);    //global M-tile id
+    size_t nc = get_global_id(1);    //global N-tile id
+    size_t batch = get_global_id(2);
+
+    float AT[KW][MW]; // sub tiles
+    float BT[NW][KW];
+    float CT[NW][MW];
+
+    #pragma unroll
+    for (uint i=0; i<NW*MW; ++i) // zero CT tile
+        ((float*) CT)[i] = 0.0;
+
+    for (uint kt=0; kt<KT; ++kt)  // iterate over K-dim tiles
+    {{
+        #pragma unroll
+        for (uint k=0; k<KW; ++k)  // every k-element inside K-dim tile
+            *( (floatMW*) AT[k] ) = A[batch*K*MT + (kt*KW + k)*MT + mt]; // store M-Width floats
+
+        #pragma unroll
+        for (uint n=0; n<NW; ++n)  // every n-element inside N-dim tile
+            *( (floatKW*) BT[n] ) = B[batch*N*KT + (nc*NW + n)*KT + kt]; // store K-Width floats
+
+        #pragma unroll
+        for (uint k=0; k<KW; ++k)
+        #pragma unroll
+        for (uint n=0; n<NW; ++n)  // sub tiles multiplication
+        #pragma unroll
+        for (uint m=0; m<MW; ++m)
+            CT[n][m] += AT[k][m] * BT[n][k];
+    }}
+
+    #pragma unroll
+    for (uint n=0; n<NW; ++n)
+        O[ batch*N*MT + (nc*NW + n)*MT + mt] {'+=' if is_add_to_output else '='}
+                               *( (floatMW*) CT[n]);
+}}""")
+
--- a/xlib/avecl/_internal/op/pad.py
+++ b/xlib/avecl/_internal/op/pad.py
@ -0,0 +1,71 @@
+from typing import List
+
+import numpy as np
+
+from ..HType import HType
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..info import PadInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+
+def pad(input_t : Tensor, axes_paddings : List, mode : str = 'constant', constant_value=0, dtype : np.dtype = None, output_t : Tensor=None) -> Tensor:
+    """
+    arguments:
+
+        axes_paddings   list of (l_pad, r_pad),
+
+                        if [0] == ... (Ellipsis), then left-side paddings will be filled with (0,0) for remain axes
+                        if [-1] == ... , same for ride-side
+
+        dtype           cast to dtype
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape, but should match total size
+
+    """
+    op = SCacheton.get(_PadOp, input_t.shape, input_t.dtype, dtype, tuple(axes_paddings), mode, constant_value )
+
+    if output_t is None:
+        output_t = Tensor (op.o_shape, op.o_dtype, device=input_t.get_device())
+    elif output_t.shape.size != op.o_shape.size:
+        raise ValueError(f'output_t must have size {op.o_shape.size}')
+
+    input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
+
+    return output_t
+
+
+class _PadOp:
+    def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, axes_paddings, mode, constant_value ):
+        _allow_modes = ['constant']
+        if mode not in _allow_modes:
+            raise ValueError(f'Allowed pads modes: {_allow_modes}')
+
+        if mode == 'constant':
+            if not HType.is_scalar_type(constant_value):
+                raise ValueError('constan_value must be scalar')
+
+        info = PadInfo(i_shape, axes_paddings)
+
+        self.o_shape = o_shape = info.o_shape
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('I', i_shape, i_dtype)}
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
+{{
+size_t gid = get_global_id(0);
+
+{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
+
+if ({' & '.join(f'o{i} >= {l_pad} & o{i} < (O{i}-{r_pad})'  for i, (l_pad,r_pad) in enumerate(info.axes_paddings))})
+    O_GLOBAL_STORE(gid, I_GLOBAL_LOAD( I_IDX({ ','.join(f'o{i}-{l_pad}' for i,(l_pad,r_pad) in zip(range(o_shape.ndim), info.axes_paddings)  ) }) ) );
+else
+    O_GLOBAL_STORE(gid, (O_TYPE){constant_value} );
+    //O_GLOBAL_STORE(gid, I_GLOBAL_LOAD( I_IDX_MOD({ ','.join(f' I{i} + ( (o{i}-{l_pad})*( ((o{i}-{l_pad})/I{i}) % 2 == 0 ? 1: -1) ) % I{i} ' for i,(l_pad,r_pad) in zip(range(o_shape.ndim), info.axes_paddings)  ) }) ) );
+}}""")
+        #print(self.forward_krn)
--- a/xlib/avecl/_internal/op/reduce.py
+++ b/xlib/avecl/_internal/op/reduce.py
@ -0,0 +1,214 @@
+import math
+
+import numpy as np
+
+from ..AAxes import AAxes
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..info import ReductionInfo, TransposeInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+from .slice_ import slice_
+from .transpose import transpose
+from .any_wise import square, sqrt
+
+
+def reduce_mean (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
+    """
+    Reduce mean operator.
+
+        input_t     Tensor
+
+        axes(None)  int
+                    Iterable of ints.
+                    None - all axes
+
+        keepdims(False)     keep reduced axes
+    """
+    return reduce_op ('mean', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
+
+def reduce_std(input_t, axes=None, keepdims=False):
+    """
+    Reduce std operator.
+
+        input_t     Tensor
+
+        axes(None)  int
+                    Iterable of ints.
+                    None - all axes
+
+        keepdims(False)     keep reduced axes
+    """
+    return sqrt(reduce_variance(input_t, axes, keepdims))
+
+
+def reduce_variance(input_t, axes=None, keepdims=False):
+    """
+    Reduce variance operator.
+
+        input_t     Tensor
+
+        axes(None)  int
+                    Iterable of ints.
+                    None - all axes
+
+        keepdims(False)     keep reduced axes
+    """
+    mean = reduce_mean(input_t, axes, keepdims=True)
+    return reduce_mean(square(input_t - mean), axes, keepdims)
+
+def moments(input_t, axes=None, keepdims=False):
+    """
+    Returns (mean, variance) of input_t
+
+        input_t     Tensor
+
+        axes(None)  int
+                    Iterable of ints.
+                    None - all axes
+
+        keepdims(False)     keep reduced axes
+    """
+    mean = reduce_mean(input_t, axes, keepdims)
+    mean_shape_keepdims = mean._op.info.o_shape_kd
+    var = reduce_mean(square(input_t - mean.reshape(mean_shape_keepdims) ), axes, keepdims)
+    return mean, var
+
+def reduce_min (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
+    """
+    Reduce min operator.
+
+        input_t     Tensor
+
+        axes(None)  int
+                    Iterable of ints.
+                    None - all axes
+
+        keepdims(False)     keep reduced axes
+    """
+    return reduce_op ('min', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
+
+def reduce_max (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
+    """
+    Reduce max operator.
+
+        input_t     Tensor
+
+        axes(None)  int
+                    Iterable of ints.
+                    None - all axes
+
+        keepdims(False)     keep reduced axes
+    """
+    return reduce_op ('max', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
+
+def reduce_sum (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
+    """
+    Reduce sum operator.
+
+        input_t     Tensor
+
+        axes(None)  int
+                    Iterable of ints.
+                    None - all axes
+
+        keepdims(False)     keep reduced axes
+    """
+    return reduce_op ('sum', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
+
+def reduce_op (op_type : str, input_t, axes=None, keepdims=False, output_t=None, is_add_to_output=False):
+    """
+    arguments
+
+        op_type             'sum' 'mean' 'min' 'max'
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape,
+                            but should match total size.
+                            gradfn will not be set.
+
+        is_add_to_output    add result to output_t if output_t is set.
+    """
+
+    op = SCacheton.get(_ReduceOp, op_type, input_t.shape, input_t.dtype, AAxes(axes, input_t.shape.ndim), keepdims)
+
+    if output_t is None:
+        output_t = Tensor ( op.info.o_shape, input_t.dtype, device=input_t.get_device() )
+    elif output_t.shape.size != op.info.o_shape.size:
+        raise ValueError(f'output_t must have size {op.info.o_shape.size}')
+
+    # Make an intermediate tensor
+    input_t_inter = transpose(input_t, op.intermediate_transpose_axes)
+
+    # Perform multistage inplace operation in intermediate tensor
+    for stage, (shape, STAGE_COLS, STAGE_VALID_COLS) in enumerate(zip(op.forward_krn_shapes, op.forward_krn_stage_cols, op.forward_krn_stage_valid_cols)):
+        input_t_inter.get_device().run_kernel(op.forward_krn, input_t_inter.get_buffer(), np.int64(op.COLS), np.int64(STAGE_COLS), np.int64(STAGE_VALID_COLS),
+                           global_shape=shape)
+
+    if op_type == 'mean':
+        # divide values in ROWS by number of COLS
+        input_t_inter.get_device().run_kernel(op.mean_div_forward_krn, input_t_inter.get_buffer(), np.int64(op.COLS), global_shape=(op.ROWS,) )
+
+    # Fetch final tensor from zero indexes using slices argument
+    slice_(input_t_inter, op.inter_slices, output_t=output_t, is_add_to_output=is_add_to_output)
+
+    return output_t
+
+
+class _ReduceOp:
+    def __init__(self, op_type, i_shape : AShape, i_dtype : np.dtype, axes : AAxes, keepdims=False):
+        self.op_type = op_type
+        self.info = info = ReductionInfo(i_shape, axes, keepdims)
+
+        # Determine transpose order for intermediate tensor, where reduction axes will be at the end
+        self.intermediate_transpose_axes = info.o_axes + info.reduction_axes
+        self.intermediate_shape = TransposeInfo(i_shape, self.intermediate_transpose_axes).o_shape
+
+        # slices argument to fetch processed tensor from zero indexes
+        self.inter_slices = ( slice(None,None,None), ) * info.o_axes.ndim + (0,) * info.reduction_axes.ndim
+
+        # COLS are reduction axes, ROWS are remaining axes
+        rows_ndim = info.o_axes.ndim
+        self.ROWS = ROWS = self.intermediate_shape[:rows_ndim].size
+        self.COLS = COLS = self.intermediate_shape[rows_ndim:].size
+
+        # Number of stages to operate COLS
+        n_stages = (COLS-1).bit_length()
+        self.forward_krn_shapes           = [ (ROWS * math.ceil(COLS/ (2**(stage+1)) ),) for stage in range(n_stages) ]
+        self.forward_krn_stage_cols       = [ math.ceil(COLS / (2**(stage+1)) ) for stage in range(n_stages) ]
+        self.forward_krn_stage_valid_cols = [ math.ceil(COLS / (2** stage   ) ) for stage in range(n_stages) ]
+
+        self.forward_krn = Kernel(f"""
+{HKernel.define_tensor('I', (1,), i_dtype)}
+
+__kernel void impl(__global I_PTR_TYPE* I_PTR_NAME, long COLS, long STAGE_COLS, long STAGE_VALID_COLS)
+{{
+    size_t gid = get_global_id(0);
+
+    size_t col = gid % STAGE_COLS;
+    size_t row = gid / STAGE_COLS;
+    size_t i_idx = row*COLS + col;
+
+    size_t other_col = col + STAGE_COLS;
+    if (other_col < STAGE_VALID_COLS)
+    {{
+        I_TYPE val_a = I_GLOBAL_LOAD(i_idx);
+        I_TYPE val_b = I_GLOBAL_LOAD(row*COLS + other_col);
+
+        {'I_TYPE val_x = val_a + val_b;' if op_type in ['sum','mean'] else
+         'I_TYPE val_x = fmin( I_TO_FLOATX(val_a), I_TO_FLOATX(val_b) );' if op_type == 'min' else
+         'I_TYPE val_x = fmax( I_TO_FLOATX(val_a), I_TO_FLOATX(val_b) );' if op_type == 'max' else ''
+        }
+        I_GLOBAL_STORE(i_idx, val_x);
+    }}
+}}
+""")
+        self.mean_div_forward_krn = Kernel(f"""
+{HKernel.define_tensor('I', (1,), i_dtype)}
+__kernel void impl(__global I_PTR_TYPE* I_PTR_NAME, long COLS)
+{{
+    size_t row = get_global_id(0);
+    I_GLOBAL_STORE(row*COLS, I_GLOBAL_LOAD(row*COLS) / COLS );
+}}
+""")
--- a/xlib/avecl/_internal/op/remap.py
+++ b/xlib/avecl/_internal/op/remap.py
@ -0,0 +1,103 @@
+import numpy as np
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..info import BroadcastInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+def remap (input_t : Tensor, coords_t : Tensor, dtype=None) -> Tensor:
+    """
+    remap input_t in spatial axes using coords_t
+
+    arguments
+
+        input_t     Tensor( ...,IH,IW )
+
+        coords_t    Tensor( ...,OH,OW,D )
+                    OH - output height
+                    OW - output width
+                    D is (2)[x,y] coords
+
+        dtype
+
+    ...-head part of shapes will be broadcasted to each other
+    """
+
+    op = SCacheton.get(_RemapOp, input_t.shape, input_t.dtype, coords_t.shape, coords_t.dtype, dtype)
+
+    output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
+
+    input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(), coords_t.get_buffer())
+
+    return output_t
+
+
+class _RemapOp():
+    def __init__(self, i_shape : AShape, i_dtype, c_shape : AShape, c_dtype, o_dtype):
+        if np.dtype(i_dtype).type == np.bool_:
+            raise ValueError('np.bool_ dtype of i_dtype is not supported.')
+        if np.dtype(c_dtype).type == np.bool_:
+            raise ValueError('np.bool_ dtype of c_dtype is not supported.')
+        if i_shape.ndim < 2:
+            raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
+        if c_shape.ndim < 3:
+            raise ValueError(f'Coords shape ndim must be >= 3(...,H,W,D)')
+        if c_shape[-1] != 2:
+            raise ValueError('Last coords dim must be == 2 (x,y)')
+
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        if i_shape.ndim == 2 and c_shape.ndim == 3:
+            # nothing to broadcast
+
+            i_br_shape = i_shape
+            c_br_shape = c_shape
+
+            o_shape = c_shape[-3:-1]
+        else:
+            op = BroadcastInfo([ i_shape[:-2], c_shape[:-3] ])
+
+            i_br_shape = op.br_shapes[0] + i_shape[-2:]
+            c_br_shape = op.br_shapes[1] + c_shape[-3:]
+
+            o_shape = op.o_shape + c_shape[-3:-1]
+
+        self.o_shape = o_shape
+
+        self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('I', i_br_shape, i_dtype)}
+{HKernel.define_tensor('C', c_br_shape[:-1], c_dtype)}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, __global const C_PTR_TYPE2* C_PTR_NAME)
+{{
+    size_t gid = get_global_id(0);
+
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)}
+
+    C_TYPE2 c_value = C_GLOBAL_LOAD2(C_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim)}));
+
+    float cx01 = (float) c_value.x;
+    float cy01 = (float) c_value.y;
+
+    float cx0f = floor(cx01);   int cx0 = (int)cx0f;
+    float cy0f = floor(cy01);   int cy0 = (int)cy0f;
+    float cx1f = cx0f+1;        int cx1 = (int)cx1f;
+    float cy1f = cy0f+1;        int cy1 = (int)cy1f;
+
+    float p00 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx0')}));
+    float p01 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx1')}));
+    float p10 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx0')}));
+    float p11 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx1')}));
+
+    p00 *= (cx1f - cx01)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx0 >= 0 & cx0 < Im1);
+    p01 *= (cx01 - cx0f)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx1 >= 0 & cx1 < Im1);
+    p10 *= (cx1f - cx01)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx0 >= 0 & cx0 < Im1);
+    p11 *= (cx01 - cx0f)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx1 >= 0 & cx1 < Im1);
+
+    O_GLOBAL_STORE(gid, p00 + p01 + p10 + p11);
+}}
+""")
--- a/xlib/avecl/_internal/op/remap_np_affine.py
+++ b/xlib/avecl/_internal/op/remap_np_affine.py
@ -0,0 +1,96 @@
+import numpy as np
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+def remap_np_affine (input_t : Tensor, affine_n : np.array, inverse=False, output_size=None, dtype=None) -> Tensor:
+    """
+    remap affine operator for all channels using single numpy affine mat
+
+    arguments
+
+        input_t     Tensor (...,H,W)
+
+        affine_n    np.array (2,3)
+
+        dtype
+    """
+    if affine_n.shape != (2,3):
+        raise ValueError('affine_n.shape must be (2,3)')
+
+    op = SCacheton.get(_RemapAffineOp, input_t.shape, input_t.dtype, output_size, dtype)
+
+    output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
+
+    ((a, b, c),
+     (d, e, f)) = affine_n
+    if not inverse:
+        # do inverse by default, match cv2.warpAffine behaviour
+        D = a*e - b*d
+        D = 1.0 / D if D != 0.0 else 0.0
+        a, b, c, d, e, f = (  e*D, -b*D, (b*f-e*c)*D ,
+                             -d*D,  a*D, (d*c-a*f)*D )
+                             
+    input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(),
+                                    np.float32(a), np.float32(b), np.float32(c), np.float32(d), np.float32(e), np.float32(f) )
+
+    return output_t
+
+
+class _RemapAffineOp():
+    def __init__(self, i_shape : AShape, i_dtype, o_size, o_dtype):
+        if np.dtype(i_dtype).type == np.bool_:
+            raise ValueError('np.bool_ dtype of i_dtype is not supported.')
+        if i_shape.ndim < 2:
+            raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
+
+        IH,IW = i_shape[-2:]
+        if o_size is not None:
+            OH,OW = o_size
+        else:
+            OH,OW = IH,IW
+
+        o_shape = AShape( (OH,OW) )
+        if i_shape.ndim > 2:
+            o_shape = i_shape[:-2] + o_shape
+
+        self.o_shape = o_shape
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('I', i_shape, i_dtype)}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME,
+                   float a, float b, float c,
+                   float d, float e, float f)
+{{
+    size_t gid = get_global_id(0);
+
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
+
+    float cx01 = om1*a + om2*b + c;
+    float cy01 = om1*d + om2*e + f;
+
+    float cx0f = floor(cx01);   int cx0 = (int)cx0f;
+    float cy0f = floor(cy01);   int cy0 = (int)cy0f;
+    float cx1f = cx0f+1;        int cx1 = (int)cx1f;
+    float cy1f = cy0f+1;        int cy1 = (int)cy1f;
+
+    float p00 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx0')}));
+    float p01 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx1')}));
+    float p10 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx0')}));
+    float p11 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx1')}));
+
+    p00 *= (cx1f - cx01)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx0 >= 0 & cx0 < Im1);
+    p01 *= (cx01 - cx0f)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx1 >= 0 & cx1 < Im1);
+    p10 *= (cx1f - cx01)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx0 >= 0 & cx0 < Im1);
+    p11 *= (cx01 - cx0f)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx1 >= 0 & cx1 < Im1);
+
+    O_GLOBAL_STORE(gid, p00 + p01 + p10 + p11);
+}}
+""")
--- a/xlib/avecl/_internal/op/reshape.py
+++ b/xlib/avecl/_internal/op/reshape.py
@ -0,0 +1,26 @@
+from typing import Iterable
+
+from ..Tensor import Tensor
+from ..SCacheton import SCacheton
+from ..info import ReshapeInfo
+
+
+def reshape(input_t : Tensor, new_shape : Iterable, copy=True) -> Tensor:
+    """
+    reshape operator
+
+    arguments
+
+        new_shape    Iterable of ints
+
+        copy(True)      if True, produces new Tensor
+                        otherwise result tensor points to the same memory
+
+    Produces reference Tensor with new shape.
+    """
+    info = SCacheton.get(ReshapeInfo, input_t.shape, tuple(int(x) for x in new_shape) )
+
+    if copy:
+        return Tensor(info.o_shape, input_t.dtype, device=input_t.get_device()).set(input_t)
+    return input_t.as_shape( info.o_shape )
+
--- a/xlib/avecl/_internal/op/slice_.py
+++ b/xlib/avecl/_internal/op/slice_.py
@ -0,0 +1,67 @@
+import numpy as np
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..HType import HType
+from ..info import SliceInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+
+def slice_(input_t : Tensor, slices, dtype : np.dtype = None, output_t=None, is_add_to_output=False) -> Tensor:
+    """
+    arguments:
+
+        input_t     input tensor
+        slices      argument received from class.__getitem__(slices)
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape, but should match total size.
+                            gradfn will not be set.
+
+        is_add_to_output    add result to output_t if output_t is set.
+
+    Remark.
+
+    Slicing logic is not the same as numpy:
+    For example np[2:0:1] slice will produce invalid array with zero index,
+    but nn.slice() will select 2 index, same as val_t[2].
+    """
+    op = SCacheton.get(_SliceOp, input_t.shape, input_t.dtype, dtype, HType.hashable_slices(slices), False if output_t is None else is_add_to_output )
+    o_shape = op.slice_info.o_shape
+
+    if output_t is None:
+        if op.slice_info.just_reshaped:
+            return input_t.reshape(o_shape)
+        else:
+            output_t = Tensor(o_shape, op.o_dtype, device=input_t.get_device())
+
+    elif output_t.shape.size != o_shape.size:
+        raise ValueError(f'output_t must have size {o_shape.size}')
+
+    input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
+
+    return output_t
+
+
+class _SliceOp:
+    def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, slices, is_add_to_output):
+        self.slice_info = slice_info = SliceInfo(i_shape, slices)
+
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        self.forward_krn = Kernel(global_shape=(slice_info.o_shape_kd.size,), kernel_text=f"""
+{HKernel.define_tensor('O', slice_info.o_shape_kd, o_dtype )}
+{HKernel.define_tensor('I', i_shape, i_dtype )}
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
+{{
+size_t gid = get_global_id(0);
+
+{HKernel.decompose_idx_to_axes_idxs('gid', 'o', slice_info.o_shape_kd.ndim)}
+
+{chr(10).join( f'size_t i{i} = {b} + o{i} * {s}; ' for i, (b,e,s) in enumerate(slice_info.axes_bes)  )  }
+
+{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}(gid, I_GLOBAL_LOAD( I_IDX({HKernel.axes_seq_enum('i', i_shape.ndim)}) ) );
+}}
+""")
--- a/xlib/avecl/_internal/op/slice_set.py
+++ b/xlib/avecl/_internal/op/slice_set.py
@ -0,0 +1,73 @@
+import numpy as np
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..HType import HType
+from ..info import BroadcastInfo, SliceInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+
+def slice_set(input_t : Tensor, slices, value) -> Tensor:
+    """
+    arguments:
+
+        input_t     input tensor
+        slices      argument received from class.__getitem__(slices)
+        value
+
+
+    Remark.
+
+    """
+    if HType.is_scalar_type(value):
+        v_shape = None
+        v_dtype = None
+        v_scalar = value
+    elif not isinstance(value, Tensor):
+        value = Tensor.from_value(value, dtype=input_t.dtype, device=input_t.get_device())
+        v_shape = value.shape
+        v_dtype = value.dtype
+        v_scalar = None
+
+    op = SCacheton.get(_SliceSetOp, input_t.shape, input_t.dtype, v_shape, v_dtype, v_scalar, HType.hashable_slices(slices) )
+
+    if v_scalar is not None:
+        input_t.get_device().run_kernel(op.forward_krn, input_t.get_buffer() )
+    else:
+        input_t.get_device().run_kernel(op.forward_krn, input_t.get_buffer(), value.get_buffer() )
+
+    return input_t
+
+class _SliceSetOp:
+    def __init__(self, i_shape : AShape, i_dtype : np.dtype, v_shape : AShape, v_dtype : np.dtype, v_scalar, slices):
+        slice_info = SliceInfo(i_shape, slices)
+
+        if v_scalar is None:
+            if v_shape.ndim > i_shape.ndim:
+                raise ValueError(f'v_shape.ndim {v_shape.ndim} cannot be larger than i_shape.ndim {i_shape.ndim}')
+
+            # Check that v_shape can broadcast with slice_info.shape
+            br_info = BroadcastInfo([slice_info.o_shape_kd, v_shape])
+
+            v_br_shape = br_info.br_shapes[1]
+
+        self.forward_krn = Kernel(global_shape=(i_shape.size,), kernel_text=f"""
+{HKernel.define_tensor('O', i_shape, i_dtype )}
+
+{HKernel.define_tensor('I', v_br_shape, v_dtype ) if v_scalar is None else ''}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME
+               {', __global const I_PTR_TYPE* I_PTR_NAME' if v_scalar is None else ''})
+{{
+size_t gid = get_global_id(0);
+
+{HKernel.decompose_idx_to_axes_idxs('gid', 'O', slice_info.o_shape_kd.ndim)}
+
+if ({' & '.join( [f'o{i} >= {b} & o{i} < {e}' if s != 0 else f'o{i} == {b}' for i, (b,e,s) in enumerate(slice_info.axes_abs_bes)] +
+                 [f'((o{i} % {s}) == 0)' for i, (_,_,s) in enumerate(slice_info.axes_abs_bes) if s > 1 ] ) } )
+
+    O_GLOBAL_STORE(gid, {f"I_GLOBAL_LOAD( I_IDX_MOD({HKernel.axes_seq_enum('O', i_shape.ndim)}) ) " if v_scalar is None else f" (O_TYPE)({v_scalar})"} );
+}}
+""")
--- a/xlib/avecl/_internal/op/stack.py
+++ b/xlib/avecl/_internal/op/stack.py
@ -0,0 +1,74 @@
+import numpy as np
+from typing import List
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HArgs import HArgs
+from ..HKernel import HKernel
+from ..HType import HType
+from ..info import StackInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+def stack(tensor_list : List[Tensor], axis, dtype=None, output_t=None, is_add_to_output=False):
+    """
+    Stack operator.
+
+    arguments:
+
+        tensor_list     List of Tensors
+
+        axis            Int
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape, but should match total size.
+                            gradfn will not be set.
+
+        is_add_to_output    add result to output_t if output_t is set.
+    """
+    HArgs.check_zero_get_length(tensor_list)
+    HArgs.check_all_tensors(tensor_list)
+
+    device = HArgs.check_get_same_device(tensor_list)
+
+    shape_list, dtype_list, _ = HArgs.decompose(tensor_list)
+
+    op = SCacheton.get(_StackOp, shape_list, dtype_list, int(axis), dtype, False if output_t is None else is_add_to_output)
+
+    if output_t is None:
+        output_t = Tensor (op.info.o_shape, op.o_dtype, device=device)
+    elif output_t.shape.size != op.info.o_shape.size:
+        raise ValueError(f'output_t must have size {op.info.o_shape.size}')
+
+    for i, krn in enumerate(op.forward_krns):
+        device.run_kernel(krn, output_t.get_buffer(), tensor_list[i].get_buffer(), np.int64(i) )
+
+    return output_t
+
+
+class _StackOp:
+    def __init__(self, shape_list : List[AShape], dtype_list : List[np.dtype], axis, o_dtype, is_add_to_output):
+        self.stack_count = stack_count = len(shape_list)
+
+        i_shape = shape_list[0]
+        if not all (s == i_shape for s in shape_list):
+            raise ValueError('All shapes must be the same')
+
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
+        self.info = info = StackInfo(i_shape, axis, stack_count)
+
+        self.forward_krns = forward_krns = []
+        for i_dtype in dtype_list:
+            forward_krns.append( Kernel(global_shape=(i_shape.size,), kernel_text=f"""
+{HKernel.define_tensor('O', info.o_shape, o_dtype )}
+{HKernel.define_tensor('I', i_shape, i_dtype )}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, long i_new_idx)
+{{
+    size_t gid = get_global_id(0);
+
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'I', i_shape.ndim)}
+
+    {'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_seq_enum('I', i_shape.ndim, new_axis=('i_new_idx', info.axis))}), I_GLOBAL_LOAD(gid) );
+}}
+"""))
--- a/xlib/avecl/_internal/op/tile.py
+++ b/xlib/avecl/_internal/op/tile.py
@ -0,0 +1,58 @@
+import numpy as np
+from typing import List
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..info import TileInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+def tile(input_t : Tensor, tiles : List[int], dtype : np.dtype = None, output_t=None, is_add_to_output=False):
+    """
+    Tile operator
+
+    arguments
+
+        tiles       Iterable of ints
+
+        dtype
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape, but should match total size.
+                            gradfn will not be set.
+
+        is_add_to_output    add result to output_t if output_t is set.
+    """
+
+    op = SCacheton.get(_TileOp, input_t.shape, input_t.dtype, tuple(int(tile) for tile in tiles), dtype, False if output_t is None else is_add_to_output)
+
+    if output_t is None:
+        output_t = Tensor (op.info.o_shape, op.o_dtype, device=input_t.get_device())
+    elif output_t.shape.size != op.info.o_shape.size:
+        raise ValueError(f'output_t must have size {op.info.o_shape.size}')
+
+    input_t.get_device().run_kernel( op.forward_krn, output_t.get_buffer(), input_t.get_buffer())
+
+    return output_t
+
+class _TileOp:
+    def __init__(self, i_shape : AShape, i_dtype, tiles, o_dtype, is_add_to_output):
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        self.info = info = TileInfo(i_shape, tiles)
+
+        self.forward_krn = Kernel(global_shape=(info.o_shape.size,), kernel_text=f"""
+
+{HKernel.define_tensor('I', i_shape, i_dtype)}
+{HKernel.define_tensor('O', info.o_shape, o_dtype)}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
+{{
+    size_t gid = get_global_id(0);
+    {HKernel.decompose_idx_to_axes_idxs ('gid', 'O', info.o_shape.ndim)}
+
+    {'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'} (gid, I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)})) );
+}}
+""")
+
--- a/xlib/avecl/_internal/op/transpose.py
+++ b/xlib/avecl/_internal/op/transpose.py
@ -0,0 +1,68 @@
+import numpy as np
+
+from ..AAxes import AAxes
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..info import TransposeInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+def transpose(input_t : Tensor, axes_order, op_text=None, dtype : np.dtype = None, output_t : Tensor=None, is_add_to_output=False) -> Tensor:
+    """
+    arguments:
+
+        axes_order     Int
+                       Iterable of ints
+                       None
+
+        dtype           cast to dtype
+
+        op_text(None)    optional op with value during transpose.
+                        'O = I'
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape, but should match total size
+
+    """
+    op = SCacheton.get(_TransposeOp, input_t.shape, input_t.dtype, dtype, AAxes(axes_order), op_text, False if output_t is None else is_add_to_output )
+
+    if output_t is None:
+        output_t = Tensor (op.o_shape, op.o_dtype, device=input_t.get_device())
+    elif output_t.shape.size != op.o_shape.size:
+        raise ValueError(f'output_t must have size {op.o_shape.size}')
+
+    input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
+
+    return output_t
+
+
+class _TransposeOp:
+    def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, axes_order : AAxes, op_text, is_add_to_output : bool ):
+        self.axes_order = axes_order
+        self.o_shape = o_shape = TransposeInfo(i_shape, axes_order).o_shape
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        if op_text is None:
+            op_text = 'O = I'
+
+        self.forward_krn = Kernel(global_shape=(i_shape.size,), kernel_text=f"""
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('I', i_shape, i_dtype)}
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
+{{
+
+size_t gid = get_global_id(0);
+
+{HKernel.decompose_idx_to_axes_idxs('gid', 'i', i_shape.ndim)}
+
+I_TYPE I = I_GLOBAL_LOAD(gid);
+O_TYPE O;
+
+{op_text};
+
+{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_order_enum('I', axes_order )}), O );
+
+}}""")
+
+
--- a/xlib/avecl/_internal/op/unused.zip
+++ b/xlib/avecl/_internal/op/unused.zip
--- a/xlib/avecl/_internal/op/warp_affine.py
+++ b/xlib/avecl/_internal/op/warp_affine.py
@ -0,0 +1,72 @@
+import numpy as np
+
+from ..AShape import AShape
+from ..initializer import InitCoords2DArange
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+from .matmul import matmul
+from .remap import remap
+
+def warp_affine (input_t : Tensor, affine_t : Tensor, output_size=None, dtype=None) -> Tensor:
+    """
+    arguments
+
+        input_t     Tensor(...,H,W)
+
+        affine_t    Tensor(...,2,3)
+                    affine matrix
+
+                    example of identity affine matrix
+                    [1,0,0],
+                    [0,1,0]
+
+        ...-head part of shapes will be broadcasted to each other
+
+        output_size(None)
+
+                    tuple of 2 ints (HW)
+                    of output size
+                    if None , size will not be changed
+
+    """
+    op = SCacheton.get(_WarpAffineOp, input_t.shape, input_t.dtype,  affine_t.shape, affine_t.dtype, output_size)
+
+    affine_t = affine_t.transpose( op.affine_transpose_axes, dtype=np.float32 ).reshape( (-1,3,2) )
+
+    coords_t = Tensor(op.coords_shape, np.float32, device=input_t.get_device(), initializer=op.coords_init )
+    coords_t = coords_t.reshape(op.coords_reshape)
+    coords_t = matmul(coords_t, affine_t).reshape(op.coords_affined_shape)
+
+    output_t = remap(input_t, coords_t, dtype=dtype)
+    return output_t
+
+
+class _WarpAffineOp():
+    def __init__(self, i_shape : AShape, i_dtype, a_shape : AShape, a_dtype, o_size):
+        if np.dtype(i_dtype).type == np.bool_:
+            raise ValueError('np.bool_ dtype of i_dtype is not supported.')
+        if np.dtype(a_dtype).type == np.bool_:
+            raise ValueError('np.bool_ dtype of a_dtype is not supported.')
+        if i_shape.ndim < 2:
+            raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
+        if a_shape.ndim < 2:
+            raise ValueError(f'a_shape.ndim must be >= 2 (...,2,3)')
+        if a_shape[-2] != 2 or a_shape[-1] != 3:
+            raise ValueError('Last a_shape dims must be == (...,2,3)')
+
+        IH,IW = i_shape[-2:]
+        if o_size is not None:
+            OH,OW = o_size
+        else:
+            OH,OW = IH,IW
+
+        self.coords_shape = AShape( (OH,OW,3) )
+        self.coords_affined_shape = AShape( (OH,OW,2) )
+
+        if a_shape.ndim > 2:
+            self.coords_shape = a_shape[:-2] + self.coords_shape
+            self.coords_affined_shape = a_shape[:-2] + self.coords_affined_shape
+
+        self.coords_init = InitCoords2DArange(0,OH-1,0,OW-1)
+        self.coords_reshape = (-1,OH*OW,3)
+        self.affine_transpose_axes = a_shape.axes_arange().swapped_axes(-2,-1)