add xlib.avecl

2025-08-19 13:09:58 -07:00 · 2021-09-30 18:21:30 +04:00 · 2021-09-30 18:21:30 +04:00 · 0058474da7
commit 0058474da7
parent 932edfe875
56 changed files with 5569 additions and 0 deletions
--- a/xlib/avecl/init.py
+++ b/xlib/avecl/init.py
@ -0,0 +1,31 @@
+"""
+AveCL ! Make OpenCL great again.
+
+Lightweight ndarray library using OpenCL 1.2 written in pure python.
+Applicable for high-performance general purpose n-dim array computations for every device that supports OpenCL 1.2.
+
+Works in python 3.5+. Dependencies: numpy.
+
+This lib uses relative import, thus you can place it in any subfolder.
+
+made by @iperov from scratch.
+"""
+
+from ._internal.AAxes import AAxes
+from ._internal.AShape import AShape
+from ._internal.backend import (Device, DeviceInfo, Kernel,
+                                get_available_devices_info, get_best_device,
+                                get_default_device, get_device,
+                                set_default_device)
+from ._internal.HArgs import HArgs
+from ._internal.HKernel import HKernel
+from ._internal.HTensor import HTensor
+from ._internal.HType import HType
+from ._internal.initializer import (InitCoords2DArange, Initializer,
+                                    InitRandomUniform)
+from ._internal.NCore import NCore
+from ._internal.NTest import NTest
+from ._internal.op import *
+from ._internal.SCacheton import SCacheton
+from ._internal.Tensor import Tensor
+from ._internal.TensorImpl import *
--- a/xlib/avecl/_internal/AAxes.py
+++ b/xlib/avecl/_internal/AAxes.py
@ -0,0 +1,169 @@
+from collections import Iterable
+
+class AAxes(Iterable):
+    __slots__ = ['axes','ndim','_inversed']
+
+    def __init__(self, axes, shape_ndim=None):
+        """
+        Constructs AAxes from user argument
+
+        arguments
+
+        axes        AAxes
+                    Int
+                    Iterable of ints
+                    None
+
+        shape_ndim(None)    provide shape_ndim if axes contain negative values
+
+        can raise an errors during the construction
+
+        AAxes supports:
+
+        A+B : concat A_axes with B_axes
+
+        A-B : removes B_axes from A_axes
+        """
+
+        if isinstance(axes, AAxes):
+            self.axes = axes.axes
+            self.ndim = axes.ndim
+            self._inversed = axes._inversed
+        elif axes is None:
+            self.axes = None
+            self.ndim = None
+            self._inversed = None
+        else:
+            if not isinstance(axes, Iterable):
+                axes = (axes,)
+
+            if isinstance(axes, Iterable):
+                valid_axes = []
+                for x in axes:
+                    if x is None:
+                        raise ValueError(f'Incorrent value {x} in axes {axes}')
+                    x = int(x)
+                    if x < 0:
+                        if shape_ndim is None:
+                            raise ValueError(f'Incorrent value {x} in axes {axes}, or provide shape_ndim')
+                        x = shape_ndim + x
+
+                    if x in valid_axes:
+                        raise ValueError(f'Axes must contain unique values.')
+                    valid_axes.append(x)
+
+                self.axes = tuple(valid_axes)
+                self.ndim = len(self.axes)
+                self._inversed = None
+
+    def is_none_axes(self):
+        """
+        returns True if AAxes is constructed with (None) argument, i.e. all-axes
+        """
+        return self.axes is None
+
+    def sorted(self) -> 'AAxes':
+        """
+        returns sorted AAxes
+        """
+        return AAxes(sorted(self.axes))
+
+    def swapped_axes(self, axis_a, axis_b) -> 'AAxes':
+        x = list(self.axes)
+        if axis_a < 0:
+            axis_a = len(x) + axis_a
+        if axis_b < 0:
+            axis_b = len(x) + axis_b
+
+        x[axis_b], x[axis_a] = x[axis_a], x[axis_b]
+
+        return AAxes( tuple(x) )
+
+    def inversed(self) -> 'AAxes':
+        """
+        Returns inversed axes order
+
+        Example:
+
+         for (0,2,3,1)  returns (0,3,1,2)
+        """
+        if self.is_none_axes():
+            raise Exception(f'none-axes does not support inversed(). Handle none-axes by calling .is_none_axes()')
+
+        if self._inversed is None:
+            x = { axis:i for i,axis in enumerate(self.axes) }
+            t = []
+            for i in range(self.ndim):
+                axis = x.get(i, None)
+                if axis is None:
+                    raise Exception(f'axes {self.axes} are inconsistent to do inverse order.')
+                t.append(axis)
+            self._inversed = AAxes(t)
+
+        return self._inversed
+
+
+    def __hash__(self): return self.axes.__hash__()
+    def __eq__(self, other):
+        if isinstance(other, AAxes):
+            return self.axes == other.axes
+        elif isinstance(other, Iterable):
+            return self.axes == tuple(other)
+        return False
+    def __iter__(self):
+        if self.is_none_axes():
+            raise Exception(f'none-axes does not support iteration. Handle none-axes by calling .is_none_axes()')
+        return self.axes.__iter__()
+
+    def __len__(self): return self.ndim
+    def __getitem__(self,key):
+        if self.is_none_axes():
+            raise Exception(f'none-axes does not support indexing. Handle none-axes by calling .is_none_axes()')
+
+        elif isinstance(key, slice):
+            return AAxes(self.axes[key])
+
+        return self.axes[key]
+
+    def __radd__(self, o):
+        if isinstance(o, Iterable):
+            return AAxes( tuple(o) + self.axes)
+        else:
+            raise ValueError(f'unable to use type {o.__class__} in AAxes append')
+    def __add__(self, o):
+        if isinstance(o, Iterable):
+            return AAxes( self.axes + tuple(o) )
+        else:
+            raise ValueError(f'unable to use type {o.__class__} in AAxes append')
+
+    def __rsub__(self, o):
+        if isinstance(o, Iterable):
+            new_axes = []
+            for axis in o:
+                if axis not in self.axes:
+                    new_axes.append(axis)
+
+            return AAxes(new_axes)
+        else:
+            raise ValueError(f'unable to use type {o.__class__} in AAxes substraction')
+
+    def __sub__(self, o):
+        if isinstance(o, Iterable):
+            new_axes = []
+            o_axes = tuple(o)
+            for axis in self.axes:
+                if axis not in o_axes:
+                    new_axes.append(axis)
+
+            return AAxes(new_axes)
+        else:
+            raise ValueError(f'unable to use type {o.__class__} in AAxes substraction')
+
+    def __str__(self):
+        if self.is_none_axes():
+            return '(None)'
+        return str(self.axes)
+
+    def __repr__(self): return 'AAxes' + self.__str__()
+
+__all__ = ['AAxes']
--- a/xlib/avecl/_internal/AShape.py
+++ b/xlib/avecl/_internal/AShape.py
@ -0,0 +1,103 @@
+from collections import Iterable
+from .AAxes import AAxes
+
+class AShape(Iterable):
+    __slots__ = ['shape','size','ndim']
+
+    def __init__(self, shape):
+        """
+        Constructs valid shape from user argument
+
+        arguments
+
+        shape      AShape
+                    Iterable
+
+        can raise ValueError during the construction
+        """
+
+        if isinstance(shape, AShape):
+            self.shape = shape.shape
+            self.size = shape.size
+            self.ndim = shape.ndim
+        else:
+            if isinstance(shape, (int,float) ):
+                shape = (int(shape),)
+
+            if isinstance(shape, Iterable):
+                size = 1
+                valid_shape = []
+                for x in shape:
+                    if x is None:
+                        raise ValueError(f'Incorrent value {x} in shape {shape}')
+                    x = int(x)
+                    if x < 1:
+                        raise ValueError(f'Incorrent value {x} in shape {shape}')
+                    valid_shape.append(x)
+                    size *= x # Faster than np.prod()
+
+                self.shape = tuple(valid_shape)
+                self.ndim = len(self.shape)
+                if self.ndim == 0:
+                    # Force (1,) shape for scalar shape
+                    self.ndim = 1
+                    self.shape = (1,)
+                self.size = size
+            else:
+                raise ValueError('Invalid type to create AShape')
+
+    def axes_arange(self) -> AAxes:
+        """
+        Returns tuple of axes arange.
+
+         Example (0,1,2) for ndim 3
+        """
+        return AAxes(range(self.ndim))
+
+    def transpose_by_axes(self, axes) -> 'AShape':
+        """
+        Same as AShape[axes]
+
+        Returns AShape transposed by axes.
+
+         axes       AAxes
+                    Iterable(list,tuple,set,generator)
+        """
+        return AShape(self.shape[axis] for axis in AAxes(axes) )
+
+    def __hash__(self): return self.shape.__hash__()
+    def __eq__(self, other):
+        if isinstance(other, AShape):
+            return self.shape == other.shape
+        elif isinstance(other, Iterable):
+            return self.shape == tuple(other)
+        return False
+    def __iter__(self): return self.shape.__iter__()
+    def __len__(self): return len(self.shape)
+    def __getitem__(self,key):
+        if isinstance(key, Iterable):
+            if isinstance(key, AAxes):
+                if key.is_none_axes():
+                    return self
+            return self.transpose_by_axes(key)
+        elif isinstance(key, slice):
+            return AShape(self.shape[key])
+
+        return self.shape[key]
+
+    def __radd__(self, o):
+        if isinstance(o, Iterable):
+            return AShape( tuple(o) + self.shape)
+        else:
+            raise ValueError(f'unable to use type {o.__class__} in AShape append')
+    def __add__(self, o):
+        if isinstance(o, Iterable):
+            return AShape( self.shape + tuple(o) )
+        else:
+            raise ValueError(f'unable to use type {o.__class__} in AShape append')
+
+
+    def __str__(self):  return str(self.shape)
+    def __repr__(self): return 'AShape' + self.__str__()
+
+__all__ = ['AShape']
--- a/xlib/avecl/_internal/HArgs.py
+++ b/xlib/avecl/_internal/HArgs.py
@ -0,0 +1,106 @@
+from typing import List
+
+import numpy as np
+
+from .backend import Device
+from .HTensor import HTensor
+from .HType import HType
+from .Tensor import Tensor
+from .AShape import AShape
+
+class HArgs:
+    """
+    Helper functions for list of arguments
+    """
+
+    @staticmethod
+    def decompose(args):
+        """
+        decompose list of args of Tensor and supported numeric values
+
+         returns ( shape_list,   #  if scalar value -> shape is None
+                   dtype_list,   #
+                   kernel_args_list   #
+                 )
+        """
+        shape_list = []
+        dtype_list = []
+        kernel_args_list = []
+        for arg in args:
+
+            if isinstance(arg, Tensor):
+                shape_list.append(arg.shape)
+                dtype_list.append(arg.dtype)
+                kernel_args_list.append(arg.get_buffer())
+            else:
+
+                if isinstance(arg, int):
+                    dtype, arg = np.int32, np.int32(arg)
+                elif isinstance(arg, float):
+                    dtype, arg = np.float32, np.float32(arg)
+                elif HType.is_obj_of_np_scalar_type(arg):
+                    dtype = arg.__class__
+                else:
+                    raise ValueError(f'Unsupported type of arg: {arg.__class__} Use Tensor or number type.')
+
+                shape_list.append(None)
+                dtype_list.append(dtype)
+                kernel_args_list.append(arg)
+
+        return tuple(shape_list), tuple(dtype_list), tuple(kernel_args_list)
+
+    @staticmethod
+    def get_shapes(args : List[Tensor]) -> List[AShape]:
+        """
+        """
+        return tuple(t.shape for t in args)
+
+    @staticmethod
+    def check_zero_get_length(args) -> int:
+        """
+        raises an error if len(args) == 0, otherwise returns len
+        """
+        args_len = len(args)
+        if len(args) == 0:
+            raise ValueError('args must be specified')
+        return args_len
+
+    @staticmethod
+    def check_get_same_device(args : List[Tensor]) -> Device:
+        """
+        check all device of tensors are the same and return the device
+        """
+        result = HTensor.all_same_device(args)
+        if not result:
+            raise ValueError('all Tensors must have the same device')
+        return args[0].get_device()
+
+    @staticmethod
+    def check_all_tensors(args : List[Tensor]):
+        """
+        """
+        if not all (isinstance(tensor, Tensor) for tensor in args):
+            raise ValueError('All values must have type of Tensor')
+
+    @staticmethod
+    def check_get_same_shape(args : List[Tensor]) -> AShape:
+        """
+        check all shapes of tensors are the same and return the shape
+        """
+        shape = args[0].shape
+        if not all (t.shape == shape for t in args):
+            raise ValueError('All tensors must have the same shape')
+        return shape
+
+
+    @staticmethod
+    def filter_tensor(args, raise_on_empty : bool):
+        """
+        get only tensors from the list
+        """
+        tensor_args = [arg for arg in args if isinstance(arg, Tensor) ]
+        if raise_on_empty and len(tensor_args) == 0:
+            raise ValueError('At least one arg must be a Tensor')
+        return tensor_args
+
+__all__ = ['HArgs']
--- a/xlib/avecl/_internal/HKernel.py
+++ b/xlib/avecl/_internal/HKernel.py
@ -0,0 +1,496 @@
+import numpy as np
+
+class HKernel:
+    """
+    Helper functions for Kernels
+    """
+
+    _np_dtype_to_cl = { np.bool_   : 'bool',
+                        np.int8    : 'char',
+                        np.uint8   : 'uchar',
+                        np.int16   : 'short',
+                        np.uint16  : 'ushort',
+                        np.int32   : 'int',
+                        np.uint32  : 'uint',
+                        np.int64   : 'long',
+                        np.uint64  : 'ulong',
+                        np.float16 : 'half',
+                        np.float32 : 'float',
+                        np.float64 : 'double'
+                      }
+
+
+
+    @staticmethod
+    def np_dtype_to_cl(dtype : np.dtype):
+        """
+        returns string opencl type  from numpy dtype
+
+        example np.float32  -> 'float'
+                np.uint8    -> 'unsigned char'
+        """
+        return HKernel._np_dtype_to_cl[np.dtype(dtype).type]
+
+    @staticmethod
+    def define_scalar_func_arg(name, dtype : np.dtype):
+        """
+        """
+        return f'{HKernel._np_dtype_to_cl[np.dtype(dtype).type]} {name}'
+
+    @staticmethod
+    def define_tensor_type(name, dtype : np.dtype):
+        """
+        Returns a definitions for operations with tensor
+
+        example for 'O', np.float16:
+
+        #define O_PTR_NAME p_O
+        #define O_PTR_TYPE half
+        #define O_PTR_TYPE2 half2
+        #define O_PTR_TYPE3 half3
+        #define O_PTR_TYPE4 half4
+        #define O_PTR_TYPE8 half8
+        #define O_PTR_TYPE16 half16
+        #define O_TYPE float
+        #define O_TYPE2 float2
+        #define O_TYPE3 float3
+        #define O_TYPE4 float4
+        #define O_TYPE8 float8
+        #define O_TYPE16 float16
+        #define O_GLOBAL_LOAD(offset)   vload_half  (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD2(offset)  vload_half2 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD3(offset)  vload_half3 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD4(offset)  vload_half4 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD8(offset)  vload_half8 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE(offset,value)   vstore_half  ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE2(offset,value)  vstore_half2 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE3(offset,value)  vstore_half3 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE4(offset,value)  vstore_half4 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE8(offset,value)  vstore_half8 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_TO_FLOATX(x) ((float)x)
+        """
+        name_upper = name.upper()
+
+        dtype = np.dtype(dtype).type
+
+        out = [f'#define {name.upper()}_PTR_NAME p_{name.upper()}']
+
+        if dtype == np.float16:
+            out += [f'#define {name_upper}_PTR_TYPE half']
+            out += [f'#define {name_upper}_PTR_TYPE2 half2']
+            out += [f'#define {name_upper}_PTR_TYPE3 half3']
+            out += [f'#define {name_upper}_PTR_TYPE4 half4']
+            out += [f'#define {name_upper}_PTR_TYPE8 half8']
+            out += [f'#define {name_upper}_PTR_TYPE16 half16']
+            out += [f'#define {name_upper}_TYPE {HKernel.np_dtype_to_cl(np.float32)}']
+            out += [f'#define {name_upper}_TYPE2 {HKernel.np_dtype_to_cl(np.float32)}2']
+            out += [f'#define {name_upper}_TYPE3 {HKernel.np_dtype_to_cl(np.float32)}3']
+            out += [f'#define {name_upper}_TYPE4 {HKernel.np_dtype_to_cl(np.float32)}4']
+            out += [f'#define {name_upper}_TYPE8 {HKernel.np_dtype_to_cl(np.float32)}8']
+            out += [f'#define {name_upper}_TYPE16 {HKernel.np_dtype_to_cl(np.float32)}16']
+
+
+            out += [f'#define {name_upper}_GLOBAL_LOAD(offset)   vload_half  (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_LOAD2(offset)  vload_half2 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_LOAD3(offset)  vload_half3 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_LOAD4(offset)  vload_half4 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_LOAD8(offset)  vload_half8 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+
+            out += [f'#define {name_upper}_GLOBAL_STORE(offset,value)   vstore_half  ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_STORE2(offset,value)  vstore_half2 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_STORE3(offset,value)  vstore_half3 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_STORE4(offset,value)  vstore_half4 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_STORE8(offset,value)  vstore_half8 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+
+        else:
+            out += [f'#define {name_upper}_PTR_TYPE {HKernel.np_dtype_to_cl(dtype)}']
+            out += [f'#define {name_upper}_PTR_TYPE2 {HKernel.np_dtype_to_cl(dtype)}2']
+            out += [f'#define {name_upper}_PTR_TYPE3 {HKernel.np_dtype_to_cl(dtype)}3']
+            out += [f'#define {name_upper}_PTR_TYPE4 {HKernel.np_dtype_to_cl(dtype)}4']
+            out += [f'#define {name_upper}_PTR_TYPE8 {HKernel.np_dtype_to_cl(dtype)}8']
+            out += [f'#define {name_upper}_PTR_TYPE16 {HKernel.np_dtype_to_cl(dtype)}16']
+            out += [f'#define {name_upper}_TYPE {HKernel.np_dtype_to_cl(dtype)}']
+            out += [f'#define {name_upper}_TYPE2 {HKernel.np_dtype_to_cl(dtype)}2']
+            out += [f'#define {name_upper}_TYPE3 {HKernel.np_dtype_to_cl(dtype)}3']
+            out += [f'#define {name_upper}_TYPE4 {HKernel.np_dtype_to_cl(dtype)}4']
+            out += [f'#define {name_upper}_TYPE8 {HKernel.np_dtype_to_cl(dtype)}8']
+            out += [f'#define {name_upper}_TYPE16 {HKernel.np_dtype_to_cl(dtype)}16']
+
+            out += [f'#define {name_upper}_GLOBAL_LOAD(offset)   {name_upper}_PTR_NAME[(offset)]']
+            out += [f'#define {name_upper}_GLOBAL_LOAD2(offset)  {name_upper}_PTR_NAME[(offset)]']
+            out += [f'#define {name_upper}_GLOBAL_LOAD3(offset)  {name_upper}_PTR_NAME[(offset)]']
+            out += [f'#define {name_upper}_GLOBAL_LOAD4(offset)  {name_upper}_PTR_NAME[(offset)]']
+            out += [f'#define {name_upper}_GLOBAL_LOAD8(offset)  {name_upper}_PTR_NAME[(offset)]']
+            out += [f'#define {name_upper}_GLOBAL_LOAD16(offset) {name_upper}_PTR_NAME[(offset)]']
+
+            out += [f'#define {name_upper}_GLOBAL_STORE(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
+            out += [f'#define {name_upper}_GLOBAL_STORE2(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
+            out += [f'#define {name_upper}_GLOBAL_STORE3(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
+            out += [f'#define {name_upper}_GLOBAL_STORE4(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
+            out += [f'#define {name_upper}_GLOBAL_STORE8(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
+            out += [f'#define {name_upper}_GLOBAL_STORE16(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
+
+        if dtype in [np.float32, np.float64]:
+            out += [f'#define {name_upper}_TO_FLOATX(x) x']
+        elif dtype in [np.bool_, np.int8, np.uint8, np.int16, np.uint16, np.int32,np.uint32, np.float16]:
+            out += [f'#define {name_upper}_TO_FLOATX(x) ((float)x)']
+        elif dtype in [np.int64,np.uint64]:
+            out += [f'#define {name_upper}_TO_FLOATX(x) ((double)x)']
+        return '\n'.join(out)
+
+    @staticmethod
+    def define_tensor_shape(name, shape, axes_symbols=None):
+        """
+        Returns a definitions for operations with tensor
+
+        example for 'O', (7,3),
+
+        #define O0 7
+        #define O1 3
+        #define Om1 3
+        #define Om2 7
+        #define O_IDX(o0,o1) ( (size_t)(o0) )*3 +( o1 )
+        #define O_IDX_MOD(o0,o1) ( (size_t)(o0) % 7 )*3 +( (o1) % 3 )
+        """
+        shape = tuple(shape)
+        ndim = len(shape)
+        name_upper = name.upper()
+        name_lower = name.lower()
+
+        if axes_symbols is None:
+            axes_symbols = "".join([str(i) for i in range(ndim)])
+        axes_symbols = axes_symbols.upper()
+
+        out = []
+        for i in range(ndim):
+            out += [f'#define {name_upper}{axes_symbols[i]} {shape[i]}']
+
+        for i in range(1,ndim+1):
+            out += [f'#define {name_upper}m{i} {shape[-i]}']
+
+        line = f'#define {name_upper}_IDX({HKernel.axes_seq_enum(name, ndim)}) '
+
+        for i in range(ndim):
+            if i == 0:
+                line += f'( (size_t)({name_lower}{i}) )'
+            else:
+                line += f'( {name_lower}{i} )'
+
+            for j in range(i+1,ndim):
+                line += f'*{shape[j]} '
+            if i != ndim-1:
+                line += '+'
+
+        out += [line]
+
+        line = f'#define {name_upper}_IDX_MOD({HKernel.axes_seq_enum(name, ndim)}) '
+
+        for i in range(ndim):
+            if i == 0:
+                line += f'( (size_t)({name_lower}{i}) % {shape[i]} )'
+            else:
+                line += f'( ({name_lower}{i}) % {shape[i]} )'
+
+            for j in range(i+1,ndim):
+                line += f'*{shape[j]} '
+            if i != ndim-1:
+                line += '+'
+
+        out += [line,'']
+
+        return '\n'.join(out)
+
+    @staticmethod
+    def define_tensor(name, shape, dtype : np.dtype, axes_symbols=None):
+        """
+        Returns a definitions for operations with tensor
+
+        arguments
+
+            name     text
+
+            shape           Iterable
+
+            dtype           np.dtype
+
+            axes_symbols(None)  string of symbols.
+                                None -> numeric symbols will be used
+
+        example for 'O', (2,4), np.float16
+
+        #define O_PTR_NAME p_O
+        #define O_PTR_TYPE half
+        #define O_PTR_TYPE2 half2
+        #define O_PTR_TYPE3 half3
+        #define O_PTR_TYPE4 half4
+        #define O_PTR_TYPE8 half8
+        #define O_PTR_TYPE16 half16
+        #define O_TYPE float
+        #define O_TYPE2 float2
+        #define O_TYPE3 float3
+        #define O_TYPE4 float4
+        #define O_TYPE8 float8
+        #define O_TYPE16 float16
+        #define O_GLOBAL_LOAD(offset)   vload_half  (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD2(offset)  vload_half2 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD3(offset)  vload_half3 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD4(offset)  vload_half4 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD8(offset)  vload_half8 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE(offset,value)   vstore_half  ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE2(offset,value)  vstore_half2 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE3(offset,value)  vstore_half3 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE4(offset,value)  vstore_half4 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE8(offset,value)  vstore_half8 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_TO_FLOATX(x) ((float)x)
+        #define O0 2
+        #define O1 4
+        #define Om1 4
+        #define Om2 2
+        #define O_IDX(o0,o1) ( (size_t)(o0) )*4 +( o1 )
+        #define O_IDX_MOD(o0,o1) ( (size_t)(o0) % 2 )*4 +( (o1) % 4 )
+        """
+        return'\n'.join([ HKernel.define_tensor_type(name, dtype),
+                          HKernel.define_tensor_shape(name, shape, axes_symbols)
+                        ])
+
+    @staticmethod
+    def define_axes_sizes(axis_letter, axes_sizes):
+        """
+        Returns definitions of axes sizes
+
+        example for 'O', (4,512,512)
+        #define O0 4
+        #define O1 512
+        #define O2 512
+        """
+        out = ""
+        axes_sizes = tuple(axes_sizes)
+        ndim = len(axes_sizes)
+        for i in range(ndim):
+            out += f'#define {axis_letter.upper()}{i} {axes_sizes[i]}\n'
+
+        return out
+
+    @staticmethod
+    def decompose_idx_to_axes_idxs(var_name, tensor_name, ndim):
+        """
+        decompose a size_t variable to axes indexes.
+        Keeps original variable untouched.
+
+        Example for 'gid','O',2
+
+        size_t gid_original = gid;
+        size_t o1 = gid % O1; gid /= O1;
+        #define om1 o1
+        size_t o0 = gid % O0;
+        #define om2 o0
+        gid = gid_original;
+        """
+        name_lower = tensor_name.lower()
+        name_upper = tensor_name.upper()
+
+        out = [f'size_t {var_name}_original = {var_name};']
+
+        for i in range(ndim-1,-1,-1):
+            line = f'size_t {name_lower}{i} = {var_name} % {name_upper}{i};'
+            if i > 0:
+                line += f' {var_name} /= {name_upper}{i};'
+            out += [line]
+            out += [f'#define {name_lower}m{ndim-i} {name_lower}{i}']
+
+        out += [f'{var_name} = {var_name}_original;']
+        return '\n'.join(out)
+
+    @staticmethod
+    def axes_order_enum(tensor_name, axes_order):
+        """
+        returns axis enumeration with given order
+
+        Example
+         ('I', (1,2,0)) returns 'i1,i2,i0'
+         ('I', 'HW') return 'ih,iw'
+        """
+        if isinstance(axes_order, str):
+            axes_order = axes_order.lower()
+        else:
+            axes_order = tuple(axes_order)
+
+        name_lower = tensor_name.lower()
+
+        return ','.join( [ f'{name_lower}{axes_order[axis]}' for axis in range(len(axes_order)) ])
+
+    @staticmethod
+    def axes_seq_enum(tensor_name, ndim, new_axis=None, zero_axes=None, suffix=None):
+        """
+        returns axis sequental enumeration with given ndim
+
+        Example
+
+         ('I', 4) returns 'i0,i1,i2,i3'
+
+         ('I', 4, new_axis=('name',1) ) returns 'i0,name,i1,i2,i3'
+
+         ('I', 3, zero_axes=(1,) ) returns 'i0,0,i2'
+
+         ('I', 2, suffix='ih,iw' ) returns 'i0,i1,ih,iw'
+        """
+        name_lower = tensor_name.lower()
+        if zero_axes is not None:
+            axes = [ '0' if axis in zero_axes else f'{name_lower}{axis}' for axis in range(ndim) ]
+        else:
+            axes = [ f'{name_lower}{axis}' for axis in range(ndim) ]
+
+        if suffix is None:
+            suffix = []
+        else:
+            suffix = [suffix]
+
+        if new_axis is not None:
+            name, axis = new_axis
+            return','.join(axes[:axis] + [name] + axes[axis:] + suffix)
+        else:
+            return ','.join(axes+ suffix)
+
+    @staticmethod
+    def include_constants_pi():
+        """
+        defines PI constants
+
+         PI_F
+         PI_2_F
+         PI_4_F
+        """
+        return f"""
+#define  PI_F          3.14159274101257f
+#define  PI_2_F        1.57079637050629f
+#define  PI_4_F        0.78539818525314f
+"""
+
+    @staticmethod
+    def include_hash():
+        """
+        returns hash functions:
+
+         uint  hash_uint_uint(uint v)
+         ulong hash_ulong_from_ulong(ulong x)
+         float hash_float_from_uint(uint x)
+         double hash_double_from_ulong(ulong x)
+        """
+
+        return f"""
+
+#define UIF (1.0 / (float)(0xffffffffU))
+
+//from Chris Wellons https://nullprogram.com/blog/2018/07/31/ https://www.shadertoy.com/view/WttXWX
+uint hash_uint_from_uint(uint x)
+{{
+    x ^= x >> 17;
+    x *= 0xed5ad4bbU;
+    x ^= x >> 11;
+    x *= 0xac4c1b51U;
+    x ^= x >> 15;
+    x *= 0x31848babU;
+    x ^= x >> 14;
+    return x;
+}}
+
+ulong hash_ulong_from_ulong(ulong x)
+{{
+    x ^= x >> 32;
+    x *= 0xd6e8feb86659fd93U;
+    x ^= x >> 32;
+    x *= 0xd6e8feb86659fd93U;
+    x ^= x >> 32;
+    return x;
+}}
+
+float hash_float_from_uint(uint x)
+{{
+    return hash_uint_from_uint(x) / (float)(0xffffffffU);
+}}
+
+double hash_double_from_ulong(ulong x)
+{{
+    return (double)hash_ulong_from_ulong(x) / (double)(0xffffffffffffffffU);
+}}
+
+/*****************************
+UNUSED CODE
+
+//---------- PCG hashes from https://www.shadertoy.com/view/XlGcRh
+uint hash_uint_uint(uint v)
+{{
+    uint state = v * 747796405u + 2891336453u;
+    uint word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u;
+    return (word >> 22u) ^ word;
+}}
+
+uint2 hash_uint2_uint2 (uint2 v)
+{{
+    v = v * 1664525u + 1013904223u;
+    v.x += v.y * 1664525u;
+    v.y += v.x * 1664525u;
+    v ^= v>>16u;
+    v.x += v.y * 1664525u;
+    v.y += v.x * 1664525u;
+    v ^= v>>16u;
+    return v;
+}}
+
+uint3 hash_uint3_uint3(uint3 v)
+{{
+    v = v * 1664525u + 1013904223u;
+    v.x += v.y*v.z;
+    v.y += v.z*v.x;
+    v.z += v.x*v.y;
+    v ^= v >> 16u;
+    v.x += v.y*v.z;
+    v.y += v.z*v.x;
+    v.z += v.x*v.y;
+    return v;
+}}
+
+float hash_float_uint(uint v)
+{{
+	return (float)( hash_uint_uint(v) ) * UIF;
+}}
+
+float2 hash_float2_uint (uint v)
+{{
+    uint2 q = hash_uint2_uint2( (uint2)(v, 1) );
+    return (float2)(q.x, q.y) * UIF;
+}}
+
+float3 hash_float3_uint (uint v)
+{{
+    uint3 q = hash_uint3_uint3( (uint3)(v, 1, 1) );
+    return (float3)(q.x, q.y, q.z) * UIF;
+}}
+
+//---------- Classic hashes used in shaders
+
+float hash_float_float(float p)
+{{
+
+    float x = sin(p*12.9898)*43758.5453;
+    return x - floor(x);
+}}
+
+float hash_float_float2(float2 p)
+{{
+    float x = sin( dot(p, (float2)(12.9898, 78.233)) )*43758.5453;
+    return x - floor(x);
+}}
+
+****************************/
+
+
+"""
+
+__all__ = ['HKernel']
--- a/xlib/avecl/_internal/HTensor.py
+++ b/xlib/avecl/_internal/HTensor.py
@ -0,0 +1,22 @@
+from typing import List, Union
+
+from .Tensor import *
+
+
+class HTensor:
+    """
+    Helper functions for Tensor
+    """
+
+    @staticmethod
+    def all_same_device( tensor_or_list : Union[Tensor, List[Tensor] ] ) -> bool:
+        """
+        check if all tensors in a list use the same device
+        """
+        if not isinstance(tensor_or_list, (list,tuple)):
+            tensor_or_list = (tensor_or_list,)
+
+        device = tensor_or_list[0].get_device()
+        return all( device == tensor.get_device() for tensor in tensor_or_list )
+
+__all__ = ['HTensor']
--- a/xlib/avecl/_internal/HType.py
+++ b/xlib/avecl/_internal/HType.py
@ -0,0 +1,82 @@
+import operator
+from typing import Iterable, List
+import numpy as np
+
+scalar_types = [int, float, np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64,
+                np.float16, np.float32, np.float64, np.bool_]
+
+np_scalar_types = [np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64,
+                    np.float16, np.float32, np.float64, np.bool_]
+
+_np_dtype_to_cl = {
+    np.bool_   : 'bool',
+    np.uint8   : 'uchar',
+    np.int8    : 'char',
+    np.uint16  : 'ushort',
+    np.int16   : 'short',
+    np.uint32  : 'uint',
+    np.int32   : 'int',
+    np.uint64  : 'ulong',
+    np.int64   : 'long',
+    np.float16 : 'half',
+    np.float32 : 'float',
+    np.float64 : 'double',
+}
+
+_np_dtype_weight = {
+    np.bool_   : 1,
+    np.uint8   : 2,
+    np.int8    : 3,
+    np.uint16  : 4,
+    np.int16   : 5,
+    np.uint32  : 6,
+    np.int32   : 7,
+    np.uint64  : 8,
+    np.int64   : 9,
+    np.float16 : 10,
+    np.float32 : 11,
+    np.float64 : 12,
+}
+
+class HType:
+    """
+    Helper functions for types.
+    """
+
+    def is_scalar_type(value):
+        return value.__class__ in scalar_types
+
+    def get_np_scalar_types() -> List:
+        return np_scalar_types
+
+    def is_obj_of_np_scalar_type(obj) -> bool:
+        return obj.__class__ in np_scalar_types
+
+    def np_dtype_to_cl(dtype : np.dtype) -> str:
+        return _np_dtype_to_cl[ np.dtype(dtype).type ]
+
+    def get_most_weighted_dtype( dtype_list ):
+        dtype_list = [ np.dtype(dtype) for dtype in dtype_list ]
+
+        dtype_list = [ (_np_dtype_weight[dtype.type], dtype) for dtype in dtype_list ]
+        dtype_list = sorted(dtype_list, key=operator.itemgetter(0), reverse=True)
+        return dtype_list[0][1]
+
+    def hashable_slices(slices):
+        """
+        Convert list of slice to hashable arg.
+        """
+        if not isinstance(slices, Iterable):
+            slices = (slices,)
+        normalized_slices = []
+        for x in slices:
+            if isinstance(x, slice):
+                normalized_slices.append( (x.start, x.stop, x.step) )
+            elif x is Ellipsis or x is None:
+                normalized_slices.append(x)
+            else:
+                normalized_slices.append(int(x))
+        return tuple(normalized_slices)
+
+
+__all__ = ['HType']
--- a/xlib/avecl/_internal/NCore.py
+++ b/xlib/avecl/_internal/NCore.py
@ -0,0 +1,24 @@
+from .SCacheton import SCacheton
+from .Tensor import Tensor
+from .backend import *
+
+class NCore:
+    """
+    core functions
+    """
+
+    @staticmethod
+    def cleanup():
+        """
+        try to cleanup all resources consumed by TensorCL.
+
+        can raise Exception
+        """
+        SCacheton.cleanup()
+
+        if Tensor._object_count != 0:
+            raise Exception(f'Unable to cleanup while {Tensor._object_count} Tensor objects exist.')
+
+        cleanup_devices()
+
+__all__ = ['NCore']
--- a/xlib/avecl/_internal/NTest.py
+++ b/xlib/avecl/_internal/NTest.py
@ -0,0 +1,754 @@
+import traceback
+
+import numpy as np
+
+from .HType import HType
+from .NCore import NCore
+from .backend import get_device, get_default_device, set_default_device
+from .Tensor import Tensor
+from . import op
+from .initializer import InitRandomUniform, InitCoords2DArange
+from .info import Conv2DInfo
+
+class NTest():
+
+    def test_all():
+        NCore.cleanup()
+
+        prev_device = get_default_device()
+
+        device = get_device(0)
+        print(f'Using {device.get_description()}')
+
+        set_default_device(device)
+
+        test_funcs = [
+                        InitRandomUniform_test,
+                        InitCoords2DArange_test,
+                        cast_test,
+                        transpose_test,
+                        pad_test,
+                        concat_test,
+                        tile_test,
+                        stack_test,
+                        slice_test,
+                        slice_set_test,
+                        reduce_test,
+                        matmul_test,
+                        any_wise_op_test,
+                        depthwise_conv2d_test,
+                        remap_np_affine_test,
+                        remap_test,
+                        warp_affine_test,
+                        gaussian_blur_test,
+                        binary_erode_circle_test,
+                        binary_dilate_circle_test,
+                        binary_morph_test,
+                    ]
+
+        for test_func in test_funcs:
+            print(f'{test_func.__name__}()')
+            test_func()
+            device.cleanup()
+        device.print_stat()
+
+        NCore.cleanup()
+
+        set_default_device(prev_device)
+
+        print('Done.')
+
+def _all_close(x,y, atol=1, btol=1):
+    return np.allclose( np.ndarray.flatten(x[None,...]), np.ndarray.flatten(y[None,...]), atol, btol )
+
+def cast_test():
+    for in_dtype in HType.get_np_scalar_types():
+        for out_dtype in HType.get_np_scalar_types():
+            shape = tuple(np.random.randint(1, 8, size=( np.random.randint(1,5))) )
+
+            print(f'cast: {shape} in_dtype:{str(np.dtype(in_dtype).name)} out_dtype:{str(np.dtype(out_dtype).name)}  ... ', end='')
+
+            val_n = np.random.uniform( -64, 64, size=shape ).astype(in_dtype)
+            cast_n = val_n.astype(out_dtype)
+            val_t = Tensor.from_value(val_n)
+            cast_t = op.cast(val_t, out_dtype)
+
+            if not _all_close(cast_t.np(), cast_n):
+                raise Exception(f'data is not equal')
+
+            print('pass')
+
+def binary_morph_test():
+    for shape_len in range(2,4):
+        for dtype in HType.get_np_scalar_types():
+            shape = np.random.randint( 1, 64, size=(shape_len,) )
+            erode_dilate = np.random.randint( -16, 16 )
+            blur = np.random.rand()*16 - 8
+
+            input_n = np.random.randint( 2, size=shape ).astype(dtype)
+            input_t = Tensor.from_value(input_n)
+
+            print(f'binary_morph: {shape} erode_dilate:{erode_dilate} blur:{blur} {np.dtype(dtype).name} ... ', end='')
+
+            op.binary_morph(input_t, erode_dilate=erode_dilate, blur=blur, fade_to_border=True)
+
+            print('pass')
+
+def binary_erode_circle_test():
+    for shape_len in range(2,4):
+        for dtype in HType.get_np_scalar_types():
+
+            shape = np.random.randint( 1, 64, size=(shape_len,) )
+            radius = np.random.randint( 1, 16 )
+            iterations = np.random.randint( 1, 4 )
+
+            input_n = np.random.randint( 2, size=shape ).astype(dtype)
+            input_t = Tensor.from_value(input_n)
+
+            print(f'binary_erode_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='')
+
+            op.binary_erode_circle(input_t, radius=radius, iterations=iterations)
+
+            print('pass')
+
+def binary_dilate_circle_test():
+    for shape_len in range(2,4):
+        for dtype in HType.get_np_scalar_types():
+
+            shape = np.random.randint( 1, 64, size=(shape_len,) )
+            radius = np.random.randint( 1, 16 )
+            iterations = np.random.randint( 1, 4 )
+
+            input_n = np.random.randint( 2, size=shape ).astype(dtype)
+            input_t = Tensor.from_value(input_n)
+
+            print(f'binary_dilate_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='')
+
+            op.binary_dilate_circle(input_t, radius=radius, iterations=iterations)
+
+            print('pass')
+
+
+def gaussian_blur_test():
+    for shape_len in range(2,5):
+        for dtype in [np.float16, np.float32, np.float64]:
+
+            shape = np.random.randint( 1, 64, size=(shape_len,) )
+            sigma = np.random.rand() * 10
+            print(f'gaussian_blur: {shape} sigma:{sigma} {np.dtype(dtype).name} ... ', end='')
+
+            val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
+            val_t = Tensor.from_value(val_n)
+
+            op.gaussian_blur(val_t, sigma)
+
+            print('pass')
+
+def pad_test():
+    for iteration in range(1):
+      for shape_len in range(5,1,-1):
+        for mode in ['constant']:
+          for dtype in HType.get_np_scalar_types():
+            while True:
+                shape = np.random.randint( 1, 8, size=(shape_len,) )
+
+                paddings = tuple( (np.random.randint(8), np.random.randint(8)) for i in range(len(shape)) )
+
+                print(f'pad: {shape} {paddings} {mode} {np.dtype(dtype).name} ... ', end='')
+
+                val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
+                pad_n = np.pad(val_n, paddings, mode=mode)
+
+                val_t = Tensor.from_value(val_n)
+                pad_t = op.pad(val_t, paddings, mode=mode)
+
+                print(f'{pad_n.shape} == {pad_t.shape} ... ', end='')
+
+                if pad_n.shape != pad_t.shape:
+                    raise Exception(f'shape is not equal')
+
+                if not _all_close(pad_t.np(), pad_n):
+                    raise Exception(f'data is not equal')
+
+                print('pass')
+                break
+
+def slice_set_test():
+    for iteration in [0,1]:
+      for shape_len in range(5,1,-1):
+          for dtype in HType.get_np_scalar_types():
+            while True:
+                shape = np.random.randint( 1, 8, size=(shape_len,) )
+
+                if iteration == 0:
+                    slices = [ slice(None,None,None), ] * shape_len
+                    axis = np.random.randint(shape_len)
+                    shape[axis] = 1
+                    slices[axis] = 0
+                else:
+                    slices = []
+                    for i in range (shape_len):
+                        axis_size = shape[i]
+
+                        b = np.random.randint(axis_size)
+                        e = np.random.randint(axis_size)
+                        if b == e:
+                            slices.append(b)
+                        else:
+                            if b < e:
+                                s = 1
+                                if b == 0:
+                                    b = None
+                                if e == axis_size-1:
+                                    e = None
+                            else:
+                                s = -1
+                                if b == axis_size-1:
+                                    b = None
+                                if e == 0:
+                                    e = None
+                            slices.append ( slice(b,e,s) )
+
+                    if np.random.randint(2) == 0:
+                        axis = np.random.randint(shape_len)
+                        slices[axis] = Ellipsis
+
+                shape = tuple(shape)
+                slices = tuple(slices)
+
+                print(f'slice_set: {shape} {np.dtype(dtype).name} {slices} ... ', end='')
+
+                val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
+                val_t = Tensor.from_value(val_n)
+
+                sliced_n = val_n[slices]
+
+                v = [0] if sliced_n.ndim > 0 else 0
+
+                val_n[slices] = v
+                val_t[slices] = v
+
+                if not _all_close(val_t.np(), val_n):
+                    raise Exception(f'data is not equal')
+
+                print('pass')
+                break
+
+def depthwise_conv2d_test():
+    def _numpy_depthwise_conv2d(input_n, kernel_n, STRIDE=1, DILATION=1, padding='same', dtype=np.float32):
+        N, IC, IH, IW = input_n.shape
+        KI, KH, KW = kernel_n.shape
+
+        ci = Conv2DInfo(IH, IW, KH, KW, STRIDE, DILATION, padding)
+
+        PADL, PADT = ci.PADL, ci.PADT
+
+        OC, OH, OW = IC, ci.OH, ci.OW
+
+        O_IK_idxs = { idx:[ [ [], [] ], [ [], [] ] ] for idx in range(OH*OW) }
+        K_IO_idxs = { idx:[ [ [], [] ], [ [], [] ] ] for idx in range(KH*KW) }
+        I_KO_idxs = { idx:[ [ [], [] ], [ [], [] ] ] for idx in range(IH*IW) }
+
+        for ow in range(OW):
+            for oh in range(OH):
+                O_idx = oh*OW + ow
+                for kh in range(KH):
+                    for kw in range(KW):
+                        iw = -PADL + kw*DILATION + ow*STRIDE
+                        ih = -PADT + kh*DILATION + oh*STRIDE
+                        if (iw >= 0) & (ih >= 0) & (iw < IW) & (ih < IH):
+
+                            O_IK_idxs[O_idx][0][0].append (ih)
+                            O_IK_idxs[O_idx][0][1].append (iw)
+                            O_IK_idxs[O_idx][1][0].append (kh)
+                            O_IK_idxs[O_idx][1][1].append (kw)
+
+                            K_idx = kh*KW + kw
+                            K_IO_idxs[K_idx][0][0].append (ih)
+                            K_IO_idxs[K_idx][0][1].append (iw)
+                            K_IO_idxs[K_idx][1][0].append (oh)
+                            K_IO_idxs[K_idx][1][1].append (ow)
+
+                            I_idx = ih*IW + iw
+                            I_KO_idxs[I_idx][0][0].append (kh)
+                            I_KO_idxs[I_idx][0][1].append (kw)
+                            I_KO_idxs[I_idx][1][0].append (oh)
+                            I_KO_idxs[I_idx][1][1].append (ow)
+
+        output_shape = (N, OC, OH, OW)
+        output = np.empty( output_shape, dtype)
+
+        for n in range(N):
+            for oc in range(OC):
+                for oh in range(OH):
+                    for ow in range(OW):
+                        O_idx = oh*OW + ow
+                        I_idxs = O_IK_idxs[O_idx][0]
+                        K_idxs = O_IK_idxs[O_idx][1]
+
+                        v = (  input_n[ n,oc][..., I_idxs[0], I_idxs[1]] *
+                            kernel_n [oc][..., K_idxs[0], K_idxs[1]] ).sum()
+
+                        output[n,oc,oh,ow] = v
+        return output
+
+    for padding in ['same','valid',2]:
+        for dilation in [1,2]:
+          for stride in [1,2]:
+            for ks in [1,3]:
+              for n in [1,4]:
+                for ic in [1,4]:
+                    for ih,iw in zip(*[[4,16]]*2):
+                        if padding == 'valid' and iw < ks:
+                            continue
+                        for dtype in [np.int16, np.float16, np.float32]:
+                            input_shape  = (n, ic, ih, iw)
+                            kernel_shape = (ic, ks, ks)
+
+                            print(f'depthwise_conv2d: {input_shape},{kernel_shape},{padding},{stride},{dilation},{np.dtype(dtype).name} ... ', end='')
+
+                            input_n  = np.random.randint( 64, size=input_shape ).astype(dtype)
+                            kernel_n = np.ones(shape=kernel_shape ).astype(dtype)
+
+                            input_t  = Tensor.from_value(input_n)
+                            kernel_t = Tensor.from_value(kernel_n)
+
+                            conved_t = op.depthwise_conv2D(input_t, kernel_t, stride=stride, dilation=dilation, padding=padding)
+                            conved_n = _numpy_depthwise_conv2d(input_n, kernel_n, STRIDE=stride, DILATION=dilation, padding=padding, dtype=dtype)
+
+                            if conved_n.shape != conved_t.shape:
+                                raise Exception(f'shape is not equal')
+
+                            if not all ( np.ndarray.flatten( conved_t.np() == conved_n) ):
+                                raise Exception(f'data is not equal')
+
+                            print('pass')
+
+
+
+def warp_affine_test():
+    for dtype in HType.get_np_scalar_types():
+        if dtype == np.bool_:
+            continue
+        H = np.random.randint(8, 64)
+        W = np.random.randint(8, 64)
+
+        print(f'warp_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
+
+        input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )
+
+        affine_t = Tensor.from_value ( [[1,0,0],
+                                        [0,1,0]], dtype)
+
+        result_t = op.warp_affine(input_t, affine_t)
+
+        if not _all_close(input_t.np(), result_t.np() ):
+            raise Exception(f'data is not equal')
+
+        print('pass')
+
+
+def remap_np_affine_test():
+    for dtype in HType.get_np_scalar_types():
+        if dtype == np.bool_:
+            continue
+        H = np.random.randint(8, 64)
+        W = np.random.randint(8, 64)
+
+        print(f'remap_np_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
+
+        input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )
+
+        affine_n = np.array ( [[1,0,0],
+                               [0,1,0]], dtype)
+
+        result_t = op.remap_np_affine(input_t, affine_n)
+
+        if not _all_close(input_t.np(), result_t.np() ):
+            raise Exception(f'data is not equal')
+
+        print('pass')
+
+
+def remap_test():
+    for dtype in HType.get_np_scalar_types():
+        if dtype == np.bool_:
+            continue
+        H = np.random.randint(8, 64)
+        W = np.random.randint(8, 64)
+
+        print(f'remap: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
+
+        input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )
+
+        coords_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) )
+
+        result_t = op.remap(input_t, coords_t)
+
+        if not _all_close(input_t.np(), result_t.np() ):
+            raise Exception(f'data is not equal')
+
+        print('pass')
+
+def tile_test():
+    for _ in range(3):
+      for shape_len in range(3, 5):
+        for dtype in HType.get_np_scalar_types():
+            shape = tuple(np.random.randint( 8, size=(shape_len,) )+1)
+            tiles = tuple(np.random.randint( 4, size=(shape_len,) )+1)
+
+            print(f'tile: {shape} {tiles} {np.dtype(dtype).name} ... ', end='')
+
+            val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
+            tiled_n = np.tile(val_n, tiles)
+
+            val_t = Tensor.from_value(val_n)
+            tiled_t = op.tile(val_t, tiles)
+
+            print(f'{tiled_n.shape} == {tiled_t.shape} ... ', end='')
+
+            if tiled_n.shape != tiled_t.shape:
+                raise Exception(f'shape is not equal')
+
+            if not _all_close(tiled_t.np(), tiled_n):
+                raise Exception(f'data is not equal')
+
+            print('pass')
+
+def stack_test():
+    for _ in range(3):
+        for shape_len in range(1, 4):
+            for dtype in HType.get_np_scalar_types():
+                shape = tuple(np.random.randint( 8, size=(shape_len,) )+1)
+                axis = np.random.randint(shape_len+1)
+                stack_count = np.random.randint(4)+1
+
+                print(f'stack: {shape}*{stack_count} axis:{axis} {np.dtype(dtype).name} ... ', end='')
+
+                vals_n = [ np.random.randint( 2**8, size=shape ).astype(dtype) for i in range(stack_count) ]
+                stack_n = np.stack(vals_n, axis)
+
+                vals_t = [ Tensor.from_value(vals_n[i]) for i in range(stack_count) ]
+                stack_t = op.stack(vals_t, axis)
+
+                print(f'{stack_n.shape} == {stack_t.shape} ... ', end='')
+
+                if stack_n.shape != stack_t.shape:
+                    raise Exception('shape is not equal')
+
+                if not _all_close(stack_t.np(), stack_n):
+                    raise Exception(f'data is not equal')
+
+                print('pass')
+
+def reduce_test():
+    for op_type in ['sum', 'mean', 'min', 'max']:
+      for dtype in HType.get_np_scalar_types():
+        if dtype != np.bool_:
+            for shape_len in range(2, 5):
+                shape = np.random.randint( 8, size=(shape_len,) )+1
+
+                reduction_axes = np.array([*range(shape_len)])
+                np.random.shuffle(reduction_axes)
+
+                # Cut random amount of reduction_axes
+                reduction_axes = tuple(reduction_axes [:np.random.randint(shape_len+1)])
+                if len(reduction_axes) == 0:
+                    reduction_axes = None
+
+                keepdims = np.random.randint(2) == 0
+
+                print(f'reduce {op_type}: {shape} {np.dtype(dtype).name} axes={reduction_axes} keepdims={keepdims} ... ', end='')
+
+                if dtype in [np.float16, np.float32, np.float64]:
+                    value_n = np.random.uniform(size=shape).astype(dtype)
+                else:
+                    value_n = np.random.randint( max(1, int(np.iinfo(dtype).max / np.prod(shape)) ), size=shape, dtype=dtype )
+
+                value_t = Tensor.from_value(value_n)
+
+                if op_type == 'sum':
+                    reducted_n = value_n.sum(reduction_axes, keepdims=keepdims)
+                    reducted_t = value_t.sum(reduction_axes, keepdims=keepdims)
+                elif op_type == 'mean':
+                    reducted_n = value_n.mean(reduction_axes, keepdims=keepdims)
+                    reducted_t = value_t.mean(reduction_axes, keepdims=keepdims)
+                elif op_type == 'max':
+                    reducted_n = value_n.max(reduction_axes, keepdims=keepdims)
+                    reducted_t = value_t.max(reduction_axes, keepdims=keepdims)
+                elif op_type == 'min':
+                    reducted_n = value_n.min(reduction_axes, keepdims=keepdims)
+                    reducted_t = value_t.min(reduction_axes, keepdims=keepdims)
+
+                print(f'{reducted_n.shape} == {reducted_t.shape} ... ')
+
+                if not _all_close(reducted_t.np(), reducted_n):
+                    raise Exception(f'data is not equal')
+
+                print('pass')
+
+
+def InitRandomUniform_test():
+    for dtype in HType.get_np_scalar_types():
+        for shape_len in range(1, 5):
+            shape = np.random.randint( 8, size=(shape_len,) )+1
+
+            print(f'InitRandomUniform: {shape} {np.dtype(dtype).name} ... ', end='')
+
+            Tensor(shape, dtype, initializer=InitRandomUniform()).np()
+
+            print('pass')
+
+def InitCoords2DArange_test():
+    for dtype in HType.get_np_scalar_types():
+        for shape_len in range(2, 5):
+            shape = np.random.randint( 1, 60, size=(shape_len,) ).tolist()
+            shape = shape + ([2] if np.random.randint(2) == 0 else [3])
+            h_start = np.random.randint(80)
+            h_stop = h_start + np.random.randint(80)
+            w_start = np.random.randint(80)
+            w_stop = w_start + np.random.randint(80)
+
+            print(f'InitCoords2DArange: {shape} {np.dtype(dtype).name} ... ', end='')
+
+            Tensor(shape, dtype, initializer=InitCoords2DArange(h_start,h_stop,w_start,w_stop )).np()
+
+            print('pass')
+
+def concat_test():
+    for shape_len in range(2, 5):
+        for dtype in HType.get_np_scalar_types():
+            shape = (np.random.randint( 8, size=(shape_len,) )+1).tolist()
+            axis = np.random.randint(shape_len)
+            count = np.random.randint(4)+1
+
+            shapes = tuple( tuple( dim if i != axis else np.random.randint(8)+1
+                                    for i,dim in enumerate(shape) )
+                            for shape in ([shape] * count) )
+
+            print(f'concat: {shapes} axis={axis} {np.dtype(dtype).name} ... ', end='')
+
+            V_n = [ np.random.randint( 2**8, size=shape ).astype(dtype) for shape in shapes ]
+            O_n = np.concatenate(V_n, axis)
+
+            print(f'{O_n.shape} == ', end='')
+
+            V_t = [ Tensor.from_value(V_n[i]) for i in range(count) ]
+            O_t = op.concat(V_t, axis)
+
+            print(f'{O_t.shape} ... ', end='')
+
+            if O_n.shape != O_t.shape:
+                raise Exception('shape is not equal')
+
+            if not all ( np.ndarray.flatten( O_t.np() == O_n ) ):
+                raise Exception(f'data is not equal')
+
+            print('pass')
+
+def matmul_test():
+    for _ in range(100):
+        for dtype in [np.float32]:
+            BATCH = np.random.randint(8)+1
+            M = np.random.randint(8)+1
+            N = np.random.randint(32768)+1
+            K = np.random.randint(32768)+1
+
+            while K*N > ( 8000000 // BATCH ):
+                K = max(1, K // 2)
+                N = max(1, N // 2)
+
+            if np.random.randint(2) == 0:
+                size = [2,4,8,16][np.random.randint(4)]
+                M = max(1, M // size) * size
+                N = max(1, N // size) * size
+                K = max(1, K // size) * size
+
+            if BATCH == 1:
+                A_shape = (M, K)
+                B_shape = (K, N)
+            else:
+                A_shape = (BATCH, M, K)
+                B_shape = (BATCH, K, N)
+
+            print(f'matmul: {A_shape} {B_shape} {np.dtype(dtype).name} ... ', end='')
+
+            A_n = np.random.randint( 2**4, size=A_shape ).astype(dtype)
+            B_n = np.random.randint( 2**4, size=B_shape ).astype(dtype)
+
+            O_n = np.matmul(A_n, B_n)
+
+            print(f'{O_n.shape} == ', end='')
+
+            A_t = Tensor.from_value(A_n)
+            B_t = Tensor.from_value(B_n)
+            O_t = op.matmul(A_t, B_t)
+            print(f'{O_t.shape} ... ', end='')
+
+            if O_n.shape != O_t.shape:
+                raise Exception('shape is not equal')
+            if not _all_close (O_t.np(), O_n):
+                raise Exception(f'data is not equal')
+
+            print('pass')
+
+def slice_test():
+    for iteration in [0,1]:
+      for shape_len in range(5,1,-1):
+          for dtype in HType.get_np_scalar_types():
+            while True:
+                shape = np.random.randint( 1, 8, size=(shape_len,) )
+
+                if iteration == 0:
+                    slices = [ slice(None,None,None), ] * shape_len
+                    axis = np.random.randint(shape_len)
+                    shape[axis] = 1
+                    slices[axis] = 0
+                else:
+                    slices = []
+                    for i in range (shape_len):
+                        axis_size = shape[i]
+                        b = np.random.randint(axis_size)
+                        e = np.random.randint(axis_size)
+                        if b == e:
+                            slices.append(b)
+                        else:
+                            if b < e:
+                                s = 1
+                                if b == 0:
+                                    b = None
+                                if e == axis_size-1:
+                                    e = None
+                            else:
+                                s = -1
+                                if b == axis_size-1:
+                                    b = None
+                                if e == 0:
+                                    e = None
+                            slices.append ( slice(b,e,s) )
+
+                    if np.random.randint(2) == 0:
+                        axis = np.random.randint(shape_len)
+                        slices[axis] = Ellipsis
+
+                shape = tuple(shape)
+                slices = tuple(slices)
+
+                print(f'slice: {shape} {np.dtype(dtype).name} {slices} ... ', end='')
+
+                val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
+
+                sliced_n = val_n[slices]
+
+                print(f'{sliced_n.shape} ... ', end='')
+
+                sliced_t = Tensor.from_value(val_n)[slices]
+
+                print(f'{sliced_t.shape} ... ', end='')
+
+                if 0 in sliced_n.shape:
+                    # some cases like 0:1:-1 will produce zero shape and invalid array on numpy
+                    # but nn.slice has no such behaviour, thus we have to generate new slice again
+                    print('pass (bad case)')
+                    continue
+
+                if np.prod(sliced_n.shape) != sliced_t.shape.size:
+                    raise Exception(f'shape is not equal')
+
+                if not _all_close(sliced_t.np(), sliced_n):
+                    raise Exception(f'data is not equal')
+
+                print('pass')
+                break
+
+
+def transpose_test():
+    for dtype in HType.get_np_scalar_types():
+        for shape_len in range(2, 5):
+            shape = np.random.randint( 8, size=(shape_len,) )+1
+            axes_order = np.array([*range(shape_len)])
+            np.random.shuffle(axes_order)
+
+            print(f'transpose: {shape} {axes_order} ... ', end='')
+
+            val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
+            transposed_n = np.transpose(val_n, axes_order)
+
+            print(f'{transposed_n.shape} ... ', end='')
+
+            val_t = Tensor.from_value(val_n)
+            transposed_t = op.transpose (val_t, axes_order )
+
+            print(f'{transposed_t.shape} ... ', end='')
+
+            if transposed_n.shape != transposed_t.shape:
+                raise Exception('shape is not equal')
+            if not all ( np.ndarray.flatten( transposed_t.np() == transposed_n ) ):
+                raise Exception(f'data is not equal {shape} {axes_order}')
+
+            print('pass')
+
+
+def any_wise_op_test():
+    for op_type in ['square', '+', '-', '*', '/', 'min', 'max']:
+        for dtype in HType.get_np_scalar_types():
+          if dtype != np.bool_:
+
+            shape_gen = range(1, 5)
+            for shape_len in shape_gen:
+                a_shape = tuple(np.random.randint( 8, size=(shape_len,) )+1)
+
+                if np.random.randint(2) == 0:
+                    b_shape = tuple(a_shape[np.random.randint(len(a_shape)):])
+                    b_shape = (1,) if len(b_shape) == 0 else b_shape
+                else:
+                    b_shape = list(a_shape)
+                    b_shape[ np.random.randint(len(b_shape)) ] = 1
+                    b_shape = tuple(b_shape)
+
+                shapes = [a_shape, b_shape]
+                if np.random.randint(2) == 0:
+                    shapes = shapes[::-1]
+                a_shape, b_shape = shapes
+
+                print(f'any_wise: {a_shape} {str(op_type)} {b_shape}:{str(np.dtype(dtype).name)} ...', end='')
+
+                a_n = np.random.randint( 1, 2**8, size=a_shape ).astype(dtype)
+                b_n = np.random.randint( 1, 2**8, size=b_shape ).astype(dtype)
+                a_t = Tensor.from_value(a_n)
+                b_t = Tensor.from_value(b_n)
+
+                if op_type == '+':
+                    r_t = a_t + b_t
+                elif op_type == '-':
+                    r_t = a_t - b_t
+                elif op_type == '*':
+                    r_t = a_t * b_t
+                elif op_type == '/':
+                    r_t = a_t / b_t
+                elif op_type == 'min':
+                    r_t = op.min_(a_t, b_t)
+                elif op_type == 'max':
+                    r_t = op.max_(a_t, b_t)
+                elif op_type == 'square':
+                    r_t = op.square(a_t)
+
+                if op_type in ['+','-','*','/']:
+                    r_n = eval(f'a_n {op_type} b_n')
+                    r_n = r_n.astype(dtype)
+                elif op_type == 'min':
+                    r_n = np.minimum(a_n, b_n)
+                elif op_type == 'max':
+                    r_n = np.maximum(a_n, b_n)
+                elif op_type == 'square':
+                    r_n = np.square(a_n)
+
+                if r_n.shape != r_t.shape:
+                    raise Exception(f'shapes are not equal')
+
+                if not _all_close(r_t.np(), r_n):
+                    raise Exception(f'data is not equal')
+
+                print('pass')
+
+
--- a/xlib/avecl/_internal/SCacheton.py
+++ b/xlib/avecl/_internal/SCacheton.py
@ -0,0 +1,55 @@
+class SCacheton:
+    """
+    Static class for caching classes and vars by hashable arguments
+    """
+    cachetons = {}
+    cachevars = {}
+
+    @staticmethod
+    def get(cls, *args, **kwargs):
+        """
+        Get class cached by args/kwargs
+        If it does not exist, creates new with *args,**kwargs
+        All cached data will be freed with cleanup()
+
+        You must not to store Tensor in SCacheton, use per-device cache vars
+        """
+        cls_multitons = SCacheton.cachetons.get(cls, None)
+        if cls_multitons is None:
+            cls_multitons = SCacheton.cachetons[cls] = {}
+
+        key = (args, tuple(kwargs.items()) )
+
+        data = cls_multitons.get(key, None)
+        if data is None:
+            data = cls_multitons[key] = cls(*args, **kwargs)
+
+        return data
+
+    @staticmethod
+    def set_var(var_name, value):
+        """
+        Set data cached by var_name
+        All cached data will be freed with cleanup()
+
+        You must not to store Tensor in SCacheton, use per-device cache vars
+        """
+        SCacheton.cachevars[var_name] = value
+
+    @staticmethod
+    def get_var(var_name):
+        """
+        Get data cached by var_name
+        All cached data will be freed with cleanup()
+
+        You must not to store Tensor in SCacheton, use per-device cache vars
+        """
+        return SCacheton.cachevars.get(var_name, None)
+
+    @staticmethod
+    def cleanup():
+        """
+        Free all cached objects
+        """
+        SCacheton.cachetons = {}
+        SCacheton.cachevars = {}
--- a/xlib/avecl/_internal/Tensor.py
+++ b/xlib/avecl/_internal/Tensor.py
@ -0,0 +1,124 @@
+import weakref
+from typing import Iterable, Union
+
+import numpy as np
+
+from .AShape import *
+from .backend import *
+from .HType import *
+
+
+class Tensor:
+    ### CONSTRUCTORS
+
+    def __init__(self, shape : Union[AShape, Iterable],
+                       dtype : np.dtype,
+                       device : Union[None, int, Device, DeviceInfo] = None,
+                       initializer : 'Initializer' = None,
+                       ):
+        Tensor._object_count += 1
+
+        self._shape = shape = AShape(shape)
+        self._dtype = dtype = np.dtype(dtype)
+        self._device = device = get_device(device)
+        if device is None:
+            raise Exception('No device.')
+
+        self._seq_id = Tensor._seq_id = Tensor._seq_id + 1
+        self._buffer = Buffer(device, size=shape.size*dtype.itemsize,
+                              on_initialize= lambda initializer=initializer, wself=weakref.ref(self): initializer.initialize_tensor( wself() ) if initializer is not None else None )
+
+    @staticmethod
+    def from_value(value, dtype=None, device:Union[None,int,Device,DeviceInfo]=None) -> 'Tensor':
+        if isinstance(value, Tensor):
+            device = value.get_device()
+        elif not isinstance(value, np.ndarray):
+            if HType.is_scalar_type(value):
+                if dtype is None:
+                    raise ValueError('dtype must be specified for single value')
+                value = np.array([value], dtype=dtype)
+            elif isinstance(value, Iterable):
+                if dtype is None:
+                    raise ValueError('dtype must be specified for Iterable of values')
+                value = np.array(value, dtype=dtype)
+            else:
+                raise ValueError(f'Unsupported value type {value.__class__}')
+
+        return Tensor(shape=value.shape, dtype=value.dtype, device=device).set(value)
+
+    ### PROPERTIES
+
+    @property
+    def shape(self): return self._shape
+    @property
+    def ndim(self): return len(self._shape)
+    @property
+    def dtype(self) -> np.dtype: return self._dtype
+    @property
+    def name(self) -> str: return f'#{self.get_seq_id()}'
+
+    ### GETTERS/SETTERS
+
+    def get_buffer(self) -> Buffer: return self._buffer
+    def get_device(self) -> Device: return self._device
+    def get_seq_id(self) -> int: return self._seq_id
+
+    def set(self, value : Union['Tensor',np.ndarray,int,float]) -> 'Tensor':
+        if isinstance(value, Tensor):
+            if self.shape.size != value.shape.size:
+                raise ValueError('Unable to set the data from other tensor: shape.size is not the same.')
+            self.get_buffer().set(value.get_buffer())
+        else:
+            if isinstance(value, np.ndarray):
+                value = value.astype(self.dtype)
+            else:
+                if HType.is_scalar_type(value):
+                    value = np.array([value], dtype=self.dtype)
+                elif isinstance(value, Iterable):
+                    value = np.array(value, dtype=self.dtype)
+                else:
+                    raise ValueError(f'Unsupported value type {value.__class__}')
+
+            self.get_buffer().set(value)
+        return self
+
+    def np(self):
+        """Returns numpy value of a Tensor"""
+        return self.get_buffer().np(self.shape, self.dtype)
+
+
+    ### OPERATORS
+
+    def __add__(self, value) -> 'Tensor': ...
+    def __radd__(self, value) -> 'Tensor': ...
+    def __sub__(self, value) -> 'Tensor': ...
+    def __rsub__(self, value) -> 'Tensor': ...
+    def __mul__(self, value) -> 'Tensor': ...
+    def __rmul__(self, value) -> 'Tensor': ...
+    def __truediv__(self, value) -> 'Tensor': ...
+    def __rtruediv__(self, value) -> 'Tensor': ...
+    def __neg__(self) -> 'Tensor': ...
+    def __getitem__(self, slices) -> 'Tensor': ...
+    def __setitem__(self, slices, value): ...
+
+    def as_shape(self, shape) -> 'Tensor': ...
+    def copy(self) -> 'Tensor': ...
+    def max(self, axes=None, keepdims=False) -> 'Tensor': ...
+    def mean(self, axes=None, keepdims=False) -> 'Tensor': ...
+    def min(self, axes=None, keepdims=False) -> 'Tensor': ...
+    def reshape(self, new_shape) -> 'Tensor': ...
+    def sum(self, axes=None, keepdims=False) -> 'Tensor': ...
+    def transpose(self, axes_order, op_text=None, dtype=None) -> 'Tensor': ...
+
+    @property
+    def T(self): return self.transpose( tuple(range(self.shape.ndim-1,-1,-1)) )
+
+    ### INTERNAL METHODS
+
+    def __del__(self):
+        Tensor._object_count -= 1
+
+    _object_count = 0
+    _seq_id = 0
+
+__all__ = ['Tensor']
--- a/xlib/avecl/_internal/TensorImpl.py
+++ b/xlib/avecl/_internal/TensorImpl.py
@ -0,0 +1,92 @@
+from .op import *
+from .AShape import *
+from .backend import *
+from .Tensor import Tensor
+
+def Tensor__str__(self : Tensor): return f"T {self.name} {self.shape} {self.dtype.name}"
+Tensor.__str__ = Tensor__str__
+
+def Tensor__repr__(self : Tensor):
+    s  = self.__str__() + '\n'
+    s += str(self.np()) + '\n'
+    s += self.__str__()
+    return s
+Tensor.__repr__ = Tensor__repr__
+
+def Tensor__add__(self : Tensor, value) -> Tensor:
+    return add(self, value)
+Tensor.__add__ = Tensor__add__
+
+def Tensor__radd__(self : Tensor, value) -> Tensor:
+    return add(value, self)
+Tensor.__radd__ = Tensor__radd__
+
+def Tensor__sub__(self : Tensor, value) -> Tensor:
+    return sub(self, value)
+Tensor.__sub__ = Tensor__sub__
+
+def Tensor__rsub__(self : Tensor, value) -> Tensor:
+    return sub(value, self)
+Tensor.__rsub__ = Tensor__rsub__
+
+def Tensor__mul__(self : Tensor, value) -> Tensor:
+    if self == value:
+        return square(self)
+    return mul(self, value)
+Tensor.__mul__ = Tensor__mul__
+
+def Tensor__rmul__(self : Tensor, value) -> Tensor:
+    if self == value:
+        return square(self)
+    return mul(value, self)
+Tensor.__rmul__ = Tensor__rmul__
+
+def Tensor__truediv__(self : Tensor, value) -> Tensor:
+    return div(self, value)
+Tensor.__truediv__ = Tensor__truediv__
+
+def Tensor__rtruediv__(self : Tensor, value) -> Tensor:
+    return div(value, self)
+Tensor.__rtruediv__ = Tensor__rtruediv__
+
+def Tensor___neg__(self : Tensor):
+    raise NotImplementedError()
+Tensor.___neg__ = Tensor___neg__
+
+Tensor.__getitem__ = slice_
+Tensor.__setitem__ = slice_set
+
+def Tensor_as_shape(self : Tensor, shape) -> Tensor:
+    return TensorRef(self, shape)
+Tensor.as_shape = Tensor_as_shape
+
+def Tensor_copy(self : Tensor) -> Tensor:
+    return Tensor.from_value(self)
+Tensor.copy = Tensor_copy
+
+Tensor.max = reduce_max
+Tensor.mean = reduce_mean
+Tensor.min = reduce_min
+Tensor.reshape = reshape
+Tensor.sum = reduce_sum
+Tensor.transpose = transpose
+
+class TensorRef(Tensor):
+    """
+    TensorRef used to interpret existing Tensor with different shape.
+    use Tensor.as_ref() method
+    """
+
+    def __init__(self, t : Tensor, shape):
+        shape = AShape(shape)
+        if t.shape.size != shape.size:
+            raise ValueError(f'Cannot interpet shape {t.shape} as ref shape {shape}')
+        super().__init__(shape, t.dtype, device=t.get_device())
+        self._t = t
+
+    # Forward methods to original tensor
+    def get_seq_id(self) -> int:              return self._t.get_seq_id()
+    def get_buffer(self) -> Buffer: return self._t.get_buffer()
+    def get_device(self) -> Device: return self._t.get_device()
+
+__all__ = []
--- a/xlib/avecl/_internal/backend/Buffer.py
+++ b/xlib/avecl/_internal/backend/Buffer.py
@ -0,0 +1,109 @@
+from typing import Iterable, Union
+
+import numpy as np
+
+from . import OpenCL as CL
+
+class Buffer:
+    __slots__ = ['_device','_cl_mem','_size','_on_initialize']
+
+    def __init__(self, device : 'Device', size : int, on_initialize = None):
+        """
+        represents physical buffer associated with physical device
+
+         device     Device
+
+         size       int
+        """
+
+        Buffer._object_count += 1
+        self._device = device
+        self._size = size
+        self._cl_mem = None
+        self._on_initialize = on_initialize
+
+    def __del__(self):
+        #print('Buffer.__del__')
+        Buffer._object_count -= 1
+        self.free_cl_mem()
+
+    def get_device(self) -> 'Device': return self._device
+    def get_size(self) -> int: return self._size
+
+    def has_cl_mem(self) -> bool: return self._cl_mem is not None
+    def get_cl_mem(self) -> CL.cl_mem:
+        if self._cl_mem is None:
+            self._cl_mem = self._device._cl_mem_pool_alloc(self._size)
+            if self._on_initialize is not None:
+                self._on_initialize()
+
+        return self._cl_mem
+
+    def free_cl_mem(self):
+        if self._cl_mem is not None:
+            self._device._cl_mem_pool_free(self._cl_mem)
+            self._cl_mem = None
+
+    def set(self, value : Union['Buffer', np.ndarray]):
+        """
+        Parameters
+
+            value   Buffer    copy data from other Buffer.
+
+                    np.ndarray  copies values from ndarray
+                                to Buffer's memory
+
+        """
+        if isinstance(value, Buffer):
+            if self != value:
+                if self._size != value._size:
+                    raise Exception(f'Unable to copy from Buffer with {value._size} size to buffer with {self._size} size.')
+
+                if self._device == value._device:
+                    CL.clEnqueueCopyBuffer(self._device._get_ctx_q(), value.get_cl_mem(), self.get_cl_mem(), 0, 0, self._size, 0, None, None)
+                else:
+                    # Transfer between devices will cause low performance
+                    raise NotImplementedError()
+        else:
+            if not isinstance(value, np.ndarray):
+                raise ValueError (f'Invalid type {value.__class__}. Must be np.ndarray.')
+
+            if value.nbytes != self._size:
+                raise ValueError(f'Value size {value.nbytes} does not match Buffer size {self._size}.')
+
+            if not value.flags.contiguous:
+                value = value.reshape(-1)
+            if not value.flags.contiguous:
+                raise ValueError ("Unable to write from non-contiguous np array.")
+
+            ev = CL.cl_event()
+
+            clr = CL.clEnqueueWriteBuffer(self._device._get_ctx_q(), self.get_cl_mem(), False, 0, value.nbytes, value.ctypes.data_as(CL.c_void_p), 0, None, ev)
+            if clr != CL.CLERROR.SUCCESS:
+                raise Exception(f'clEnqueueWriteBuffer error: {clr}')
+
+            CL.clWaitForEvents(1, ( CL.cl_event * 1 )(ev) )
+            CL.clReleaseEvent(ev)
+
+    def np(self, shape : Iterable, dtype : np.dtype):
+        """
+        Returns data of buffer as np.ndarray with specified shape and dtype
+        """
+        out_np_value = np.empty (shape, dtype)
+
+        if out_np_value.nbytes != self._size:
+            raise ValueError(f'Unable to represent Buffer with size {self._size} as shape {shape} with dtype {dtype}')
+
+        clr = CL.clEnqueueReadBuffer(self._device._get_ctx_q(), self.get_cl_mem(), True, 0, self._size, out_np_value.ctypes.data, 0, None, None)
+        if clr != CL.CLERROR.SUCCESS:
+            raise Exception(f'clEnqueueReadBuffer error: {clr}')
+
+        return out_np_value
+
+    def __str__(self):
+        return f'Buffer [{self._size} bytes][{f"{self._cl_mem.value}" if self._cl_mem is not None else "unallocated"}] on {str(self._device)}'
+
+    def __repr__(self):
+        return self.__str__()
+
+    _object_count = 0
--- a/xlib/avecl/_internal/backend/Device.py
+++ b/xlib/avecl/_internal/backend/Device.py
@ -0,0 +1,522 @@
+from typing import List, Union
+
+import numpy as np
+
+from . import OpenCL as CL
+from .Buffer import Buffer
+from .DeviceInfo import DeviceInfo
+from .Kernel import Kernel
+
+_np_dtype_to_cl = { np.uint8:   CL.cl_uchar,
+                    np.int8:    CL.cl_char,
+                    np.uint16:  CL.cl_ushort,
+                    np.int16:   CL.cl_short,
+                    np.uint32:  CL.cl_uint,
+                    np.int32:   CL.cl_int,
+                    np.uint64:  CL.cl_ulong,
+                    np.int64:   CL.cl_long,
+                    np.float16: CL.cl_half,
+                    np.float32: CL.cl_float,
+                    np.float64: CL.cl_double }
+
+_opencl_device_ids = None
+_default_device = None
+_devices = {}
+
+class Device:
+    """
+    Represents physical TensorCL device
+    """
+
+    def __init__(self, device_info : DeviceInfo, **kwargs):
+        if kwargs.get('_check', None) is None:
+            raise Exception('You should not to create Device from constructor. Use get_device()')
+
+        self._cached_data = {}      # cached data (per device) by key
+        self._pooled_buffers = {}   # Pool of cached device buffers.
+        self._compiled_kernels = {} # compiled kernels by key
+        self._ctx_q = None          # CL command queue
+        self._ctx = None            # CL context
+
+        self._total_memory_allocated = 0
+        self._total_buffers_allocated = 0
+        self._total_memory_pooled = 0
+        self._total_buffers_pooled = 0
+
+        self._device_info = device_info
+        self._device_id = _get_opencl_device_ids()[device_info.get_index()]
+
+    def __del__(self):
+        self.cleanup()
+
+    def __eq__(self, other):
+        if self is not None and other is not None and isinstance(self, Device) and isinstance(other, Device):
+            return self._device_id.value == other._device_id.value
+        return False
+
+    def __hash__(self):
+        return self._device_id.value
+
+    def _get_ctx(self) -> CL.cl_context:
+        # Create OpenCL context on demand
+        if self._ctx is None:
+            clr = CL.CLRESULT()
+            ctx = CL.clCreateContext( None, 1, (CL.cl_device_id * 1)( self._device_id ), None, None, clr)
+            if clr != CL.CLERROR.SUCCESS:
+                raise Exception('Unable to create OpenCL context.')
+            self._ctx = ctx
+        return self._ctx
+
+    def _get_ctx_q(self) -> CL.cl_command_queue:
+        # Create CommandQueue on demand
+        if self._ctx_q is None:
+            clr = CL.CLRESULT()
+            ctx_q = CL.clCreateCommandQueue(self._get_ctx(), self._device_id, CL.cl_command_queue_properties(0), clr)
+            if clr != CL.CLERROR.SUCCESS:
+                raise Exception('Unable to create OpenCL CommandQueue.')
+            self._ctx_q = ctx_q
+        return self._ctx_q
+
+    def get_description(self) -> str:
+        return f"{self._device_info.get_name()} [{(self._device_info.get_total_memory() / 1024**3) :.3}Gb]"
+
+    def __str__(self):
+        return self.get_description()
+
+    def __repr__(self):
+        return f'{self.__class__.__name__} object: ' + self.__str__()
+
+    def set_cached_data(self, key, value):
+        """
+
+        All cached data will be freed with cleanup()
+        """
+        self._cached_data[key] = value
+
+    def get_cached_data(self, key):
+        return self._cached_data.get(key, None)
+
+    def get_total_allocated_memory(self):
+        return self._total_memory_allocated
+
+    def get_max_malloc_size(self) -> int:
+        size = CL.cl_ulong()
+        clr = CL.clGetDeviceInfo(self._device_id, CL.CL_DEVICE_MAX_MEM_ALLOC_SIZE, CL.sizeof(size), CL.byref(size), None)
+        if clr != CL.CLERROR.SUCCESS:
+            raise Exception(f'clGetDeviceInfo error: {clr}')
+        return size.value
+
+    def _compile_kernel(self, key, kernel_text) -> CL.cl_kernel:
+        """
+        compile or get cached kernel
+        """
+
+        compiled_krn, prog = self._compiled_kernels.get(key, (None, None) )
+
+        if compiled_krn is None:
+            clr = CL.CLRESULT()
+            prog = CL.clCreateProgramWithSource(self._get_ctx(), 1, CL.c_char_p(kernel_text.encode()), None, clr )
+            if clr != CL.CLERROR.SUCCESS:
+                raise Exception(f'clCreateProgramWithSource error {clr}, with kernel_text:\n{kernel_text}')
+
+            clr = CL.clBuildProgram(prog, 1, (CL.cl_device_id*1)(self._device_id), CL.c_char_p('-cl-std=CL1.2 -cl-single-precision-constant'.encode()), None, None  )
+            if clr != CL.CLERROR.SUCCESS:
+                build_log_size = CL.c_size_t()
+                clr = CL.clGetProgramBuildInfo(prog, self._device_id, CL.CL_PROGRAM_BUILD_LOG, 0, None, CL.byref(build_log_size) )
+                if clr != CL.CLERROR.SUCCESS:
+                    raise Exception(f'clGetProgramBuildInfo,error: {clr}')
+
+                build_log = CL.create_string_buffer(build_log_size.value)
+                clr = CL.clGetProgramBuildInfo(prog, self._device_id, CL.CL_PROGRAM_BUILD_LOG, build_log_size.value, build_log, None )
+                if clr != CL.CLERROR.SUCCESS:
+                    raise Exception(f'clGetProgramBuildInfo error: {clr}')
+
+                build_log = str(build_log.value, 'utf-8')
+                raise Exception(f'clBuildProgram error:\n\n{build_log}')
+
+            num_kernels = CL.cl_uint()
+            clr = CL.clCreateKernelsInProgram(prog, 0, None, CL.byref(num_kernels))
+            if clr != CL.CLERROR.SUCCESS:
+                raise Exception(f'clCreateKernelsInProgram error: {clr}')
+
+            if num_kernels.value != 1:
+                raise Exception(f'Kernel must contain only one __kernel:\n\n{kernel_text}')
+
+            kernels = (CL.cl_kernel * num_kernels.value)()
+            clr = CL.clCreateKernelsInProgram(prog, num_kernels.value, kernels, None)
+            if clr != CL.CLERROR.SUCCESS:
+                raise Exception(f'clCreateKernelsInProgram error: {clr}')
+
+            compiled_krn = kernels[0]
+            self._compiled_kernels[key] = (compiled_krn, prog)
+
+        return compiled_krn
+
+    def _cl_mem_alloc(self, size) -> CL.cl_mem:
+        clr = CL.CLRESULT()
+        mem = CL.clCreateBuffer(self._get_ctx(), CL.CL_MEM_READ_WRITE, size, None, clr)
+        if clr == CL.CLERROR.SUCCESS:
+            # Fill one byte to check memory availability
+            ev = CL.cl_event()
+            clr = CL.clEnqueueFillBuffer (self._get_ctx_q(), mem, (CL.c_char * 1)(), 1, 0, 1, 0, None, ev )
+            if clr == CL.CLERROR.SUCCESS:
+                CL.clReleaseEvent(ev)
+                self._total_memory_allocated += size
+                self._total_buffers_allocated += 1
+                return mem
+        return None
+
+    def _cl_mem_free(self, mem : CL.cl_mem):
+        size = CL.c_size_t()
+        clr = CL.clGetMemObjectInfo(mem, CL.CL_MEM_SIZE, CL.sizeof(size), CL.byref(size), None )
+        if clr != CL.CLERROR.SUCCESS:
+            raise Exception(f'clGetMemObjectInfo error: {clr}')
+        size = size.value
+        self._total_memory_allocated -= size
+        self._total_buffers_allocated -= 1
+        clr = CL.clReleaseMemObject(mem)
+        if clr != CL.CLERROR.SUCCESS:
+            raise Exception(f'clReleaseMemObject error: {clr}')
+
+    def _cl_mem_pool_alloc(self, size):
+        """
+        allocate or get cl_mem from pool
+        """
+        pool = self._pooled_buffers
+
+        # First try to get pooled buffer
+        ar = pool.get(size, None)
+        if ar is not None and len(ar) != 0:
+            mem = ar.pop(-1)
+            self._total_memory_pooled -= size
+            self._total_buffers_pooled -= 1
+        else:
+            # No pooled buffer, try to allocate new
+            while True:
+                mem = self._cl_mem_alloc(size)
+                if mem is None:
+                    # MemoryError. Finding largest pooled buffer to release
+                    buf_to_release = None
+                    for size_key in sorted(list(pool.keys()), reverse=True):
+                        ar = pool[size_key]
+                        if len(ar) != 0:
+                            buf_to_release = ar.pop(-1)
+                            break
+
+                    if buf_to_release is not None:
+                        # Release pooled buffer and try to allocate again
+                        self._cl_mem_free(buf_to_release)
+                        continue
+
+                    raise Exception(f'Unable to allocate {size // 1024**2}Mb on {str(self)}')
+
+
+                break
+
+        return mem
+
+    def _cl_mem_pool_free(self, mem : CL.cl_mem):
+        """
+        Put cl_mem to pool for reuse in future.
+        """
+        size = CL.c_size_t()
+        clr = CL.clGetMemObjectInfo(mem, CL.CL_MEM_SIZE, CL.sizeof(size), CL.byref(size), None )
+        if clr != CL.CLERROR.SUCCESS:
+            raise Exception(f'clGetMemObjectInfo error: {clr}')
+        size = size.value
+
+        pool = self._pooled_buffers
+        ar = pool.get(size, None)
+        if ar is None:
+            ar = pool[size] = []
+        ar.append(mem)
+
+        self._total_memory_pooled += size
+        self._total_buffers_pooled += 1
+
+    def print_stat(self):
+        s = f'''
+Total memory allocated:  {self._total_memory_allocated}
+Total buffers allocated: {self._total_buffers_allocated}
+Total memory pooled:     {self._total_memory_pooled}
+Total buffers pooled:    {self._total_buffers_pooled}
+N of compiled kernels:   {len(self._compiled_kernels)}
+N of cacheddata:         {len(self._cached_data)}
+'''
+        print(s)
+
+    def run_kernel(self, kernel : Kernel, *args, global_shape=None, local_shape=None, global_shape_offsets=None, wait=False):
+        """
+        Run kernel on Device
+
+        Arguments
+
+            *args           arguments will be passed to OpenCL kernel
+                            allowed types:
+
+                            Buffer
+                            np single value
+
+            global_shape(None)  tuple of ints, up to 3 dims
+                                amount of parallel kernel executions.
+                                in OpenCL kernel,
+                                id can be obtained via get_global_id(dim)
+
+            local_shape(None)   tuple of ints, up to 3 dims
+                                specifies local groups of every dim of global_shape.
+                                in OpenCL kernel,
+                                id can be obtained via get_local_id(dim)
+
+            global_shape_offsets(None)  tuple of ints
+                                        offsets for global_shape
+
+            wait(False)     wait execution to complete
+        """
+        ckernel = self._compile_kernel(kernel, kernel.get_kernel_text())
+
+        if global_shape is None:
+            global_shape = kernel.get_global_shape()
+        if global_shape is None:
+            raise ValueError('global_shape must be defined.')
+
+        work_dim = len(global_shape)
+        global_shape_ar = (CL.c_size_t*work_dim)()
+        for i,v in enumerate(global_shape):
+            global_shape_ar[i] = v
+
+        local_shape_ar = None
+        if local_shape is None:
+            local_shape = kernel.get_local_shape()
+        if local_shape is not None:
+            if len(local_shape) != work_dim:
+                raise ValueError('len of local_shape must match global_shape')
+
+            local_shape_ar = (CL.c_size_t*work_dim)()
+            for i,v in enumerate(local_shape):
+                local_shape_ar[i] = v
+
+
+        global_shape_offsets_ar = None
+        if global_shape_offsets is not None:
+            if len(global_shape_offsets) != work_dim:
+                raise ValueError('len of global_shape_offsets must match global_shape')
+
+            global_shape_offsets_ar = (CL.c_size_t*work_dim)()
+            for i,v in enumerate(local_shape):
+                global_shape_offsets_ar[i] = v
+
+        for i, arg in enumerate(args):
+
+            if isinstance(arg, Buffer):
+                arg = arg.get_cl_mem()
+            else:
+                cl_type = _np_dtype_to_cl.get(arg.__class__, None)
+                if cl_type is None:
+                    raise ValueError(f'Cannot convert type {arg.__class__} to OpenCL type.')
+                arg = cl_type(arg)
+
+            clr = CL.clSetKernelArg(ckernel, i, CL.sizeof(arg), CL.byref(arg))
+            if clr != CL.CLERROR.SUCCESS:
+                raise Exception(f'clSetKernelArg error: {clr}')
+
+        ev = CL.cl_event() if wait else None
+
+        clr = CL.clEnqueueNDRangeKernel(self._get_ctx_q(), ckernel, work_dim, global_shape_offsets_ar, global_shape_ar, local_shape_ar, 0, None, ev)
+        if clr != CL.CLERROR.SUCCESS:
+            raise Exception(f'clEnqueueNDRangeKernel error: {clr}')
+
+        if wait:
+            CL.clWaitForEvents(1, (CL.cl_event*1)(ev) )
+            CL.clReleaseEvent(ev)
+
+    def wait(self):
+        """
+        Wait to finish all queued operations on this Device
+        """
+        clr = CL.clFinish(self._get_ctx_q())
+        if clr != CL.CLERROR.SUCCESS:
+            raise Exception(f'clFinish error: {clr}')
+
+    def cleanup(self):
+        """
+        Frees all resources from this Device.
+        """
+        self._cached_data = {}
+
+        pool = self._pooled_buffers
+        for size_key in pool.keys():
+            for mem in pool[size_key]:
+                self._cl_mem_free(mem)
+        self._pooled_buffers = {}
+        self._total_memory_pooled = 0
+        self._total_buffers_pooled = 0
+
+        if self._total_memory_allocated != 0:
+            raise Exception('Unable to cleanup CLDevice, while not all Buffers are deallocated.')
+
+        for kernel, prog in self._compiled_kernels.values():
+            clr = CL.clReleaseKernel(kernel)
+            if clr != CL.CLERROR.SUCCESS:
+                raise Exception(f'clReleaseKernel error: {clr}')
+
+            clr = CL.clReleaseProgram(prog)
+            if clr != CL.CLERROR.SUCCESS:
+                raise Exception(f'clReleaseProgram error: {clr}')
+        self._compiled_kernels = {}
+
+        if self._ctx_q is not None:
+            clr = CL.clReleaseCommandQueue(self._ctx_q)
+            if clr != CL.CLERROR.SUCCESS:
+                raise Exception(f'clReleaseCommandQueue error: {clr}')
+            self._ctx_q = None
+
+        if self._ctx is not None:
+            clr = CL.clReleaseContext(self._ctx)
+            if clr != CL.CLERROR.SUCCESS:
+                raise Exception(f'clReleaseContext error: {clr}')
+            self._ctx = None
+
+def _get_opencl_device_ids() -> List[CL.cl_device_id]:
+    global _opencl_device_ids
+    if _opencl_device_ids is None:
+        _opencl_device_ids = []
+        device_types = CL.CL_DEVICE_TYPE_CPU | CL.CL_DEVICE_TYPE_ACCELERATOR | CL.CL_DEVICE_TYPE_GPU
+
+        while True:
+            num_platforms = CL.cl_uint()
+            if CL.clGetPlatformIDs(0, None, num_platforms) != CL.CLERROR.SUCCESS or \
+                num_platforms.value == 0:
+                break
+
+            platforms = (CL.cl_platform_id * num_platforms.value) ()
+            if CL.clGetPlatformIDs(num_platforms.value, platforms, None) != CL.CLERROR.SUCCESS:
+                break
+
+            for i_platform in range(num_platforms.value):
+                platform = platforms[i_platform]
+                num_devices = CL.cl_uint(0)
+                if CL.clGetDeviceIDs(platform, device_types, 0, None, num_devices) != CL.CLERROR.SUCCESS or \
+                    num_devices.value == 0:
+                    continue
+
+                device_ids = (CL.cl_device_id * num_devices.value)()
+                if CL.clGetDeviceIDs(platform, device_types, num_devices.value, device_ids, None) != CL.CLERROR.SUCCESS:
+                    continue
+
+                for i in range(num_devices.value):
+                    device_id = device_ids[i]
+                    # Check OpenCL version.
+                    if device_id is not None:
+                        device_version_size = CL.c_size_t()
+                        if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_VERSION, 0, None, device_version_size) == CL.CLERROR.SUCCESS:
+                            device_version = CL.create_string_buffer(device_version_size.value)
+                            if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_VERSION, device_version_size.value, device_version, None) == CL.CLERROR.SUCCESS:
+                                device_version = str(device_version.value, 'ascii')
+
+                                major, minor = device_version.split(' ')[1].split('.')
+                                opencl_version = int(major)*10+int(minor)
+                                if opencl_version >= 12:
+                                    _opencl_device_ids.append(device_id)
+            break
+    return _opencl_device_ids
+
+def get_available_devices_info() -> List[DeviceInfo]:
+    """
+    returns a list of available picklable DeviceInfo's
+    """
+    devices = []
+    for device_index, device_id in enumerate(_get_opencl_device_ids()):
+        device_name = 'undefined'
+        device_total_memory = 0
+
+        name_size = CL.c_size_t()
+        if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_NAME, 0, None, name_size) == CL.CLERROR.SUCCESS:
+            name_value = CL.create_string_buffer(name_size.value)
+            if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_NAME, name_size.value, name_value, None) == CL.CLERROR.SUCCESS:
+                device_name = str(name_value.value, 'ascii')
+
+        global_mem_size = CL.cl_ulong()
+        if CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_GLOBAL_MEM_SIZE, CL.sizeof(global_mem_size), CL.byref(global_mem_size), None) == CL.CLERROR.SUCCESS:
+            device_total_memory = global_mem_size.value
+
+        vendor_id = CL.cl_uint()
+        CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_VENDOR_ID, CL.sizeof(vendor_id), CL.byref(vendor_id), None)
+        vendor_id = vendor_id.value
+
+        max_compute_units = CL.cl_uint()
+        CL.clGetDeviceInfo(device_id, CL.CL_DEVICE_MAX_COMPUTE_UNITS, CL.sizeof(max_compute_units), CL.byref(max_compute_units), None)
+        max_compute_units = max_compute_units.value
+
+        performance_level = max_compute_units
+
+        if vendor_id == 0x8086: # Intel device
+            performance_level -= 1000
+
+        devices.append( DeviceInfo(index=device_index, name=device_name, total_memory=device_total_memory, performance_level=performance_level ) )
+
+    return devices
+
+def get_default_device() -> Union[Device, None]:
+    global _default_device
+    if _default_device is None:
+        _default_device = get_device(0)
+    return _default_device
+
+def set_default_device(device : Device):
+    if not isinstance(device, Device):
+        raise ValueError('device must be an instance of Device')
+
+    global _default_device
+    _default_device = device
+
+def get_device(arg : Union[None, int, Device, DeviceInfo]) -> Union[Device, None]:
+    """
+    get physical TensorCL device.
+
+        arg     None        - get best device
+                int         - by index
+                DeviceInfo  - by device info
+                Device      - returns the same
+    """
+    global _devices
+
+    if arg is None:
+        return get_best_device()
+    elif isinstance(arg, int):
+        devices_info = get_available_devices_info()
+        if arg < len(devices_info):
+            arg = devices_info[arg]
+        else:
+            return None
+    elif isinstance(arg, Device):
+        return arg
+    elif not isinstance(arg, DeviceInfo):
+        raise ValueError(f'Unknown type of arg {arg.__class__}')
+
+    device = _devices.get(arg, None)
+    if device is None:
+        device = _devices[arg] = Device(arg, _check=1)
+
+    return device
+
+def get_best_device() -> Union[Device, None]:
+    """
+    returns best device from available.
+    """
+    perf_level = -999999
+    result = None
+    for device_info in get_available_devices_info():
+        dev_perf_level = device_info.get_performance_level()
+        if perf_level < dev_perf_level:
+            perf_level = dev_perf_level
+            result = device_info
+    if result is not None:
+        result = get_device(result)
+    return result
+
+def cleanup_devices():
+    global _devices
+
+    for device in list(_devices.values()):
+        device.cleanup()
+    _devices = {}
--- a/xlib/avecl/_internal/backend/DeviceInfo.py
+++ b/xlib/avecl/_internal/backend/DeviceInfo.py
@ -0,0 +1,44 @@
+class DeviceInfo:
+    """
+    Represents picklable OpenCL device info
+    """
+
+    def __init__(self, index : int = None, name : str = None, total_memory : int = None, performance_level : int = None):
+        self._index = index
+        self._name = name
+        self._total_memory = total_memory
+        self._performance_level = performance_level
+        
+    def __getstate__(self): 
+        return self.__dict__.copy()
+
+    def __setstate__(self, d):
+        self.__init__()
+        self.__dict__.update(d)
+        
+    def get_index(self) -> int:
+        return self._index
+
+    def get_name(self) -> str:
+        return self._name
+
+    def get_total_memory(self) -> int:
+        return self._total_memory
+        
+    def get_performance_level(self) -> int:
+        return self._performance_level
+        
+    def __eq__(self, other):
+        if self is not None and other is not None and isinstance(self, DeviceInfo) and isinstance(other, DeviceInfo):
+            return self._index == other._index
+        return False
+
+    def __hash__(self):
+        return self._index
+
+    def __str__(self):
+        return f"[{self._index}] {self._name} [{(self._total_memory / 1024**3) :.3}Gb]"
+
+    def __repr__(self):
+        return f'{self.__class__.__name__} object: ' + self.__str__()
+
--- a/xlib/avecl/_internal/backend/Kernel.py
+++ b/xlib/avecl/_internal/backend/Kernel.py
@ -0,0 +1,26 @@
+class Kernel:
+    """
+    TensorCL kernel.
+
+    It does not allocate any resources, thus can be used as static variable within class.
+
+    arguments
+
+        kernel_text    OpenCL text of kernel. Must contain only one __kernel
+
+        global_shape    default global_shape for .run()
+
+        local_shape     default local_shape for .run()
+    """
+    def __init__(self, kernel_text, global_shape=None, local_shape=None):
+        self._kernel_text = kernel_text
+        self._global_shape = global_shape
+        self._local_shape = local_shape
+
+    def get_kernel_text(self) -> str: return self._kernel_text
+    def get_global_shape(self): return self._global_shape
+    def get_local_shape(self): return self._local_shape
+
+    def __str__(self):  return f'Kernel: \n{self._kernel_text}'
+    def __repr__(self): return self.__str__()
+
--- a/xlib/avecl/_internal/backend/OpenCL/OpenCL.py
+++ b/xlib/avecl/_internal/backend/OpenCL/OpenCL.py
@ -0,0 +1,278 @@
+"""
+Minimal OpenCL 1.2 low level ctypes API.
+"""
+import ctypes
+from ctypes import POINTER, create_string_buffer, sizeof, c_char_p, c_char, c_size_t, c_void_p, byref
+from ctypes.util import find_library
+from enum import IntEnum
+
+dlls_by_name = {}
+def dll_import(dll_name):
+    dll = dlls_by_name.get(dll_name, None)
+    if dll is None:
+        try:
+            dll = ctypes.cdll.LoadLibrary(find_library(dll_name))
+        except:
+            pass
+        if dll is None:
+            raise RuntimeError(f'Unable to load {dll_name} library.')
+        dlls_by_name[dll_name] = dll
+
+    def decorator(func):
+        dll_func = getattr(dll, func.__name__)
+        anno = list(func.__annotations__.values())
+        dll_func.argtypes = anno[:-1]
+        dll_func.restype = anno[-1]
+        def wrapper(*args):
+            return dll_func(*args)
+        return wrapper
+    return decorator
+
+class cl_char(ctypes.c_int8): pass
+class cl_uchar(ctypes.c_uint8): pass
+class cl_short(ctypes.c_int16): pass
+class cl_ushort(ctypes.c_uint16): pass
+class cl_int(ctypes.c_int32): pass
+class cl_uint(ctypes.c_uint32): pass
+class cl_long(ctypes.c_int64): pass
+class cl_ulong(ctypes.c_uint64): pass
+class cl_half(ctypes.c_uint16): pass
+class cl_float(ctypes.c_float): pass
+class cl_double(ctypes.c_double): pass
+class cl_bool(cl_uint): pass
+class cl_bitfield(cl_ulong):
+    def __or__(self, other):
+        assert isinstance(other, self.__class__)
+        return self.__class__(self.value | other.value)
+    def __and__(self, other):
+        assert isinstance(other, self.__class__)
+        return self.__class__(self.value & other.value)
+    def __xor__(self, other):
+        assert isinstance(other, self.__class__)
+        return self.__class__(self.value ^ other.value)
+    def __not__(self):
+        return self.__class__(~self.value)
+    def __contains__(self, other):
+        assert isinstance(other, self.__class__)
+        return (self.value & other.value) == other.value
+    def __hash__(self):
+        return self.value.__hash__()
+    def __eq__(self, other):
+        if not isinstance(other, self.__class__):
+            return False
+        else:
+            return self.value == other.value
+    def __ne__(self, other):
+        return not(self == other)
+    def __repr__(self):
+        return f'cl_bitfield: {self.value}'
+
+class CLERROR(IntEnum):
+    SUCCESS =                                  0
+    DEVICE_NOT_FOUND =                         -1
+    DEVICE_NOT_AVAILABLE =                     -2
+    COMPILER_NOT_AVAILABLE =                   -3
+    MEM_OBJECT_ALLOCATION_FAILURE =            -4
+    OUT_OF_RESOURCES =                         -5
+    OUT_OF_HOST_MEMORY =                       -6
+    PROFILING_INFO_NOT_AVAILABLE =             -7
+    MEM_COPY_OVERLAP =                         -8
+    IMAGE_FORMAT_MISMATCH =                    -9
+    IMAGE_FORMAT_NOT_SUPPORTED =               -10
+    BUILD_PROGRAM_FAILURE =                    -11
+    MAP_FAILURE =                              -12
+    MISALIGNED_SUB_BUFFER_OFFSET =             -13
+    EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST = -14
+    INVALID_VALUE =                            -30
+    INVALID_DEVICE_TYPE =                      -31
+    INVALID_PLATFORM =                         -32
+    INVALID_DEVICE =                           -33
+    INVALID_CONTEXT =                          -34
+    INVALID_QUEUE_PROPERTIES =                 -35
+    INVALID_COMMAND_QUEUE =                    -36
+    INVALID_HOST_PTR =                         -37
+    INVALID_MEM_OBJECT =                       -38
+    INVALID_IMAGE_FORMAT_DESCRIPTOR =          -39
+    INVALID_IMAGE_SIZE =                       -40
+    INVALID_SAMPLER =                          -41
+    INVALID_BINARY =                           -42
+    INVALID_BUILD_OPTIONS =                    -43
+    INVALID_PROGRAM =                          -44
+    INVALID_PROGRAM_EXECUTABLE =               -45
+    INVALID_KERNEL_NAME =                      -46
+    INVALID_KERNEL_DEFINITION =                -47
+    INVALID_KERNEL =                           -48
+    INVALID_ARG_INDEX =                        -49
+    INVALID_ARG_VALUE =                        -50
+    INVALID_ARG_SIZE =                         -51
+    INVALID_KERNEL_ARGS =                      -52
+    INVALID_WORK_DIMENSION =                   -53
+    INVALID_WORK_GROUP_SIZE =                  -54
+    INVALID_WORK_ITEM_SIZE =                   -55
+    INVALID_GLOBAL_OFFSET =                    -56
+    INVALID_EVENT_WAIT_LIST =                  -57
+    INVALID_EVENT =                            -58
+    INVALID_OPERATION =                        -59
+    INVALID_GL_OBJECT =                        -60
+    INVALID_BUFFER_SIZE =                      -61
+    INVALID_MIP_LEVEL =                        -62
+    INVALID_GLOBAL_WORK_SIZE =                 -63
+    INVALID_PROPERTY =                         -64
+    INVALID_GL_SHAREGROUP_REFERENCE_KHR =      -1000
+    PLATFORM_NOT_FOUND_KHR =                   -1001
+
+class CLRESULT(cl_int):
+    def __eq__(self, other):
+        if isinstance(other, int):
+            return self.value == other
+        elif isinstance(other, self.__class__):
+            return self.value == other.value
+        else:
+            return False
+    def __ne__(self, other):
+        return not(self == other)
+    def __hash__(self):
+        return self.value.__hash__()
+    def __str__(self):
+        try:
+            return f'CLRESULT ({str(CLERROR(self.value))})'
+        except:
+            return f'CLRESULT ({self.value})'
+    def __repr__(self):
+        return self.__str__()
+
+class cl_platform_id(c_void_p): ...
+class cl_platform_info(cl_uint): ...
+class cl_device_id(c_void_p): ...
+class cl_device_type(cl_bitfield): ...
+class cl_device_info(cl_uint): ...
+class cl_context(c_void_p): ...
+class cl_context_properties(c_void_p): ...
+class cl_command_queue(c_void_p): ...
+class cl_command_queue_properties(cl_bitfield): ...
+class cl_event(c_void_p): ...
+class cl_mem(c_void_p): ...
+class cl_mem_info(cl_uint): ...
+class cl_mem_flags(cl_bitfield): ...
+class cl_program(c_void_p): ...
+class cl_program_build_info(cl_uint): ...
+class cl_kernel(c_void_p): ...
+
+# https://github.com/KhronosGroup/OpenCL-Headers/blob/master/CL/cl.h
+CL_PLATFORM_PROFILE    = cl_platform_info(0x0900)
+CL_PLATFORM_VERSION    = cl_platform_info(0x0901)
+CL_PLATFORM_NAME       = cl_platform_info(0x0902)
+CL_PLATFORM_VENDOR     = cl_platform_info(0x0903)
+CL_PLATFORM_EXTENSIONS = cl_platform_info(0x0904)
+
+CL_DEVICE_TYPE_DEFAULT     = cl_device_type( (1 << 0) )
+CL_DEVICE_TYPE_CPU         = cl_device_type( (1 << 1) )
+CL_DEVICE_TYPE_GPU         = cl_device_type( (1 << 2) )
+CL_DEVICE_TYPE_ACCELERATOR = cl_device_type( (1 << 3) )
+CL_DEVICE_TYPE_ALL         = cl_device_type( 0xFFFFFFFF )
+
+CL_DEVICE_TYPE                = cl_device_info (0x1000)
+CL_DEVICE_VENDOR_ID           = cl_device_info (0x1001)
+CL_DEVICE_MAX_COMPUTE_UNITS   = cl_device_info (0x1002)
+CL_DEVICE_GLOBAL_MEM_SIZE     = cl_device_info (0x101F)
+CL_DEVICE_NAME                = cl_device_info (0x102B)
+CL_DEVICE_VERSION             = cl_device_info (0x102F)
+CL_DEVICE_MAX_MEM_ALLOC_SIZE  = cl_device_info (0x1010)
+CL_DEVICE_MAX_WORK_GROUP_SIZE = cl_device_info (0x1004)
+CL_DRIVER_VERSION             = cl_device_info (0x102D)
+CL_DEVICE_EXTENSIONS          = cl_device_info (0x1030)
+
+# cl_mem_flags
+CL_MEM_READ_WRITE     = cl_mem_flags( (1 << 0) )
+CL_MEM_WRITE_ONLY     = cl_mem_flags( (1 << 1) )
+CL_MEM_READ_ONLY      = cl_mem_flags( (1 << 2) )
+CL_MEM_USE_HOST_PTR   = cl_mem_flags( (1 << 3) )
+CL_MEM_ALLOC_HOST_PTR = cl_mem_flags( (1 << 4) )
+CL_MEM_COPY_HOST_PTR  = cl_mem_flags( (1 << 5) )
+
+# cl_mem_info
+CL_MEM_SIZE = cl_mem_info(0x1102)
+
+# cl_program_build_info
+CL_PROGRAM_BUILD_STATUS  = cl_program_build_info(0x1181)
+CL_PROGRAM_BUILD_OPTIONS = cl_program_build_info(0x1182)
+CL_PROGRAM_BUILD_LOG     = cl_program_build_info(0x1183)
+
+
+@dll_import('OpenCL')
+def clGetPlatformIDs (num_entries : cl_uint, platforms : POINTER(cl_platform_id), num_platforms : POINTER(cl_uint) ) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clGetPlatformInfo (platform : cl_platform_id, param_name : cl_platform_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t)) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clGetDeviceIDs (platform : cl_platform_id, device_type : cl_device_type, num_entries : cl_uint, devices : POINTER(cl_device_id), num_devices : POINTER(cl_uint)) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clGetDeviceInfo(device : cl_device_id, param_name : cl_device_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t)) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clCreateContext(properties : cl_context_properties, num_devices : cl_uint,  devices : POINTER(cl_device_id), pfn_notify : c_void_p, user_data : c_void_p, errcode_ret : POINTER(CLRESULT) ) -> cl_context: ...
+
+@dll_import('OpenCL')
+def clReleaseContext(context : cl_context) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clCreateCommandQueue(context : cl_context, device : cl_device_id, properties : cl_command_queue_properties, errcode_ret : POINTER(CLRESULT) ) -> cl_command_queue: ...
+
+@dll_import('OpenCL')
+def clReleaseCommandQueue(command_queue : cl_command_queue) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clFinish(command_queue : cl_command_queue) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clWaitForEvents(num_events : cl_uint, event_list : POINTER(cl_event) ) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clReleaseEvent(event : cl_event) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clCreateBuffer(context : cl_context, flags : cl_mem_flags, size : c_size_t, host_ptr : c_void_p, errcode_ret : POINTER(CLRESULT) ) -> cl_mem: ...
+
+@dll_import('OpenCL')
+def clGetMemObjectInfo(memobj : cl_mem, param_name : cl_mem_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t) ) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clReleaseMemObject(memobj : cl_mem) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clEnqueueReadBuffer (command_queue : cl_command_queue, buffer : cl_mem, blocking_read : cl_bool, offset : c_size_t, cb : c_size_t, ptr : c_void_p, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clEnqueueWriteBuffer (command_queue : cl_command_queue, buffer : cl_mem, blocking_write : cl_bool, offset : c_size_t, size : c_size_t, ptr : c_void_p, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clEnqueueCopyBuffer (command_queue : cl_command_queue, src_buffer : cl_mem, dst_buffer : cl_mem, src_offset : c_size_t, dst_offset : c_size_t, cb : c_size_t, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : cl_event) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clEnqueueFillBuffer (command_queue : cl_command_queue, buffer : cl_mem, pattern : c_void_p, pattern_size : c_size_t, offset : c_size_t, size : c_size_t, num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clCreateProgramWithSource (context : cl_context, count : cl_uint, strings : POINTER(c_char_p), lengths : POINTER(c_size_t), errcode_ret : POINTER(CLRESULT) ) -> cl_program: ...
+
+@dll_import('OpenCL')
+def clReleaseProgram (program : cl_program) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clBuildProgram (program : cl_program, num_devices : cl_uint, device_list : POINTER(cl_device_id), options : c_char_p, pfn_notify : c_void_p, user_data : c_void_p) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def  clGetProgramBuildInfo (program : cl_program, device : cl_device_id, param_name : cl_program_build_info, param_value_size : c_size_t, param_value : c_void_p, param_value_size_ret : POINTER(c_size_t) ) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clCreateKernelsInProgram (program : cl_program, num_kernels : cl_uint, kernels : POINTER(cl_kernel), num_kernels_ret : POINTER(cl_uint) ) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clReleaseKernel (program : cl_kernel) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clSetKernelArg (kernel : cl_kernel, arg_index : cl_uint, arg_size : c_size_t, arg_value : c_void_p) -> CLRESULT: ...
+
+@dll_import('OpenCL')
+def clEnqueueNDRangeKernel (command_queue : cl_command_queue, kernel : cl_kernel, work_dim : cl_uint, global_work_offset : POINTER(c_size_t), global_work_size : POINTER(c_size_t), local_work_size : POINTER(c_size_t), num_events_in_wait_list : cl_uint, event_wait_list : POINTER(cl_event), event : POINTER(cl_event) ) -> CLRESULT: ...
--- a/xlib/avecl/_internal/backend/OpenCL/init.py
+++ b/xlib/avecl/_internal/backend/OpenCL/init.py
@ -0,0 +1,37 @@
+"""
+Minimal OpenCL 1.2 low level ctypes API.
+"""
+from .OpenCL import (CL_DEVICE_EXTENSIONS, CL_DEVICE_GLOBAL_MEM_SIZE,
+                     CL_DEVICE_MAX_COMPUTE_UNITS, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                     CL_DEVICE_MAX_WORK_GROUP_SIZE, CL_DEVICE_NAME,
+                     CL_DEVICE_TYPE, CL_DEVICE_TYPE_ACCELERATOR,
+                     CL_DEVICE_TYPE_ALL, CL_DEVICE_TYPE_CPU,
+                     CL_DEVICE_TYPE_DEFAULT, CL_DEVICE_TYPE_GPU,
+                     CL_DEVICE_VENDOR_ID, CL_DEVICE_VERSION, CL_DRIVER_VERSION,
+                     CL_MEM_ALLOC_HOST_PTR, CL_MEM_COPY_HOST_PTR,
+                     CL_MEM_READ_ONLY, CL_MEM_READ_WRITE, CL_MEM_SIZE,
+                     CL_MEM_USE_HOST_PTR, CL_MEM_WRITE_ONLY,
+                     CL_PLATFORM_EXTENSIONS, CL_PLATFORM_NAME,
+                     CL_PLATFORM_PROFILE, CL_PLATFORM_VENDOR,
+                     CL_PLATFORM_VERSION, CL_PROGRAM_BUILD_LOG,
+                     CL_PROGRAM_BUILD_OPTIONS, CL_PROGRAM_BUILD_STATUS,
+                     CLERROR, CLRESULT, byref, c_char, c_char_p, c_size_t,
+                     c_void_p, cl_bitfield, cl_bool, cl_char, cl_command_queue,
+                     cl_command_queue_properties, cl_context,
+                     cl_context_properties, cl_device_id, cl_device_info,
+                     cl_device_type, cl_double, cl_event, cl_float, cl_half,
+                     cl_int, cl_kernel, cl_long, cl_mem, cl_mem_info,
+                     cl_platform_id, cl_platform_info, cl_program,
+                     cl_program_build_info, cl_short, cl_uchar, cl_uint,
+                     cl_ulong, cl_ushort, clBuildProgram, clCreateBuffer,
+                     clCreateCommandQueue, clCreateContext,
+                     clCreateKernelsInProgram, clCreateProgramWithSource,
+                     clEnqueueCopyBuffer, clEnqueueFillBuffer,
+                     clEnqueueNDRangeKernel, clEnqueueReadBuffer,
+                     clEnqueueWriteBuffer, clFinish, clGetDeviceIDs,
+                     clGetDeviceInfo, clGetMemObjectInfo, clGetPlatformIDs,
+                     clGetPlatformInfo, clGetProgramBuildInfo,
+                     clReleaseCommandQueue, clReleaseContext, clReleaseEvent,
+                     clReleaseKernel, clReleaseMemObject, clReleaseProgram,
+                     clSetKernelArg, clWaitForEvents, create_string_buffer,
+                     ctypes, sizeof)
--- a/xlib/avecl/_internal/backend/init.py
+++ b/xlib/avecl/_internal/backend/init.py
@ -0,0 +1,6 @@
+from .Buffer import Buffer
+from .Device import (Device, cleanup_devices, get_available_devices_info,
+                     get_best_device, get_default_device, get_device,
+                     set_default_device)
+from .DeviceInfo import DeviceInfo
+from .Kernel import Kernel
--- a/xlib/avecl/_internal/info/BroadcastInfo.py
+++ b/xlib/avecl/_internal/info/BroadcastInfo.py
@ -0,0 +1,74 @@
+from typing import List
+
+import numpy as np
+
+from ..AShape import AShape
+from ..AAxes import AAxes
+
+class BroadcastInfo:
+    """
+    Broadcast info for multi shapes.
+
+    arguments
+
+        shapes     List[AShape]
+
+    errors during the construction:
+
+        ValueError
+
+    Example:
+     shapes   = (   3,),
+                (3, 1,)
+                           #Example | comment
+
+     o_shape               #(3, 3)    broadcasted together o_shape
+
+     br_shapes             #(1, 3)    shape[0] broadcasted to o_shape ndim
+                           #(3, 1)    shape[1] broadcasted to o_shape ndim
+
+     shapes_tiles          #(3, 1)    tiles to make o_shape from br_shapes[0]
+                           #(1, 3)    tiles to make o_shape from br_shapes[1]
+
+     shapes_reduction_axes #(0,)      reduction_axes to make shapes[0] from o_shape
+                           #(1,)      reduction_axes to make shapes[1] from o_shape
+    """
+
+    __slots__ = ['o_shape', 'br_shapes', 'shapes_tiles', 'shapes_reduction_axes']
+
+    def __init__(self, shapes : List[AShape]):
+
+        highest_rank = sorted([shape.ndim for shape in shapes])[-1]
+
+        # Broadcasted all shapes to highest ndim with (1,)-s in from left side
+        br_shapes = [ (1,)*(highest_rank-shape.ndim) + shape for shape in shapes ]
+
+        # Determine o_shape from all shapes
+        o_shape = AShape( np.max( [ [ br_shape.shape[axis] for br_shape in br_shapes] for axis in range(highest_rank) ], axis=1  ) )
+
+        shapes_tiles = []
+        shapes_reduction_axes = []
+        for br_shape in br_shapes:
+            shape_tile = []
+            shape_reduction_axes = []
+            for axis, (x,y) in enumerate(zip(br_shape.shape, o_shape.shape)):
+                if x != y:
+                    if x == 1 and y != 1:
+                        shape_tile.append(y)
+                        shape_reduction_axes.append(axis)
+                    elif x != 1 and y == 1:
+                        shape_tile.append(1)
+                    else:
+                        raise ValueError(f'operands could not be broadcast together with shapes {br_shape} {o_shape}')
+                else:
+                    shape_tile.append(1)
+
+            shapes_tiles.append(shape_tile)
+            shapes_reduction_axes.append( AAxes(shape_reduction_axes) )
+
+
+        self.o_shape : AShape = AShape(o_shape)
+        self.br_shapes : List[AShape] = br_shapes
+        self.shapes_tiles : List[List] = shapes_tiles
+        self.shapes_reduction_axes : List [AAxes] = shapes_reduction_axes
+
--- a/xlib/avecl/_internal/info/ConcatInfo.py
+++ b/xlib/avecl/_internal/info/ConcatInfo.py
@ -0,0 +1,62 @@
+from ..AShape import AShape
+
+class ConcatInfo:
+    __slots__ = ['o_shape', 'axis', 'axis_sizes',  'axis_offsets']
+
+    def __init__(self, shapes, axis):
+        """
+            Concat info
+
+            arguments
+
+                shapes      Iterable of shapes
+
+            errors during the construction:
+
+                ValueError
+
+            result
+
+                .o_shape   AShape
+
+                .axis           Int     fixed axis argument
+
+                .axis_sizes     List[Int]   axis sizes for every shape in shapes
+
+                .axis_offsets   List[Int]   axis offset in o_shape
+                                            for every shape in shapes
+            """
+
+        shapes = tuple(shapes)
+
+        if len(shapes) == 0:
+            raise ValueError('shapes is empty')
+
+
+        shape = shapes[0]
+
+        if axis < 0:
+            axis = shape.ndim + axis
+        if axis < 0 or axis >= shape.ndim:
+            raise ValueError(f'Wrong axis {axis}')
+
+        fixed_shapes = [ tuple(a for i,a in enumerate(shape) if i != axis) for shape in shapes ]
+        req_shape = fixed_shapes[0]
+        if not all(shape == req_shape for shape in fixed_shapes[1:]):
+            raise ValueError(f'All shapes must match shape {tuple(a if i != axis else "*" for i,a in enumerate(shape))}')
+
+        axis_sizes = [ shape[axis] for shape in shapes ]
+
+        axis_offset = 0
+        axis_offsets = []
+        for axis_size in axis_sizes:
+            axis_offsets.append(axis_offset)
+            axis_offset += axis_size
+
+        self.o_shape = AShape( tuple(shape)[0:axis] + (sum(axis_sizes),) + tuple(shape)[axis+1:] )
+        self.axis = axis
+        self.axis_sizes = axis_sizes
+        self.axis_offsets = tuple(axis_offsets)
+
+
+
--- a/xlib/avecl/_internal/info/Conv2DInfo.py
+++ b/xlib/avecl/_internal/info/Conv2DInfo.py
@ -0,0 +1,78 @@
+from collections import Iterable
+import math
+
+class Conv2DInfo:
+    """
+    Conv2DInfo
+
+    arguments
+
+     H, W        int            axes sizes
+     KH, KW      int            kernel sizes
+     stride      int
+     dilation    int
+
+     padding     'valid'        no padding
+                 'same'         output size will be the same
+                                or divided by stride
+                 int            padding value for all sides
+
+                 Iterable of 4 ints
+                                paddings for left,top,right,bottom sides
+
+    errors during the construction:
+
+        ValueError
+
+    result:
+
+        .PADL .PADR  paddings for W axis
+        .PADT .PADB  paddings for H axis
+
+        .OH .OW      result axes
+
+        .OH_T .OW_T  result transposed axes.
+                    it is None if padding != 'valid','same'
+    """
+
+    __slots__ = ['PADL', 'PADR', 'PADT', 'PADB', 'OH', 'OW', 'OH_T', 'OW_T']
+
+    def __init__(self, H, W, KH, KW, stride, dilation, padding):
+        # Effective kernel sizes with dilation
+        EKH = (KH-1)*dilation + 1
+        EKW = (KW-1)*dilation + 1
+
+        # Determine pad size of sides
+        OH_T = OW_T = None
+        if padding == 'valid':
+            PADL = PADT = PADR = PADB = 0
+            OH_T =  H * stride + max(EKH - stride, 0)
+            OW_T =  W * stride + max(EKW - stride, 0)
+        elif padding == 'same':
+            PADL = int(math.floor((EKW - 1)/2))
+            PADT = int(math.floor((EKH - 1)/2))
+            PADR = int(math.ceil((EKW - 1)/2))
+            PADB = int(math.ceil((EKH - 1)/2))
+            OH_T = H * stride
+            OW_T = W * stride
+        elif isinstance(padding, int):
+            PADL = PADT = PADR = PADB = padding
+        elif isinstance(padding, Iterable):
+            padding = tuple(int(x) for x in padding)
+            if len(padding) != 4:
+                raise ValueError("Invalid paddings list length.")
+            PADL, PADT, PADR, PADB = padding
+        else:
+            raise ValueError("Invalid padding value.")
+
+        self.PADL = PADL
+        self.PADT = PADT
+        self.PADR = PADR
+        self.PADB = PADB
+
+        self.OH = max(1, int((H + PADT + PADB - EKH) / stride + 1) )
+        self.OW = max(1, int((W + PADL + PADR - EKW) / stride + 1) )
+        self.OH_T = OH_T
+        self.OW_T = OW_T
+
+
--- a/xlib/avecl/_internal/info/PadInfo.py
+++ b/xlib/avecl/_internal/info/PadInfo.py
@ -0,0 +1,56 @@
+from typing import List
+
+import numpy as np
+
+from ..AShape import AShape
+
+
+class PadInfo:
+    """
+    Pad info.
+
+    arguments
+
+        shape   AShape
+
+        axes_paddings   list of (l_pad, r_pad)
+
+                        if [0] == ... (Ellipsis), then left-side paddings will be filled with (0,0) for remain axes
+                        if [-1] == ... , same for ride-side
+
+    errors during the construction:
+
+        ValueError
+
+    result:
+
+        .o_shape   AShape
+
+    """
+
+    __slots__ = ['o_shape','axes_paddings']
+
+    def __init__(self, shape, axes_paddings : List):
+
+        if Ellipsis in axes_paddings:
+            if sum(1 if x == Ellipsis else 0 for x in axes_paddings) > 1:
+                raise ValueError('only 1 ...(ellipsis) allowed in axes_paddings')
+            if axes_paddings[0] == Ellipsis:
+                axes_paddings = ((0,0),)*(shape.ndim-(len(axes_paddings)-1))+ axes_paddings[1:]
+            elif axes_paddings[-1] == Ellipsis:
+                axes_paddings = axes_paddings[:-1] + ((0,0),)*(shape.ndim-(len(axes_paddings)-1))
+            else:
+                raise ValueError('...(ellipsis) must be at the begin or the end of axes_paddings')
+
+        if len(axes_paddings) != shape.ndim:
+            raise ValueError(f'axes_paddings should match shape.ndim {shape.ndim}')
+
+
+        self.axes_paddings = axes_paddings
+        o_shape = []
+
+        for axis, (axis_size, (l_pad, r_pad)) in enumerate(zip(shape, axes_paddings)):
+            new_axis_size = l_pad + axis_size + r_pad
+            o_shape.append(new_axis_size)
+
+        self.o_shape = AShape(o_shape)
--- a/xlib/avecl/_internal/info/ReductionInfo.py
+++ b/xlib/avecl/_internal/info/ReductionInfo.py
@ -0,0 +1,50 @@
+from ..AShape import AShape
+
+class ReductionInfo:
+    """
+    Reduction info
+
+    arguments
+
+        shape       AShape
+
+        axes        AAxes
+
+        keepdims    bool
+
+    can raise ValueError, TypeError during the construction
+    """
+
+    __slots__ = [
+        'reduction_axes',   # sorted reduction AAxes
+        'o_axes',           # remain AAxes after reduction
+        'o_shape',          # result AShape of reduction
+        'o_shape_kd',       # result AShape of reduction with keepdims
+    ]
+
+    def __init__(self, shape, axes, keepdims):
+        shape_axes = shape.axes_arange()
+
+        if axes.is_none_axes():
+            axes = shape_axes
+
+        # Check correctness of axes
+        for axis in axes:
+            if axis not in shape_axes:
+                raise ValueError(f'Wrong axis {axis} not in {shape_axes}')
+
+        self.reduction_axes = reduction_axes = axes.sorted()
+
+        # Output axes. Remove axes from shape_axes
+        self.o_axes = o_axes = shape_axes - axes
+
+        if o_axes.is_none_axes():
+            o_shape = AShape( (1,) )
+        else:
+            o_shape = shape[o_axes]
+
+        self.o_shape = o_shape
+        self.o_shape_kd = AShape( 1 if axis in reduction_axes else shape[axis] for axis in range(shape.ndim))
+
+        if keepdims:
+            self.o_shape = self.o_shape_kd
--- a/xlib/avecl/_internal/info/ReshapeInfo.py
+++ b/xlib/avecl/_internal/info/ReshapeInfo.py
@ -0,0 +1,44 @@
+from ..AShape import AShape
+
+class ReshapeInfo:
+    """
+    Reshape info.
+    can raise ValueError,TypeError during the construction
+
+    arguments
+
+        shape           AShape
+
+        target_shape    Iterable of ints
+                        can be any len and contains only one '-1'
+    Example
+
+     shape        (2, 512, 8, 8, 64)
+     target_shape (2, 512, -1)
+     o_shape      (2, 512, 4096)
+    """
+
+    __slots__ = ['o_shape']
+
+    def __init__(self, shape, target_shape):
+        o_shape = []
+
+        remain_size = shape.size
+
+        unk_axis = None
+        for t_size in target_shape:
+            t_size = int(t_size)
+            if t_size != -1:
+                mod = remain_size % t_size
+                if mod != 0:
+                    raise ValueError(f'Cannot reshape {shape} to {target_shape}.')
+                remain_size /= t_size
+            else:
+                if unk_axis is not None:
+                    raise ValueError('Can specify only one unknown dimension.')
+                unk_axis = len(o_shape)
+            o_shape.append( t_size )
+
+        if unk_axis is not None:
+            o_shape[unk_axis] = int(remain_size)
+        self.o_shape = AShape(o_shape)
--- a/xlib/avecl/_internal/info/SliceInfo.py
+++ b/xlib/avecl/_internal/info/SliceInfo.py
@ -0,0 +1,148 @@
+import math
+
+import numpy as np
+
+from ..AShape import AShape
+
+
+class SliceInfo:
+    __slots__ = ['o_shape', 'o_shape_kd', 'just_reshaped','axes_bes','axes_abs_bes']
+
+    def __init__(self, shape : AShape, slices):
+        """
+        Slice info.
+        can raise ValueError,TypeError during the construction
+
+        arguments
+
+            slices
+
+        Example
+
+
+        o_shape    result shape after slice
+
+        axes_bes        list of (begin,step,end) per axis
+                        if s==0, then single axis element is fetched from position b
+                        s can be negative.
+        """
+        # Validate slices argument for given shape.
+        new_slices = []
+        before_ellipsis = None
+
+        for s in slices:
+            if s is Ellipsis:
+                before_ellipsis = new_slices
+                new_slices = []
+                continue
+            elif s is not None and not isinstance(s, (int,tuple) ):
+                raise ValueError(f'unknown slice argument {s} of type {s.__class__}')
+
+            new_slices.append(s)
+
+        if before_ellipsis is not None:
+            # Process Ellipsis separator
+            new_slices_n_axes = sum([ 1 for x in new_slices if x != None])
+            before_ellipsis_n_axes = sum([ 1 for x in before_ellipsis if x != None])
+
+            # Expand slices by filling intermediate (None,None,None) for each remaining axis
+            new_slices = before_ellipsis + \
+                         [(None,None,None)]*max(0, shape.ndim-before_ellipsis_n_axes-new_slices_n_axes) + \
+                         new_slices
+
+        new_slices_n_axes = sum([ 1 for x in new_slices if x != None])
+        if new_slices_n_axes > shape.ndim:
+            raise ValueError('slices arguments more than shape axes')
+        elif new_slices_n_axes < shape.ndim:
+            # Fill remaining axes
+            new_slices += [(None,None,None)]*( shape.ndim - new_slices_n_axes )
+
+        slices = tuple(new_slices)
+
+        # Compute shapes
+        output_is_reshaped = True  # Flag determines that output_tensor
+                                   # can be just reshaped without any computation
+        o_shape = []          # output tensor shape
+        o_shape_kd = [] # output shape used in kernel, must match input shape
+        axes_bes = []
+        axes_abs_bes = []
+
+        i_axis = 0
+
+        # Process slices arguments
+        for v in slices:
+            if v is None:
+                # None is new axis
+                # We can add unlimited number of (1,) axes at any place of shape
+                o_shape.append(1)
+                continue
+
+            i_axis_size = shape[i_axis]
+            i_axis += 1
+
+            if isinstance(v, int):
+                if v < 0:
+                    v += i_axis_size
+                if v < 0 or v >= i_axis_size:
+                    raise ValueError(f'index {v} is out of bounds for axis {i_axis} with size {i_axis_size}')
+                b,e,s = v,v,0
+            else:
+                b,e,s = v
+                if s == 0:
+                    raise ValueError(f'slice step cannot be zero')
+
+            # Fix begin, end, step values
+            if s is None:
+                s = 1
+            if b is None:
+                b = 0 if s >= 0 else i_axis_size-1
+            if e is None:
+                e = i_axis_size if s >= 0 else -1
+            elif e < 0:
+                e += i_axis_size
+            if b < 0:
+                b += i_axis_size
+
+            if s >= 0:
+                b = np.clip(b, 0, i_axis_size)
+                e = np.clip(e, 0, i_axis_size)
+                if b > e:
+                    raise ValueError('for positive step, begin cannot be > end.')
+
+                abs_b, abs_e, abs_s = b,e,s
+            else:
+                b = np.clip(b, 0, i_axis_size-1)
+                e = np.clip(e, -1, i_axis_size)
+
+                if b <= e:
+                    raise ValueError('for negative step, begin cannot be <= end.')
+
+                abs_s = -s
+                abs_e = b + 1
+                abs_b = b - (math.ceil( (b-e) / abs_s ) -1) * abs_s
+
+            # for every o_shape_kd axis
+            # we have exact begin,step values to fetch value from input
+            axes_bes.append( (b,e,s))
+            axes_abs_bes.append( (abs_b, abs_e, abs_s))
+
+            if i_axis_size != 1 and not (b == 0 and e == i_axis_size and s == 1):
+                # Such params of axis slice will change input, thus output cannot be as just reshaped input
+                output_is_reshaped = False
+
+            # Compute output_axis_size based on begin,end,step
+            o_axis_size = max(0, math.ceil ( (e-b) / (s if s != 0 else 1) ) )
+
+            if o_axis_size >= 1:
+                # >= 1 : select range of indexes, axis will remain
+                o_shape.append(o_axis_size)
+            # ^ othwerwise axis will be supressed
+
+            # o_shape with keepdims, must match ndim of input shape
+            o_shape_kd.append( max(1,o_axis_size) )
+
+        self.just_reshaped = output_is_reshaped
+        self.o_shape = AShape(o_shape)
+        self.o_shape_kd = AShape(o_shape_kd)
+        self.axes_bes = axes_bes
+        self.axes_abs_bes = axes_abs_bes
--- a/xlib/avecl/_internal/info/StackInfo.py
+++ b/xlib/avecl/_internal/info/StackInfo.py
@ -0,0 +1,37 @@
+from ..AShape import AShape
+
+class StackInfo:
+    __slots__ = ['o_shape', 'axis']
+
+    def __init__(self, shape, axis, stack_count):
+        """
+        Stack info
+
+        arguments
+
+            shape           AShape
+
+            axis            Int
+
+            stack_count     Int
+
+        errors during the construction:
+
+            ValueError
+
+        result:
+
+            .o_shape       AShape
+
+            .axis     Int       positive axis argument
+        """
+        if axis < 0:
+            axis = shape.ndim + 1 + axis
+        if axis < 0 or axis > shape.ndim:
+            raise ValueError(f'Wrong axis {axis}')
+
+        if stack_count <= 0:
+            raise ValueError(f'Invalid stack_count {stack_count}')
+
+        self.o_shape = AShape( tuple(shape)[0:axis] + (stack_count,) + tuple(shape)[axis:] )
+        self.axis = axis
--- a/xlib/avecl/_internal/info/TileInfo.py
+++ b/xlib/avecl/_internal/info/TileInfo.py
@ -0,0 +1,52 @@
+import numpy as np
+from ..AShape import AShape, AShape
+
+class TileInfo:
+    """
+    Tile info.
+
+    arguments
+
+        shape   AShape
+
+        tiles   Iterable of ints
+
+    errors during the construction:
+
+        ValueError
+
+    result:
+
+        .o_shape   AShape
+
+        .axes_slices    list of slice() to fetch original shape
+                        from o_shape for each tile
+    """
+
+    __slots__ = ['o_shape', 'axes_slices']
+
+    def __init__(self, shape, tiles):
+        if len(tiles) != shape.ndim:
+            raise ValueError(f'tiles should match shape.ndim {shape.ndim}')
+
+        self.o_shape = AShape(dim*tiles[i] for i,dim in enumerate(shape))
+
+        c = [0]*shape.ndim
+
+        axes_offsets = []
+        for n in range(np.prod(tiles)):
+            axes_offsets.append( c.copy() )
+            for i in range(shape.ndim-1,-1,-1):
+                c[i] += 1
+                if c[i] < tiles[i]:
+                    break
+                c[i] = 0
+
+        axes_slices = []
+        for axes_offset in axes_offsets:
+            sl = []
+            for axis,tile in enumerate(axes_offset):
+                axis_size = shape[axis]
+                sl.append( slice(axis_size*tile, axis_size*(tile+1)) )
+            axes_slices.append(tuple(sl))
+        self.axes_slices = tuple(axes_slices)
--- a/xlib/avecl/_internal/info/TransposeInfo.py
+++ b/xlib/avecl/_internal/info/TransposeInfo.py
@ -0,0 +1,37 @@
+from ..AShape import AShape
+from ..AAxes import AAxes
+
+class TransposeInfo:
+    """
+    TransposeInfo
+
+    arguments
+
+        shape           AShape
+
+        axes_order      AAxes
+
+    errors during the construction:
+
+        ValueError
+
+    result
+
+        .o_shape   AShape
+
+        .no_changes     bool       transpose changes nothing
+
+    """
+
+    __slots__ = ['no_changes', 'o_shape']
+
+    def __init__(self, shape : AShape, axes_order : AAxes):
+        if shape.ndim != axes_order.ndim:
+            raise ValueError('axes must match the shape')
+
+        # Axes order changes nothing?
+        self.o_shape = shape[axes_order]
+        self.no_changes = axes_order == shape.axes_arange()
+
+
+
--- a/xlib/avecl/_internal/info/init.py
+++ b/xlib/avecl/_internal/info/init.py
@ -0,0 +1,10 @@
+from .BroadcastInfo import BroadcastInfo
+from .ConcatInfo import ConcatInfo
+from .PadInfo import PadInfo
+from .ReductionInfo import ReductionInfo
+from .ReshapeInfo import ReshapeInfo
+from .SliceInfo import SliceInfo
+from .StackInfo import StackInfo
+from .TileInfo import TileInfo
+from .TransposeInfo import TransposeInfo
+from .Conv2DInfo import Conv2DInfo
--- a/xlib/avecl/_internal/initializer/InitCoords2DArange.py
+++ b/xlib/avecl/_internal/initializer/InitCoords2DArange.py
@ -0,0 +1,89 @@
+import numpy as np
+
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+from .Initializer import Initializer
+
+class InitCoords2DArange(Initializer):
+
+    def __init__(self, h_start, h_stop, w_start, w_stop):
+        """
+        Initialize (...,H,W,D) tensor with coords arange
+        D == 2(x,y) or 3 (x,y,1)
+
+        arguments
+
+            h_start     float     height start value (inclusive)
+
+            h_stop      float     height stop value (inclusive)
+
+            w_start     float     width start value (inclusive)
+
+            w_stop      float     width stop value (inclusive)
+
+        """
+        super().__init__()
+        self._h_start = h_start
+        self._h_stop = h_stop
+        self._w_start = w_start
+        self._w_stop = w_stop
+
+    def initialize_tensor(self, tensor : Tensor):
+        shape = tensor.shape
+        dtype = tensor.dtype
+
+        if shape.ndim < 3:
+            raise ValueError(f'tensor.shape.ndim must be >= 3 (...,H,W,D)')
+
+        OH,OW,OD = shape[-3:]
+        if OD not in [2,3]:
+            raise ValueError(f'last dim D {OD} must == 2 or 3')
+
+        if OH > 1:
+            h_step = (self._h_stop-self._h_start) / (OH-1)
+        else:
+            h_step = 0
+
+        if OW > 1:
+            w_step = (self._w_stop-self._w_start) / (OW-1)
+        else:
+            w_step = 0
+
+        key = (InitCoords2DArange, dtype)
+        kernel = SCacheton.get_var(key)
+        if kernel is None:
+            kernel = Kernel(kernel_text=f"""
+
+{HKernel.define_tensor_type('O', tensor.dtype)}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME  , float h_start, float h_step
+                                                    , float w_start, float w_step,
+                                                    uint O0, uint O1, uint O2)
+{{
+    size_t gid = get_global_id(0);
+
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'O', 3)}
+
+    O_TYPE v;
+    if (o2 == 0)
+        v = w_start+o1*w_step;
+    else
+    if (o2 == 1)
+        v = h_start+o0*h_step;
+    else
+        v = 1;
+
+    O_GLOBAL_STORE(gid, v);
+}}
+""")
+            SCacheton.set_var(key, kernel)
+
+        tensor.get_device().run_kernel( kernel, tensor.get_buffer(),
+                             np.float32(self._h_start), np.float32(h_step),
+                             np.float32(self._w_start), np.float32(w_step),
+                             np.uint32(OH), np.uint32(OW), np.uint32(OD),
+                             global_shape=(shape.size,) )
+
+    def __str__(self):  return f'CoordsArange'
--- a/xlib/avecl/_internal/initializer/InitRandomUniform.py
+++ b/xlib/avecl/_internal/initializer/InitRandomUniform.py
@ -0,0 +1,62 @@
+import numpy as np
+
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+from .Initializer import Initializer
+
+
+class InitRandomUniform(Initializer):
+
+    def __init__(self, low=0, high=1):
+        """
+        arguments
+
+         low(0)   low value
+
+         high(1)  high value (exclusive)
+        """
+        super().__init__()
+        self._low = low
+        self._high = high
+
+    def initialize_tensor(self, tensor : Tensor):
+
+        key = (InitRandomUniform, self._low, self._high, tensor.dtype)
+        kernel = SCacheton.get_var(key)
+        if kernel is None:
+
+            hl = self._high-self._low
+            l = self._low
+
+            if tensor.dtype in [np.bool_,np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32]:
+                gen_expression = f'hash_uint_from_uint(gid+seed32) % {int(hl)} + {int(l)}'
+            elif tensor.dtype in [np.int64]:
+                gen_expression = f'hash_ulong_from_ulong(gid+seed64) % {int(hl)} + {int(l)}'
+            elif tensor.dtype in [np.uint64]:
+                gen_expression = f'hash_ulong_from_ulong(gid+seed64) % {int(hl)} + {int(l)}'
+            elif tensor.dtype in [np.float16, np.float32]:
+                gen_expression = f'hash_float_from_uint(gid+seed32)*{hl} + {l}'
+            elif tensor.dtype in [np.float64]:
+                gen_expression = f'hash_double_from_ulong(gid+seed64)*{hl} + {l}'
+
+            kernel = Kernel(kernel_text=f"""
+{HKernel.include_hash()}
+{HKernel.define_tensor('O', (tensor.shape.size,), tensor.dtype )}
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, uint seed32, ulong seed64)
+{{
+    size_t gid = get_global_id(0);
+    O_GLOBAL_STORE(gid, {gen_expression} );
+}}
+""")
+            SCacheton.set_var(key, kernel)
+
+
+        tensor.get_device().run_kernel( kernel, tensor.get_buffer(),
+                                        np.uint32(np.random.randint(np.iinfo(np.uint32).max, dtype=np.uint32)),
+                                        np.uint64(np.random.randint(np.iinfo(np.uint64).max, dtype=np.uint64)),
+                                        global_shape=(tensor.shape.size,) )
+
+    def __str__(self):  return f'InitRandomUniform low={self._low}, high={self._high}'
+
--- a/xlib/avecl/_internal/initializer/Initializer.py
+++ b/xlib/avecl/_internal/initializer/Initializer.py
@ -0,0 +1,17 @@
+from ..Tensor import Tensor
+
+class Initializer:
+    """
+    Base class for tensor inititalizers
+    """
+    def initialize_tensor(self, tensor : Tensor):
+        """
+        Implement initialization of tensor
+
+        You can compute the data using python and then call buffer.set(numpy_value)
+        or call kernel.run( tensor.get_device(), tensor.get_buffer, ...) to initialize the data using OpenCL
+        """
+        raise NotImplementedError()
+
+    def __str__(self): return 'Initializer'
+    def __repr__(self): return self.__str__()
--- a/xlib/avecl/_internal/initializer/init.py
+++ b/xlib/avecl/_internal/initializer/init.py
@ -0,0 +1,3 @@
+from .InitCoords2DArange import InitCoords2DArange
+from .Initializer import Initializer
+from .InitRandomUniform import InitRandomUniform
--- a/xlib/avecl/_internal/op/init.py
+++ b/xlib/avecl/_internal/op/init.py
@ -0,0 +1,21 @@
+from .any_wise import add, any_wise, div, max_, min_, mul, sqrt, square, sub
+from .binary_dilate_circle import binary_dilate_circle
+from .binary_erode_circle import binary_erode_circle
+from .binary_morph import binary_morph
+from .cast import cast
+from .concat import concat
+from .depthwise_conv2D import depthwise_conv2D
+from .gaussian_blur import gaussian_blur
+from .matmul import matmul, matmulc
+from .pad import pad
+from .reduce import (moments, reduce_max, reduce_mean, reduce_min, reduce_std,
+                     reduce_sum, reduce_variance)
+from .remap import remap
+from .remap_np_affine import remap_np_affine
+from .reshape import reshape
+from .slice_ import slice_
+from .slice_set import slice_set
+from .stack import stack
+from .tile import tile
+from .transpose import transpose
+from .warp_affine import warp_affine
--- a/xlib/avecl/_internal/op/any_wise.py
+++ b/xlib/avecl/_internal/op/any_wise.py
@ -0,0 +1,111 @@
+import numpy as np
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HArgs import HArgs
+from ..HKernel import HKernel
+from ..HType import HType
+from ..info import BroadcastInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+
+def any_wise(op_text : str,
+             *args,
+             dtype : np.dtype = None,
+             output_t:Tensor=None) -> Tensor:
+    """
+    operator for N-wise ops with N inputs
+
+    arguments
+        op_text     example: O=(2*I0*I1)+I2
+
+        *args       List[ Tensor | number ]
+
+        dtype
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape, but should match total size.
+    """
+    HArgs.check_zero_get_length(args)
+    tensor_args = HArgs.filter_tensor(args, raise_on_empty=True)
+    device = HArgs.check_get_same_device(tensor_args)
+
+    shape_list, dtype_list, krn_args = HArgs.decompose(args)
+
+    op = SCacheton.get(_AnyWiseOp, shape_list, dtype_list, dtype, op_text)
+
+    if output_t is None:
+        output_t = Tensor ( op.o_shape, op.o_dtype, device=device )
+    elif output_t.shape.size != op.o_shape.size:
+        raise ValueError(f'output_t must have size {op.o_shape.size}')
+
+    device.run_kernel(op.forward_krn, output_t.get_buffer(), *krn_args)
+
+    return output_t
+
+class _AnyWiseOp:
+    def __init__(self, shape_list, dtype_list, o_dtype, op_text : str):
+        if len(shape_list) != len(dtype_list):
+            raise ValueError('len(shape_list) != len(dtype_list)')
+
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
+
+        if len(shape_list) == 1:
+            # element-wise.
+            i_shape, i_dtype = shape_list[0], dtype_list[0]
+            self.o_shape = o_shape = i_shape
+
+            self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('IN', i_shape, i_dtype)}
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const IN_PTR_TYPE* IN_PTR_NAME)
+{{
+size_t gid = get_global_id(0);
+
+O_TYPE O = O_GLOBAL_LOAD(gid);
+IN_TYPE I0 = IN_GLOBAL_LOAD(gid);
+{op_text};
+O_GLOBAL_STORE(gid, O);
+}}
+""")
+        else:
+            # Multi arg.
+            self.info = info = BroadcastInfo( [ shape if shape is not None else AShape((1,)) for shape in shape_list ])
+
+            self.o_shape = o_shape = info.o_shape
+
+            defs, arg_defs, impls = [], [], []
+            for i, (t_shape, t_dtype) in enumerate(zip(shape_list, dtype_list)):
+                t_name = f'I{i}'
+                if t_shape is not None:
+                    defs.append( HKernel.define_tensor(t_name, info.br_shapes[i], t_dtype) )
+                    arg_defs.append( f", __global const {t_name}_PTR_TYPE* {t_name}_PTR_NAME" )
+                    impls.append( f"{t_name}_TYPE {t_name} = {t_name}_GLOBAL_LOAD({t_name}_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)}));")
+                else:
+                    arg_defs.append( f", {HKernel.define_scalar_func_arg(t_name, t_dtype)}" )
+
+            defs, arg_defs, impls = '\n'.join(defs), '\n'.join(arg_defs), '\n'.join(impls)
+
+            self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+{defs}
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME{arg_defs})
+{{
+size_t gid = get_global_id(0);
+{HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)}
+{impls}
+O_TYPE O;
+{op_text};
+O_GLOBAL_STORE(gid, O);
+}}
+""")
+
+def add(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0+I1', a_t, b_t)
+def sub(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0-I1', a_t, b_t)
+def mul(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0*I1', a_t, b_t)
+def div(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=I0/I1', a_t, b_t)
+def min_(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=fmin( I0_TO_FLOATX(I0), I1_TO_FLOATX(I1) )', a_t, b_t)
+def max_(a_t : Tensor, b_t : Tensor) -> Tensor: return any_wise('O=fmax( I0_TO_FLOATX(I0), I1_TO_FLOATX(I1) )', a_t, b_t)
+def square(a_t : Tensor) -> Tensor:  return any_wise('O=I0*I0', a_t)
+def sqrt(a_t : Tensor) -> Tensor:    return any_wise('O=sqrt(I0_TO_FLOATX(I0))', a_t)
--- a/xlib/avecl/_internal/op/binary_dilate_circle.py
+++ b/xlib/avecl/_internal/op/binary_dilate_circle.py
@ -0,0 +1,91 @@
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..info import Conv2DInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+
+def binary_dilate_circle (input_t : Tensor, radius : int = 1, iterations : int = 1, dtype=None):
+    """
+    Binary dilate operator using circle kernel with radius.
+
+     input_t     Tensor (...,H,W)
+
+    per-element of H,W, set 1 if any neighbor elements inside circle with radius != 0.
+    otherwise set 0.
+    """
+    op = SCacheton.get(_BinaryDilateOp, input_t.shape, input_t.dtype, int(radius), dtype)
+
+    device = input_t.get_device()
+
+    if radius <= 0 or iterations <= 0:
+        return input_t.copy()
+    else:
+        for i in range(iterations):
+            if i == 0:
+                buf_in = input_t
+            else:
+                buf_in, buf_out = buf_out, buf_in
+            if i <= 1:
+                buf_out = Tensor( op.o_shape, op.o_dtype, device=device )
+            device.run_kernel(op.forward_krn, buf_out.get_buffer(), buf_in.get_buffer() )
+
+    return buf_out
+
+class _BinaryDilateOp():
+    def __init__(self, i_shape : AShape, i_dtype, radius, o_dtype):
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        if i_shape.ndim < 2:
+            raise ValueError(f'i_shape.ndim must be >= 2')
+
+        KS = radius*2+1
+        IH,IW = i_shape[-2:]
+
+        ci = Conv2DInfo(IH, IW, KS, KS, stride=1, dilation=1, padding='same')
+
+        self.o_shape = o_shape = i_shape
+
+        self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('I', i_shape, i_dtype)}
+
+#define PADL {ci.PADL}
+#define PADT {ci.PADT}
+
+#define RADIUS {radius}
+#define KS {KS}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
+{{
+    size_t gid = get_global_id(0);
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
+
+    {'#pragma unroll' if KS <= 16 else ''}
+    for (int kh=0; kh<KS; ++kh)
+    {'#pragma unroll' if KS <= 16 else ''}
+    for (int kw=0; kw<KS; ++kw)
+    {{
+        if ( hypot( (float)(kh-RADIUS), (float)(kw-RADIUS) )  <= RADIUS)
+        {{
+            int im2 = -PADT + kh + om2;
+            int im1 = -PADL + kw + om1;
+
+            I_TYPE i_val = (im1 >= 0 & im1 < Im1 & im2 >= 0 & im2 < Im2) ?
+                           I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))
+                           : 0;
+
+            if (i_val != (I_TYPE)0)
+            {{
+                O_GLOBAL_STORE(gid, (O_TYPE) 1);
+                return;
+            }}
+        }}
+    }}
+
+    O_GLOBAL_STORE(gid, (O_TYPE) 0 );
+}}
+""")
+
+
--- a/xlib/avecl/_internal/op/binary_erode_circle.py
+++ b/xlib/avecl/_internal/op/binary_erode_circle.py
@ -0,0 +1,91 @@
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..info import Conv2DInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+
+def binary_erode_circle (input_t : Tensor, radius : int = 1, iterations : int = 1, dtype=None):
+    """
+    Binary erode operator using circle kernel with radius.
+
+     input_t     Tensor (...,H,W)
+
+    per-element of H,W, set 1 if all neighbor elements inside circle with radius != 0.
+    otherwise set 0.
+    """
+    op = SCacheton.get(_BinaryErodeOp, input_t.shape, input_t.dtype, int(radius), dtype)
+
+    device = input_t.get_device()
+
+    if radius <= 0 or iterations <= 0:
+        return input_t.copy()
+    else:
+        for i in range(iterations):
+            if i == 0:
+                buf_in = input_t
+            else:
+                buf_in, buf_out = buf_out, buf_in
+            if i <= 1:
+                buf_out = Tensor( op.o_shape, op.o_dtype, device=device )
+            device.run_kernel(op.forward_krn, buf_out.get_buffer(), buf_in.get_buffer() )
+
+    return buf_out
+
+class _BinaryErodeOp():
+    def __init__(self, i_shape : AShape, i_dtype, radius, o_dtype):
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        if i_shape.ndim < 2:
+            raise ValueError(f'i_shape.ndim must be >= 2')
+
+        KS = radius*2+1
+        IH,IW = i_shape[-2:]
+
+        ci = Conv2DInfo(IH, IW, KS, KS, stride=1, dilation=1, padding='same')
+
+        self.o_shape = o_shape = i_shape
+
+        self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('I', i_shape, i_dtype)}
+
+#define PADL {ci.PADL}
+#define PADT {ci.PADT}
+
+#define RADIUS {radius}
+#define KS {KS}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
+{{
+    size_t gid = get_global_id(0);
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
+
+    {'#pragma unroll' if KS <= 16 else ''}
+    for (int kh=0; kh<KS; ++kh)
+    {'#pragma unroll' if KS <= 16 else ''}
+    for (int kw=0; kw<KS; ++kw)
+    {{
+        if ( hypot( (float)(kh-RADIUS), (float)(kw-RADIUS) ) <= RADIUS)
+        {{
+            int im2 = -PADT + kh + om2;
+            int im1 = -PADL + kw + om1;
+
+            I_TYPE i_val = (im1 >= 0 & im1 < Im1 & im2 >= 0 & im2 < Im2) ?
+                           I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))
+                           : 0;
+
+            if (i_val == (I_TYPE)0)
+            {{
+                O_GLOBAL_STORE(gid, (O_TYPE) 0 );
+                return;
+            }}
+        }}
+    }}
+
+    O_GLOBAL_STORE(gid, (O_TYPE) 1 );
+}}
+""")
+
+
--- a/xlib/avecl/_internal/op/binary_morph.py
+++ b/xlib/avecl/_internal/op/binary_morph.py
@ -0,0 +1,43 @@
+from ..Tensor import Tensor
+from .binary_dilate_circle import binary_dilate_circle
+from .binary_erode_circle import binary_erode_circle
+from .gaussian_blur import gaussian_blur
+from .pad import pad
+
+
+def binary_morph(input_t : Tensor, erode_dilate : int, blur : float, fade_to_border : bool = False, dtype=None) -> Tensor:
+    """
+    Apply optional binary erode/dilate and optional blur.
+
+        input_t    (...,H,W) tensor. Non zero values will be treated as 1.
+
+        erode_dilate    int     >= 0    amount of pixels to dilate
+
+        blur            float   >= 0    amount of pixels to blur
+
+        fade_to_border(False)   clip the image in order
+                                to fade smoothly to the border with specified blur amount
+    """
+    x = input_t
+
+    H,W = input_t.shape[-2:]
+
+    x = pad(x, (...,(H,H),(W,W)), mode='constant', constant_value=0)
+
+    if erode_dilate > 0:
+        x = binary_erode_circle(x, radius=1, iterations=max(1,erode_dilate//2))
+    elif erode_dilate < 0:
+        x = binary_dilate_circle(x, radius=1, iterations=max(1,-erode_dilate//2) )
+
+    if fade_to_border:
+        h_clip_size = H + blur // 2
+        w_clip_size = W + blur // 2
+        x[...,:h_clip_size,:] = 0
+        x[...,-h_clip_size:,:] = 0
+        x[...,:,:w_clip_size] = 0
+        x[...,:,-w_clip_size:] = 0
+
+    if blur > 0:
+        x = gaussian_blur(x, blur * 0.250, dtype=dtype)
+
+    return x[...,H:-H,W:-W]
--- a/xlib/avecl/_internal/op/cast.py
+++ b/xlib/avecl/_internal/op/cast.py
@ -0,0 +1,17 @@
+from ..Tensor import Tensor
+
+from .any_wise import any_wise
+
+def cast(input_t : Tensor, dtype, output_t:Tensor=None) -> Tensor:
+    """
+    cast operator
+
+    arguments
+        input_t
+
+        dtype
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape, but should match total size.
+    """
+    return any_wise('O=I0', input_t, dtype=dtype, output_t=output_t)
--- a/xlib/avecl/_internal/op/concat.py
+++ b/xlib/avecl/_internal/op/concat.py
@ -0,0 +1,70 @@
+from ..backend import Kernel
+from ..HArgs import HArgs
+from ..HType import HType
+from ..HKernel import HKernel
+from ..info import ConcatInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+def concat(tensor_list, axis, dtype=None, output_t=None, is_add_to_output=False) -> Tensor:
+    """
+    arguments
+
+        tensor_list     Iterable
+
+        axis            Int
+
+        dtype           np.dtype
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape,
+                            but should match total size.
+                            gradfn will not be set.
+
+        is_add_to_output    add result to output_t if output_t is set.
+    """
+    tensor_list = tuple(tensor_list)
+    HArgs.check_zero_get_length(tensor_list)
+    HArgs.check_all_tensors(tensor_list)
+
+    device = HArgs.check_get_same_device(tensor_list)
+    shape_list, dtype_list, _ = HArgs.decompose(tensor_list)
+
+    op = SCacheton.get(_ConcatOp, shape_list, dtype_list, dtype, int(axis), False if output_t is None else is_add_to_output)
+
+    if output_t is None:
+        output_t = Tensor (op.info.o_shape, op.o_dtype, device=device)
+    elif output_t.shape.size != op.info.o_shape.size:
+        raise ValueError(f'output_t must have size {op.info.o_shape.size}')
+
+    for forward_krn,t in zip(op.forward_krns,tensor_list):
+        device.run_kernel(forward_krn, output_t.get_buffer(), t.get_buffer(), global_shape=(t.shape.size,) )
+
+    return output_t
+
+class _ConcatOp:
+    def __init__(self, shape_list, dtype_list, o_dtype, axis, is_add_to_output):
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
+
+        self.info = info = ConcatInfo(shape_list, axis)
+
+        self.forward_krns = forward_krns = []
+
+        for i, (shape, dtype) in enumerate(zip(shape_list, dtype_list)):
+            forward_krn = Kernel(f"""
+
+{HKernel.define_tensor('O', info.o_shape, o_dtype )}
+{HKernel.define_tensor('I', shape, dtype)}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
+{{
+    size_t gid = get_global_id(0);
+
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'I', shape.ndim)}
+
+    i{info.axis} += {info.axis_offsets[i]};
+
+    {'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_seq_enum('I', info.o_shape.ndim)}), I_GLOBAL_LOAD(gid) );
+}}
+""")
+            forward_krns.append(forward_krn)
--- a/xlib/avecl/_internal/op/depthwise_conv2D.py
+++ b/xlib/avecl/_internal/op/depthwise_conv2D.py
@ -0,0 +1,107 @@
+import numpy as np
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..info import BroadcastInfo, Conv2DInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+
+def depthwise_conv2D (input_t : Tensor, kernel_t : Tensor, stride=1, dilation=1, padding='same', dtype=None):
+    """
+    Depthwise Conv2D operator.
+
+     input_t     Tensor (...,H,W)
+
+     kernel_t    Tensor (...,H,W)
+
+     stride(1)       int
+
+     dilation(1)     int
+
+     padding(same)   'valid'         no padding
+                     'same'          output size will be the same
+                                     or divided by stride
+                     int             padding value for all sides
+                     Iterable of 4 ints
+                                paddings for left,top,right,bottom sides
+
+    ...-head part of shapes will be broadcasted to each other
+    """
+
+    op = SCacheton.get(_DepthwiseConv2DOp, input_t.shape, input_t.dtype, kernel_t.shape, kernel_t.dtype, dtype, int(stride), int(dilation), padding)
+
+    output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
+    output_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(), kernel_t.get_buffer())
+
+    return output_t
+
+class _DepthwiseConv2DOp():
+    def __init__(self, i_shape : AShape, i_dtype, k_shape : AShape, k_dtype, o_dtype, stride, dilation, padding):
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        if i_shape.ndim < 2:
+            raise ValueError(f'i_shape.ndim must be >= 2')
+
+        if k_shape.ndim < 2:
+            raise ValueError(f'k_shape.ndim must be >= 2')
+
+        IH,IW = i_shape[-2:]
+        KH,KW = k_shape[-2:]
+
+        ci = Conv2DInfo(IH, IW, KH, KW, stride, dilation, padding)
+
+        if i_shape.ndim == 2 and k_shape.ndim == 2:
+            # nothing to broadcast
+            i_br_shape = i_shape
+            k_br_shape = k_shape
+
+            o_shape = AShape([ci.OH, ci.OW])
+        else:
+            op = BroadcastInfo([ i_shape[:-2], k_shape[:-2] ])
+
+            i_br_shape = op.br_shapes[0] + i_shape[-2:]
+            k_br_shape = op.br_shapes[1] + k_shape[-2:]
+
+            o_shape = op.o_shape + [ci.OH, ci.OW]
+
+        self.o_shape = o_shape
+
+        self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('I', i_br_shape, i_dtype)}
+{HKernel.define_tensor('K', k_br_shape, k_dtype)}
+
+#define PADL {ci.PADL}
+#define PADT {ci.PADT}
+
+#define STRIDE {stride}
+#define DILATION {dilation}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, __global const K_PTR_TYPE* K_PTR_NAME)
+{{
+    size_t gid = get_global_id(0);
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
+
+    float v = 0.0;
+    {'#pragma unroll' if KH <= 9 else ''}
+    for (int km2=0; km2<Km2; ++km2)
+    {{
+        int im2 = -PADT + km2*DILATION + om2*STRIDE;
+        if (im2 >= 0 & im2 < Im2)
+            {'#pragma unroll' if KW <= 9 else ''}
+            for (int km1=0; km1<Km1; ++km1)
+            {{
+                int im1 = -PADL + km1*DILATION + om1*STRIDE;
+                if (im1 >= 0 & im1 < Im1)
+                    v += ((float)(I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='im2,im1' )}))))
+                                 *K_GLOBAL_LOAD(K_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='km2,km1' )}));
+            }}
+    }}
+
+    O_GLOBAL_STORE(gid, (O_TYPE) v);
+}}
+""")
+
+
--- a/xlib/avecl/_internal/op/gaussian_blur.py
+++ b/xlib/avecl/_internal/op/gaussian_blur.py
@ -0,0 +1,39 @@
+import numpy as np
+
+from ..Tensor import Tensor
+from .depthwise_conv2D import depthwise_conv2D
+
+
+def gaussian_blur (input_t : Tensor, sigma, dtype=None) -> Tensor:
+    """
+    arguments
+
+        input_t     Tensor(...,H,W)
+
+        sigma       float
+
+
+    """
+    if sigma <= 0.0:
+        return input_t.copy() #TODO
+
+    device = input_t.get_device()
+
+    key = (gaussian_blur, sigma)
+    kernel_t = device.get_cached_data(key)
+    if kernel_t is None:
+        kernel_t = Tensor.from_value( _make_gaussian_kernel(sigma, np.float32), device=device )
+        device.set_cached_data(key, kernel_t)
+
+    output_t = depthwise_conv2D(input_t, kernel_t, dtype=dtype)
+    return output_t
+
+def _make_gaussian_kernel(sigma : float, dtype):
+    kernel_size = max(3, int(2 * 2 * sigma))
+    if kernel_size % 2 == 0:
+        kernel_size += 1
+    mean = np.floor(0.5 * kernel_size)
+    kernel_1d = np.array([ np.exp(-(float(x) - float(mean)) ** 2 / (2 * sigma ** 2)) for x in range(kernel_size)])
+    np_kernel = np.outer(kernel_1d, kernel_1d)
+    kernel = np_kernel / np.sum(np_kernel)
+    return kernel.astype(dtype)
--- a/xlib/avecl/_internal/op/matmul.py
+++ b/xlib/avecl/_internal/op/matmul.py
@ -0,0 +1,158 @@
+import numpy as np
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HArgs import HArgs
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+
+def matmul(a_t : Tensor, b_t : Tensor, output_t: Tensor=None, is_add_to_output=False) -> Tensor:
+    """
+    matmul operator in row-major format
+
+     A(...,M,K) x
+     B(...,K,N) =
+      (...,M,N)
+
+     arguments
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape,
+                            but should match total size.
+                            gradfn will not be set.
+
+        is_add_to_output    add result to output_t if output_t is set.
+    """
+
+    return matmulc(b_t, a_t, output_t=output_t, is_add_to_output=is_add_to_output)
+
+
+
+def matmulc(a_t : Tensor, b_t : Tensor, output_t : Tensor = None, is_add_to_output=False) -> Tensor:
+    """
+    matmul operator in col-major format
+
+        A(...,K,M) x
+        B(...,N,K) =
+         (...,N,M)
+
+    arguments
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape,
+                            but should match total size.
+                            gradfn will not be set.
+
+        is_add_to_output    add result to output_t if output_t is set.
+    """
+    device = HArgs.check_get_same_device([a_t, b_t])
+
+    op = SCacheton.get(_MatmulOp, a_t.shape, a_t.dtype, b_t.shape, b_t.dtype, False if output_t is None else is_add_to_output)
+
+    if output_t is None:
+        output_t = Tensor (op.o_shape, op.o_dtype, device=device )
+    elif output_t.shape.size != op.o_shape.size:
+        raise ValueError(f'output_t must have size {op.o_shape.size}')
+
+    device.run_kernel(op.forward_krn, output_t.get_buffer(), a_t.get_buffer(), b_t.get_buffer(), )
+
+    return output_t
+
+
+class _MatmulOp:
+    def __init__(self, a_shape, a_dtype, b_shape, b_dtype, is_add_to_output):
+        a_dtype = np.dtype(a_dtype).type
+        b_dtype = np.dtype(b_dtype).type
+
+        if a_dtype != np.float32 or b_dtype != np.float32:
+            raise ValueError('matmul works only with float32 tensors.')
+
+        if a_shape.ndim != b_shape.ndim:
+            raise ValueError(f'ndims are not equal. {a_shape.ndim} != {b_shape.ndim}')
+
+        ndim = a_shape.ndim
+        if ndim < 2:
+            raise ValueError('Tensors ndim must be at least 2.')
+
+        K, M = a_shape[-2], a_shape[-1]
+        N, B_COLS = b_shape[-2], b_shape[-1]
+
+        if K != B_COLS:
+            raise ValueError('A_ROWS != B_COLS')
+
+        BATCH = a_shape[0:-2].size
+        B_BATCH = b_shape[0:-2].size
+
+        if BATCH != B_BATCH:
+            raise ValueError(f'BATCH size {BATCH} != {B_BATCH} in shapes {a_shape} {b_shape}')
+
+        if ndim == 2:
+            self.o_shape = AShape( (N, M) )
+        else:
+            self.o_shape = AShape( a_shape[:-2]+(N, M) )
+        self.o_dtype = np.float32
+
+        self.M = M
+        self.N = N
+        self.K = K
+
+        # Determining optimal tile widths
+        for MW in [8,4,2,1]:
+            if M % MW == 0:
+                break
+        for KW in [8,4,2,1]:
+            if N % KW == 0 and K % KW == 0:
+                break
+        NW = KW
+
+        self.forward_krn = Kernel(global_shape=(M//MW, N//NW, BATCH), kernel_text=f"""
+#define K {K}
+#define N {N}
+#define MW {MW}     // M tile Width
+#define NW {NW}     // N tile Width  -- NW & KW should be the same !
+#define KW {KW}     // K tile Width
+#define MT {M//MW}  // MT is max for 'mt' (M tile count)
+#define KT {K//KW}  // KT is max for 'kt' (K tile count)
+
+#define floatMW { f'float{MW}' if MW != 1 else 'float'}
+#define floatKW { f'float{KW}' if KW != 1 else 'float'}
+
+__kernel void GeMM(__global floatMW* O, const __global floatMW* restrict A, const __global floatKW* restrict B)
+{{
+    size_t mt = get_global_id(0);    //global M-tile id
+    size_t nc = get_global_id(1);    //global N-tile id
+    size_t batch = get_global_id(2);
+
+    float AT[KW][MW]; // sub tiles
+    float BT[NW][KW];
+    float CT[NW][MW];
+
+    #pragma unroll
+    for (uint i=0; i<NW*MW; ++i) // zero CT tile
+        ((float*) CT)[i] = 0.0;
+
+    for (uint kt=0; kt<KT; ++kt)  // iterate over K-dim tiles
+    {{
+        #pragma unroll
+        for (uint k=0; k<KW; ++k)  // every k-element inside K-dim tile
+            *( (floatMW*) AT[k] ) = A[batch*K*MT + (kt*KW + k)*MT + mt]; // store M-Width floats
+
+        #pragma unroll
+        for (uint n=0; n<NW; ++n)  // every n-element inside N-dim tile
+            *( (floatKW*) BT[n] ) = B[batch*N*KT + (nc*NW + n)*KT + kt]; // store K-Width floats
+
+        #pragma unroll
+        for (uint k=0; k<KW; ++k)
+        #pragma unroll
+        for (uint n=0; n<NW; ++n)  // sub tiles multiplication
+        #pragma unroll
+        for (uint m=0; m<MW; ++m)
+            CT[n][m] += AT[k][m] * BT[n][k];
+    }}
+
+    #pragma unroll
+    for (uint n=0; n<NW; ++n)
+        O[ batch*N*MT + (nc*NW + n)*MT + mt] {'+=' if is_add_to_output else '='}
+                               *( (floatMW*) CT[n]);
+}}""")
+
--- a/xlib/avecl/_internal/op/pad.py
+++ b/xlib/avecl/_internal/op/pad.py
@ -0,0 +1,71 @@
+from typing import List
+
+import numpy as np
+
+from ..HType import HType
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..info import PadInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+
+def pad(input_t : Tensor, axes_paddings : List, mode : str = 'constant', constant_value=0, dtype : np.dtype = None, output_t : Tensor=None) -> Tensor:
+    """
+    arguments:
+
+        axes_paddings   list of (l_pad, r_pad),
+
+                        if [0] == ... (Ellipsis), then left-side paddings will be filled with (0,0) for remain axes
+                        if [-1] == ... , same for ride-side
+
+        dtype           cast to dtype
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape, but should match total size
+
+    """
+    op = SCacheton.get(_PadOp, input_t.shape, input_t.dtype, dtype, tuple(axes_paddings), mode, constant_value )
+
+    if output_t is None:
+        output_t = Tensor (op.o_shape, op.o_dtype, device=input_t.get_device())
+    elif output_t.shape.size != op.o_shape.size:
+        raise ValueError(f'output_t must have size {op.o_shape.size}')
+
+    input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
+
+    return output_t
+
+
+class _PadOp:
+    def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, axes_paddings, mode, constant_value ):
+        _allow_modes = ['constant']
+        if mode not in _allow_modes:
+            raise ValueError(f'Allowed pads modes: {_allow_modes}')
+
+        if mode == 'constant':
+            if not HType.is_scalar_type(constant_value):
+                raise ValueError('constan_value must be scalar')
+
+        info = PadInfo(i_shape, axes_paddings)
+
+        self.o_shape = o_shape = info.o_shape
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('I', i_shape, i_dtype)}
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
+{{
+size_t gid = get_global_id(0);
+
+{HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
+
+if ({' & '.join(f'o{i} >= {l_pad} & o{i} < (O{i}-{r_pad})'  for i, (l_pad,r_pad) in enumerate(info.axes_paddings))})
+    O_GLOBAL_STORE(gid, I_GLOBAL_LOAD( I_IDX({ ','.join(f'o{i}-{l_pad}' for i,(l_pad,r_pad) in zip(range(o_shape.ndim), info.axes_paddings)  ) }) ) );
+else
+    O_GLOBAL_STORE(gid, (O_TYPE){constant_value} );
+    //O_GLOBAL_STORE(gid, I_GLOBAL_LOAD( I_IDX_MOD({ ','.join(f' I{i} + ( (o{i}-{l_pad})*( ((o{i}-{l_pad})/I{i}) % 2 == 0 ? 1: -1) ) % I{i} ' for i,(l_pad,r_pad) in zip(range(o_shape.ndim), info.axes_paddings)  ) }) ) );
+}}""")
+        #print(self.forward_krn)
--- a/xlib/avecl/_internal/op/reduce.py
+++ b/xlib/avecl/_internal/op/reduce.py
@ -0,0 +1,214 @@
+import math
+
+import numpy as np
+
+from ..AAxes import AAxes
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..info import ReductionInfo, TransposeInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+from .slice_ import slice_
+from .transpose import transpose
+from .any_wise import square, sqrt
+
+
+def reduce_mean (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
+    """
+    Reduce mean operator.
+
+        input_t     Tensor
+
+        axes(None)  int
+                    Iterable of ints.
+                    None - all axes
+
+        keepdims(False)     keep reduced axes
+    """
+    return reduce_op ('mean', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
+
+def reduce_std(input_t, axes=None, keepdims=False):
+    """
+    Reduce std operator.
+
+        input_t     Tensor
+
+        axes(None)  int
+                    Iterable of ints.
+                    None - all axes
+
+        keepdims(False)     keep reduced axes
+    """
+    return sqrt(reduce_variance(input_t, axes, keepdims))
+
+
+def reduce_variance(input_t, axes=None, keepdims=False):
+    """
+    Reduce variance operator.
+
+        input_t     Tensor
+
+        axes(None)  int
+                    Iterable of ints.
+                    None - all axes
+
+        keepdims(False)     keep reduced axes
+    """
+    mean = reduce_mean(input_t, axes, keepdims=True)
+    return reduce_mean(square(input_t - mean), axes, keepdims)
+
+def moments(input_t, axes=None, keepdims=False):
+    """
+    Returns (mean, variance) of input_t
+
+        input_t     Tensor
+
+        axes(None)  int
+                    Iterable of ints.
+                    None - all axes
+
+        keepdims(False)     keep reduced axes
+    """
+    mean = reduce_mean(input_t, axes, keepdims)
+    mean_shape_keepdims = mean._op.info.o_shape_kd
+    var = reduce_mean(square(input_t - mean.reshape(mean_shape_keepdims) ), axes, keepdims)
+    return mean, var
+
+def reduce_min (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
+    """
+    Reduce min operator.
+
+        input_t     Tensor
+
+        axes(None)  int
+                    Iterable of ints.
+                    None - all axes
+
+        keepdims(False)     keep reduced axes
+    """
+    return reduce_op ('min', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
+
+def reduce_max (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
+    """
+    Reduce max operator.
+
+        input_t     Tensor
+
+        axes(None)  int
+                    Iterable of ints.
+                    None - all axes
+
+        keepdims(False)     keep reduced axes
+    """
+    return reduce_op ('max', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
+
+def reduce_sum (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
+    """
+    Reduce sum operator.
+
+        input_t     Tensor
+
+        axes(None)  int
+                    Iterable of ints.
+                    None - all axes
+
+        keepdims(False)     keep reduced axes
+    """
+    return reduce_op ('sum', input_t, axes=axes, keepdims=keepdims, output_t=output_t, is_add_to_output=is_add_to_output)
+
+def reduce_op (op_type : str, input_t, axes=None, keepdims=False, output_t=None, is_add_to_output=False):
+    """
+    arguments
+
+        op_type             'sum' 'mean' 'min' 'max'
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape,
+                            but should match total size.
+                            gradfn will not be set.
+
+        is_add_to_output    add result to output_t if output_t is set.
+    """
+
+    op = SCacheton.get(_ReduceOp, op_type, input_t.shape, input_t.dtype, AAxes(axes, input_t.shape.ndim), keepdims)
+
+    if output_t is None:
+        output_t = Tensor ( op.info.o_shape, input_t.dtype, device=input_t.get_device() )
+    elif output_t.shape.size != op.info.o_shape.size:
+        raise ValueError(f'output_t must have size {op.info.o_shape.size}')
+
+    # Make an intermediate tensor
+    input_t_inter = transpose(input_t, op.intermediate_transpose_axes)
+
+    # Perform multistage inplace operation in intermediate tensor
+    for stage, (shape, STAGE_COLS, STAGE_VALID_COLS) in enumerate(zip(op.forward_krn_shapes, op.forward_krn_stage_cols, op.forward_krn_stage_valid_cols)):
+        input_t_inter.get_device().run_kernel(op.forward_krn, input_t_inter.get_buffer(), np.int64(op.COLS), np.int64(STAGE_COLS), np.int64(STAGE_VALID_COLS),
+                           global_shape=shape)
+
+    if op_type == 'mean':
+        # divide values in ROWS by number of COLS
+        input_t_inter.get_device().run_kernel(op.mean_div_forward_krn, input_t_inter.get_buffer(), np.int64(op.COLS), global_shape=(op.ROWS,) )
+
+    # Fetch final tensor from zero indexes using slices argument
+    slice_(input_t_inter, op.inter_slices, output_t=output_t, is_add_to_output=is_add_to_output)
+
+    return output_t
+
+
+class _ReduceOp:
+    def __init__(self, op_type, i_shape : AShape, i_dtype : np.dtype, axes : AAxes, keepdims=False):
+        self.op_type = op_type
+        self.info = info = ReductionInfo(i_shape, axes, keepdims)
+
+        # Determine transpose order for intermediate tensor, where reduction axes will be at the end
+        self.intermediate_transpose_axes = info.o_axes + info.reduction_axes
+        self.intermediate_shape = TransposeInfo(i_shape, self.intermediate_transpose_axes).o_shape
+
+        # slices argument to fetch processed tensor from zero indexes
+        self.inter_slices = ( slice(None,None,None), ) * info.o_axes.ndim + (0,) * info.reduction_axes.ndim
+
+        # COLS are reduction axes, ROWS are remaining axes
+        rows_ndim = info.o_axes.ndim
+        self.ROWS = ROWS = self.intermediate_shape[:rows_ndim].size
+        self.COLS = COLS = self.intermediate_shape[rows_ndim:].size
+
+        # Number of stages to operate COLS
+        n_stages = (COLS-1).bit_length()
+        self.forward_krn_shapes           = [ (ROWS * math.ceil(COLS/ (2**(stage+1)) ),) for stage in range(n_stages) ]
+        self.forward_krn_stage_cols       = [ math.ceil(COLS / (2**(stage+1)) ) for stage in range(n_stages) ]
+        self.forward_krn_stage_valid_cols = [ math.ceil(COLS / (2** stage   ) ) for stage in range(n_stages) ]
+
+        self.forward_krn = Kernel(f"""
+{HKernel.define_tensor('I', (1,), i_dtype)}
+
+__kernel void impl(__global I_PTR_TYPE* I_PTR_NAME, long COLS, long STAGE_COLS, long STAGE_VALID_COLS)
+{{
+    size_t gid = get_global_id(0);
+
+    size_t col = gid % STAGE_COLS;
+    size_t row = gid / STAGE_COLS;
+    size_t i_idx = row*COLS + col;
+
+    size_t other_col = col + STAGE_COLS;
+    if (other_col < STAGE_VALID_COLS)
+    {{
+        I_TYPE val_a = I_GLOBAL_LOAD(i_idx);
+        I_TYPE val_b = I_GLOBAL_LOAD(row*COLS + other_col);
+
+        {'I_TYPE val_x = val_a + val_b;' if op_type in ['sum','mean'] else
+         'I_TYPE val_x = fmin( I_TO_FLOATX(val_a), I_TO_FLOATX(val_b) );' if op_type == 'min' else
+         'I_TYPE val_x = fmax( I_TO_FLOATX(val_a), I_TO_FLOATX(val_b) );' if op_type == 'max' else ''
+        }
+        I_GLOBAL_STORE(i_idx, val_x);
+    }}
+}}
+""")
+        self.mean_div_forward_krn = Kernel(f"""
+{HKernel.define_tensor('I', (1,), i_dtype)}
+__kernel void impl(__global I_PTR_TYPE* I_PTR_NAME, long COLS)
+{{
+    size_t row = get_global_id(0);
+    I_GLOBAL_STORE(row*COLS, I_GLOBAL_LOAD(row*COLS) / COLS );
+}}
+""")
--- a/xlib/avecl/_internal/op/remap.py
+++ b/xlib/avecl/_internal/op/remap.py
@ -0,0 +1,103 @@
+import numpy as np
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..info import BroadcastInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+def remap (input_t : Tensor, coords_t : Tensor, dtype=None) -> Tensor:
+    """
+    remap input_t in spatial axes using coords_t
+
+    arguments
+
+        input_t     Tensor( ...,IH,IW )
+
+        coords_t    Tensor( ...,OH,OW,D )
+                    OH - output height
+                    OW - output width
+                    D is (2)[x,y] coords
+
+        dtype
+
+    ...-head part of shapes will be broadcasted to each other
+    """
+
+    op = SCacheton.get(_RemapOp, input_t.shape, input_t.dtype, coords_t.shape, coords_t.dtype, dtype)
+
+    output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
+
+    input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(), coords_t.get_buffer())
+
+    return output_t
+
+
+class _RemapOp():
+    def __init__(self, i_shape : AShape, i_dtype, c_shape : AShape, c_dtype, o_dtype):
+        if np.dtype(i_dtype).type == np.bool_:
+            raise ValueError('np.bool_ dtype of i_dtype is not supported.')
+        if np.dtype(c_dtype).type == np.bool_:
+            raise ValueError('np.bool_ dtype of c_dtype is not supported.')
+        if i_shape.ndim < 2:
+            raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
+        if c_shape.ndim < 3:
+            raise ValueError(f'Coords shape ndim must be >= 3(...,H,W,D)')
+        if c_shape[-1] != 2:
+            raise ValueError('Last coords dim must be == 2 (x,y)')
+
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        if i_shape.ndim == 2 and c_shape.ndim == 3:
+            # nothing to broadcast
+
+            i_br_shape = i_shape
+            c_br_shape = c_shape
+
+            o_shape = c_shape[-3:-1]
+        else:
+            op = BroadcastInfo([ i_shape[:-2], c_shape[:-3] ])
+
+            i_br_shape = op.br_shapes[0] + i_shape[-2:]
+            c_br_shape = op.br_shapes[1] + c_shape[-3:]
+
+            o_shape = op.o_shape + c_shape[-3:-1]
+
+        self.o_shape = o_shape
+
+        self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('I', i_br_shape, i_dtype)}
+{HKernel.define_tensor('C', c_br_shape[:-1], c_dtype)}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, __global const C_PTR_TYPE2* C_PTR_NAME)
+{{
+    size_t gid = get_global_id(0);
+
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)}
+
+    C_TYPE2 c_value = C_GLOBAL_LOAD2(C_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim)}));
+
+    float cx01 = (float) c_value.x;
+    float cy01 = (float) c_value.y;
+
+    float cx0f = floor(cx01);   int cx0 = (int)cx0f;
+    float cy0f = floor(cy01);   int cy0 = (int)cy0f;
+    float cx1f = cx0f+1;        int cx1 = (int)cx1f;
+    float cy1f = cy0f+1;        int cy1 = (int)cy1f;
+
+    float p00 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx0')}));
+    float p01 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx1')}));
+    float p10 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx0')}));
+    float p11 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx1')}));
+
+    p00 *= (cx1f - cx01)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx0 >= 0 & cx0 < Im1);
+    p01 *= (cx01 - cx0f)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx1 >= 0 & cx1 < Im1);
+    p10 *= (cx1f - cx01)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx0 >= 0 & cx0 < Im1);
+    p11 *= (cx01 - cx0f)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx1 >= 0 & cx1 < Im1);
+
+    O_GLOBAL_STORE(gid, p00 + p01 + p10 + p11);
+}}
+""")
--- a/xlib/avecl/_internal/op/remap_np_affine.py
+++ b/xlib/avecl/_internal/op/remap_np_affine.py
@ -0,0 +1,96 @@
+import numpy as np
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+def remap_np_affine (input_t : Tensor, affine_n : np.array, inverse=False, output_size=None, dtype=None) -> Tensor:
+    """
+    remap affine operator for all channels using single numpy affine mat
+
+    arguments
+
+        input_t     Tensor (...,H,W)
+
+        affine_n    np.array (2,3)
+
+        dtype
+    """
+    if affine_n.shape != (2,3):
+        raise ValueError('affine_n.shape must be (2,3)')
+
+    op = SCacheton.get(_RemapAffineOp, input_t.shape, input_t.dtype, output_size, dtype)
+
+    output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
+
+    ((a, b, c),
+     (d, e, f)) = affine_n
+    if not inverse:
+        # do inverse by default, match cv2.warpAffine behaviour
+        D = a*e - b*d
+        D = 1.0 / D if D != 0.0 else 0.0
+        a, b, c, d, e, f = (  e*D, -b*D, (b*f-e*c)*D ,
+                             -d*D,  a*D, (d*c-a*f)*D )
+                             
+    input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(),
+                                    np.float32(a), np.float32(b), np.float32(c), np.float32(d), np.float32(e), np.float32(f) )
+
+    return output_t
+
+
+class _RemapAffineOp():
+    def __init__(self, i_shape : AShape, i_dtype, o_size, o_dtype):
+        if np.dtype(i_dtype).type == np.bool_:
+            raise ValueError('np.bool_ dtype of i_dtype is not supported.')
+        if i_shape.ndim < 2:
+            raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
+
+        IH,IW = i_shape[-2:]
+        if o_size is not None:
+            OH,OW = o_size
+        else:
+            OH,OW = IH,IW
+
+        o_shape = AShape( (OH,OW) )
+        if i_shape.ndim > 2:
+            o_shape = i_shape[:-2] + o_shape
+
+        self.o_shape = o_shape
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('I', i_shape, i_dtype)}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME,
+                   float a, float b, float c,
+                   float d, float e, float f)
+{{
+    size_t gid = get_global_id(0);
+
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
+
+    float cx01 = om1*a + om2*b + c;
+    float cy01 = om1*d + om2*e + f;
+
+    float cx0f = floor(cx01);   int cx0 = (int)cx0f;
+    float cy0f = floor(cy01);   int cy0 = (int)cy0f;
+    float cx1f = cx0f+1;        int cx1 = (int)cx1f;
+    float cy1f = cy0f+1;        int cy1 = (int)cy1f;
+
+    float p00 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx0')}));
+    float p01 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy0,cx1')}));
+    float p10 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx0')}));
+    float p11 = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='cy1,cx1')}));
+
+    p00 *= (cx1f - cx01)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx0 >= 0 & cx0 < Im1);
+    p01 *= (cx01 - cx0f)*(cy1f - cy01)*(cy0 >= 0 & cy0 < Im2 & cx1 >= 0 & cx1 < Im1);
+    p10 *= (cx1f - cx01)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx0 >= 0 & cx0 < Im1);
+    p11 *= (cx01 - cx0f)*(cy01 - cy0f)*(cy1 >= 0 & cy1 < Im2 & cx1 >= 0 & cx1 < Im1);
+
+    O_GLOBAL_STORE(gid, p00 + p01 + p10 + p11);
+}}
+""")
--- a/xlib/avecl/_internal/op/reshape.py
+++ b/xlib/avecl/_internal/op/reshape.py
@ -0,0 +1,26 @@
+from typing import Iterable
+
+from ..Tensor import Tensor
+from ..SCacheton import SCacheton
+from ..info import ReshapeInfo
+
+
+def reshape(input_t : Tensor, new_shape : Iterable, copy=True) -> Tensor:
+    """
+    reshape operator
+
+    arguments
+
+        new_shape    Iterable of ints
+
+        copy(True)      if True, produces new Tensor
+                        otherwise result tensor points to the same memory
+
+    Produces reference Tensor with new shape.
+    """
+    info = SCacheton.get(ReshapeInfo, input_t.shape, tuple(int(x) for x in new_shape) )
+
+    if copy:
+        return Tensor(info.o_shape, input_t.dtype, device=input_t.get_device()).set(input_t)
+    return input_t.as_shape( info.o_shape )
+
--- a/xlib/avecl/_internal/op/slice_.py
+++ b/xlib/avecl/_internal/op/slice_.py
@ -0,0 +1,67 @@
+import numpy as np
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..HType import HType
+from ..info import SliceInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+
+def slice_(input_t : Tensor, slices, dtype : np.dtype = None, output_t=None, is_add_to_output=False) -> Tensor:
+    """
+    arguments:
+
+        input_t     input tensor
+        slices      argument received from class.__getitem__(slices)
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape, but should match total size.
+                            gradfn will not be set.
+
+        is_add_to_output    add result to output_t if output_t is set.
+
+    Remark.
+
+    Slicing logic is not the same as numpy:
+    For example np[2:0:1] slice will produce invalid array with zero index,
+    but nn.slice() will select 2 index, same as val_t[2].
+    """
+    op = SCacheton.get(_SliceOp, input_t.shape, input_t.dtype, dtype, HType.hashable_slices(slices), False if output_t is None else is_add_to_output )
+    o_shape = op.slice_info.o_shape
+
+    if output_t is None:
+        if op.slice_info.just_reshaped:
+            return input_t.reshape(o_shape)
+        else:
+            output_t = Tensor(o_shape, op.o_dtype, device=input_t.get_device())
+
+    elif output_t.shape.size != o_shape.size:
+        raise ValueError(f'output_t must have size {o_shape.size}')
+
+    input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
+
+    return output_t
+
+
+class _SliceOp:
+    def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, slices, is_add_to_output):
+        self.slice_info = slice_info = SliceInfo(i_shape, slices)
+
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        self.forward_krn = Kernel(global_shape=(slice_info.o_shape_kd.size,), kernel_text=f"""
+{HKernel.define_tensor('O', slice_info.o_shape_kd, o_dtype )}
+{HKernel.define_tensor('I', i_shape, i_dtype )}
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
+{{
+size_t gid = get_global_id(0);
+
+{HKernel.decompose_idx_to_axes_idxs('gid', 'o', slice_info.o_shape_kd.ndim)}
+
+{chr(10).join( f'size_t i{i} = {b} + o{i} * {s}; ' for i, (b,e,s) in enumerate(slice_info.axes_bes)  )  }
+
+{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}(gid, I_GLOBAL_LOAD( I_IDX({HKernel.axes_seq_enum('i', i_shape.ndim)}) ) );
+}}
+""")
--- a/xlib/avecl/_internal/op/slice_set.py
+++ b/xlib/avecl/_internal/op/slice_set.py
@ -0,0 +1,73 @@
+import numpy as np
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..HType import HType
+from ..info import BroadcastInfo, SliceInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+
+def slice_set(input_t : Tensor, slices, value) -> Tensor:
+    """
+    arguments:
+
+        input_t     input tensor
+        slices      argument received from class.__getitem__(slices)
+        value
+
+
+    Remark.
+
+    """
+    if HType.is_scalar_type(value):
+        v_shape = None
+        v_dtype = None
+        v_scalar = value
+    elif not isinstance(value, Tensor):
+        value = Tensor.from_value(value, dtype=input_t.dtype, device=input_t.get_device())
+        v_shape = value.shape
+        v_dtype = value.dtype
+        v_scalar = None
+
+    op = SCacheton.get(_SliceSetOp, input_t.shape, input_t.dtype, v_shape, v_dtype, v_scalar, HType.hashable_slices(slices) )
+
+    if v_scalar is not None:
+        input_t.get_device().run_kernel(op.forward_krn, input_t.get_buffer() )
+    else:
+        input_t.get_device().run_kernel(op.forward_krn, input_t.get_buffer(), value.get_buffer() )
+
+    return input_t
+
+class _SliceSetOp:
+    def __init__(self, i_shape : AShape, i_dtype : np.dtype, v_shape : AShape, v_dtype : np.dtype, v_scalar, slices):
+        slice_info = SliceInfo(i_shape, slices)
+
+        if v_scalar is None:
+            if v_shape.ndim > i_shape.ndim:
+                raise ValueError(f'v_shape.ndim {v_shape.ndim} cannot be larger than i_shape.ndim {i_shape.ndim}')
+
+            # Check that v_shape can broadcast with slice_info.shape
+            br_info = BroadcastInfo([slice_info.o_shape_kd, v_shape])
+
+            v_br_shape = br_info.br_shapes[1]
+
+        self.forward_krn = Kernel(global_shape=(i_shape.size,), kernel_text=f"""
+{HKernel.define_tensor('O', i_shape, i_dtype )}
+
+{HKernel.define_tensor('I', v_br_shape, v_dtype ) if v_scalar is None else ''}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME
+               {', __global const I_PTR_TYPE* I_PTR_NAME' if v_scalar is None else ''})
+{{
+size_t gid = get_global_id(0);
+
+{HKernel.decompose_idx_to_axes_idxs('gid', 'O', slice_info.o_shape_kd.ndim)}
+
+if ({' & '.join( [f'o{i} >= {b} & o{i} < {e}' if s != 0 else f'o{i} == {b}' for i, (b,e,s) in enumerate(slice_info.axes_abs_bes)] +
+                 [f'((o{i} % {s}) == 0)' for i, (_,_,s) in enumerate(slice_info.axes_abs_bes) if s > 1 ] ) } )
+
+    O_GLOBAL_STORE(gid, {f"I_GLOBAL_LOAD( I_IDX_MOD({HKernel.axes_seq_enum('O', i_shape.ndim)}) ) " if v_scalar is None else f" (O_TYPE)({v_scalar})"} );
+}}
+""")
--- a/xlib/avecl/_internal/op/stack.py
+++ b/xlib/avecl/_internal/op/stack.py
@ -0,0 +1,74 @@
+import numpy as np
+from typing import List
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HArgs import HArgs
+from ..HKernel import HKernel
+from ..HType import HType
+from ..info import StackInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+def stack(tensor_list : List[Tensor], axis, dtype=None, output_t=None, is_add_to_output=False):
+    """
+    Stack operator.
+
+    arguments:
+
+        tensor_list     List of Tensors
+
+        axis            Int
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape, but should match total size.
+                            gradfn will not be set.
+
+        is_add_to_output    add result to output_t if output_t is set.
+    """
+    HArgs.check_zero_get_length(tensor_list)
+    HArgs.check_all_tensors(tensor_list)
+
+    device = HArgs.check_get_same_device(tensor_list)
+
+    shape_list, dtype_list, _ = HArgs.decompose(tensor_list)
+
+    op = SCacheton.get(_StackOp, shape_list, dtype_list, int(axis), dtype, False if output_t is None else is_add_to_output)
+
+    if output_t is None:
+        output_t = Tensor (op.info.o_shape, op.o_dtype, device=device)
+    elif output_t.shape.size != op.info.o_shape.size:
+        raise ValueError(f'output_t must have size {op.info.o_shape.size}')
+
+    for i, krn in enumerate(op.forward_krns):
+        device.run_kernel(krn, output_t.get_buffer(), tensor_list[i].get_buffer(), np.int64(i) )
+
+    return output_t
+
+
+class _StackOp:
+    def __init__(self, shape_list : List[AShape], dtype_list : List[np.dtype], axis, o_dtype, is_add_to_output):
+        self.stack_count = stack_count = len(shape_list)
+
+        i_shape = shape_list[0]
+        if not all (s == i_shape for s in shape_list):
+            raise ValueError('All shapes must be the same')
+
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
+        self.info = info = StackInfo(i_shape, axis, stack_count)
+
+        self.forward_krns = forward_krns = []
+        for i_dtype in dtype_list:
+            forward_krns.append( Kernel(global_shape=(i_shape.size,), kernel_text=f"""
+{HKernel.define_tensor('O', info.o_shape, o_dtype )}
+{HKernel.define_tensor('I', i_shape, i_dtype )}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, long i_new_idx)
+{{
+    size_t gid = get_global_id(0);
+
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'I', i_shape.ndim)}
+
+    {'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_seq_enum('I', i_shape.ndim, new_axis=('i_new_idx', info.axis))}), I_GLOBAL_LOAD(gid) );
+}}
+"""))
--- a/xlib/avecl/_internal/op/tile.py
+++ b/xlib/avecl/_internal/op/tile.py
@ -0,0 +1,58 @@
+import numpy as np
+from typing import List
+
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..info import TileInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+def tile(input_t : Tensor, tiles : List[int], dtype : np.dtype = None, output_t=None, is_add_to_output=False):
+    """
+    Tile operator
+
+    arguments
+
+        tiles       Iterable of ints
+
+        dtype
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape, but should match total size.
+                            gradfn will not be set.
+
+        is_add_to_output    add result to output_t if output_t is set.
+    """
+
+    op = SCacheton.get(_TileOp, input_t.shape, input_t.dtype, tuple(int(tile) for tile in tiles), dtype, False if output_t is None else is_add_to_output)
+
+    if output_t is None:
+        output_t = Tensor (op.info.o_shape, op.o_dtype, device=input_t.get_device())
+    elif output_t.shape.size != op.info.o_shape.size:
+        raise ValueError(f'output_t must have size {op.info.o_shape.size}')
+
+    input_t.get_device().run_kernel( op.forward_krn, output_t.get_buffer(), input_t.get_buffer())
+
+    return output_t
+
+class _TileOp:
+    def __init__(self, i_shape : AShape, i_dtype, tiles, o_dtype, is_add_to_output):
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        self.info = info = TileInfo(i_shape, tiles)
+
+        self.forward_krn = Kernel(global_shape=(info.o_shape.size,), kernel_text=f"""
+
+{HKernel.define_tensor('I', i_shape, i_dtype)}
+{HKernel.define_tensor('O', info.o_shape, o_dtype)}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
+{{
+    size_t gid = get_global_id(0);
+    {HKernel.decompose_idx_to_axes_idxs ('gid', 'O', info.o_shape.ndim)}
+
+    {'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'} (gid, I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)})) );
+}}
+""")
+
--- a/xlib/avecl/_internal/op/transpose.py
+++ b/xlib/avecl/_internal/op/transpose.py
@ -0,0 +1,68 @@
+import numpy as np
+
+from ..AAxes import AAxes
+from ..AShape import AShape
+from ..backend import Kernel
+from ..HKernel import HKernel
+from ..info import TransposeInfo
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+
+def transpose(input_t : Tensor, axes_order, op_text=None, dtype : np.dtype = None, output_t : Tensor=None, is_add_to_output=False) -> Tensor:
+    """
+    arguments:
+
+        axes_order     Int
+                       Iterable of ints
+                       None
+
+        dtype           cast to dtype
+
+        op_text(None)    optional op with value during transpose.
+                        'O = I'
+
+        output_t            compute result to this Tensor.
+                            Tensor may be with different shape, but should match total size
+
+    """
+    op = SCacheton.get(_TransposeOp, input_t.shape, input_t.dtype, dtype, AAxes(axes_order), op_text, False if output_t is None else is_add_to_output )
+
+    if output_t is None:
+        output_t = Tensor (op.o_shape, op.o_dtype, device=input_t.get_device())
+    elif output_t.shape.size != op.o_shape.size:
+        raise ValueError(f'output_t must have size {op.o_shape.size}')
+
+    input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer() )
+
+    return output_t
+
+
+class _TransposeOp:
+    def __init__(self, i_shape : AShape, i_dtype : np.dtype, o_dtype : np.dtype, axes_order : AAxes, op_text, is_add_to_output : bool ):
+        self.axes_order = axes_order
+        self.o_shape = o_shape = TransposeInfo(i_shape, axes_order).o_shape
+        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
+
+        if op_text is None:
+            op_text = 'O = I'
+
+        self.forward_krn = Kernel(global_shape=(i_shape.size,), kernel_text=f"""
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('I', i_shape, i_dtype)}
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME)
+{{
+
+size_t gid = get_global_id(0);
+
+{HKernel.decompose_idx_to_axes_idxs('gid', 'i', i_shape.ndim)}
+
+I_TYPE I = I_GLOBAL_LOAD(gid);
+O_TYPE O;
+
+{op_text};
+
+{'O_STORE_ADD' if is_add_to_output else 'O_GLOBAL_STORE'}( O_IDX({HKernel.axes_order_enum('I', axes_order )}), O );
+
+}}""")
+
+
--- a/xlib/avecl/_internal/op/unused.zip
+++ b/xlib/avecl/_internal/op/unused.zip
--- a/xlib/avecl/_internal/op/warp_affine.py
+++ b/xlib/avecl/_internal/op/warp_affine.py
@ -0,0 +1,72 @@
+import numpy as np
+
+from ..AShape import AShape
+from ..initializer import InitCoords2DArange
+from ..SCacheton import SCacheton
+from ..Tensor import Tensor
+from .matmul import matmul
+from .remap import remap
+
+def warp_affine (input_t : Tensor, affine_t : Tensor, output_size=None, dtype=None) -> Tensor:
+    """
+    arguments
+
+        input_t     Tensor(...,H,W)
+
+        affine_t    Tensor(...,2,3)
+                    affine matrix
+
+                    example of identity affine matrix
+                    [1,0,0],
+                    [0,1,0]
+
+        ...-head part of shapes will be broadcasted to each other
+
+        output_size(None)
+
+                    tuple of 2 ints (HW)
+                    of output size
+                    if None , size will not be changed
+
+    """
+    op = SCacheton.get(_WarpAffineOp, input_t.shape, input_t.dtype,  affine_t.shape, affine_t.dtype, output_size)
+
+    affine_t = affine_t.transpose( op.affine_transpose_axes, dtype=np.float32 ).reshape( (-1,3,2) )
+
+    coords_t = Tensor(op.coords_shape, np.float32, device=input_t.get_device(), initializer=op.coords_init )
+    coords_t = coords_t.reshape(op.coords_reshape)
+    coords_t = matmul(coords_t, affine_t).reshape(op.coords_affined_shape)
+
+    output_t = remap(input_t, coords_t, dtype=dtype)
+    return output_t
+
+
+class _WarpAffineOp():
+    def __init__(self, i_shape : AShape, i_dtype, a_shape : AShape, a_dtype, o_size):
+        if np.dtype(i_dtype).type == np.bool_:
+            raise ValueError('np.bool_ dtype of i_dtype is not supported.')
+        if np.dtype(a_dtype).type == np.bool_:
+            raise ValueError('np.bool_ dtype of a_dtype is not supported.')
+        if i_shape.ndim < 2:
+            raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
+        if a_shape.ndim < 2:
+            raise ValueError(f'a_shape.ndim must be >= 2 (...,2,3)')
+        if a_shape[-2] != 2 or a_shape[-1] != 3:
+            raise ValueError('Last a_shape dims must be == (...,2,3)')
+
+        IH,IW = i_shape[-2:]
+        if o_size is not None:
+            OH,OW = o_size
+        else:
+            OH,OW = IH,IW
+
+        self.coords_shape = AShape( (OH,OW,3) )
+        self.coords_affined_shape = AShape( (OH,OW,2) )
+
+        if a_shape.ndim > 2:
+            self.coords_shape = a_shape[:-2] + self.coords_shape
+            self.coords_affined_shape = a_shape[:-2] + self.coords_affined_shape
+
+        self.coords_init = InitCoords2DArange(0,OH-1,0,OW-1)
+        self.coords_reshape = (-1,OH*OW,3)
+        self.affine_transpose_axes = a_shape.axes_arange().swapped_axes(-2,-1)