update xlib.avecl

2025-08-19 13:09:58 -07:00 · 2021-10-20 18:02:50 +04:00 · 2021-10-20 18:02:50 +04:00 · 6da916cc66
commit 6da916cc66
parent 2d401f47f8
14 changed files with 246 additions and 184 deletions
--- a/xlib/avecl/init.py
+++ b/xlib/avecl/init.py
@ -3,6 +3,7 @@ AveCL ! Make OpenCL great again.
 Lightweight ndarray library using OpenCL 1.2 written in pure python.
 Applicable for high-performance general purpose n-dim array computations for every device that supports OpenCL 1.2.
 Supports any dtype except float64.
 Works in python 3.5+. Dependencies: numpy.
--- a/xlib/avecl/_internal/AShape.py
+++ b/xlib/avecl/_internal/AShape.py
@ -57,6 +57,19 @@ class AShape(Iterable):
    def as_list(self) -> List[int]:
        return list(self.shape)
    def check_axis(self, axis : int) -> int:
        """
        Check axis and returns normalized axis value
        can raise ValueError
        """
        if axis < 0:
            axis += self.ndim
        if axis < 0 or axis >= self.ndim:
            raise ValueError(f'axis {axis} out of bound of ndim {self.ndim}')
        return axis
    def axes_arange(self) -> AAxes:
        """
        Returns tuple of axes arange.
--- a/xlib/avecl/_internal/HKernel.py
+++ b/xlib/avecl/_internal/HKernel.py
@ -15,12 +15,9 @@ class HKernel:
                        np.int64   : 'long',
                        np.uint64  : 'ulong',
                        np.float16 : 'half',
-                        np.float32 : 'float',
+                        np.float32 : 'float'
                        np.float64 : 'double'
                      }
    @staticmethod
    def np_dtype_to_cl(dtype : np.dtype):
        """
@ -134,7 +131,7 @@ class HKernel:
            out += [f'#define {name_upper}_GLOBAL_STORE8(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
            out += [f'#define {name_upper}_GLOBAL_STORE16(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
-        if dtype in [np.float32, np.float64]:
+        if dtype in [np.float32]:
            out += [f'#define {name_upper}_TO_FLOATX(x) x']
        elif dtype in [np.bool_, np.int8, np.uint8, np.int16, np.uint16, np.int32,np.uint32, np.float16]:
            out += [f'#define {name_upper}_TO_FLOATX(x) ((float)x)']
@ -145,7 +142,10 @@ class HKernel:
    @staticmethod
    def define_ndim_idx(ndim):
        """
        define macro to calculate index for n-dim shape
        example for ndim=3
        #define NDIM3_IDX(t0,t1,t2,T0,T1,T2) (((size_t)(t0))*T1*T2+((size_t)(t1))*T2+((size_t)(t2)))
        #define NDIM3_IDX_MOD(t0,t1,t2,T0,T1,T2) (((size_t)(t0) % T0)*T1*T2+((size_t)(t1) % T1)*T2+((size_t)(t2) % T2))
        """
@ -165,14 +165,14 @@ class HKernel:
        """
        Returns a definitions for operations with tensor shape
-        example for 'O', (7,3),
+        example for 'O', (2,3),
-        #define O0 7
+        #define O0 2
        #define O1 3
        #define Om1 3
-        #define Om2 7
+        #define Om2 2
-        #define O_IDX(o0,o1) ( (size_t)(o0) )*3 +( o1 )
+        #define O_IDX(o0,o1) (((size_t)(o0))*3+((size_t)(o1)))
-        #define O_IDX_MOD(o0,o1) ( (size_t)(o0) % 7 )*3 +( (o1) % 3 )
+        #define O_IDX_MOD(o0,o1) (((size_t)(o0) % 2)*3+((size_t)(o1) % 3))
        """
        shape = tuple(shape)
        ndim = len(shape)
@ -183,36 +183,14 @@ class HKernel:
            axes_symbols = "".join([str(i) for i in range(ndim)])
        axes_symbols = axes_symbols.upper()
-        out = []
+        out =  [f'#define {name_upper}{axes_symbols[i]} {shape[i]}' for i in range(ndim)]
-        for i in range(ndim):
+        out += [f'#define {name_upper}m{i} {shape[-i]}' for i in range(1,ndim+1)]
            out += [f'#define {name_upper}{axes_symbols[i]} {shape[i]}']
-        for i in range(1,ndim+1):
+        out += [f'#define {name_upper}_IDX({HKernel.axes_seq_enum(name, ndim)}) (' + \
-            out += [f'#define {name_upper}m{i} {shape[-i]}']
+                 '+'.join([f'((size_t)({name_lower}{i}))'              + ''.join(f'*{shape[j]}' for j in range(i+1,ndim)) for i in range(ndim)]) + ')']
-        line = f'#define {name_upper}_IDX({HKernel.axes_seq_enum(name, ndim)}) '
+        out += [f'#define {name_upper}_IDX_MOD({HKernel.axes_seq_enum(name, ndim)}) (' + \
-
+                 '+'.join([f'((size_t)({name_lower}{i}) % {shape[i]})' + ''.join(f'*{shape[j]}' for j in range(i+1,ndim)) for i in range(ndim)]) + ')']
        for i in range(ndim):
            line += f'( (size_t)({name_lower}{i}) )'
            for j in range(i+1,ndim):
                line += f'*{shape[j]} '
            if i != ndim-1:
                line += '+'
        out += [line]
        line = f'#define {name_upper}_IDX_MOD({HKernel.axes_seq_enum(name, ndim)}) '
        for i in range(ndim):
            line += f'( (size_t)({name_lower}{i}) % {shape[i]} )'
            for j in range(i+1,ndim):
                line += f'*{shape[j]} '
            if i != ndim-1:
                line += '+'
        out += [line,'']
        return '\n'.join(out)
--- a/xlib/avecl/_internal/HType.py
+++ b/xlib/avecl/_internal/HType.py
@ -3,10 +3,10 @@ from typing import Iterable, List
 import numpy as np
 scalar_types = [int, float, np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64,
-                np.float16, np.float32, np.float64, np.bool_]
+                np.float16, np.float32, np.bool_]
 np_scalar_types = [np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64,
-                    np.float16, np.float32, np.float64, np.bool_]
+                    np.float16, np.float32, np.bool_]
 _np_dtype_to_cl = {
    np.bool_   : 'bool',
@ -20,7 +20,6 @@ _np_dtype_to_cl = {
    np.int64   : 'long',
    np.float16 : 'half',
    np.float32 : 'float',
    np.float64 : 'double',
 }
 _np_dtype_weight = {
@ -34,8 +33,7 @@ _np_dtype_weight = {
    np.uint64  : 8,
    np.int64   : 9,
    np.float16 : 10,
-    np.float32 : 11,
+    np.float32 : 11
    np.float64 : 12,
 }
 class HType:
--- a/xlib/avecl/_internal/NTest.py
+++ b/xlib/avecl/_internal/NTest.py
@ -1,14 +1,13 @@
 import traceback
 import numpy as np
 from .HType import HType
 from .NCore import NCore
 from .backend import get_device, get_default_device, set_default_device
 from .Tensor import Tensor
 from . import op
-from .initializer import InitRandomUniform, InitCoords2DArange
+from .backend import get_default_device, get_device, set_default_device
 from .HType import HType
 from .info import Conv2DInfo
 from .initializer import InitCoords2DArange, InitRandomUniform
 from .NCore import NCore
 from .Tensor import Tensor
 class NTest():
@ -45,6 +44,7 @@ class NTest():
                        binary_dilate_circle_test,
                        binary_morph_test,
                        cvt_color_test,
                        rct_test,
                    ]
        for test_func in test_funcs:
@ -62,18 +62,39 @@ class NTest():
 def _all_close(x,y, atol=1, btol=1):
    return np.allclose( np.ndarray.flatten(x[None,...]), np.ndarray.flatten(y[None,...]), atol, btol )
 def rct_test():
    for _ in range(10):
      for dtype in [np.float16, np.float32]:
        base_shape = list(np.random.randint(1, 8, size=4) )
        shape = base_shape.copy()
        shape[1] = 3
        mask_shape = base_shape.copy()
        mask_shape[1] = 3
        print(f'rct {shape} {str(np.dtype(dtype).name)} ... ', end='', flush=True)
        source_t = Tensor(shape=shape, dtype=dtype, initializer=InitRandomUniform())
        target_t = Tensor(shape=shape, dtype=dtype, initializer=InitRandomUniform())
        mask_t   = Tensor(shape=mask_shape, dtype=dtype, initializer=InitRandomUniform())
        result_t = op.rct(target_t, source_t, target_mask_t=mask_t, source_mask_t=mask_t )
        print('pass')
 def cvt_color_test():
    for _ in range(10):
     for shape_len in range(2,6):
      for in_mode in ['RGB','BGR','XYZ','LAB']:
        for out_mode in ['RGB','BGR','XYZ','LAB']:
-          for dtype in [np.float16, np.float32, np.float64]:
+          for dtype in [np.float16, np.float32]:
            shape = list(np.random.randint(1, 8, size=shape_len) )
            ch_axis = np.random.randint(len(shape))
            shape[ch_axis] = 3
-            print(f'cvt_color {shape} {str(np.dtype(dtype).name)} {in_mode}->{out_mode} ... ', end='')
+            print(f'cvt_color {shape} {str(np.dtype(dtype).name)} {in_mode}->{out_mode} ... ', end='', flush=True)
            inp_n = np.random.uniform(size=shape ).astype(dtype)
            inp_t = Tensor.from_value(inp_n)
@ -81,7 +102,9 @@ def cvt_color_test():
            out_t = op.cvt_color(inp_t, in_mode=in_mode, out_mode=out_mode, ch_axis=ch_axis)
            inp_t2 = op.cvt_color(out_t, in_mode=out_mode, out_mode=in_mode, ch_axis=ch_axis)
-            if not _all_close(inp_t.np(), inp_t2.np(), atol=0.1, btol=0.1):
+            is_check = in_mode in ['RGB','BGR','XYZ'] and out_mode in ['XYZ','LAB']
            if is_check and not _all_close(inp_t.np(), inp_t2.np(), atol=0.1, btol=0.1):
                raise Exception(f'data is not equal')
            print('pass')
@ -91,7 +114,7 @@ def cast_test():
        for out_dtype in HType.get_np_scalar_types():
            shape = tuple(np.random.randint(1, 8, size=( np.random.randint(1,5))) )
-            print(f'cast: {shape} in_dtype:{str(np.dtype(in_dtype).name)} out_dtype:{str(np.dtype(out_dtype).name)}  ... ', end='')
+            print(f'cast: {shape} in_dtype:{str(np.dtype(in_dtype).name)} out_dtype:{str(np.dtype(out_dtype).name)}  ... ', end='', flush=True)
            val_n = np.random.uniform( -64, 64, size=shape ).astype(in_dtype)
            cast_n = val_n.astype(out_dtype)
@ -113,7 +136,7 @@ def binary_morph_test():
            input_n = np.random.randint( 2, size=shape ).astype(dtype)
            input_t = Tensor.from_value(input_n)
-            print(f'binary_morph: {shape} erode_dilate:{erode_dilate} blur:{blur} {np.dtype(dtype).name} ... ', end='')
+            print(f'binary_morph: {shape} erode_dilate:{erode_dilate} blur:{blur} {np.dtype(dtype).name} ... ', end='', flush=True)
            op.binary_morph(input_t, erode_dilate=erode_dilate, blur=blur, fade_to_border=True)
@ -130,7 +153,7 @@ def binary_erode_circle_test():
            input_n = np.random.randint( 2, size=shape ).astype(dtype)
            input_t = Tensor.from_value(input_n)
-            print(f'binary_erode_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='')
+            print(f'binary_erode_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='', flush=True)
            op.binary_erode_circle(input_t, radius=radius, iterations=iterations)
@ -147,7 +170,7 @@ def binary_dilate_circle_test():
            input_n = np.random.randint( 2, size=shape ).astype(dtype)
            input_t = Tensor.from_value(input_n)
-            print(f'binary_dilate_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='')
+            print(f'binary_dilate_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='', flush=True)
            op.binary_dilate_circle(input_t, radius=radius, iterations=iterations)
@ -156,11 +179,11 @@ def binary_dilate_circle_test():
 def gaussian_blur_test():
    for shape_len in range(2,5):
-        for dtype in [np.float16, np.float32, np.float64]:
+        for dtype in [np.float16, np.float32]:
            shape = np.random.randint( 1, 64, size=(shape_len,) )
            sigma = np.random.rand() * 10
-            print(f'gaussian_blur: {shape} sigma:{sigma} {np.dtype(dtype).name} ... ', end='')
+            print(f'gaussian_blur: {shape} sigma:{sigma} {np.dtype(dtype).name} ... ', end='', flush=True)
            val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
            val_t = Tensor.from_value(val_n)
@ -179,7 +202,7 @@ def pad_test():
                paddings = tuple( (np.random.randint(8), np.random.randint(8)) for i in range(len(shape)) )
-                print(f'pad: {shape} {paddings} {mode} {np.dtype(dtype).name} ... ', end='')
+                print(f'pad: {shape} {paddings} {mode} {np.dtype(dtype).name} ... ', end='', flush=True)
                val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
                pad_n = np.pad(val_n, paddings, mode=mode)
@ -187,7 +210,7 @@ def pad_test():
                val_t = Tensor.from_value(val_n)
                pad_t = op.pad(val_t, paddings, mode=mode)
-                print(f'{pad_n.shape} == {pad_t.shape} ... ', end='')
+                print(f'{pad_n.shape} == {pad_t.shape} ... ', end='', flush=True)
                if pad_n.shape != pad_t.shape:
                    raise Exception(f'shape is not equal')
@ -241,7 +264,7 @@ def slice_set_test():
                shape = tuple(shape)
                slices = tuple(slices)
-                print(f'slice_set: {shape} {np.dtype(dtype).name} {slices} ... ', end='')
+                print(f'slice_set: {shape} {np.dtype(dtype).name} {slices} ... ', end='', flush=True)
                val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
                val_t = Tensor.from_value(val_n)
@ -330,7 +353,7 @@ def depthwise_conv2d_test():
                            input_shape  = (n, ic, ih, iw)
                            kernel_shape = (ic, ks, ks)
-                            print(f'depthwise_conv2d: {input_shape},{kernel_shape},{padding},{stride},{dilation},{np.dtype(dtype).name} ... ', end='')
+                            print(f'depthwise_conv2d: {input_shape},{kernel_shape},{padding},{stride},{dilation},{np.dtype(dtype).name} ... ', end='', flush=True)
                            input_n  = np.random.randint( 64, size=input_shape ).astype(dtype)
                            kernel_n = np.ones(shape=kernel_shape ).astype(dtype)
@ -358,7 +381,7 @@ def warp_affine_test():
        H = np.random.randint(8, 64)
        W = np.random.randint(8, 64)
-        print(f'warp_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
+        print(f'warp_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='', flush=True)
        input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )
@ -380,7 +403,7 @@ def remap_np_affine_test():
        H = np.random.randint(8, 64)
        W = np.random.randint(8, 64)
-        print(f'remap_np_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
+        print(f'remap_np_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='', flush=True)
        input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )
@ -402,7 +425,7 @@ def remap_test():
        H = np.random.randint(8, 64)
        W = np.random.randint(8, 64)
-        print(f'remap: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
+        print(f'remap: [{H},{W}] {np.dtype(dtype).name} ... ', end='', flush=True)
        input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )
@ -422,7 +445,7 @@ def tile_test():
            shape = tuple(np.random.randint( 8, size=(shape_len,) )+1)
            tiles = tuple(np.random.randint( 4, size=(shape_len,) )+1)
-            print(f'tile: {shape} {tiles} {np.dtype(dtype).name} ... ', end='')
+            print(f'tile: {shape} {tiles} {np.dtype(dtype).name} ... ', end='', flush=True)
            val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
            tiled_n = np.tile(val_n, tiles)
@ -430,7 +453,7 @@ def tile_test():
            val_t = Tensor.from_value(val_n)
            tiled_t = op.tile(val_t, tiles)
-            print(f'{tiled_n.shape} == {tiled_t.shape} ... ', end='')
+            print(f'{tiled_n.shape} == {tiled_t.shape} ... ', end='', flush=True)
            if tiled_n.shape != tiled_t.shape:
                raise Exception(f'shape is not equal')
@ -448,7 +471,7 @@ def stack_test():
                axis = np.random.randint(shape_len+1)
                stack_count = np.random.randint(4)+1
-                print(f'stack: {shape}*{stack_count} axis:{axis} {np.dtype(dtype).name} ... ', end='')
+                print(f'stack: {shape}*{stack_count} axis:{axis} {np.dtype(dtype).name} ... ', end='', flush=True)
                vals_n = [ np.random.randint( 2**8, size=shape ).astype(dtype) for i in range(stack_count) ]
                stack_n = np.stack(vals_n, axis)
@ -456,7 +479,7 @@ def stack_test():
                vals_t = [ Tensor.from_value(vals_n[i]) for i in range(stack_count) ]
                stack_t = op.stack(vals_t, axis)
-                print(f'{stack_n.shape} == {stack_t.shape} ... ', end='')
+                print(f'{stack_n.shape} == {stack_t.shape} ... ', end='', flush=True)
                if stack_n.shape != stack_t.shape:
                    raise Exception('shape is not equal')
@ -483,9 +506,9 @@ def reduce_test():
                keepdims = np.random.randint(2) == 0
-                print(f'reduce {op_type}: {shape} {np.dtype(dtype).name} axes={reduction_axes} keepdims={keepdims} ... ', end='')
+                print(f'reduce {op_type}: {shape} {np.dtype(dtype).name} axes={reduction_axes} keepdims={keepdims} ... ', end='', flush=True)
-                if dtype in [np.float16, np.float32, np.float64]:
+                if dtype in [np.float16, np.float32]:
                    value_n = np.random.uniform(size=shape).astype(dtype)
                else:
                    value_n = np.random.randint( max(1, int(np.iinfo(dtype).max / np.prod(shape)) ), size=shape, dtype=dtype )
@ -518,7 +541,7 @@ def InitRandomUniform_test():
        for shape_len in range(1, 5):
            shape = np.random.randint( 8, size=(shape_len,) )+1
-            print(f'InitRandomUniform: {shape} {np.dtype(dtype).name} ... ', end='')
+            print(f'InitRandomUniform: {shape} {np.dtype(dtype).name} ... ', end='', flush=True)
            Tensor(shape, dtype, initializer=InitRandomUniform()).np()
@ -534,7 +557,7 @@ def InitCoords2DArange_test():
            w_start = np.random.randint(80)
            w_stop = w_start + np.random.randint(80)
-            print(f'InitCoords2DArange: {shape} {np.dtype(dtype).name} ... ', end='')
+            print(f'InitCoords2DArange: {shape} {np.dtype(dtype).name} ... ', end='', flush=True)
            Tensor(shape, dtype, initializer=InitCoords2DArange(h_start,h_stop,w_start,w_stop )).np()
@ -551,17 +574,17 @@ def concat_test():
                                    for i,dim in enumerate(shape) )
                            for shape in ([shape] * count) )
-            print(f'concat: {shapes} axis={axis} {np.dtype(dtype).name} ... ', end='')
+            print(f'concat: {shapes} axis={axis} {np.dtype(dtype).name} ... ', end='', flush=True)
            V_n = [ np.random.randint( 2**8, size=shape ).astype(dtype) for shape in shapes ]
            O_n = np.concatenate(V_n, axis)
-            print(f'{O_n.shape} == ', end='')
+            print(f'{O_n.shape} == ', end='', flush=True)
            V_t = [ Tensor.from_value(V_n[i]) for i in range(count) ]
            O_t = op.concat(V_t, axis)
-            print(f'{O_t.shape} ... ', end='')
+            print(f'{O_t.shape} ... ', end='', flush=True)
            if O_n.shape != O_t.shape:
                raise Exception('shape is not equal')
@ -596,19 +619,19 @@ def matmul_test():
                A_shape = (BATCH, M, K)
                B_shape = (BATCH, K, N)
-            print(f'matmul: {A_shape} {B_shape} {np.dtype(dtype).name} ... ', end='')
+            print(f'matmul: {A_shape} {B_shape} {np.dtype(dtype).name} ... ', end='', flush=True)
            A_n = np.random.randint( 2**4, size=A_shape ).astype(dtype)
            B_n = np.random.randint( 2**4, size=B_shape ).astype(dtype)
            O_n = np.matmul(A_n, B_n)
-            print(f'{O_n.shape} == ', end='')
+            print(f'{O_n.shape} == ', end='', flush=True)
            A_t = Tensor.from_value(A_n)
            B_t = Tensor.from_value(B_n)
            O_t = op.matmul(A_t, B_t)
-            print(f'{O_t.shape} ... ', end='')
+            print(f'{O_t.shape} ... ', end='', flush=True)
            if O_n.shape != O_t.shape:
                raise Exception('shape is not equal')
@ -659,17 +682,17 @@ def slice_test():
                shape = tuple(shape)
                slices = tuple(slices)
-                print(f'slice: {shape} {np.dtype(dtype).name} {slices} ... ', end='')
+                print(f'slice: {shape} {np.dtype(dtype).name} {slices} ... ', end='', flush=True)
                val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
                sliced_n = val_n[slices]
-                print(f'{sliced_n.shape} ... ', end='')
+                print(f'{sliced_n.shape} ... ', end='', flush=True)
                sliced_t = Tensor.from_value(val_n)[slices]
-                print(f'{sliced_t.shape} ... ', end='')
+                print(f'{sliced_t.shape} ... ', end='', flush=True)
                if 0 in sliced_n.shape:
                    # some cases like 0:1:-1 will produce zero shape and invalid array on numpy
@ -694,17 +717,17 @@ def transpose_test():
            axes_order = np.array([*range(shape_len)])
            np.random.shuffle(axes_order)
-            print(f'transpose: {shape} {axes_order} ... ', end='')
+            print(f'transpose: {shape} {axes_order} ... ', end='', flush=True)
            val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
            transposed_n = np.transpose(val_n, axes_order)
-            print(f'{transposed_n.shape} ... ', end='')
+            print(f'{transposed_n.shape} ... ', end='', flush=True)
            val_t = Tensor.from_value(val_n)
            transposed_t = op.transpose (val_t, axes_order )
-            print(f'{transposed_t.shape} ... ', end='')
+            print(f'{transposed_t.shape} ... ', end='', flush=True)
            if transposed_n.shape != transposed_t.shape:
                raise Exception('shape is not equal')
@ -736,7 +759,7 @@ def any_wise_op_test():
                    shapes = shapes[::-1]
                a_shape, b_shape = shapes
-                print(f'any_wise: {a_shape} {str(op_type)} {b_shape}:{str(np.dtype(dtype).name)} ...', end='')
+                print(f'any_wise: {a_shape} {str(op_type)} {b_shape}:{str(np.dtype(dtype).name)} ...', end='', flush=True)
                a_n = np.random.randint( 1, 2**8, size=a_shape ).astype(dtype)
                b_n = np.random.randint( 1, 2**8, size=b_shape ).astype(dtype)
--- a/xlib/avecl/_internal/Tensor.py
+++ b/xlib/avecl/_internal/Tensor.py
@ -109,6 +109,7 @@ class Tensor:
    def min(self, axes=None, keepdims=False) -> 'Tensor': ...
    def reshape(self, new_shape) -> 'Tensor': ...
    def sum(self, axes=None, keepdims=False) -> 'Tensor': ...
    def std(self, axes=None, keepdims=False) -> 'Tensor': ...
    def transpose(self, axes_order, op_text=None, dtype=None) -> 'Tensor': ...
    @property
--- a/xlib/avecl/_internal/TensorImpl.py
+++ b/xlib/avecl/_internal/TensorImpl.py
@ -70,6 +70,7 @@ Tensor.mean = reduce_mean
 Tensor.min = reduce_min
 Tensor.reshape = reshape
 Tensor.sum = reduce_sum
 Tensor.std = reduce_std
 Tensor.transpose = transpose
 class TensorRef(Tensor):
--- a/xlib/avecl/_internal/backend/Device.py
+++ b/xlib/avecl/_internal/backend/Device.py
@ -18,8 +18,7 @@ _np_dtype_to_cl = { np.uint8:   CL.cl_uchar,
                    np.uint64:  CL.cl_ulong,
                    np.int64:   CL.cl_long,
                    np.float16: CL.cl_half,
-                    np.float32: CL.cl_float,
+                    np.float32: CL.cl_float}
                    np.float64: CL.cl_double }
 _opencl_device_ids = None
 _default_device = None
--- a/xlib/avecl/_internal/initializer/InitRandomUniform.py
+++ b/xlib/avecl/_internal/initializer/InitRandomUniform.py
@ -38,8 +38,6 @@ class InitRandomUniform(Initializer):
                gen_expression = f'hash_ulong_from_ulong(gid+seed64) % {int(hl)} + {int(l)}'
            elif tensor.dtype in [np.float16, np.float32]:
                gen_expression = f'hash_float_from_uint(gid+seed32)*{hl} + {l}'
            elif tensor.dtype in [np.float64]:
                gen_expression = f'hash_double_from_ulong(gid+seed64)*{hl} + {l}'
            kernel = Kernel(kernel_text=f"""
 {HKernel.include_hash()}
--- a/xlib/avecl/_internal/op/init.py
+++ b/xlib/avecl/_internal/op/init.py
@ -9,12 +9,13 @@ from .depthwise_conv2D import depthwise_conv2D
 from .gaussian_blur import gaussian_blur
 from .matmul import matmul, matmulc
 from .pad import pad
 from .rct import rct
 from .reduce import (moments, reduce_max, reduce_mean, reduce_min, reduce_std,
                     reduce_sum, reduce_variance)
 from .remap import remap
 from .remap_np_affine import remap_np_affine
 from .reshape import reshape
-from .slice_ import slice_
+from .slice_ import slice_, split
 from .slice_set import slice_set
 from .stack import stack
 from .tile import tile
--- a/xlib/avecl/_internal/op/any_wise.py
+++ b/xlib/avecl/_internal/op/any_wise.py
@ -1,27 +1,31 @@
 import numpy as np
 from ..AAxes import AAxes
 from ..AShape import AShape
 from ..backend import Kernel
 from ..HArgs import HArgs
 from ..HKernel import HKernel
 from ..HType import HType
-from ..info import BroadcastInfo
+from ..info import BroadcastInfo, ReductionInfo
 from ..SCacheton import SCacheton
 from ..Tensor import Tensor
 def any_wise(op_text : str,
             *args,
             dim_wise_axis : int = None,
             dtype : np.dtype = None,
             output_t:Tensor=None) -> Tensor:
    """
-    operator for N-wise ops with N inputs
+    elements-wise operator with N inputs
    arguments
        op_text     example: O=(2*I0*I1)+I2
        *args       List[ Tensor | number ]
        dim_wise_axis(None)
        dtype
        output_t            compute result to this Tensor.
@ -33,7 +37,7 @@ def any_wise(op_text : str,
    shape_list, dtype_list, krn_args = HArgs.decompose(args)
-    op = SCacheton.get(_AnyWiseOp, shape_list, dtype_list, dtype, op_text)
+    op = SCacheton.get(_AnyWiseOp, shape_list, dtype_list, dim_wise_axis, dtype, op_text)
    if output_t is None:
        output_t = Tensor ( op.o_shape, op.o_dtype, device=device )
@ -45,59 +49,60 @@ def any_wise(op_text : str,
    return output_t
 class _AnyWiseOp:
-    def __init__(self, shape_list, dtype_list, o_dtype, op_text : str):
+    def __init__(self, shape_list, dtype_list, dim_wise_axis, o_dtype, op_text : str):
        if len(shape_list) != len(dtype_list):
            raise ValueError('len(shape_list) != len(dtype_list)')
        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
        if len(shape_list) == 1:
            # element-wise.
            i_shape, i_dtype = shape_list[0], dtype_list[0]
            self.o_shape = o_shape = i_shape
            self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
 {HKernel.define_tensor('O', o_shape, o_dtype)}
 {HKernel.define_tensor('IN', i_shape, i_dtype)}
 __kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const IN_PTR_TYPE* IN_PTR_NAME)
 {{
 size_t gid = get_global_id(0);
 O_TYPE O = O_GLOBAL_LOAD(gid);
 IN_TYPE I0 = IN_GLOBAL_LOAD(gid);
 {op_text};
 O_GLOBAL_STORE(gid, O);
 }}
 """)
        else:
            # Multi arg.
        self.info = info = BroadcastInfo( [ shape if shape is not None else AShape((1,)) for shape in shape_list ])
        self.o_shape = o_shape = info.o_shape
        g_shape = o_shape
        if dim_wise_axis is not None:
            dim_wise_axis = o_shape.check_axis(dim_wise_axis)
            dim_wise_axis_size = o_shape[dim_wise_axis]
            if dim_wise_axis_size > 16:
                raise ValueError(f'dim_wise_axis size > 16: {dim_wise_axis_size}')
            g_shape = ReductionInfo( o_shape, AAxes(dim_wise_axis), False ).o_shape
        defs, arg_defs, impls = [], [], []
        for i, (t_shape, t_dtype) in enumerate(zip(shape_list, dtype_list)):
            t_name = f'I{i}'
            if t_shape is not None:
                defs.append( HKernel.define_tensor(t_name, info.br_shapes[i], t_dtype) )
                arg_defs.append( f", __global const {t_name}_PTR_TYPE* {t_name}_PTR_NAME" )
-                    impls.append( f"{t_name}_TYPE {t_name} = {t_name}_GLOBAL_LOAD({t_name}_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)}));")
+
                if dim_wise_axis is not None:
                    for i_elem in range(dim_wise_axis_size):
                        impls.append( f"{t_name}_TYPE {t_name}_{i_elem} = {t_name}_GLOBAL_LOAD({t_name}_IDX_MOD({HKernel.axes_seq_enum('G', g_shape.ndim, new_axis=(f'{i_elem}', dim_wise_axis) )}));")
                else:
                        impls.append( f"{t_name}_TYPE {t_name} = {t_name}_GLOBAL_LOAD({t_name}_IDX_MOD({HKernel.axes_seq_enum('G', g_shape.ndim)}));")
            else:
                arg_defs.append( f", {HKernel.define_scalar_func_arg(t_name, t_dtype)}" )
        defs, arg_defs, impls = '\n'.join(defs), '\n'.join(arg_defs), '\n'.join(impls)
-            self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+        if dim_wise_axis is not None:
            o_def = '\n'.join( f"O_TYPE O_{i_elem};" for i_elem in range(dim_wise_axis_size) )
            o_store = '\n'.join( f"O_GLOBAL_STORE(O_IDX({HKernel.axes_seq_enum('G', g_shape.ndim, new_axis=(f'{i_elem}', dim_wise_axis) )}), O_{i_elem});" for i_elem in range(dim_wise_axis_size) )
        else:
            o_def   = 'O_TYPE O;'
            o_store = 'O_GLOBAL_STORE(gid, O);'
        self.forward_krn = Kernel(global_shape=(g_shape.size,), kernel_text=f"""
 {defs}
 {HKernel.define_tensor('O', o_shape, o_dtype)}
 {HKernel.define_tensor_shape('G', g_shape)}
 __kernel void impl(__global O_PTR_TYPE* O_PTR_NAME{arg_defs})
 {{
 size_t gid = get_global_id(0);
-{HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)}
+{HKernel.decompose_idx_to_axes_idxs('gid', 'G', g_shape.ndim)}
 {impls}
-O_TYPE O;
+{o_def}
 {op_text};
-O_GLOBAL_STORE(gid, O);
+{o_store}
 }}
 """)
--- a/xlib/avecl/_internal/op/cvt_color.py
+++ b/xlib/avecl/_internal/op/cvt_color.py
@ -39,7 +39,7 @@ def cvt_color (input_t : Tensor, in_mode : str, out_mode : str, ch_axis=1, dtype
    return output_t
 _allowed_modes = ['RGB', 'BGR', 'XYZ', 'LAB']
-_allowed_dtypes = [np.float16, np.float32, np.float64]
+_allowed_dtypes = [np.float16, np.float32]
 class _CvtColor32Op():
    def __init__(self, i_shape : AShape, i_dtype, in_mode, o_dtype, out_mode, ch_axis):
@ -100,54 +100,74 @@ class _CvtColor32Op():
            self.forward_krn = krn
    @staticmethod
-    def get_RGB_to_LAB_body(R,G,B,L,a,b,lab_type='') -> str:
+    def get_RGB_to_LAB_body(R,G,B,L,a,b, declare_out_type=False) -> str:
        return f"""
-{_CvtColor32Op.get_RGB_to_XYZ_body(R,G,B,'X','Y','Z', xyz_type='float')}
+{_CvtColor32Op.get_sRGB_to_XYZ_body(R,G,B,'X','Y','Z', declare_out_type=True)}
-{_CvtColor32Op.get_XYZ_to_LAB_body('X','Y','Z',L,a,b, lab_type=lab_type)}
+{_CvtColor32Op.get_XYZ_to_LAB_body('X','Y','Z',L,a,b, declare_out_type=declare_out_type)}
 """
    @staticmethod
-    def get_LAB_to_RGB_body(L,a,b,R,G,B,rgb_type='') -> str:
+    def get_LAB_to_RGB_body(L,a,b,R,G,B, declare_out_type=False) -> str:
        return f"""
-{_CvtColor32Op.get_LAB_to_XYZ_body(L,a,b,'X','Y','Z', xyz_type='float')}
+{_CvtColor32Op.get_LAB_to_XYZ_body(L,a,b,'X','Y','Z', declare_out_type=True)}
-{_CvtColor32Op.get_XYZ_to_RGB_body('X','Y','Z',R,G,B,rgb_type=rgb_type)}
+{_CvtColor32Op.get_XYZ_to_sRGB_body('X','Y','Z',R,G,B, declare_out_type=declare_out_type)}
 """
    @staticmethod
-    def get_RGB_to_XYZ_body(R,G,B,X,Y,Z,xyz_type='') -> str:
+    def get_sRGB_to_XYZ_body(R,G,B,X,Y,Z, declare_out_type=False) -> str:
        return f"""
-{xyz_type} {X} = fma(0.4124564, {R}, fma(0.3575761, {G}, 0.1804375*{B}));
+{R} = ({R} > 0.04045)*( pow( ({R}+0.055)/1.055, 2.4) ) + ({R} <= 0.04045)*({R} / 12.92);
-{xyz_type} {Y} = fma(0.2126729, {R}, fma(0.7151522, {G}, 0.0721750*{B}));
+{G} = ({G} > 0.04045)*( pow( ({G}+0.055)/1.055, 2.4) ) + ({G} <= 0.04045)*({G} / 12.92);
-{xyz_type} {Z} = fma(0.0193339, {R}, fma(0.1191920, {G}, 0.9503041*{B}));
+{B} = ({B} > 0.04045)*( pow( ({B}+0.055)/1.055, 2.4) ) + ({B} <= 0.04045)*({B} / 12.92);
-"""
+
-    @staticmethod
+{_CvtColor32Op.get_RGB_to_XYZ_body(R,G,B,X,Y,Z,declare_out_type=declare_out_type) }
    def get_XYZ_to_RGB_body(X,Y,Z,R,G,B,rgb_type='') -> str:
        return f"""
 {rgb_type} {R} = fma( 3.2404542, {X}, fma(-1.5371385, {Y}, -0.4985314*{Z}));
 {rgb_type} {G} = fma(-0.9692660, {X}, fma( 1.8760108, {Y},  0.0415560*{Z}));
 {rgb_type} {B} = fma( 0.0556434, {X}, fma(-0.2040259, {Y},  1.0572252*{Z}));
 """
    @staticmethod
-    def get_RGB_to_BGR_body(R,G,B,b,g,r,bgr_type='') -> str:
+    def get_RGB_to_XYZ_body(R,G,B,X,Y,Z, declare_out_type=False) -> str:
        return f"""
-{bgr_type} {b} = {R};
+{'float' if declare_out_type else ''} {X} = {R}*0.412453 + {G}*0.357580 + {B}*0.180423;
-{bgr_type} {g} = {G};
+{'float' if declare_out_type else ''} {Y} = {R}*0.212671 + {G}*0.715160 + {B}*0.072169;
-{bgr_type} {r} = {B};
+{'float' if declare_out_type else ''} {Z} = {R}*0.019334 + {G}*0.119193 + {B}*0.950227;
 """
    @staticmethod
-    def get_BGR_to_RGB_body(B,G,R,r,g,b,rgb_type='') -> str:
+    def get_XYZ_to_sRGB_body(X,Y,Z,R,G,B, declare_out_type=False) -> str:
        return f"""
-{rgb_type} {r} = {B};
+{_CvtColor32Op.get_XYZ_to_RGB_body(X,Y,Z,R,G,B,declare_out_type=declare_out_type) }
-{rgb_type} {g} = {G};
+{R} = ({R} > 0.0031308)*( 1.055*pow({R},1.0/2.4)-0.055 ) + ({R} <= 0.0031308)*({R} * 12.92);
-{rgb_type} {b} = {R};
+{G} = ({G} > 0.0031308)*( 1.055*pow({G},1.0/2.4)-0.055 ) + ({G} <= 0.0031308)*({G} * 12.92);
 {B} = ({B} > 0.0031308)*( 1.055*pow({B},1.0/2.4)-0.055 ) + ({B} <= 0.0031308)*({B} * 12.92);
 """
    @staticmethod
-    def get_XYZ_to_LAB_body(X,Y,Z,L,A,B,lab_type='') -> str:
+    def get_XYZ_to_RGB_body(X,Y,Z,R,G,B, declare_out_type=False) -> str:
        return f"""
 {'float' if declare_out_type else ''} {R} = clamp( {X}* 3.240479 + {Y}*-1.53715  + {Z}*-0.498535, 0.0, 1.0 );
 {'float' if declare_out_type else ''} {G} = clamp( {X}*-0.969256 + {Y}* 1.875991 + {Z}* 0.041556, 0.0, 1.0 );
 {'float' if declare_out_type else ''} {B} = clamp( {X}* 0.055648 + {Y}*-0.204043 + {Z}* 1.057311, 0.0, 1.0 );
 """
    @staticmethod
    def get_RGB_to_BGR_body(R,G,B,b,g,r, declare_out_type=False) -> str:
        return f"""
 {'float' if declare_out_type else ''} {b} = {R};
 {'float' if declare_out_type else ''} {g} = {G};
 {'float' if declare_out_type else ''} {r} = {B};
 """
    @staticmethod
    def get_BGR_to_RGB_body(B,G,R,r,g,b, declare_out_type=False) -> str:
        return f"""
 {'float' if declare_out_type else ''} {r} = {B};
 {'float' if declare_out_type else ''} {g} = {G};
 {'float' if declare_out_type else ''} {b} = {R};
 """
    @staticmethod
    def get_XYZ_to_LAB_body(X,Y,Z,L,A,B, declare_out_type=False) -> str:
        beta3 = '((6.0/29.0)*(6.0/29.0)*(6.0/29.0))'
-        xyz_xn = '(0.9556)'
+        xyz_xn = '(0.950456)'
        xyz_zn = '(1.088754)'
        return f"""
 {X} /= {xyz_xn};
@ -157,20 +177,20 @@ class _CvtColor32Op():
 {Y} = ({Y} > {beta3})*rootn({Y}, 3) + ({Y} <= {beta3})*(7.787*{Y}+4.0/29.0);
 {Z} = ({Z} > {beta3})*rootn({Z}, 3) + ({Z} <= {beta3})*(7.787*{Z}+4.0/29.0);
-{lab_type} {L} = 116.0*{Y}-16.0;
+{'float' if declare_out_type else ''} {L} = 116.0*{Y}-16.0;
-{lab_type} {A} = 500.0*({X}-{Y});
+{'float' if declare_out_type else ''} {A} = 500.0*({X}-{Y});
-{lab_type} {B} = 200.0*({Y}-{Z});
+{'float' if declare_out_type else ''} {B} = 200.0*({Y}-{Z});
 """
    @staticmethod
-    def get_LAB_to_XYZ_body(L,A,B,X,Y,Z,xyz_type='') -> str:
+    def get_LAB_to_XYZ_body(L,A,B,X,Y,Z, declare_out_type=False) -> str:
        beta = '(6.0/29.0)'
        beta2 = '((6.0/29.0)*(6.0/29.0))'
-        xyz_xn = '(0.9556)'
+        xyz_xn = '(0.950456)'
        xyz_zn = '(1.088754)'
        return f"""
-{xyz_type} {Y} = ({L} + 16.0) / 116.0;
+{'float' if declare_out_type else ''} {Y} = ({L} + 16.0) / 116.0;
-{xyz_type} {X} = {Y} + {A} / 500.0;
+{'float' if declare_out_type else ''} {X} = {Y} + {A} / 500.0;
-{xyz_type} {Z} = {Y} - {B} / 200.0;
+{'float' if declare_out_type else ''} {Z} = {Y} - {B} / 200.0;
 {Y} = ({Y} > {beta})*({Y}*{Y}*{Y})          + ({Y} <= {beta})*({Y}-16.0/116.0)*3*{beta2};
 {X} = ({X} > {beta})*({X}*{X}*{X}*{xyz_xn}) + ({X} <= {beta})*({X}-16.0/116.0)*3*{beta2}*{xyz_xn};
--- a/xlib/avecl/_internal/op/reduce.py
+++ b/xlib/avecl/_internal/op/reduce.py
@ -58,7 +58,7 @@ def reduce_variance(input_t, axes=None, keepdims=False):
    mean = reduce_mean(input_t, axes, keepdims=True)
    return reduce_mean(square(input_t - mean), axes, keepdims)
-def moments(input_t, axes=None, keepdims=False):
+def moments(input_t, axes=None):
    """
    Returns (mean, variance) of input_t
@ -68,11 +68,9 @@ def moments(input_t, axes=None, keepdims=False):
                    Iterable of ints.
                    None - all axes
        keepdims(False)     keep reduced axes
    """
-    mean = reduce_mean(input_t, axes, keepdims)
+    mean = reduce_mean(input_t, axes, True)
-    mean_shape_keepdims = mean._op.info.o_shape_kd
+    var = reduce_mean(square(input_t - mean), axes, True)
    var = reduce_mean(square(input_t - mean.reshape(mean_shape_keepdims) ), axes, keepdims)
    return mean, var
 def reduce_min (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
--- a/xlib/avecl/_internal/op/slice_.py
+++ b/xlib/avecl/_internal/op/slice_.py
@ -1,6 +1,9 @@
 from typing import List
 import numpy as np
 from ..AShape import AShape
 from ..AAxes import AAxes
 from ..backend import Kernel
 from ..HKernel import HKernel
 from ..HType import HType
@ -9,6 +12,29 @@ from ..SCacheton import SCacheton
 from ..Tensor import Tensor
 def split(input_t : Tensor, axis, keepdims=False) -> List[Tensor]:
    """
    arguments
     input_t    Tensor
     axis
    """
    shape = input_t.shape
    result = []
    for i in range(shape[axis]):
        slices = [slice(None, None, None)]*shape.ndim
        slices[axis] = i if not keepdims else slice(i,i+1,1)
        result.append( slice_(input_t, slices) )
    return result
 def slice_(input_t : Tensor, slices, dtype : np.dtype = None, output_t=None, is_add_to_output=False) -> Tensor:
    """
    arguments: