update xlib.avecl

2025-08-14 02:37:01 -07:00 · 2021-10-20 18:02:50 +04:00 · 2021-10-20 18:02:50 +04:00 · 6da916cc66
commit 6da916cc66
parent 2d401f47f8
14 changed files with 246 additions and 184 deletions
--- a/xlib/avecl/init.py
+++ b/xlib/avecl/init.py
@ -3,6 +3,7 @@ AveCL ! Make OpenCL great again.

 Lightweight ndarray library using OpenCL 1.2 written in pure python.
 Applicable for high-performance general purpose n-dim array computations for every device that supports OpenCL 1.2.
+Supports any dtype except float64.

 Works in python 3.5+. Dependencies: numpy.

--- a/xlib/avecl/_internal/AShape.py
+++ b/xlib/avecl/_internal/AShape.py
@ -15,7 +15,7 @@ class AShape(Iterable):

            shape       AShape
                        Iterable
-                        
+
        AShape cannot be scalar shape, thus minimal AShape is (1,)

        can raise ValueError during the construction
@ -50,13 +50,26 @@ class AShape(Iterable):
                self.size = size
            else:
                raise ValueError('Invalid type to create AShape')
-    
+
    def copy(self) -> 'AShape':
        return AShape(self)
-    
+
    def as_list(self) -> List[int]:
        return list(self.shape)
-        
+
+    def check_axis(self, axis : int) -> int:
+        """
+        Check axis and returns normalized axis value
+
+        can raise ValueError
+        """
+        if axis < 0:
+            axis += self.ndim
+
+        if axis < 0 or axis >= self.ndim:
+            raise ValueError(f'axis {axis} out of bound of ndim {self.ndim}')
+        return axis
+
    def axes_arange(self) -> AAxes:
        """
        Returns tuple of axes arange.
@ -64,7 +77,7 @@ class AShape(Iterable):
         Example (0,1,2) for ndim 3
        """
        return AAxes(range(self.ndim))
-        
+
    def replaced_axes(self, axes, dims) -> 'AShape':
        """
        returns new AShape where axes replaced with new dims
@ -76,22 +89,22 @@ class AShape(Iterable):
                axis = ndim + axis
            if axis < 0 or axis >= ndim:
                raise ValueError(f'invalid axis value {axis}')
-            
+
            new_shape[axis] = dim
        return AShape(new_shape)
-        
+

    def split(self, axis) -> Tuple['AShape', 'AShape']:
        """
        split AShape at specified axis
-        
-        returns two AShape before+exclusive and inclusive+after 
+
+        returns two AShape before+exclusive and inclusive+after
        """
        if axis < 0:
            axis = self.ndim + axis
        if axis < 0 or axis >= self.ndim:
            raise ValueError(f'invalid axis value {axis}')
-            
+
        return self[:axis], self[axis:]

    def transpose_by_axes(self, axes) -> 'AShape':
--- a/xlib/avecl/_internal/HKernel.py
+++ b/xlib/avecl/_internal/HKernel.py
@ -15,12 +15,9 @@ class HKernel:
                        np.int64   : 'long',
                        np.uint64  : 'ulong',
                        np.float16 : 'half',
-                        np.float32 : 'float',
-                        np.float64 : 'double'
+                        np.float32 : 'float'
                      }

-
-
    @staticmethod
    def np_dtype_to_cl(dtype : np.dtype):
        """
@ -134,30 +131,33 @@ class HKernel:
            out += [f'#define {name_upper}_GLOBAL_STORE8(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
            out += [f'#define {name_upper}_GLOBAL_STORE16(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']

-        if dtype in [np.float32, np.float64]:
+        if dtype in [np.float32]:
            out += [f'#define {name_upper}_TO_FLOATX(x) x']
        elif dtype in [np.bool_, np.int8, np.uint8, np.int16, np.uint16, np.int32,np.uint32, np.float16]:
            out += [f'#define {name_upper}_TO_FLOATX(x) ((float)x)']
        elif dtype in [np.int64,np.uint64]:
            out += [f'#define {name_upper}_TO_FLOATX(x) ((double)x)']
        return '\n'.join(out)
-    
+
    @staticmethod
    def define_ndim_idx(ndim):
        """
+        define macro to calculate index for n-dim shape
+
        example for ndim=3
+
        #define NDIM3_IDX(t0,t1,t2,T0,T1,T2) (((size_t)(t0))*T1*T2+((size_t)(t1))*T2+((size_t)(t2)))
        #define NDIM3_IDX_MOD(t0,t1,t2,T0,T1,T2) (((size_t)(t0) % T0)*T1*T2+((size_t)(t1) % T1)*T2+((size_t)(t2) % T2))
        """
-        
+
        out = [f'#define NDIM{ndim}_IDX(' + \
                ','.join([f't{i}' for i in range(ndim)] + [f'T{i}' for i in range(ndim)]) + \
                ') (' + '+'.join([f'((size_t)(t{i}))' + ''.join(f'*T{j}' for j in range(i+1,ndim)) for i in range(ndim) ]) + ')']
-         
+
        out +=[f'#define NDIM{ndim}_IDX_MOD(' + \
                ','.join([f't{i}' for i in range(ndim)] + [f'T{i}' for i in range(ndim)]) + \
                ') (' + '+'.join([f'((size_t)(t{i}) % T{i})' + ''.join(f'*T{j}' for j in range(i+1,ndim)) for i in range(ndim) ]) + ')']
-              
+
        return '\n'.join(out)

    @staticmethod
@ -165,14 +165,14 @@ class HKernel:
        """
        Returns a definitions for operations with tensor shape

-        example for 'O', (7,3),
+        example for 'O', (2,3),

-        #define O0 7
+        #define O0 2
        #define O1 3
        #define Om1 3
-        #define Om2 7
-        #define O_IDX(o0,o1) ( (size_t)(o0) )*3 +( o1 )
-        #define O_IDX_MOD(o0,o1) ( (size_t)(o0) % 7 )*3 +( (o1) % 3 )
+        #define Om2 2
+        #define O_IDX(o0,o1) (((size_t)(o0))*3+((size_t)(o1)))
+        #define O_IDX_MOD(o0,o1) (((size_t)(o0) % 2)*3+((size_t)(o1) % 3))
        """
        shape = tuple(shape)
        ndim = len(shape)
@ -183,36 +183,14 @@ class HKernel:
            axes_symbols = "".join([str(i) for i in range(ndim)])
        axes_symbols = axes_symbols.upper()

-        out = []
-        for i in range(ndim):
-            out += [f'#define {name_upper}{axes_symbols[i]} {shape[i]}']
+        out =  [f'#define {name_upper}{axes_symbols[i]} {shape[i]}' for i in range(ndim)]
+        out += [f'#define {name_upper}m{i} {shape[-i]}' for i in range(1,ndim+1)]

-        for i in range(1,ndim+1):
-            out += [f'#define {name_upper}m{i} {shape[-i]}']
+        out += [f'#define {name_upper}_IDX({HKernel.axes_seq_enum(name, ndim)}) (' + \
+                 '+'.join([f'((size_t)({name_lower}{i}))'              + ''.join(f'*{shape[j]}' for j in range(i+1,ndim)) for i in range(ndim)]) + ')']

-        line = f'#define {name_upper}_IDX({HKernel.axes_seq_enum(name, ndim)}) '
-
-        for i in range(ndim):
-            line += f'( (size_t)({name_lower}{i}) )'
-
-            for j in range(i+1,ndim):
-                line += f'*{shape[j]} '
-            if i != ndim-1:
-                line += '+'
-
-        out += [line]
-
-        line = f'#define {name_upper}_IDX_MOD({HKernel.axes_seq_enum(name, ndim)}) '
-
-        for i in range(ndim):
-            line += f'( (size_t)({name_lower}{i}) % {shape[i]} )'
-
-            for j in range(i+1,ndim):
-                line += f'*{shape[j]} '
-            if i != ndim-1:
-                line += '+'
-
-        out += [line,'']
+        out += [f'#define {name_upper}_IDX_MOD({HKernel.axes_seq_enum(name, ndim)}) (' + \
+                 '+'.join([f'((size_t)({name_lower}{i}) % {shape[i]})' + ''.join(f'*{shape[j]}' for j in range(i+1,ndim)) for i in range(ndim)]) + ')']

        return '\n'.join(out)

--- a/xlib/avecl/_internal/HType.py
+++ b/xlib/avecl/_internal/HType.py
@ -3,10 +3,10 @@ from typing import Iterable, List
 import numpy as np

 scalar_types = [int, float, np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64,
-                np.float16, np.float32, np.float64, np.bool_]
+                np.float16, np.float32, np.bool_]

 np_scalar_types = [np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64,
-                    np.float16, np.float32, np.float64, np.bool_]
+                    np.float16, np.float32, np.bool_]

 _np_dtype_to_cl = {
    np.bool_   : 'bool',
@ -20,7 +20,6 @@ _np_dtype_to_cl = {
    np.int64   : 'long',
    np.float16 : 'half',
    np.float32 : 'float',
-    np.float64 : 'double',
 }

 _np_dtype_weight = {
@ -34,8 +33,7 @@ _np_dtype_weight = {
    np.uint64  : 8,
    np.int64   : 9,
    np.float16 : 10,
-    np.float32 : 11,
-    np.float64 : 12,
+    np.float32 : 11
 }

 class HType:
--- a/xlib/avecl/_internal/NTest.py
+++ b/xlib/avecl/_internal/NTest.py
@ -1,14 +1,13 @@
-import traceback
-
 import numpy as np

-from .HType import HType
-from .NCore import NCore
-from .backend import get_device, get_default_device, set_default_device
-from .Tensor import Tensor
 from . import op
-from .initializer import InitRandomUniform, InitCoords2DArange
+from .backend import get_default_device, get_device, set_default_device
+from .HType import HType
 from .info import Conv2DInfo
+from .initializer import InitCoords2DArange, InitRandomUniform
+from .NCore import NCore
+from .Tensor import Tensor
+

 class NTest():

@ -45,6 +44,7 @@ class NTest():
                        binary_dilate_circle_test,
                        binary_morph_test,
                        cvt_color_test,
+                        rct_test,
                    ]

        for test_func in test_funcs:
@ -62,18 +62,39 @@ class NTest():
 def _all_close(x,y, atol=1, btol=1):
    return np.allclose( np.ndarray.flatten(x[None,...]), np.ndarray.flatten(y[None,...]), atol, btol )

+def rct_test():
+    for _ in range(10):
+      for dtype in [np.float16, np.float32]:
+        base_shape = list(np.random.randint(1, 8, size=4) )
+        shape = base_shape.copy()
+        shape[1] = 3
+
+        mask_shape = base_shape.copy()
+        mask_shape[1] = 3
+
+        print(f'rct {shape} {str(np.dtype(dtype).name)} ... ', end='', flush=True)
+
+        source_t = Tensor(shape=shape, dtype=dtype, initializer=InitRandomUniform())
+        target_t = Tensor(shape=shape, dtype=dtype, initializer=InitRandomUniform())
+        mask_t   = Tensor(shape=mask_shape, dtype=dtype, initializer=InitRandomUniform())
+
+        result_t = op.rct(target_t, source_t, target_mask_t=mask_t, source_mask_t=mask_t )
+
+        print('pass')
+
+
 def cvt_color_test():
    for _ in range(10):
     for shape_len in range(2,6):
      for in_mode in ['RGB','BGR','XYZ','LAB']:
        for out_mode in ['RGB','BGR','XYZ','LAB']:
-          for dtype in [np.float16, np.float32, np.float64]:
+          for dtype in [np.float16, np.float32]:
            shape = list(np.random.randint(1, 8, size=shape_len) )

            ch_axis = np.random.randint(len(shape))
            shape[ch_axis] = 3

-            print(f'cvt_color {shape} {str(np.dtype(dtype).name)} {in_mode}->{out_mode} ... ', end='')
+            print(f'cvt_color {shape} {str(np.dtype(dtype).name)} {in_mode}->{out_mode} ... ', end='', flush=True)

            inp_n = np.random.uniform(size=shape ).astype(dtype)
            inp_t = Tensor.from_value(inp_n)
@ -81,7 +102,9 @@ def cvt_color_test():
            out_t = op.cvt_color(inp_t, in_mode=in_mode, out_mode=out_mode, ch_axis=ch_axis)
            inp_t2 = op.cvt_color(out_t, in_mode=out_mode, out_mode=in_mode, ch_axis=ch_axis)

-            if not _all_close(inp_t.np(), inp_t2.np(), atol=0.1, btol=0.1):
+            is_check = in_mode in ['RGB','BGR','XYZ'] and out_mode in ['XYZ','LAB']
+
+            if is_check and not _all_close(inp_t.np(), inp_t2.np(), atol=0.1, btol=0.1):
                raise Exception(f'data is not equal')

            print('pass')
@ -91,7 +114,7 @@ def cast_test():
        for out_dtype in HType.get_np_scalar_types():
            shape = tuple(np.random.randint(1, 8, size=( np.random.randint(1,5))) )

-            print(f'cast: {shape} in_dtype:{str(np.dtype(in_dtype).name)} out_dtype:{str(np.dtype(out_dtype).name)}  ... ', end='')
+            print(f'cast: {shape} in_dtype:{str(np.dtype(in_dtype).name)} out_dtype:{str(np.dtype(out_dtype).name)}  ... ', end='', flush=True)

            val_n = np.random.uniform( -64, 64, size=shape ).astype(in_dtype)
            cast_n = val_n.astype(out_dtype)
@ -113,7 +136,7 @@ def binary_morph_test():
            input_n = np.random.randint( 2, size=shape ).astype(dtype)
            input_t = Tensor.from_value(input_n)

-            print(f'binary_morph: {shape} erode_dilate:{erode_dilate} blur:{blur} {np.dtype(dtype).name} ... ', end='')
+            print(f'binary_morph: {shape} erode_dilate:{erode_dilate} blur:{blur} {np.dtype(dtype).name} ... ', end='', flush=True)

            op.binary_morph(input_t, erode_dilate=erode_dilate, blur=blur, fade_to_border=True)

@ -130,7 +153,7 @@ def binary_erode_circle_test():
            input_n = np.random.randint( 2, size=shape ).astype(dtype)
            input_t = Tensor.from_value(input_n)

-            print(f'binary_erode_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='')
+            print(f'binary_erode_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='', flush=True)

            op.binary_erode_circle(input_t, radius=radius, iterations=iterations)

@ -147,7 +170,7 @@ def binary_dilate_circle_test():
            input_n = np.random.randint( 2, size=shape ).astype(dtype)
            input_t = Tensor.from_value(input_n)

-            print(f'binary_dilate_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='')
+            print(f'binary_dilate_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='', flush=True)

            op.binary_dilate_circle(input_t, radius=radius, iterations=iterations)

@ -156,11 +179,11 @@ def binary_dilate_circle_test():

 def gaussian_blur_test():
    for shape_len in range(2,5):
-        for dtype in [np.float16, np.float32, np.float64]:
+        for dtype in [np.float16, np.float32]:

            shape = np.random.randint( 1, 64, size=(shape_len,) )
            sigma = np.random.rand() * 10
-            print(f'gaussian_blur: {shape} sigma:{sigma} {np.dtype(dtype).name} ... ', end='')
+            print(f'gaussian_blur: {shape} sigma:{sigma} {np.dtype(dtype).name} ... ', end='', flush=True)

            val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
            val_t = Tensor.from_value(val_n)
@ -179,7 +202,7 @@ def pad_test():

                paddings = tuple( (np.random.randint(8), np.random.randint(8)) for i in range(len(shape)) )

-                print(f'pad: {shape} {paddings} {mode} {np.dtype(dtype).name} ... ', end='')
+                print(f'pad: {shape} {paddings} {mode} {np.dtype(dtype).name} ... ', end='', flush=True)

                val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
                pad_n = np.pad(val_n, paddings, mode=mode)
@ -187,7 +210,7 @@ def pad_test():
                val_t = Tensor.from_value(val_n)
                pad_t = op.pad(val_t, paddings, mode=mode)

-                print(f'{pad_n.shape} == {pad_t.shape} ... ', end='')
+                print(f'{pad_n.shape} == {pad_t.shape} ... ', end='', flush=True)

                if pad_n.shape != pad_t.shape:
                    raise Exception(f'shape is not equal')
@ -241,7 +264,7 @@ def slice_set_test():
                shape = tuple(shape)
                slices = tuple(slices)

-                print(f'slice_set: {shape} {np.dtype(dtype).name} {slices} ... ', end='')
+                print(f'slice_set: {shape} {np.dtype(dtype).name} {slices} ... ', end='', flush=True)

                val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
                val_t = Tensor.from_value(val_n)
@ -330,7 +353,7 @@ def depthwise_conv2d_test():
                            input_shape  = (n, ic, ih, iw)
                            kernel_shape = (ic, ks, ks)

-                            print(f'depthwise_conv2d: {input_shape},{kernel_shape},{padding},{stride},{dilation},{np.dtype(dtype).name} ... ', end='')
+                            print(f'depthwise_conv2d: {input_shape},{kernel_shape},{padding},{stride},{dilation},{np.dtype(dtype).name} ... ', end='', flush=True)

                            input_n  = np.random.randint( 64, size=input_shape ).astype(dtype)
                            kernel_n = np.ones(shape=kernel_shape ).astype(dtype)
@ -358,7 +381,7 @@ def warp_affine_test():
        H = np.random.randint(8, 64)
        W = np.random.randint(8, 64)

-        print(f'warp_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
+        print(f'warp_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='', flush=True)

        input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )

@ -380,7 +403,7 @@ def remap_np_affine_test():
        H = np.random.randint(8, 64)
        W = np.random.randint(8, 64)

-        print(f'remap_np_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
+        print(f'remap_np_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='', flush=True)

        input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )

@ -402,7 +425,7 @@ def remap_test():
        H = np.random.randint(8, 64)
        W = np.random.randint(8, 64)

-        print(f'remap: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
+        print(f'remap: [{H},{W}] {np.dtype(dtype).name} ... ', end='', flush=True)

        input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )

@ -422,7 +445,7 @@ def tile_test():
            shape = tuple(np.random.randint( 8, size=(shape_len,) )+1)
            tiles = tuple(np.random.randint( 4, size=(shape_len,) )+1)

-            print(f'tile: {shape} {tiles} {np.dtype(dtype).name} ... ', end='')
+            print(f'tile: {shape} {tiles} {np.dtype(dtype).name} ... ', end='', flush=True)

            val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
            tiled_n = np.tile(val_n, tiles)
@ -430,7 +453,7 @@ def tile_test():
            val_t = Tensor.from_value(val_n)
            tiled_t = op.tile(val_t, tiles)

-            print(f'{tiled_n.shape} == {tiled_t.shape} ... ', end='')
+            print(f'{tiled_n.shape} == {tiled_t.shape} ... ', end='', flush=True)

            if tiled_n.shape != tiled_t.shape:
                raise Exception(f'shape is not equal')
@ -448,7 +471,7 @@ def stack_test():
                axis = np.random.randint(shape_len+1)
                stack_count = np.random.randint(4)+1

-                print(f'stack: {shape}*{stack_count} axis:{axis} {np.dtype(dtype).name} ... ', end='')
+                print(f'stack: {shape}*{stack_count} axis:{axis} {np.dtype(dtype).name} ... ', end='', flush=True)

                vals_n = [ np.random.randint( 2**8, size=shape ).astype(dtype) for i in range(stack_count) ]
                stack_n = np.stack(vals_n, axis)
@ -456,7 +479,7 @@ def stack_test():
                vals_t = [ Tensor.from_value(vals_n[i]) for i in range(stack_count) ]
                stack_t = op.stack(vals_t, axis)

-                print(f'{stack_n.shape} == {stack_t.shape} ... ', end='')
+                print(f'{stack_n.shape} == {stack_t.shape} ... ', end='', flush=True)

                if stack_n.shape != stack_t.shape:
                    raise Exception('shape is not equal')
@ -483,9 +506,9 @@ def reduce_test():

                keepdims = np.random.randint(2) == 0

-                print(f'reduce {op_type}: {shape} {np.dtype(dtype).name} axes={reduction_axes} keepdims={keepdims} ... ', end='')
+                print(f'reduce {op_type}: {shape} {np.dtype(dtype).name} axes={reduction_axes} keepdims={keepdims} ... ', end='', flush=True)

-                if dtype in [np.float16, np.float32, np.float64]:
+                if dtype in [np.float16, np.float32]:
                    value_n = np.random.uniform(size=shape).astype(dtype)
                else:
                    value_n = np.random.randint( max(1, int(np.iinfo(dtype).max / np.prod(shape)) ), size=shape, dtype=dtype )
@ -518,7 +541,7 @@ def InitRandomUniform_test():
        for shape_len in range(1, 5):
            shape = np.random.randint( 8, size=(shape_len,) )+1

-            print(f'InitRandomUniform: {shape} {np.dtype(dtype).name} ... ', end='')
+            print(f'InitRandomUniform: {shape} {np.dtype(dtype).name} ... ', end='', flush=True)

            Tensor(shape, dtype, initializer=InitRandomUniform()).np()

@ -534,7 +557,7 @@ def InitCoords2DArange_test():
            w_start = np.random.randint(80)
            w_stop = w_start + np.random.randint(80)

-            print(f'InitCoords2DArange: {shape} {np.dtype(dtype).name} ... ', end='')
+            print(f'InitCoords2DArange: {shape} {np.dtype(dtype).name} ... ', end='', flush=True)

            Tensor(shape, dtype, initializer=InitCoords2DArange(h_start,h_stop,w_start,w_stop )).np()

@ -551,17 +574,17 @@ def concat_test():
                                    for i,dim in enumerate(shape) )
                            for shape in ([shape] * count) )

-            print(f'concat: {shapes} axis={axis} {np.dtype(dtype).name} ... ', end='')
+            print(f'concat: {shapes} axis={axis} {np.dtype(dtype).name} ... ', end='', flush=True)

            V_n = [ np.random.randint( 2**8, size=shape ).astype(dtype) for shape in shapes ]
            O_n = np.concatenate(V_n, axis)

-            print(f'{O_n.shape} == ', end='')
+            print(f'{O_n.shape} == ', end='', flush=True)

            V_t = [ Tensor.from_value(V_n[i]) for i in range(count) ]
            O_t = op.concat(V_t, axis)

-            print(f'{O_t.shape} ... ', end='')
+            print(f'{O_t.shape} ... ', end='', flush=True)

            if O_n.shape != O_t.shape:
                raise Exception('shape is not equal')
@ -596,19 +619,19 @@ def matmul_test():
                A_shape = (BATCH, M, K)
                B_shape = (BATCH, K, N)

-            print(f'matmul: {A_shape} {B_shape} {np.dtype(dtype).name} ... ', end='')
+            print(f'matmul: {A_shape} {B_shape} {np.dtype(dtype).name} ... ', end='', flush=True)

            A_n = np.random.randint( 2**4, size=A_shape ).astype(dtype)
            B_n = np.random.randint( 2**4, size=B_shape ).astype(dtype)

            O_n = np.matmul(A_n, B_n)

-            print(f'{O_n.shape} == ', end='')
+            print(f'{O_n.shape} == ', end='', flush=True)

            A_t = Tensor.from_value(A_n)
            B_t = Tensor.from_value(B_n)
            O_t = op.matmul(A_t, B_t)
-            print(f'{O_t.shape} ... ', end='')
+            print(f'{O_t.shape} ... ', end='', flush=True)

            if O_n.shape != O_t.shape:
                raise Exception('shape is not equal')
@ -659,17 +682,17 @@ def slice_test():
                shape = tuple(shape)
                slices = tuple(slices)

-                print(f'slice: {shape} {np.dtype(dtype).name} {slices} ... ', end='')
+                print(f'slice: {shape} {np.dtype(dtype).name} {slices} ... ', end='', flush=True)

                val_n = np.random.randint( 2**8, size=shape ).astype(dtype)

                sliced_n = val_n[slices]

-                print(f'{sliced_n.shape} ... ', end='')
+                print(f'{sliced_n.shape} ... ', end='', flush=True)

                sliced_t = Tensor.from_value(val_n)[slices]

-                print(f'{sliced_t.shape} ... ', end='')
+                print(f'{sliced_t.shape} ... ', end='', flush=True)

                if 0 in sliced_n.shape:
                    # some cases like 0:1:-1 will produce zero shape and invalid array on numpy
@ -694,17 +717,17 @@ def transpose_test():
            axes_order = np.array([*range(shape_len)])
            np.random.shuffle(axes_order)

-            print(f'transpose: {shape} {axes_order} ... ', end='')
+            print(f'transpose: {shape} {axes_order} ... ', end='', flush=True)

            val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
            transposed_n = np.transpose(val_n, axes_order)

-            print(f'{transposed_n.shape} ... ', end='')
+            print(f'{transposed_n.shape} ... ', end='', flush=True)

            val_t = Tensor.from_value(val_n)
            transposed_t = op.transpose (val_t, axes_order )

-            print(f'{transposed_t.shape} ... ', end='')
+            print(f'{transposed_t.shape} ... ', end='', flush=True)

            if transposed_n.shape != transposed_t.shape:
                raise Exception('shape is not equal')
@ -736,7 +759,7 @@ def any_wise_op_test():
                    shapes = shapes[::-1]
                a_shape, b_shape = shapes

-                print(f'any_wise: {a_shape} {str(op_type)} {b_shape}:{str(np.dtype(dtype).name)} ...', end='')
+                print(f'any_wise: {a_shape} {str(op_type)} {b_shape}:{str(np.dtype(dtype).name)} ...', end='', flush=True)

                a_n = np.random.randint( 1, 2**8, size=a_shape ).astype(dtype)
                b_n = np.random.randint( 1, 2**8, size=b_shape ).astype(dtype)
--- a/xlib/avecl/_internal/Tensor.py
+++ b/xlib/avecl/_internal/Tensor.py
@ -109,6 +109,7 @@ class Tensor:
    def min(self, axes=None, keepdims=False) -> 'Tensor': ...
    def reshape(self, new_shape) -> 'Tensor': ...
    def sum(self, axes=None, keepdims=False) -> 'Tensor': ...
+    def std(self, axes=None, keepdims=False) -> 'Tensor': ...
    def transpose(self, axes_order, op_text=None, dtype=None) -> 'Tensor': ...

    @property
--- a/xlib/avecl/_internal/TensorImpl.py
+++ b/xlib/avecl/_internal/TensorImpl.py
@ -70,6 +70,7 @@ Tensor.mean = reduce_mean
 Tensor.min = reduce_min
 Tensor.reshape = reshape
 Tensor.sum = reduce_sum
+Tensor.std = reduce_std
 Tensor.transpose = transpose

 class TensorRef(Tensor):
--- a/xlib/avecl/_internal/backend/Device.py
+++ b/xlib/avecl/_internal/backend/Device.py
@ -18,8 +18,7 @@ _np_dtype_to_cl = { np.uint8:   CL.cl_uchar,
                    np.uint64:  CL.cl_ulong,
                    np.int64:   CL.cl_long,
                    np.float16: CL.cl_half,
-                    np.float32: CL.cl_float,
-                    np.float64: CL.cl_double }
+                    np.float32: CL.cl_float}

 _opencl_device_ids = None
 _default_device = None
--- a/xlib/avecl/_internal/initializer/InitRandomUniform.py
+++ b/xlib/avecl/_internal/initializer/InitRandomUniform.py
@ -38,8 +38,6 @@ class InitRandomUniform(Initializer):
                gen_expression = f'hash_ulong_from_ulong(gid+seed64) % {int(hl)} + {int(l)}'
            elif tensor.dtype in [np.float16, np.float32]:
                gen_expression = f'hash_float_from_uint(gid+seed32)*{hl} + {l}'
-            elif tensor.dtype in [np.float64]:
-                gen_expression = f'hash_double_from_ulong(gid+seed64)*{hl} + {l}'

            kernel = Kernel(kernel_text=f"""
 {HKernel.include_hash()}
--- a/xlib/avecl/_internal/op/init.py
+++ b/xlib/avecl/_internal/op/init.py
@ -9,12 +9,13 @@ from .depthwise_conv2D import depthwise_conv2D
 from .gaussian_blur import gaussian_blur
 from .matmul import matmul, matmulc
 from .pad import pad
+from .rct import rct
 from .reduce import (moments, reduce_max, reduce_mean, reduce_min, reduce_std,
                     reduce_sum, reduce_variance)
 from .remap import remap
 from .remap_np_affine import remap_np_affine
 from .reshape import reshape
-from .slice_ import slice_
+from .slice_ import slice_, split
 from .slice_set import slice_set
 from .stack import stack
 from .tile import tile
--- a/xlib/avecl/_internal/op/any_wise.py
+++ b/xlib/avecl/_internal/op/any_wise.py
@ -1,27 +1,31 @@
 import numpy as np

+from ..AAxes import AAxes
 from ..AShape import AShape
 from ..backend import Kernel
 from ..HArgs import HArgs
 from ..HKernel import HKernel
 from ..HType import HType
-from ..info import BroadcastInfo
+from ..info import BroadcastInfo, ReductionInfo
 from ..SCacheton import SCacheton
 from ..Tensor import Tensor


 def any_wise(op_text : str,
             *args,
+             dim_wise_axis : int = None,
             dtype : np.dtype = None,
             output_t:Tensor=None) -> Tensor:
    """
-    operator for N-wise ops with N inputs
+    elements-wise operator with N inputs

    arguments
        op_text     example: O=(2*I0*I1)+I2

        *args       List[ Tensor | number ]

+        dim_wise_axis(None)
+
        dtype

        output_t            compute result to this Tensor.
@ -33,7 +37,7 @@ def any_wise(op_text : str,

    shape_list, dtype_list, krn_args = HArgs.decompose(args)

-    op = SCacheton.get(_AnyWiseOp, shape_list, dtype_list, dtype, op_text)
+    op = SCacheton.get(_AnyWiseOp, shape_list, dtype_list, dim_wise_axis, dtype, op_text)

    if output_t is None:
        output_t = Tensor ( op.o_shape, op.o_dtype, device=device )
@ -45,59 +49,60 @@ def any_wise(op_text : str,
    return output_t

 class _AnyWiseOp:
-    def __init__(self, shape_list, dtype_list, o_dtype, op_text : str):
+    def __init__(self, shape_list, dtype_list, dim_wise_axis, o_dtype, op_text : str):
        if len(shape_list) != len(dtype_list):
            raise ValueError('len(shape_list) != len(dtype_list)')

        self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
+        self.info = info = BroadcastInfo( [ shape if shape is not None else AShape((1,)) for shape in shape_list ])
+        self.o_shape = o_shape = info.o_shape

-        if len(shape_list) == 1:
-            # element-wise.
-            i_shape, i_dtype = shape_list[0], dtype_list[0]
-            self.o_shape = o_shape = i_shape
+        g_shape = o_shape
+        if dim_wise_axis is not None:
+            dim_wise_axis = o_shape.check_axis(dim_wise_axis)

-            self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
-{HKernel.define_tensor('O', o_shape, o_dtype)}
-{HKernel.define_tensor('IN', i_shape, i_dtype)}
-__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const IN_PTR_TYPE* IN_PTR_NAME)
-{{
-size_t gid = get_global_id(0);
+            dim_wise_axis_size = o_shape[dim_wise_axis]
+            if dim_wise_axis_size > 16:
+                raise ValueError(f'dim_wise_axis size > 16: {dim_wise_axis_size}')

-O_TYPE O = O_GLOBAL_LOAD(gid);
-IN_TYPE I0 = IN_GLOBAL_LOAD(gid);
-{op_text};
-O_GLOBAL_STORE(gid, O);
-}}
-""")
-        else:
-            # Multi arg.
-            self.info = info = BroadcastInfo( [ shape if shape is not None else AShape((1,)) for shape in shape_list ])
+            g_shape = ReductionInfo( o_shape, AAxes(dim_wise_axis), False ).o_shape

-            self.o_shape = o_shape = info.o_shape
+        defs, arg_defs, impls = [], [], []
+        for i, (t_shape, t_dtype) in enumerate(zip(shape_list, dtype_list)):
+            t_name = f'I{i}'
+            if t_shape is not None:
+                defs.append( HKernel.define_tensor(t_name, info.br_shapes[i], t_dtype) )
+                arg_defs.append( f", __global const {t_name}_PTR_TYPE* {t_name}_PTR_NAME" )

-            defs, arg_defs, impls = [], [], []
-            for i, (t_shape, t_dtype) in enumerate(zip(shape_list, dtype_list)):
-                t_name = f'I{i}'
-                if t_shape is not None:
-                    defs.append( HKernel.define_tensor(t_name, info.br_shapes[i], t_dtype) )
-                    arg_defs.append( f", __global const {t_name}_PTR_TYPE* {t_name}_PTR_NAME" )
-                    impls.append( f"{t_name}_TYPE {t_name} = {t_name}_GLOBAL_LOAD({t_name}_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)}));")
+                if dim_wise_axis is not None:
+                    for i_elem in range(dim_wise_axis_size):
+                        impls.append( f"{t_name}_TYPE {t_name}_{i_elem} = {t_name}_GLOBAL_LOAD({t_name}_IDX_MOD({HKernel.axes_seq_enum('G', g_shape.ndim, new_axis=(f'{i_elem}', dim_wise_axis) )}));")
                else:
-                    arg_defs.append( f", {HKernel.define_scalar_func_arg(t_name, t_dtype)}" )
+                        impls.append( f"{t_name}_TYPE {t_name} = {t_name}_GLOBAL_LOAD({t_name}_IDX_MOD({HKernel.axes_seq_enum('G', g_shape.ndim)}));")
+            else:
+                arg_defs.append( f", {HKernel.define_scalar_func_arg(t_name, t_dtype)}" )

-            defs, arg_defs, impls = '\n'.join(defs), '\n'.join(arg_defs), '\n'.join(impls)
+        defs, arg_defs, impls = '\n'.join(defs), '\n'.join(arg_defs), '\n'.join(impls)

-            self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+        if dim_wise_axis is not None:
+            o_def = '\n'.join( f"O_TYPE O_{i_elem};" for i_elem in range(dim_wise_axis_size) )
+            o_store = '\n'.join( f"O_GLOBAL_STORE(O_IDX({HKernel.axes_seq_enum('G', g_shape.ndim, new_axis=(f'{i_elem}', dim_wise_axis) )}), O_{i_elem});" for i_elem in range(dim_wise_axis_size) )
+        else:
+            o_def   = 'O_TYPE O;'
+            o_store = 'O_GLOBAL_STORE(gid, O);'
+
+        self.forward_krn = Kernel(global_shape=(g_shape.size,), kernel_text=f"""
 {defs}
 {HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor_shape('G', g_shape)}
 __kernel void impl(__global O_PTR_TYPE* O_PTR_NAME{arg_defs})
 {{
 size_t gid = get_global_id(0);
-{HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)}
+{HKernel.decompose_idx_to_axes_idxs('gid', 'G', g_shape.ndim)}
 {impls}
-O_TYPE O;
+{o_def}
 {op_text};
-O_GLOBAL_STORE(gid, O);
+{o_store}
 }}
 """)

--- a/xlib/avecl/_internal/op/cvt_color.py
+++ b/xlib/avecl/_internal/op/cvt_color.py
@ -39,7 +39,7 @@ def cvt_color (input_t : Tensor, in_mode : str, out_mode : str, ch_axis=1, dtype
    return output_t

 _allowed_modes = ['RGB', 'BGR', 'XYZ', 'LAB']
-_allowed_dtypes = [np.float16, np.float32, np.float64]
+_allowed_dtypes = [np.float16, np.float32]

 class _CvtColor32Op():
    def __init__(self, i_shape : AShape, i_dtype, in_mode, o_dtype, out_mode, ch_axis):
@ -100,54 +100,74 @@ class _CvtColor32Op():
            self.forward_krn = krn

    @staticmethod
-    def get_RGB_to_LAB_body(R,G,B,L,a,b,lab_type='') -> str:
+    def get_RGB_to_LAB_body(R,G,B,L,a,b, declare_out_type=False) -> str:
        return f"""
-{_CvtColor32Op.get_RGB_to_XYZ_body(R,G,B,'X','Y','Z', xyz_type='float')}
-{_CvtColor32Op.get_XYZ_to_LAB_body('X','Y','Z',L,a,b, lab_type=lab_type)}
+{_CvtColor32Op.get_sRGB_to_XYZ_body(R,G,B,'X','Y','Z', declare_out_type=True)}
+{_CvtColor32Op.get_XYZ_to_LAB_body('X','Y','Z',L,a,b, declare_out_type=declare_out_type)}
 """

    @staticmethod
-    def get_LAB_to_RGB_body(L,a,b,R,G,B,rgb_type='') -> str:
+    def get_LAB_to_RGB_body(L,a,b,R,G,B, declare_out_type=False) -> str:
        return f"""
-{_CvtColor32Op.get_LAB_to_XYZ_body(L,a,b,'X','Y','Z', xyz_type='float')}
-{_CvtColor32Op.get_XYZ_to_RGB_body('X','Y','Z',R,G,B,rgb_type=rgb_type)}
+{_CvtColor32Op.get_LAB_to_XYZ_body(L,a,b,'X','Y','Z', declare_out_type=True)}
+{_CvtColor32Op.get_XYZ_to_sRGB_body('X','Y','Z',R,G,B, declare_out_type=declare_out_type)}
 """

    @staticmethod
-    def get_RGB_to_XYZ_body(R,G,B,X,Y,Z,xyz_type='') -> str:
+    def get_sRGB_to_XYZ_body(R,G,B,X,Y,Z, declare_out_type=False) -> str:
        return f"""
-{xyz_type} {X} = fma(0.4124564, {R}, fma(0.3575761, {G}, 0.1804375*{B}));
-{xyz_type} {Y} = fma(0.2126729, {R}, fma(0.7151522, {G}, 0.0721750*{B}));
-{xyz_type} {Z} = fma(0.0193339, {R}, fma(0.1191920, {G}, 0.9503041*{B}));
-"""
-    @staticmethod
-    def get_XYZ_to_RGB_body(X,Y,Z,R,G,B,rgb_type='') -> str:
-        return f"""
-{rgb_type} {R} = fma( 3.2404542, {X}, fma(-1.5371385, {Y}, -0.4985314*{Z}));
-{rgb_type} {G} = fma(-0.9692660, {X}, fma( 1.8760108, {Y},  0.0415560*{Z}));
-{rgb_type} {B} = fma( 0.0556434, {X}, fma(-0.2040259, {Y},  1.0572252*{Z}));
+{R} = ({R} > 0.04045)*( pow( ({R}+0.055)/1.055, 2.4) ) + ({R} <= 0.04045)*({R} / 12.92);
+{G} = ({G} > 0.04045)*( pow( ({G}+0.055)/1.055, 2.4) ) + ({G} <= 0.04045)*({G} / 12.92);
+{B} = ({B} > 0.04045)*( pow( ({B}+0.055)/1.055, 2.4) ) + ({B} <= 0.04045)*({B} / 12.92);
+
+{_CvtColor32Op.get_RGB_to_XYZ_body(R,G,B,X,Y,Z,declare_out_type=declare_out_type) }
 """

    @staticmethod
-    def get_RGB_to_BGR_body(R,G,B,b,g,r,bgr_type='') -> str:
+    def get_RGB_to_XYZ_body(R,G,B,X,Y,Z, declare_out_type=False) -> str:
        return f"""
-{bgr_type} {b} = {R};
-{bgr_type} {g} = {G};
-{bgr_type} {r} = {B};
+{'float' if declare_out_type else ''} {X} = {R}*0.412453 + {G}*0.357580 + {B}*0.180423;
+{'float' if declare_out_type else ''} {Y} = {R}*0.212671 + {G}*0.715160 + {B}*0.072169;
+{'float' if declare_out_type else ''} {Z} = {R}*0.019334 + {G}*0.119193 + {B}*0.950227;
 """

    @staticmethod
-    def get_BGR_to_RGB_body(B,G,R,r,g,b,rgb_type='') -> str:
+    def get_XYZ_to_sRGB_body(X,Y,Z,R,G,B, declare_out_type=False) -> str:
        return f"""
-{rgb_type} {r} = {B};
-{rgb_type} {g} = {G};
-{rgb_type} {b} = {R};
+{_CvtColor32Op.get_XYZ_to_RGB_body(X,Y,Z,R,G,B,declare_out_type=declare_out_type) }
+{R} = ({R} > 0.0031308)*( 1.055*pow({R},1.0/2.4)-0.055 ) + ({R} <= 0.0031308)*({R} * 12.92);
+{G} = ({G} > 0.0031308)*( 1.055*pow({G},1.0/2.4)-0.055 ) + ({G} <= 0.0031308)*({G} * 12.92);
+{B} = ({B} > 0.0031308)*( 1.055*pow({B},1.0/2.4)-0.055 ) + ({B} <= 0.0031308)*({B} * 12.92);
 """

    @staticmethod
-    def get_XYZ_to_LAB_body(X,Y,Z,L,A,B,lab_type='') -> str:
+    def get_XYZ_to_RGB_body(X,Y,Z,R,G,B, declare_out_type=False) -> str:
+        return f"""
+{'float' if declare_out_type else ''} {R} = clamp( {X}* 3.240479 + {Y}*-1.53715  + {Z}*-0.498535, 0.0, 1.0 );
+{'float' if declare_out_type else ''} {G} = clamp( {X}*-0.969256 + {Y}* 1.875991 + {Z}* 0.041556, 0.0, 1.0 );
+{'float' if declare_out_type else ''} {B} = clamp( {X}* 0.055648 + {Y}*-0.204043 + {Z}* 1.057311, 0.0, 1.0 );
+"""
+
+    @staticmethod
+    def get_RGB_to_BGR_body(R,G,B,b,g,r, declare_out_type=False) -> str:
+        return f"""
+{'float' if declare_out_type else ''} {b} = {R};
+{'float' if declare_out_type else ''} {g} = {G};
+{'float' if declare_out_type else ''} {r} = {B};
+"""
+
+    @staticmethod
+    def get_BGR_to_RGB_body(B,G,R,r,g,b, declare_out_type=False) -> str:
+        return f"""
+{'float' if declare_out_type else ''} {r} = {B};
+{'float' if declare_out_type else ''} {g} = {G};
+{'float' if declare_out_type else ''} {b} = {R};
+"""
+
+    @staticmethod
+    def get_XYZ_to_LAB_body(X,Y,Z,L,A,B, declare_out_type=False) -> str:
        beta3 = '((6.0/29.0)*(6.0/29.0)*(6.0/29.0))'
-        xyz_xn = '(0.9556)'
+        xyz_xn = '(0.950456)'
        xyz_zn = '(1.088754)'
        return f"""
 {X} /= {xyz_xn};
@ -157,20 +177,20 @@ class _CvtColor32Op():
 {Y} = ({Y} > {beta3})*rootn({Y}, 3) + ({Y} <= {beta3})*(7.787*{Y}+4.0/29.0);
 {Z} = ({Z} > {beta3})*rootn({Z}, 3) + ({Z} <= {beta3})*(7.787*{Z}+4.0/29.0);

-{lab_type} {L} = 116.0*{Y}-16.0;
-{lab_type} {A} = 500.0*({X}-{Y});
-{lab_type} {B} = 200.0*({Y}-{Z});
+{'float' if declare_out_type else ''} {L} = 116.0*{Y}-16.0;
+{'float' if declare_out_type else ''} {A} = 500.0*({X}-{Y});
+{'float' if declare_out_type else ''} {B} = 200.0*({Y}-{Z});
 """
    @staticmethod
-    def get_LAB_to_XYZ_body(L,A,B,X,Y,Z,xyz_type='') -> str:
+    def get_LAB_to_XYZ_body(L,A,B,X,Y,Z, declare_out_type=False) -> str:
        beta = '(6.0/29.0)'
        beta2 = '((6.0/29.0)*(6.0/29.0))'
-        xyz_xn = '(0.9556)'
+        xyz_xn = '(0.950456)'
        xyz_zn = '(1.088754)'
        return f"""
-{xyz_type} {Y} = ({L} + 16.0) / 116.0;
-{xyz_type} {X} = {Y} + {A} / 500.0;
-{xyz_type} {Z} = {Y} - {B} / 200.0;
+{'float' if declare_out_type else ''} {Y} = ({L} + 16.0) / 116.0;
+{'float' if declare_out_type else ''} {X} = {Y} + {A} / 500.0;
+{'float' if declare_out_type else ''} {Z} = {Y} - {B} / 200.0;

 {Y} = ({Y} > {beta})*({Y}*{Y}*{Y})          + ({Y} <= {beta})*({Y}-16.0/116.0)*3*{beta2};
 {X} = ({X} > {beta})*({X}*{X}*{X}*{xyz_xn}) + ({X} <= {beta})*({X}-16.0/116.0)*3*{beta2}*{xyz_xn};
--- a/xlib/avecl/_internal/op/reduce.py
+++ b/xlib/avecl/_internal/op/reduce.py
@ -58,7 +58,7 @@ def reduce_variance(input_t, axes=None, keepdims=False):
    mean = reduce_mean(input_t, axes, keepdims=True)
    return reduce_mean(square(input_t - mean), axes, keepdims)

-def moments(input_t, axes=None, keepdims=False):
+def moments(input_t, axes=None):
    """
    Returns (mean, variance) of input_t

@ -68,11 +68,9 @@ def moments(input_t, axes=None, keepdims=False):
                    Iterable of ints.
                    None - all axes

-        keepdims(False)     keep reduced axes
    """
-    mean = reduce_mean(input_t, axes, keepdims)
-    mean_shape_keepdims = mean._op.info.o_shape_kd
-    var = reduce_mean(square(input_t - mean.reshape(mean_shape_keepdims) ), axes, keepdims)
+    mean = reduce_mean(input_t, axes, True)
+    var = reduce_mean(square(input_t - mean), axes, True)
    return mean, var

 def reduce_min (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
--- a/xlib/avecl/_internal/op/slice_.py
+++ b/xlib/avecl/_internal/op/slice_.py
@ -1,6 +1,9 @@
+from typing import List
+
 import numpy as np

 from ..AShape import AShape
+from ..AAxes import AAxes
 from ..backend import Kernel
 from ..HKernel import HKernel
 from ..HType import HType
@ -9,6 +12,29 @@ from ..SCacheton import SCacheton
 from ..Tensor import Tensor


+def split(input_t : Tensor, axis, keepdims=False) -> List[Tensor]:
+    """
+
+    arguments
+
+     input_t    Tensor
+
+     axis
+
+    """
+    shape = input_t.shape
+
+    result = []
+    for i in range(shape[axis]):
+        slices = [slice(None, None, None)]*shape.ndim
+
+        slices[axis] = i if not keepdims else slice(i,i+1,1)
+
+        result.append( slice_(input_t, slices) )
+
+    return result
+
+
 def slice_(input_t : Tensor, slices, dtype : np.dtype = None, output_t=None, is_add_to_output=False) -> Tensor:
    """
    arguments: