From 6da916cc66a4186fcb02b06fc19727fcf54ae8da Mon Sep 17 00:00:00 2001
From: iperov <lepersorium@gmail.com>
Date: Wed, 20 Oct 2021 18:02:50 +0400
Subject: [PATCH] update xlib.avecl

---
 xlib/avecl/__init__.py                        |   1 +
 xlib/avecl/_internal/AShape.py                |  33 ++++--
 xlib/avecl/_internal/HKernel.py               |  62 ++++------
 xlib/avecl/_internal/HType.py                 |   8 +-
 xlib/avecl/_internal/NTest.py                 | 111 +++++++++++-------
 xlib/avecl/_internal/Tensor.py                |   1 +
 xlib/avecl/_internal/TensorImpl.py            |   1 +
 xlib/avecl/_internal/backend/Device.py        |   3 +-
 .../initializer/InitRandomUniform.py          |   2 -
 xlib/avecl/_internal/op/__init__.py           |   3 +-
 xlib/avecl/_internal/op/any_wise.py           |  79 +++++++------
 xlib/avecl/_internal/op/cvt_color.py          |  92 +++++++++------
 xlib/avecl/_internal/op/reduce.py             |   8 +-
 xlib/avecl/_internal/op/slice_.py             |  26 ++++
 14 files changed, 246 insertions(+), 184 deletions(-)

diff --git a/xlib/avecl/__init__.py b/xlib/avecl/__init__.py
index 6e2d849..be2dc5d 100644
--- a/xlib/avecl/__init__.py
+++ b/xlib/avecl/__init__.py
@@ -3,6 +3,7 @@ AveCL ! Make OpenCL great again.
 
 Lightweight ndarray library using OpenCL 1.2 written in pure python.
 Applicable for high-performance general purpose n-dim array computations for every device that supports OpenCL 1.2.
+Supports any dtype except float64.
 
 Works in python 3.5+. Dependencies: numpy.
 
diff --git a/xlib/avecl/_internal/AShape.py b/xlib/avecl/_internal/AShape.py
index eb27fdf..ba6bea1 100644
--- a/xlib/avecl/_internal/AShape.py
+++ b/xlib/avecl/_internal/AShape.py
@@ -15,7 +15,7 @@ class AShape(Iterable):
 
             shape       AShape
                         Iterable
-                        
+
         AShape cannot be scalar shape, thus minimal AShape is (1,)
 
         can raise ValueError during the construction
@@ -50,13 +50,26 @@ class AShape(Iterable):
                 self.size = size
             else:
                 raise ValueError('Invalid type to create AShape')
-    
+
     def copy(self) -> 'AShape':
         return AShape(self)
-    
+
     def as_list(self) -> List[int]:
         return list(self.shape)
-        
+
+    def check_axis(self, axis : int) -> int:
+        """
+        Check axis and returns normalized axis value
+
+        can raise ValueError
+        """
+        if axis < 0:
+            axis += self.ndim
+
+        if axis < 0 or axis >= self.ndim:
+            raise ValueError(f'axis {axis} out of bound of ndim {self.ndim}')
+        return axis
+
     def axes_arange(self) -> AAxes:
         """
         Returns tuple of axes arange.
@@ -64,7 +77,7 @@ class AShape(Iterable):
          Example (0,1,2) for ndim 3
         """
         return AAxes(range(self.ndim))
-        
+
     def replaced_axes(self, axes, dims) -> 'AShape':
         """
         returns new AShape where axes replaced with new dims
@@ -76,22 +89,22 @@ class AShape(Iterable):
                 axis = ndim + axis
             if axis < 0 or axis >= ndim:
                 raise ValueError(f'invalid axis value {axis}')
-            
+
             new_shape[axis] = dim
         return AShape(new_shape)
-        
+
 
     def split(self, axis) -> Tuple['AShape', 'AShape']:
         """
         split AShape at specified axis
-        
-        returns two AShape before+exclusive and inclusive+after 
+
+        returns two AShape before+exclusive and inclusive+after
         """
         if axis < 0:
             axis = self.ndim + axis
         if axis < 0 or axis >= self.ndim:
             raise ValueError(f'invalid axis value {axis}')
-            
+
         return self[:axis], self[axis:]
 
     def transpose_by_axes(self, axes) -> 'AShape':
diff --git a/xlib/avecl/_internal/HKernel.py b/xlib/avecl/_internal/HKernel.py
index 8512926..71211f5 100644
--- a/xlib/avecl/_internal/HKernel.py
+++ b/xlib/avecl/_internal/HKernel.py
@@ -15,12 +15,9 @@ class HKernel:
                         np.int64   : 'long',
                         np.uint64  : 'ulong',
                         np.float16 : 'half',
-                        np.float32 : 'float',
-                        np.float64 : 'double'
+                        np.float32 : 'float'
                       }
 
-
-
     @staticmethod
     def np_dtype_to_cl(dtype : np.dtype):
         """
@@ -134,30 +131,33 @@ class HKernel:
             out += [f'#define {name_upper}_GLOBAL_STORE8(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
             out += [f'#define {name_upper}_GLOBAL_STORE16(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
 
-        if dtype in [np.float32, np.float64]:
+        if dtype in [np.float32]:
             out += [f'#define {name_upper}_TO_FLOATX(x) x']
         elif dtype in [np.bool_, np.int8, np.uint8, np.int16, np.uint16, np.int32,np.uint32, np.float16]:
             out += [f'#define {name_upper}_TO_FLOATX(x) ((float)x)']
         elif dtype in [np.int64,np.uint64]:
             out += [f'#define {name_upper}_TO_FLOATX(x) ((double)x)']
         return '\n'.join(out)
-    
+
     @staticmethod
     def define_ndim_idx(ndim):
         """
+        define macro to calculate index for n-dim shape
+
         example for ndim=3
+
         #define NDIM3_IDX(t0,t1,t2,T0,T1,T2) (((size_t)(t0))*T1*T2+((size_t)(t1))*T2+((size_t)(t2)))
         #define NDIM3_IDX_MOD(t0,t1,t2,T0,T1,T2) (((size_t)(t0) % T0)*T1*T2+((size_t)(t1) % T1)*T2+((size_t)(t2) % T2))
         """
-        
+
         out = [f'#define NDIM{ndim}_IDX(' + \
                 ','.join([f't{i}' for i in range(ndim)] + [f'T{i}' for i in range(ndim)]) + \
                 ') (' + '+'.join([f'((size_t)(t{i}))' + ''.join(f'*T{j}' for j in range(i+1,ndim)) for i in range(ndim) ]) + ')']
-         
+
         out +=[f'#define NDIM{ndim}_IDX_MOD(' + \
                 ','.join([f't{i}' for i in range(ndim)] + [f'T{i}' for i in range(ndim)]) + \
                 ') (' + '+'.join([f'((size_t)(t{i}) % T{i})' + ''.join(f'*T{j}' for j in range(i+1,ndim)) for i in range(ndim) ]) + ')']
-              
+
         return '\n'.join(out)
 
     @staticmethod
@@ -165,14 +165,14 @@ class HKernel:
         """
         Returns a definitions for operations with tensor shape
 
-        example for 'O', (7,3),
+        example for 'O', (2,3),
 
-        #define O0 7
+        #define O0 2
         #define O1 3
         #define Om1 3
-        #define Om2 7
-        #define O_IDX(o0,o1) ( (size_t)(o0) )*3 +( o1 )
-        #define O_IDX_MOD(o0,o1) ( (size_t)(o0) % 7 )*3 +( (o1) % 3 )
+        #define Om2 2
+        #define O_IDX(o0,o1) (((size_t)(o0))*3+((size_t)(o1)))
+        #define O_IDX_MOD(o0,o1) (((size_t)(o0) % 2)*3+((size_t)(o1) % 3))
         """
         shape = tuple(shape)
         ndim = len(shape)
@@ -183,36 +183,14 @@ class HKernel:
             axes_symbols = "".join([str(i) for i in range(ndim)])
         axes_symbols = axes_symbols.upper()
 
-        out = []
-        for i in range(ndim):
-            out += [f'#define {name_upper}{axes_symbols[i]} {shape[i]}']
+        out =  [f'#define {name_upper}{axes_symbols[i]} {shape[i]}' for i in range(ndim)]
+        out += [f'#define {name_upper}m{i} {shape[-i]}' for i in range(1,ndim+1)]
 
-        for i in range(1,ndim+1):
-            out += [f'#define {name_upper}m{i} {shape[-i]}']
+        out += [f'#define {name_upper}_IDX({HKernel.axes_seq_enum(name, ndim)}) (' + \
+                 '+'.join([f'((size_t)({name_lower}{i}))'              + ''.join(f'*{shape[j]}' for j in range(i+1,ndim)) for i in range(ndim)]) + ')']
 
-        line = f'#define {name_upper}_IDX({HKernel.axes_seq_enum(name, ndim)}) '
-
-        for i in range(ndim):
-            line += f'( (size_t)({name_lower}{i}) )'
-
-            for j in range(i+1,ndim):
-                line += f'*{shape[j]} '
-            if i != ndim-1:
-                line += '+'
-
-        out += [line]
-
-        line = f'#define {name_upper}_IDX_MOD({HKernel.axes_seq_enum(name, ndim)}) '
-
-        for i in range(ndim):
-            line += f'( (size_t)({name_lower}{i}) % {shape[i]} )'
-
-            for j in range(i+1,ndim):
-                line += f'*{shape[j]} '
-            if i != ndim-1:
-                line += '+'
-
-        out += [line,'']
+        out += [f'#define {name_upper}_IDX_MOD({HKernel.axes_seq_enum(name, ndim)}) (' + \
+                 '+'.join([f'((size_t)({name_lower}{i}) % {shape[i]})' + ''.join(f'*{shape[j]}' for j in range(i+1,ndim)) for i in range(ndim)]) + ')']
 
         return '\n'.join(out)
 
diff --git a/xlib/avecl/_internal/HType.py b/xlib/avecl/_internal/HType.py
index 355a706..cc60a8e 100644
--- a/xlib/avecl/_internal/HType.py
+++ b/xlib/avecl/_internal/HType.py
@@ -3,10 +3,10 @@ from typing import Iterable, List
 import numpy as np
 
 scalar_types = [int, float, np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64,
-                np.float16, np.float32, np.float64, np.bool_]
+                np.float16, np.float32, np.bool_]
 
 np_scalar_types = [np.uint8, np.int8, np.uint16, np.int16, np.uint32, np.int32, np.uint64, np.int64,
-                    np.float16, np.float32, np.float64, np.bool_]
+                    np.float16, np.float32, np.bool_]
 
 _np_dtype_to_cl = {
     np.bool_   : 'bool',
@@ -20,7 +20,6 @@ _np_dtype_to_cl = {
     np.int64   : 'long',
     np.float16 : 'half',
     np.float32 : 'float',
-    np.float64 : 'double',
 }
 
 _np_dtype_weight = {
@@ -34,8 +33,7 @@ _np_dtype_weight = {
     np.uint64  : 8,
     np.int64   : 9,
     np.float16 : 10,
-    np.float32 : 11,
-    np.float64 : 12,
+    np.float32 : 11
 }
 
 class HType:
diff --git a/xlib/avecl/_internal/NTest.py b/xlib/avecl/_internal/NTest.py
index 8fa9236..63ca398 100644
--- a/xlib/avecl/_internal/NTest.py
+++ b/xlib/avecl/_internal/NTest.py
@@ -1,14 +1,13 @@
-import traceback
-
 import numpy as np
 
-from .HType import HType
-from .NCore import NCore
-from .backend import get_device, get_default_device, set_default_device
-from .Tensor import Tensor
 from . import op
-from .initializer import InitRandomUniform, InitCoords2DArange
+from .backend import get_default_device, get_device, set_default_device
+from .HType import HType
 from .info import Conv2DInfo
+from .initializer import InitCoords2DArange, InitRandomUniform
+from .NCore import NCore
+from .Tensor import Tensor
+
 
 class NTest():
 
@@ -45,6 +44,7 @@ class NTest():
                         binary_dilate_circle_test,
                         binary_morph_test,
                         cvt_color_test,
+                        rct_test,
                     ]
 
         for test_func in test_funcs:
@@ -62,18 +62,39 @@ class NTest():
 def _all_close(x,y, atol=1, btol=1):
     return np.allclose( np.ndarray.flatten(x[None,...]), np.ndarray.flatten(y[None,...]), atol, btol )
 
+def rct_test():
+    for _ in range(10):
+      for dtype in [np.float16, np.float32]:
+        base_shape = list(np.random.randint(1, 8, size=4) )
+        shape = base_shape.copy()
+        shape[1] = 3
+
+        mask_shape = base_shape.copy()
+        mask_shape[1] = 3
+
+        print(f'rct {shape} {str(np.dtype(dtype).name)} ... ', end='', flush=True)
+
+        source_t = Tensor(shape=shape, dtype=dtype, initializer=InitRandomUniform())
+        target_t = Tensor(shape=shape, dtype=dtype, initializer=InitRandomUniform())
+        mask_t   = Tensor(shape=mask_shape, dtype=dtype, initializer=InitRandomUniform())
+
+        result_t = op.rct(target_t, source_t, target_mask_t=mask_t, source_mask_t=mask_t )
+
+        print('pass')
+
+
 def cvt_color_test():
     for _ in range(10):
      for shape_len in range(2,6):
       for in_mode in ['RGB','BGR','XYZ','LAB']:
         for out_mode in ['RGB','BGR','XYZ','LAB']:
-          for dtype in [np.float16, np.float32, np.float64]:
+          for dtype in [np.float16, np.float32]:
             shape = list(np.random.randint(1, 8, size=shape_len) )
 
             ch_axis = np.random.randint(len(shape))
             shape[ch_axis] = 3
 
-            print(f'cvt_color {shape} {str(np.dtype(dtype).name)} {in_mode}->{out_mode} ... ', end='')
+            print(f'cvt_color {shape} {str(np.dtype(dtype).name)} {in_mode}->{out_mode} ... ', end='', flush=True)
 
             inp_n = np.random.uniform(size=shape ).astype(dtype)
             inp_t = Tensor.from_value(inp_n)
@@ -81,7 +102,9 @@ def cvt_color_test():
             out_t = op.cvt_color(inp_t, in_mode=in_mode, out_mode=out_mode, ch_axis=ch_axis)
             inp_t2 = op.cvt_color(out_t, in_mode=out_mode, out_mode=in_mode, ch_axis=ch_axis)
 
-            if not _all_close(inp_t.np(), inp_t2.np(), atol=0.1, btol=0.1):
+            is_check = in_mode in ['RGB','BGR','XYZ'] and out_mode in ['XYZ','LAB']
+
+            if is_check and not _all_close(inp_t.np(), inp_t2.np(), atol=0.1, btol=0.1):
                 raise Exception(f'data is not equal')
 
             print('pass')
@@ -91,7 +114,7 @@ def cast_test():
         for out_dtype in HType.get_np_scalar_types():
             shape = tuple(np.random.randint(1, 8, size=( np.random.randint(1,5))) )
 
-            print(f'cast: {shape} in_dtype:{str(np.dtype(in_dtype).name)} out_dtype:{str(np.dtype(out_dtype).name)}  ... ', end='')
+            print(f'cast: {shape} in_dtype:{str(np.dtype(in_dtype).name)} out_dtype:{str(np.dtype(out_dtype).name)}  ... ', end='', flush=True)
 
             val_n = np.random.uniform( -64, 64, size=shape ).astype(in_dtype)
             cast_n = val_n.astype(out_dtype)
@@ -113,7 +136,7 @@ def binary_morph_test():
             input_n = np.random.randint( 2, size=shape ).astype(dtype)
             input_t = Tensor.from_value(input_n)
 
-            print(f'binary_morph: {shape} erode_dilate:{erode_dilate} blur:{blur} {np.dtype(dtype).name} ... ', end='')
+            print(f'binary_morph: {shape} erode_dilate:{erode_dilate} blur:{blur} {np.dtype(dtype).name} ... ', end='', flush=True)
 
             op.binary_morph(input_t, erode_dilate=erode_dilate, blur=blur, fade_to_border=True)
 
@@ -130,7 +153,7 @@ def binary_erode_circle_test():
             input_n = np.random.randint( 2, size=shape ).astype(dtype)
             input_t = Tensor.from_value(input_n)
 
-            print(f'binary_erode_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='')
+            print(f'binary_erode_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='', flush=True)
 
             op.binary_erode_circle(input_t, radius=radius, iterations=iterations)
 
@@ -147,7 +170,7 @@ def binary_dilate_circle_test():
             input_n = np.random.randint( 2, size=shape ).astype(dtype)
             input_t = Tensor.from_value(input_n)
 
-            print(f'binary_dilate_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='')
+            print(f'binary_dilate_circle: {shape} radius:{radius} iters:{iterations} {np.dtype(dtype).name} ... ', end='', flush=True)
 
             op.binary_dilate_circle(input_t, radius=radius, iterations=iterations)
 
@@ -156,11 +179,11 @@ def binary_dilate_circle_test():
 
 def gaussian_blur_test():
     for shape_len in range(2,5):
-        for dtype in [np.float16, np.float32, np.float64]:
+        for dtype in [np.float16, np.float32]:
 
             shape = np.random.randint( 1, 64, size=(shape_len,) )
             sigma = np.random.rand() * 10
-            print(f'gaussian_blur: {shape} sigma:{sigma} {np.dtype(dtype).name} ... ', end='')
+            print(f'gaussian_blur: {shape} sigma:{sigma} {np.dtype(dtype).name} ... ', end='', flush=True)
 
             val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
             val_t = Tensor.from_value(val_n)
@@ -179,7 +202,7 @@ def pad_test():
 
                 paddings = tuple( (np.random.randint(8), np.random.randint(8)) for i in range(len(shape)) )
 
-                print(f'pad: {shape} {paddings} {mode} {np.dtype(dtype).name} ... ', end='')
+                print(f'pad: {shape} {paddings} {mode} {np.dtype(dtype).name} ... ', end='', flush=True)
 
                 val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
                 pad_n = np.pad(val_n, paddings, mode=mode)
@@ -187,7 +210,7 @@ def pad_test():
                 val_t = Tensor.from_value(val_n)
                 pad_t = op.pad(val_t, paddings, mode=mode)
 
-                print(f'{pad_n.shape} == {pad_t.shape} ... ', end='')
+                print(f'{pad_n.shape} == {pad_t.shape} ... ', end='', flush=True)
 
                 if pad_n.shape != pad_t.shape:
                     raise Exception(f'shape is not equal')
@@ -241,7 +264,7 @@ def slice_set_test():
                 shape = tuple(shape)
                 slices = tuple(slices)
 
-                print(f'slice_set: {shape} {np.dtype(dtype).name} {slices} ... ', end='')
+                print(f'slice_set: {shape} {np.dtype(dtype).name} {slices} ... ', end='', flush=True)
 
                 val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
                 val_t = Tensor.from_value(val_n)
@@ -330,7 +353,7 @@ def depthwise_conv2d_test():
                             input_shape  = (n, ic, ih, iw)
                             kernel_shape = (ic, ks, ks)
 
-                            print(f'depthwise_conv2d: {input_shape},{kernel_shape},{padding},{stride},{dilation},{np.dtype(dtype).name} ... ', end='')
+                            print(f'depthwise_conv2d: {input_shape},{kernel_shape},{padding},{stride},{dilation},{np.dtype(dtype).name} ... ', end='', flush=True)
 
                             input_n  = np.random.randint( 64, size=input_shape ).astype(dtype)
                             kernel_n = np.ones(shape=kernel_shape ).astype(dtype)
@@ -358,7 +381,7 @@ def warp_affine_test():
         H = np.random.randint(8, 64)
         W = np.random.randint(8, 64)
 
-        print(f'warp_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
+        print(f'warp_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='', flush=True)
 
         input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )
 
@@ -380,7 +403,7 @@ def remap_np_affine_test():
         H = np.random.randint(8, 64)
         W = np.random.randint(8, 64)
 
-        print(f'remap_np_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
+        print(f'remap_np_affine: [{H},{W}] {np.dtype(dtype).name} ... ', end='', flush=True)
 
         input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )
 
@@ -402,7 +425,7 @@ def remap_test():
         H = np.random.randint(8, 64)
         W = np.random.randint(8, 64)
 
-        print(f'remap: [{H},{W}] {np.dtype(dtype).name} ... ', end='')
+        print(f'remap: [{H},{W}] {np.dtype(dtype).name} ... ', end='', flush=True)
 
         input_t = Tensor ( [H,W,2], dtype, initializer=InitCoords2DArange(0, H-1, 0, W-1) ).sum( (-1,) )
 
@@ -422,7 +445,7 @@ def tile_test():
             shape = tuple(np.random.randint( 8, size=(shape_len,) )+1)
             tiles = tuple(np.random.randint( 4, size=(shape_len,) )+1)
 
-            print(f'tile: {shape} {tiles} {np.dtype(dtype).name} ... ', end='')
+            print(f'tile: {shape} {tiles} {np.dtype(dtype).name} ... ', end='', flush=True)
 
             val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
             tiled_n = np.tile(val_n, tiles)
@@ -430,7 +453,7 @@ def tile_test():
             val_t = Tensor.from_value(val_n)
             tiled_t = op.tile(val_t, tiles)
 
-            print(f'{tiled_n.shape} == {tiled_t.shape} ... ', end='')
+            print(f'{tiled_n.shape} == {tiled_t.shape} ... ', end='', flush=True)
 
             if tiled_n.shape != tiled_t.shape:
                 raise Exception(f'shape is not equal')
@@ -448,7 +471,7 @@ def stack_test():
                 axis = np.random.randint(shape_len+1)
                 stack_count = np.random.randint(4)+1
 
-                print(f'stack: {shape}*{stack_count} axis:{axis} {np.dtype(dtype).name} ... ', end='')
+                print(f'stack: {shape}*{stack_count} axis:{axis} {np.dtype(dtype).name} ... ', end='', flush=True)
 
                 vals_n = [ np.random.randint( 2**8, size=shape ).astype(dtype) for i in range(stack_count) ]
                 stack_n = np.stack(vals_n, axis)
@@ -456,7 +479,7 @@ def stack_test():
                 vals_t = [ Tensor.from_value(vals_n[i]) for i in range(stack_count) ]
                 stack_t = op.stack(vals_t, axis)
 
-                print(f'{stack_n.shape} == {stack_t.shape} ... ', end='')
+                print(f'{stack_n.shape} == {stack_t.shape} ... ', end='', flush=True)
 
                 if stack_n.shape != stack_t.shape:
                     raise Exception('shape is not equal')
@@ -483,9 +506,9 @@ def reduce_test():
 
                 keepdims = np.random.randint(2) == 0
 
-                print(f'reduce {op_type}: {shape} {np.dtype(dtype).name} axes={reduction_axes} keepdims={keepdims} ... ', end='')
+                print(f'reduce {op_type}: {shape} {np.dtype(dtype).name} axes={reduction_axes} keepdims={keepdims} ... ', end='', flush=True)
 
-                if dtype in [np.float16, np.float32, np.float64]:
+                if dtype in [np.float16, np.float32]:
                     value_n = np.random.uniform(size=shape).astype(dtype)
                 else:
                     value_n = np.random.randint( max(1, int(np.iinfo(dtype).max / np.prod(shape)) ), size=shape, dtype=dtype )
@@ -518,7 +541,7 @@ def InitRandomUniform_test():
         for shape_len in range(1, 5):
             shape = np.random.randint( 8, size=(shape_len,) )+1
 
-            print(f'InitRandomUniform: {shape} {np.dtype(dtype).name} ... ', end='')
+            print(f'InitRandomUniform: {shape} {np.dtype(dtype).name} ... ', end='', flush=True)
 
             Tensor(shape, dtype, initializer=InitRandomUniform()).np()
 
@@ -534,7 +557,7 @@ def InitCoords2DArange_test():
             w_start = np.random.randint(80)
             w_stop = w_start + np.random.randint(80)
 
-            print(f'InitCoords2DArange: {shape} {np.dtype(dtype).name} ... ', end='')
+            print(f'InitCoords2DArange: {shape} {np.dtype(dtype).name} ... ', end='', flush=True)
 
             Tensor(shape, dtype, initializer=InitCoords2DArange(h_start,h_stop,w_start,w_stop )).np()
 
@@ -551,17 +574,17 @@ def concat_test():
                                     for i,dim in enumerate(shape) )
                             for shape in ([shape] * count) )
 
-            print(f'concat: {shapes} axis={axis} {np.dtype(dtype).name} ... ', end='')
+            print(f'concat: {shapes} axis={axis} {np.dtype(dtype).name} ... ', end='', flush=True)
 
             V_n = [ np.random.randint( 2**8, size=shape ).astype(dtype) for shape in shapes ]
             O_n = np.concatenate(V_n, axis)
 
-            print(f'{O_n.shape} == ', end='')
+            print(f'{O_n.shape} == ', end='', flush=True)
 
             V_t = [ Tensor.from_value(V_n[i]) for i in range(count) ]
             O_t = op.concat(V_t, axis)
 
-            print(f'{O_t.shape} ... ', end='')
+            print(f'{O_t.shape} ... ', end='', flush=True)
 
             if O_n.shape != O_t.shape:
                 raise Exception('shape is not equal')
@@ -596,19 +619,19 @@ def matmul_test():
                 A_shape = (BATCH, M, K)
                 B_shape = (BATCH, K, N)
 
-            print(f'matmul: {A_shape} {B_shape} {np.dtype(dtype).name} ... ', end='')
+            print(f'matmul: {A_shape} {B_shape} {np.dtype(dtype).name} ... ', end='', flush=True)
 
             A_n = np.random.randint( 2**4, size=A_shape ).astype(dtype)
             B_n = np.random.randint( 2**4, size=B_shape ).astype(dtype)
 
             O_n = np.matmul(A_n, B_n)
 
-            print(f'{O_n.shape} == ', end='')
+            print(f'{O_n.shape} == ', end='', flush=True)
 
             A_t = Tensor.from_value(A_n)
             B_t = Tensor.from_value(B_n)
             O_t = op.matmul(A_t, B_t)
-            print(f'{O_t.shape} ... ', end='')
+            print(f'{O_t.shape} ... ', end='', flush=True)
 
             if O_n.shape != O_t.shape:
                 raise Exception('shape is not equal')
@@ -659,17 +682,17 @@ def slice_test():
                 shape = tuple(shape)
                 slices = tuple(slices)
 
-                print(f'slice: {shape} {np.dtype(dtype).name} {slices} ... ', end='')
+                print(f'slice: {shape} {np.dtype(dtype).name} {slices} ... ', end='', flush=True)
 
                 val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
 
                 sliced_n = val_n[slices]
 
-                print(f'{sliced_n.shape} ... ', end='')
+                print(f'{sliced_n.shape} ... ', end='', flush=True)
 
                 sliced_t = Tensor.from_value(val_n)[slices]
 
-                print(f'{sliced_t.shape} ... ', end='')
+                print(f'{sliced_t.shape} ... ', end='', flush=True)
 
                 if 0 in sliced_n.shape:
                     # some cases like 0:1:-1 will produce zero shape and invalid array on numpy
@@ -694,17 +717,17 @@ def transpose_test():
             axes_order = np.array([*range(shape_len)])
             np.random.shuffle(axes_order)
 
-            print(f'transpose: {shape} {axes_order} ... ', end='')
+            print(f'transpose: {shape} {axes_order} ... ', end='', flush=True)
 
             val_n = np.random.randint( 2**8, size=shape ).astype(dtype)
             transposed_n = np.transpose(val_n, axes_order)
 
-            print(f'{transposed_n.shape} ... ', end='')
+            print(f'{transposed_n.shape} ... ', end='', flush=True)
 
             val_t = Tensor.from_value(val_n)
             transposed_t = op.transpose (val_t, axes_order )
 
-            print(f'{transposed_t.shape} ... ', end='')
+            print(f'{transposed_t.shape} ... ', end='', flush=True)
 
             if transposed_n.shape != transposed_t.shape:
                 raise Exception('shape is not equal')
@@ -736,7 +759,7 @@ def any_wise_op_test():
                     shapes = shapes[::-1]
                 a_shape, b_shape = shapes
 
-                print(f'any_wise: {a_shape} {str(op_type)} {b_shape}:{str(np.dtype(dtype).name)} ...', end='')
+                print(f'any_wise: {a_shape} {str(op_type)} {b_shape}:{str(np.dtype(dtype).name)} ...', end='', flush=True)
 
                 a_n = np.random.randint( 1, 2**8, size=a_shape ).astype(dtype)
                 b_n = np.random.randint( 1, 2**8, size=b_shape ).astype(dtype)
diff --git a/xlib/avecl/_internal/Tensor.py b/xlib/avecl/_internal/Tensor.py
index 100225c..2f2f741 100644
--- a/xlib/avecl/_internal/Tensor.py
+++ b/xlib/avecl/_internal/Tensor.py
@@ -109,6 +109,7 @@ class Tensor:
     def min(self, axes=None, keepdims=False) -> 'Tensor': ...
     def reshape(self, new_shape) -> 'Tensor': ...
     def sum(self, axes=None, keepdims=False) -> 'Tensor': ...
+    def std(self, axes=None, keepdims=False) -> 'Tensor': ...
     def transpose(self, axes_order, op_text=None, dtype=None) -> 'Tensor': ...
 
     @property
diff --git a/xlib/avecl/_internal/TensorImpl.py b/xlib/avecl/_internal/TensorImpl.py
index 1491582..d252879 100644
--- a/xlib/avecl/_internal/TensorImpl.py
+++ b/xlib/avecl/_internal/TensorImpl.py
@@ -70,6 +70,7 @@ Tensor.mean = reduce_mean
 Tensor.min = reduce_min
 Tensor.reshape = reshape
 Tensor.sum = reduce_sum
+Tensor.std = reduce_std
 Tensor.transpose = transpose
 
 class TensorRef(Tensor):
diff --git a/xlib/avecl/_internal/backend/Device.py b/xlib/avecl/_internal/backend/Device.py
index 9dd40fc..f8d63b3 100644
--- a/xlib/avecl/_internal/backend/Device.py
+++ b/xlib/avecl/_internal/backend/Device.py
@@ -18,8 +18,7 @@ _np_dtype_to_cl = { np.uint8:   CL.cl_uchar,
                     np.uint64:  CL.cl_ulong,
                     np.int64:   CL.cl_long,
                     np.float16: CL.cl_half,
-                    np.float32: CL.cl_float,
-                    np.float64: CL.cl_double }
+                    np.float32: CL.cl_float}
 
 _opencl_device_ids = None
 _default_device = None
diff --git a/xlib/avecl/_internal/initializer/InitRandomUniform.py b/xlib/avecl/_internal/initializer/InitRandomUniform.py
index a657de0..73e0d60 100644
--- a/xlib/avecl/_internal/initializer/InitRandomUniform.py
+++ b/xlib/avecl/_internal/initializer/InitRandomUniform.py
@@ -38,8 +38,6 @@ class InitRandomUniform(Initializer):
                 gen_expression = f'hash_ulong_from_ulong(gid+seed64) % {int(hl)} + {int(l)}'
             elif tensor.dtype in [np.float16, np.float32]:
                 gen_expression = f'hash_float_from_uint(gid+seed32)*{hl} + {l}'
-            elif tensor.dtype in [np.float64]:
-                gen_expression = f'hash_double_from_ulong(gid+seed64)*{hl} + {l}'
 
             kernel = Kernel(kernel_text=f"""
 {HKernel.include_hash()}
diff --git a/xlib/avecl/_internal/op/__init__.py b/xlib/avecl/_internal/op/__init__.py
index 6aa933f..3ef73f1 100644
--- a/xlib/avecl/_internal/op/__init__.py
+++ b/xlib/avecl/_internal/op/__init__.py
@@ -9,12 +9,13 @@ from .depthwise_conv2D import depthwise_conv2D
 from .gaussian_blur import gaussian_blur
 from .matmul import matmul, matmulc
 from .pad import pad
+from .rct import rct
 from .reduce import (moments, reduce_max, reduce_mean, reduce_min, reduce_std,
                      reduce_sum, reduce_variance)
 from .remap import remap
 from .remap_np_affine import remap_np_affine
 from .reshape import reshape
-from .slice_ import slice_
+from .slice_ import slice_, split
 from .slice_set import slice_set
 from .stack import stack
 from .tile import tile
diff --git a/xlib/avecl/_internal/op/any_wise.py b/xlib/avecl/_internal/op/any_wise.py
index 71f5fe1..ac03b3d 100644
--- a/xlib/avecl/_internal/op/any_wise.py
+++ b/xlib/avecl/_internal/op/any_wise.py
@@ -1,27 +1,31 @@
 import numpy as np
 
+from ..AAxes import AAxes
 from ..AShape import AShape
 from ..backend import Kernel
 from ..HArgs import HArgs
 from ..HKernel import HKernel
 from ..HType import HType
-from ..info import BroadcastInfo
+from ..info import BroadcastInfo, ReductionInfo
 from ..SCacheton import SCacheton
 from ..Tensor import Tensor
 
 
 def any_wise(op_text : str,
              *args,
+             dim_wise_axis : int = None,
              dtype : np.dtype = None,
              output_t:Tensor=None) -> Tensor:
     """
-    operator for N-wise ops with N inputs
+    elements-wise operator with N inputs
 
     arguments
         op_text     example: O=(2*I0*I1)+I2
 
         *args       List[ Tensor | number ]
 
+        dim_wise_axis(None)
+
         dtype
 
         output_t            compute result to this Tensor.
@@ -33,7 +37,7 @@ def any_wise(op_text : str,
 
     shape_list, dtype_list, krn_args = HArgs.decompose(args)
 
-    op = SCacheton.get(_AnyWiseOp, shape_list, dtype_list, dtype, op_text)
+    op = SCacheton.get(_AnyWiseOp, shape_list, dtype_list, dim_wise_axis, dtype, op_text)
 
     if output_t is None:
         output_t = Tensor ( op.o_shape, op.o_dtype, device=device )
@@ -45,59 +49,60 @@ def any_wise(op_text : str,
     return output_t
 
 class _AnyWiseOp:
-    def __init__(self, shape_list, dtype_list, o_dtype, op_text : str):
+    def __init__(self, shape_list, dtype_list, dim_wise_axis, o_dtype, op_text : str):
         if len(shape_list) != len(dtype_list):
             raise ValueError('len(shape_list) != len(dtype_list)')
 
         self.o_dtype = o_dtype = o_dtype if o_dtype is not None else HType.get_most_weighted_dtype (dtype_list)
+        self.info = info = BroadcastInfo( [ shape if shape is not None else AShape((1,)) for shape in shape_list ])
+        self.o_shape = o_shape = info.o_shape
 
-        if len(shape_list) == 1:
-            # element-wise.
-            i_shape, i_dtype = shape_list[0], dtype_list[0]
-            self.o_shape = o_shape = i_shape
+        g_shape = o_shape
+        if dim_wise_axis is not None:
+            dim_wise_axis = o_shape.check_axis(dim_wise_axis)
 
-            self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
-{HKernel.define_tensor('O', o_shape, o_dtype)}
-{HKernel.define_tensor('IN', i_shape, i_dtype)}
-__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const IN_PTR_TYPE* IN_PTR_NAME)
-{{
-size_t gid = get_global_id(0);
+            dim_wise_axis_size = o_shape[dim_wise_axis]
+            if dim_wise_axis_size > 16:
+                raise ValueError(f'dim_wise_axis size > 16: {dim_wise_axis_size}')
 
-O_TYPE O = O_GLOBAL_LOAD(gid);
-IN_TYPE I0 = IN_GLOBAL_LOAD(gid);
-{op_text};
-O_GLOBAL_STORE(gid, O);
-}}
-""")
-        else:
-            # Multi arg.
-            self.info = info = BroadcastInfo( [ shape if shape is not None else AShape((1,)) for shape in shape_list ])
+            g_shape = ReductionInfo( o_shape, AAxes(dim_wise_axis), False ).o_shape
 
-            self.o_shape = o_shape = info.o_shape
+        defs, arg_defs, impls = [], [], []
+        for i, (t_shape, t_dtype) in enumerate(zip(shape_list, dtype_list)):
+            t_name = f'I{i}'
+            if t_shape is not None:
+                defs.append( HKernel.define_tensor(t_name, info.br_shapes[i], t_dtype) )
+                arg_defs.append( f", __global const {t_name}_PTR_TYPE* {t_name}_PTR_NAME" )
 
-            defs, arg_defs, impls = [], [], []
-            for i, (t_shape, t_dtype) in enumerate(zip(shape_list, dtype_list)):
-                t_name = f'I{i}'
-                if t_shape is not None:
-                    defs.append( HKernel.define_tensor(t_name, info.br_shapes[i], t_dtype) )
-                    arg_defs.append( f", __global const {t_name}_PTR_TYPE* {t_name}_PTR_NAME" )
-                    impls.append( f"{t_name}_TYPE {t_name} = {t_name}_GLOBAL_LOAD({t_name}_IDX_MOD({HKernel.axes_seq_enum('O', info.o_shape.ndim)}));")
+                if dim_wise_axis is not None:
+                    for i_elem in range(dim_wise_axis_size):
+                        impls.append( f"{t_name}_TYPE {t_name}_{i_elem} = {t_name}_GLOBAL_LOAD({t_name}_IDX_MOD({HKernel.axes_seq_enum('G', g_shape.ndim, new_axis=(f'{i_elem}', dim_wise_axis) )}));")
                 else:
-                    arg_defs.append( f", {HKernel.define_scalar_func_arg(t_name, t_dtype)}" )
+                        impls.append( f"{t_name}_TYPE {t_name} = {t_name}_GLOBAL_LOAD({t_name}_IDX_MOD({HKernel.axes_seq_enum('G', g_shape.ndim)}));")
+            else:
+                arg_defs.append( f", {HKernel.define_scalar_func_arg(t_name, t_dtype)}" )
 
-            defs, arg_defs, impls = '\n'.join(defs), '\n'.join(arg_defs), '\n'.join(impls)
+        defs, arg_defs, impls = '\n'.join(defs), '\n'.join(arg_defs), '\n'.join(impls)
 
-            self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+        if dim_wise_axis is not None:
+            o_def = '\n'.join( f"O_TYPE O_{i_elem};" for i_elem in range(dim_wise_axis_size) )
+            o_store = '\n'.join( f"O_GLOBAL_STORE(O_IDX({HKernel.axes_seq_enum('G', g_shape.ndim, new_axis=(f'{i_elem}', dim_wise_axis) )}), O_{i_elem});" for i_elem in range(dim_wise_axis_size) )
+        else:
+            o_def   = 'O_TYPE O;'
+            o_store = 'O_GLOBAL_STORE(gid, O);'
+
+        self.forward_krn = Kernel(global_shape=(g_shape.size,), kernel_text=f"""
 {defs}
 {HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor_shape('G', g_shape)}
 __kernel void impl(__global O_PTR_TYPE* O_PTR_NAME{arg_defs})
 {{
 size_t gid = get_global_id(0);
-{HKernel.decompose_idx_to_axes_idxs('gid', 'o', o_shape.ndim)}
+{HKernel.decompose_idx_to_axes_idxs('gid', 'G', g_shape.ndim)}
 {impls}
-O_TYPE O;
+{o_def}
 {op_text};
-O_GLOBAL_STORE(gid, O);
+{o_store}
 }}
 """)
 
diff --git a/xlib/avecl/_internal/op/cvt_color.py b/xlib/avecl/_internal/op/cvt_color.py
index 6a7d822..35add96 100644
--- a/xlib/avecl/_internal/op/cvt_color.py
+++ b/xlib/avecl/_internal/op/cvt_color.py
@@ -39,7 +39,7 @@ def cvt_color (input_t : Tensor, in_mode : str, out_mode : str, ch_axis=1, dtype
     return output_t
 
 _allowed_modes = ['RGB', 'BGR', 'XYZ', 'LAB']
-_allowed_dtypes = [np.float16, np.float32, np.float64]
+_allowed_dtypes = [np.float16, np.float32]
 
 class _CvtColor32Op():
     def __init__(self, i_shape : AShape, i_dtype, in_mode, o_dtype, out_mode, ch_axis):
@@ -100,54 +100,74 @@ class _CvtColor32Op():
             self.forward_krn = krn
 
     @staticmethod
-    def get_RGB_to_LAB_body(R,G,B,L,a,b,lab_type='') -> str:
+    def get_RGB_to_LAB_body(R,G,B,L,a,b, declare_out_type=False) -> str:
         return f"""
-{_CvtColor32Op.get_RGB_to_XYZ_body(R,G,B,'X','Y','Z', xyz_type='float')}
-{_CvtColor32Op.get_XYZ_to_LAB_body('X','Y','Z',L,a,b, lab_type=lab_type)}
+{_CvtColor32Op.get_sRGB_to_XYZ_body(R,G,B,'X','Y','Z', declare_out_type=True)}
+{_CvtColor32Op.get_XYZ_to_LAB_body('X','Y','Z',L,a,b, declare_out_type=declare_out_type)}
 """
 
     @staticmethod
-    def get_LAB_to_RGB_body(L,a,b,R,G,B,rgb_type='') -> str:
+    def get_LAB_to_RGB_body(L,a,b,R,G,B, declare_out_type=False) -> str:
         return f"""
-{_CvtColor32Op.get_LAB_to_XYZ_body(L,a,b,'X','Y','Z', xyz_type='float')}
-{_CvtColor32Op.get_XYZ_to_RGB_body('X','Y','Z',R,G,B,rgb_type=rgb_type)}
+{_CvtColor32Op.get_LAB_to_XYZ_body(L,a,b,'X','Y','Z', declare_out_type=True)}
+{_CvtColor32Op.get_XYZ_to_sRGB_body('X','Y','Z',R,G,B, declare_out_type=declare_out_type)}
 """
 
     @staticmethod
-    def get_RGB_to_XYZ_body(R,G,B,X,Y,Z,xyz_type='') -> str:
+    def get_sRGB_to_XYZ_body(R,G,B,X,Y,Z, declare_out_type=False) -> str:
         return f"""
-{xyz_type} {X} = fma(0.4124564, {R}, fma(0.3575761, {G}, 0.1804375*{B}));
-{xyz_type} {Y} = fma(0.2126729, {R}, fma(0.7151522, {G}, 0.0721750*{B}));
-{xyz_type} {Z} = fma(0.0193339, {R}, fma(0.1191920, {G}, 0.9503041*{B}));
-"""
-    @staticmethod
-    def get_XYZ_to_RGB_body(X,Y,Z,R,G,B,rgb_type='') -> str:
-        return f"""
-{rgb_type} {R} = fma( 3.2404542, {X}, fma(-1.5371385, {Y}, -0.4985314*{Z}));
-{rgb_type} {G} = fma(-0.9692660, {X}, fma( 1.8760108, {Y},  0.0415560*{Z}));
-{rgb_type} {B} = fma( 0.0556434, {X}, fma(-0.2040259, {Y},  1.0572252*{Z}));
+{R} = ({R} > 0.04045)*( pow( ({R}+0.055)/1.055, 2.4) ) + ({R} <= 0.04045)*({R} / 12.92);
+{G} = ({G} > 0.04045)*( pow( ({G}+0.055)/1.055, 2.4) ) + ({G} <= 0.04045)*({G} / 12.92);
+{B} = ({B} > 0.04045)*( pow( ({B}+0.055)/1.055, 2.4) ) + ({B} <= 0.04045)*({B} / 12.92);
+
+{_CvtColor32Op.get_RGB_to_XYZ_body(R,G,B,X,Y,Z,declare_out_type=declare_out_type) }
 """
 
     @staticmethod
-    def get_RGB_to_BGR_body(R,G,B,b,g,r,bgr_type='') -> str:
+    def get_RGB_to_XYZ_body(R,G,B,X,Y,Z, declare_out_type=False) -> str:
         return f"""
-{bgr_type} {b} = {R};
-{bgr_type} {g} = {G};
-{bgr_type} {r} = {B};
+{'float' if declare_out_type else ''} {X} = {R}*0.412453 + {G}*0.357580 + {B}*0.180423;
+{'float' if declare_out_type else ''} {Y} = {R}*0.212671 + {G}*0.715160 + {B}*0.072169;
+{'float' if declare_out_type else ''} {Z} = {R}*0.019334 + {G}*0.119193 + {B}*0.950227;
 """
 
     @staticmethod
-    def get_BGR_to_RGB_body(B,G,R,r,g,b,rgb_type='') -> str:
+    def get_XYZ_to_sRGB_body(X,Y,Z,R,G,B, declare_out_type=False) -> str:
         return f"""
-{rgb_type} {r} = {B};
-{rgb_type} {g} = {G};
-{rgb_type} {b} = {R};
+{_CvtColor32Op.get_XYZ_to_RGB_body(X,Y,Z,R,G,B,declare_out_type=declare_out_type) }
+{R} = ({R} > 0.0031308)*( 1.055*pow({R},1.0/2.4)-0.055 ) + ({R} <= 0.0031308)*({R} * 12.92);
+{G} = ({G} > 0.0031308)*( 1.055*pow({G},1.0/2.4)-0.055 ) + ({G} <= 0.0031308)*({G} * 12.92);
+{B} = ({B} > 0.0031308)*( 1.055*pow({B},1.0/2.4)-0.055 ) + ({B} <= 0.0031308)*({B} * 12.92);
 """
 
     @staticmethod
-    def get_XYZ_to_LAB_body(X,Y,Z,L,A,B,lab_type='') -> str:
+    def get_XYZ_to_RGB_body(X,Y,Z,R,G,B, declare_out_type=False) -> str:
+        return f"""
+{'float' if declare_out_type else ''} {R} = clamp( {X}* 3.240479 + {Y}*-1.53715  + {Z}*-0.498535, 0.0, 1.0 );
+{'float' if declare_out_type else ''} {G} = clamp( {X}*-0.969256 + {Y}* 1.875991 + {Z}* 0.041556, 0.0, 1.0 );
+{'float' if declare_out_type else ''} {B} = clamp( {X}* 0.055648 + {Y}*-0.204043 + {Z}* 1.057311, 0.0, 1.0 );
+"""
+
+    @staticmethod
+    def get_RGB_to_BGR_body(R,G,B,b,g,r, declare_out_type=False) -> str:
+        return f"""
+{'float' if declare_out_type else ''} {b} = {R};
+{'float' if declare_out_type else ''} {g} = {G};
+{'float' if declare_out_type else ''} {r} = {B};
+"""
+
+    @staticmethod
+    def get_BGR_to_RGB_body(B,G,R,r,g,b, declare_out_type=False) -> str:
+        return f"""
+{'float' if declare_out_type else ''} {r} = {B};
+{'float' if declare_out_type else ''} {g} = {G};
+{'float' if declare_out_type else ''} {b} = {R};
+"""
+
+    @staticmethod
+    def get_XYZ_to_LAB_body(X,Y,Z,L,A,B, declare_out_type=False) -> str:
         beta3 = '((6.0/29.0)*(6.0/29.0)*(6.0/29.0))'
-        xyz_xn = '(0.9556)'
+        xyz_xn = '(0.950456)'
         xyz_zn = '(1.088754)'
         return f"""
 {X} /= {xyz_xn};
@@ -157,20 +177,20 @@ class _CvtColor32Op():
 {Y} = ({Y} > {beta3})*rootn({Y}, 3) + ({Y} <= {beta3})*(7.787*{Y}+4.0/29.0);
 {Z} = ({Z} > {beta3})*rootn({Z}, 3) + ({Z} <= {beta3})*(7.787*{Z}+4.0/29.0);
 
-{lab_type} {L} = 116.0*{Y}-16.0;
-{lab_type} {A} = 500.0*({X}-{Y});
-{lab_type} {B} = 200.0*({Y}-{Z});
+{'float' if declare_out_type else ''} {L} = 116.0*{Y}-16.0;
+{'float' if declare_out_type else ''} {A} = 500.0*({X}-{Y});
+{'float' if declare_out_type else ''} {B} = 200.0*({Y}-{Z});
 """
     @staticmethod
-    def get_LAB_to_XYZ_body(L,A,B,X,Y,Z,xyz_type='') -> str:
+    def get_LAB_to_XYZ_body(L,A,B,X,Y,Z, declare_out_type=False) -> str:
         beta = '(6.0/29.0)'
         beta2 = '((6.0/29.0)*(6.0/29.0))'
-        xyz_xn = '(0.9556)'
+        xyz_xn = '(0.950456)'
         xyz_zn = '(1.088754)'
         return f"""
-{xyz_type} {Y} = ({L} + 16.0) / 116.0;
-{xyz_type} {X} = {Y} + {A} / 500.0;
-{xyz_type} {Z} = {Y} - {B} / 200.0;
+{'float' if declare_out_type else ''} {Y} = ({L} + 16.0) / 116.0;
+{'float' if declare_out_type else ''} {X} = {Y} + {A} / 500.0;
+{'float' if declare_out_type else ''} {Z} = {Y} - {B} / 200.0;
 
 {Y} = ({Y} > {beta})*({Y}*{Y}*{Y})          + ({Y} <= {beta})*({Y}-16.0/116.0)*3*{beta2};
 {X} = ({X} > {beta})*({X}*{X}*{X}*{xyz_xn}) + ({X} <= {beta})*({X}-16.0/116.0)*3*{beta2}*{xyz_xn};
diff --git a/xlib/avecl/_internal/op/reduce.py b/xlib/avecl/_internal/op/reduce.py
index 6356eee..823f0c6 100644
--- a/xlib/avecl/_internal/op/reduce.py
+++ b/xlib/avecl/_internal/op/reduce.py
@@ -58,7 +58,7 @@ def reduce_variance(input_t, axes=None, keepdims=False):
     mean = reduce_mean(input_t, axes, keepdims=True)
     return reduce_mean(square(input_t - mean), axes, keepdims)
 
-def moments(input_t, axes=None, keepdims=False):
+def moments(input_t, axes=None):
     """
     Returns (mean, variance) of input_t
 
@@ -68,11 +68,9 @@ def moments(input_t, axes=None, keepdims=False):
                     Iterable of ints.
                     None - all axes
 
-        keepdims(False)     keep reduced axes
     """
-    mean = reduce_mean(input_t, axes, keepdims)
-    mean_shape_keepdims = mean._op.info.o_shape_kd
-    var = reduce_mean(square(input_t - mean.reshape(mean_shape_keepdims) ), axes, keepdims)
+    mean = reduce_mean(input_t, axes, True)
+    var = reduce_mean(square(input_t - mean), axes, True)
     return mean, var
 
 def reduce_min (input_t : Tensor, axes=None, keepdims=False, output_t=None, is_add_to_output=False) -> Tensor:
diff --git a/xlib/avecl/_internal/op/slice_.py b/xlib/avecl/_internal/op/slice_.py
index 2503d42..d692470 100644
--- a/xlib/avecl/_internal/op/slice_.py
+++ b/xlib/avecl/_internal/op/slice_.py
@@ -1,6 +1,9 @@
+from typing import List
+
 import numpy as np
 
 from ..AShape import AShape
+from ..AAxes import AAxes
 from ..backend import Kernel
 from ..HKernel import HKernel
 from ..HType import HType
@@ -9,6 +12,29 @@ from ..SCacheton import SCacheton
 from ..Tensor import Tensor
 
 
+def split(input_t : Tensor, axis, keepdims=False) -> List[Tensor]:
+    """
+
+    arguments
+
+     input_t    Tensor
+
+     axis
+
+    """
+    shape = input_t.shape
+
+    result = []
+    for i in range(shape[axis]):
+        slices = [slice(None, None, None)]*shape.ndim
+
+        slices[axis] = i if not keepdims else slice(i,i+1,1)
+
+        result.append( slice_(input_t, slices) )
+
+    return result
+
+
 def slice_(input_t : Tensor, slices, dtype : np.dtype = None, output_t=None, is_add_to_output=False) -> Tensor:
     """
     arguments: