add xlib.avecl

2025-08-20 13:33:25 -07:00 · 2021-09-30 18:21:30 +04:00 · 2021-09-30 18:21:30 +04:00 · 0058474da7
commit 0058474da7
parent 932edfe875
56 changed files with 5569 additions and 0 deletions
--- a/xlib/avecl/_internal/HKernel.py
+++ b/xlib/avecl/_internal/HKernel.py
@ -0,0 +1,496 @@
+import numpy as np
+
+class HKernel:
+    """
+    Helper functions for Kernels
+    """
+
+    _np_dtype_to_cl = { np.bool_   : 'bool',
+                        np.int8    : 'char',
+                        np.uint8   : 'uchar',
+                        np.int16   : 'short',
+                        np.uint16  : 'ushort',
+                        np.int32   : 'int',
+                        np.uint32  : 'uint',
+                        np.int64   : 'long',
+                        np.uint64  : 'ulong',
+                        np.float16 : 'half',
+                        np.float32 : 'float',
+                        np.float64 : 'double'
+                      }
+
+
+
+    @staticmethod
+    def np_dtype_to_cl(dtype : np.dtype):
+        """
+        returns string opencl type  from numpy dtype
+
+        example np.float32  -> 'float'
+                np.uint8    -> 'unsigned char'
+        """
+        return HKernel._np_dtype_to_cl[np.dtype(dtype).type]
+
+    @staticmethod
+    def define_scalar_func_arg(name, dtype : np.dtype):
+        """
+        """
+        return f'{HKernel._np_dtype_to_cl[np.dtype(dtype).type]} {name}'
+
+    @staticmethod
+    def define_tensor_type(name, dtype : np.dtype):
+        """
+        Returns a definitions for operations with tensor
+
+        example for 'O', np.float16:
+
+        #define O_PTR_NAME p_O
+        #define O_PTR_TYPE half
+        #define O_PTR_TYPE2 half2
+        #define O_PTR_TYPE3 half3
+        #define O_PTR_TYPE4 half4
+        #define O_PTR_TYPE8 half8
+        #define O_PTR_TYPE16 half16
+        #define O_TYPE float
+        #define O_TYPE2 float2
+        #define O_TYPE3 float3
+        #define O_TYPE4 float4
+        #define O_TYPE8 float8
+        #define O_TYPE16 float16
+        #define O_GLOBAL_LOAD(offset)   vload_half  (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD2(offset)  vload_half2 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD3(offset)  vload_half3 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD4(offset)  vload_half4 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD8(offset)  vload_half8 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE(offset,value)   vstore_half  ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE2(offset,value)  vstore_half2 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE3(offset,value)  vstore_half3 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE4(offset,value)  vstore_half4 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE8(offset,value)  vstore_half8 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_TO_FLOATX(x) ((float)x)
+        """
+        name_upper = name.upper()
+
+        dtype = np.dtype(dtype).type
+
+        out = [f'#define {name.upper()}_PTR_NAME p_{name.upper()}']
+
+        if dtype == np.float16:
+            out += [f'#define {name_upper}_PTR_TYPE half']
+            out += [f'#define {name_upper}_PTR_TYPE2 half2']
+            out += [f'#define {name_upper}_PTR_TYPE3 half3']
+            out += [f'#define {name_upper}_PTR_TYPE4 half4']
+            out += [f'#define {name_upper}_PTR_TYPE8 half8']
+            out += [f'#define {name_upper}_PTR_TYPE16 half16']
+            out += [f'#define {name_upper}_TYPE {HKernel.np_dtype_to_cl(np.float32)}']
+            out += [f'#define {name_upper}_TYPE2 {HKernel.np_dtype_to_cl(np.float32)}2']
+            out += [f'#define {name_upper}_TYPE3 {HKernel.np_dtype_to_cl(np.float32)}3']
+            out += [f'#define {name_upper}_TYPE4 {HKernel.np_dtype_to_cl(np.float32)}4']
+            out += [f'#define {name_upper}_TYPE8 {HKernel.np_dtype_to_cl(np.float32)}8']
+            out += [f'#define {name_upper}_TYPE16 {HKernel.np_dtype_to_cl(np.float32)}16']
+
+
+            out += [f'#define {name_upper}_GLOBAL_LOAD(offset)   vload_half  (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_LOAD2(offset)  vload_half2 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_LOAD3(offset)  vload_half3 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_LOAD4(offset)  vload_half4 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_LOAD8(offset)  vload_half8 (0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+
+            out += [f'#define {name_upper}_GLOBAL_STORE(offset,value)   vstore_half  ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_STORE2(offset,value)  vstore_half2 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_STORE3(offset,value)  vstore_half3 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_STORE4(offset,value)  vstore_half4 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_STORE8(offset,value)  vstore_half8 ( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+            out += [f'#define {name_upper}_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&{name_upper}_PTR_NAME[(offset)]) )']
+
+        else:
+            out += [f'#define {name_upper}_PTR_TYPE {HKernel.np_dtype_to_cl(dtype)}']
+            out += [f'#define {name_upper}_PTR_TYPE2 {HKernel.np_dtype_to_cl(dtype)}2']
+            out += [f'#define {name_upper}_PTR_TYPE3 {HKernel.np_dtype_to_cl(dtype)}3']
+            out += [f'#define {name_upper}_PTR_TYPE4 {HKernel.np_dtype_to_cl(dtype)}4']
+            out += [f'#define {name_upper}_PTR_TYPE8 {HKernel.np_dtype_to_cl(dtype)}8']
+            out += [f'#define {name_upper}_PTR_TYPE16 {HKernel.np_dtype_to_cl(dtype)}16']
+            out += [f'#define {name_upper}_TYPE {HKernel.np_dtype_to_cl(dtype)}']
+            out += [f'#define {name_upper}_TYPE2 {HKernel.np_dtype_to_cl(dtype)}2']
+            out += [f'#define {name_upper}_TYPE3 {HKernel.np_dtype_to_cl(dtype)}3']
+            out += [f'#define {name_upper}_TYPE4 {HKernel.np_dtype_to_cl(dtype)}4']
+            out += [f'#define {name_upper}_TYPE8 {HKernel.np_dtype_to_cl(dtype)}8']
+            out += [f'#define {name_upper}_TYPE16 {HKernel.np_dtype_to_cl(dtype)}16']
+
+            out += [f'#define {name_upper}_GLOBAL_LOAD(offset)   {name_upper}_PTR_NAME[(offset)]']
+            out += [f'#define {name_upper}_GLOBAL_LOAD2(offset)  {name_upper}_PTR_NAME[(offset)]']
+            out += [f'#define {name_upper}_GLOBAL_LOAD3(offset)  {name_upper}_PTR_NAME[(offset)]']
+            out += [f'#define {name_upper}_GLOBAL_LOAD4(offset)  {name_upper}_PTR_NAME[(offset)]']
+            out += [f'#define {name_upper}_GLOBAL_LOAD8(offset)  {name_upper}_PTR_NAME[(offset)]']
+            out += [f'#define {name_upper}_GLOBAL_LOAD16(offset) {name_upper}_PTR_NAME[(offset)]']
+
+            out += [f'#define {name_upper}_GLOBAL_STORE(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
+            out += [f'#define {name_upper}_GLOBAL_STORE2(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
+            out += [f'#define {name_upper}_GLOBAL_STORE3(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
+            out += [f'#define {name_upper}_GLOBAL_STORE4(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
+            out += [f'#define {name_upper}_GLOBAL_STORE8(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
+            out += [f'#define {name_upper}_GLOBAL_STORE16(offset,value) {name_upper}_PTR_NAME[(offset)] = (value)']
+
+        if dtype in [np.float32, np.float64]:
+            out += [f'#define {name_upper}_TO_FLOATX(x) x']
+        elif dtype in [np.bool_, np.int8, np.uint8, np.int16, np.uint16, np.int32,np.uint32, np.float16]:
+            out += [f'#define {name_upper}_TO_FLOATX(x) ((float)x)']
+        elif dtype in [np.int64,np.uint64]:
+            out += [f'#define {name_upper}_TO_FLOATX(x) ((double)x)']
+        return '\n'.join(out)
+
+    @staticmethod
+    def define_tensor_shape(name, shape, axes_symbols=None):
+        """
+        Returns a definitions for operations with tensor
+
+        example for 'O', (7,3),
+
+        #define O0 7
+        #define O1 3
+        #define Om1 3
+        #define Om2 7
+        #define O_IDX(o0,o1) ( (size_t)(o0) )*3 +( o1 )
+        #define O_IDX_MOD(o0,o1) ( (size_t)(o0) % 7 )*3 +( (o1) % 3 )
+        """
+        shape = tuple(shape)
+        ndim = len(shape)
+        name_upper = name.upper()
+        name_lower = name.lower()
+
+        if axes_symbols is None:
+            axes_symbols = "".join([str(i) for i in range(ndim)])
+        axes_symbols = axes_symbols.upper()
+
+        out = []
+        for i in range(ndim):
+            out += [f'#define {name_upper}{axes_symbols[i]} {shape[i]}']
+
+        for i in range(1,ndim+1):
+            out += [f'#define {name_upper}m{i} {shape[-i]}']
+
+        line = f'#define {name_upper}_IDX({HKernel.axes_seq_enum(name, ndim)}) '
+
+        for i in range(ndim):
+            if i == 0:
+                line += f'( (size_t)({name_lower}{i}) )'
+            else:
+                line += f'( {name_lower}{i} )'
+
+            for j in range(i+1,ndim):
+                line += f'*{shape[j]} '
+            if i != ndim-1:
+                line += '+'
+
+        out += [line]
+
+        line = f'#define {name_upper}_IDX_MOD({HKernel.axes_seq_enum(name, ndim)}) '
+
+        for i in range(ndim):
+            if i == 0:
+                line += f'( (size_t)({name_lower}{i}) % {shape[i]} )'
+            else:
+                line += f'( ({name_lower}{i}) % {shape[i]} )'
+
+            for j in range(i+1,ndim):
+                line += f'*{shape[j]} '
+            if i != ndim-1:
+                line += '+'
+
+        out += [line,'']
+
+        return '\n'.join(out)
+
+    @staticmethod
+    def define_tensor(name, shape, dtype : np.dtype, axes_symbols=None):
+        """
+        Returns a definitions for operations with tensor
+
+        arguments
+
+            name     text
+
+            shape           Iterable
+
+            dtype           np.dtype
+
+            axes_symbols(None)  string of symbols.
+                                None -> numeric symbols will be used
+
+        example for 'O', (2,4), np.float16
+
+        #define O_PTR_NAME p_O
+        #define O_PTR_TYPE half
+        #define O_PTR_TYPE2 half2
+        #define O_PTR_TYPE3 half3
+        #define O_PTR_TYPE4 half4
+        #define O_PTR_TYPE8 half8
+        #define O_PTR_TYPE16 half16
+        #define O_TYPE float
+        #define O_TYPE2 float2
+        #define O_TYPE3 float3
+        #define O_TYPE4 float4
+        #define O_TYPE8 float8
+        #define O_TYPE16 float16
+        #define O_GLOBAL_LOAD(offset)   vload_half  (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD2(offset)  vload_half2 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD3(offset)  vload_half3 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD4(offset)  vload_half4 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD8(offset)  vload_half8 (0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_LOAD16(offset) vload_half16(0, (const __global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE(offset,value)   vstore_half  ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE2(offset,value)  vstore_half2 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE3(offset,value)  vstore_half3 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE4(offset,value)  vstore_half4 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE8(offset,value)  vstore_half8 ( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_GLOBAL_STORE16(offset,value) vstore_half16( (value), 0, (__global half*) (&O_PTR_NAME[(offset)]) )
+        #define O_TO_FLOATX(x) ((float)x)
+        #define O0 2
+        #define O1 4
+        #define Om1 4
+        #define Om2 2
+        #define O_IDX(o0,o1) ( (size_t)(o0) )*4 +( o1 )
+        #define O_IDX_MOD(o0,o1) ( (size_t)(o0) % 2 )*4 +( (o1) % 4 )
+        """
+        return'\n'.join([ HKernel.define_tensor_type(name, dtype),
+                          HKernel.define_tensor_shape(name, shape, axes_symbols)
+                        ])
+
+    @staticmethod
+    def define_axes_sizes(axis_letter, axes_sizes):
+        """
+        Returns definitions of axes sizes
+
+        example for 'O', (4,512,512)
+        #define O0 4
+        #define O1 512
+        #define O2 512
+        """
+        out = ""
+        axes_sizes = tuple(axes_sizes)
+        ndim = len(axes_sizes)
+        for i in range(ndim):
+            out += f'#define {axis_letter.upper()}{i} {axes_sizes[i]}\n'
+
+        return out
+
+    @staticmethod
+    def decompose_idx_to_axes_idxs(var_name, tensor_name, ndim):
+        """
+        decompose a size_t variable to axes indexes.
+        Keeps original variable untouched.
+
+        Example for 'gid','O',2
+
+        size_t gid_original = gid;
+        size_t o1 = gid % O1; gid /= O1;
+        #define om1 o1
+        size_t o0 = gid % O0;
+        #define om2 o0
+        gid = gid_original;
+        """
+        name_lower = tensor_name.lower()
+        name_upper = tensor_name.upper()
+
+        out = [f'size_t {var_name}_original = {var_name};']
+
+        for i in range(ndim-1,-1,-1):
+            line = f'size_t {name_lower}{i} = {var_name} % {name_upper}{i};'
+            if i > 0:
+                line += f' {var_name} /= {name_upper}{i};'
+            out += [line]
+            out += [f'#define {name_lower}m{ndim-i} {name_lower}{i}']
+
+        out += [f'{var_name} = {var_name}_original;']
+        return '\n'.join(out)
+
+    @staticmethod
+    def axes_order_enum(tensor_name, axes_order):
+        """
+        returns axis enumeration with given order
+
+        Example
+         ('I', (1,2,0)) returns 'i1,i2,i0'
+         ('I', 'HW') return 'ih,iw'
+        """
+        if isinstance(axes_order, str):
+            axes_order = axes_order.lower()
+        else:
+            axes_order = tuple(axes_order)
+
+        name_lower = tensor_name.lower()
+
+        return ','.join( [ f'{name_lower}{axes_order[axis]}' for axis in range(len(axes_order)) ])
+
+    @staticmethod
+    def axes_seq_enum(tensor_name, ndim, new_axis=None, zero_axes=None, suffix=None):
+        """
+        returns axis sequental enumeration with given ndim
+
+        Example
+
+         ('I', 4) returns 'i0,i1,i2,i3'
+
+         ('I', 4, new_axis=('name',1) ) returns 'i0,name,i1,i2,i3'
+
+         ('I', 3, zero_axes=(1,) ) returns 'i0,0,i2'
+
+         ('I', 2, suffix='ih,iw' ) returns 'i0,i1,ih,iw'
+        """
+        name_lower = tensor_name.lower()
+        if zero_axes is not None:
+            axes = [ '0' if axis in zero_axes else f'{name_lower}{axis}' for axis in range(ndim) ]
+        else:
+            axes = [ f'{name_lower}{axis}' for axis in range(ndim) ]
+
+        if suffix is None:
+            suffix = []
+        else:
+            suffix = [suffix]
+
+        if new_axis is not None:
+            name, axis = new_axis
+            return','.join(axes[:axis] + [name] + axes[axis:] + suffix)
+        else:
+            return ','.join(axes+ suffix)
+
+    @staticmethod
+    def include_constants_pi():
+        """
+        defines PI constants
+
+         PI_F
+         PI_2_F
+         PI_4_F
+        """
+        return f"""
+#define  PI_F          3.14159274101257f
+#define  PI_2_F        1.57079637050629f
+#define  PI_4_F        0.78539818525314f
+"""
+
+    @staticmethod
+    def include_hash():
+        """
+        returns hash functions:
+
+         uint  hash_uint_uint(uint v)
+         ulong hash_ulong_from_ulong(ulong x)
+         float hash_float_from_uint(uint x)
+         double hash_double_from_ulong(ulong x)
+        """
+
+        return f"""
+
+#define UIF (1.0 / (float)(0xffffffffU))
+
+//from Chris Wellons https://nullprogram.com/blog/2018/07/31/ https://www.shadertoy.com/view/WttXWX
+uint hash_uint_from_uint(uint x)
+{{
+    x ^= x >> 17;
+    x *= 0xed5ad4bbU;
+    x ^= x >> 11;
+    x *= 0xac4c1b51U;
+    x ^= x >> 15;
+    x *= 0x31848babU;
+    x ^= x >> 14;
+    return x;
+}}
+
+ulong hash_ulong_from_ulong(ulong x)
+{{
+    x ^= x >> 32;
+    x *= 0xd6e8feb86659fd93U;
+    x ^= x >> 32;
+    x *= 0xd6e8feb86659fd93U;
+    x ^= x >> 32;
+    return x;
+}}
+
+float hash_float_from_uint(uint x)
+{{
+    return hash_uint_from_uint(x) / (float)(0xffffffffU);
+}}
+
+double hash_double_from_ulong(ulong x)
+{{
+    return (double)hash_ulong_from_ulong(x) / (double)(0xffffffffffffffffU);
+}}
+
+/*****************************
+UNUSED CODE
+
+//---------- PCG hashes from https://www.shadertoy.com/view/XlGcRh
+uint hash_uint_uint(uint v)
+{{
+    uint state = v * 747796405u + 2891336453u;
+    uint word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u;
+    return (word >> 22u) ^ word;
+}}
+
+uint2 hash_uint2_uint2 (uint2 v)
+{{
+    v = v * 1664525u + 1013904223u;
+    v.x += v.y * 1664525u;
+    v.y += v.x * 1664525u;
+    v ^= v>>16u;
+    v.x += v.y * 1664525u;
+    v.y += v.x * 1664525u;
+    v ^= v>>16u;
+    return v;
+}}
+
+uint3 hash_uint3_uint3(uint3 v)
+{{
+    v = v * 1664525u + 1013904223u;
+    v.x += v.y*v.z;
+    v.y += v.z*v.x;
+    v.z += v.x*v.y;
+    v ^= v >> 16u;
+    v.x += v.y*v.z;
+    v.y += v.z*v.x;
+    v.z += v.x*v.y;
+    return v;
+}}
+
+float hash_float_uint(uint v)
+{{
+	return (float)( hash_uint_uint(v) ) * UIF;
+}}
+
+float2 hash_float2_uint (uint v)
+{{
+    uint2 q = hash_uint2_uint2( (uint2)(v, 1) );
+    return (float2)(q.x, q.y) * UIF;
+}}
+
+float3 hash_float3_uint (uint v)
+{{
+    uint3 q = hash_uint3_uint3( (uint3)(v, 1, 1) );
+    return (float3)(q.x, q.y, q.z) * UIF;
+}}
+
+//---------- Classic hashes used in shaders
+
+float hash_float_float(float p)
+{{
+
+    float x = sin(p*12.9898)*43758.5453;
+    return x - floor(x);
+}}
+
+float hash_float_float2(float2 p)
+{{
+    float x = sin( dot(p, (float2)(12.9898, 78.233)) )*43758.5453;
+    return x - floor(x);
+}}
+
+****************************/
+
+
+"""
+
+__all__ = ['HKernel']