From 65fb01ff3999d7af3785749fd3a5e4b0f28cd9be Mon Sep 17 00:00:00 2001
From: iperov <lepersorium@gmail.com>
Date: Wed, 6 Oct 2021 14:50:53 +0400
Subject: [PATCH] update xlib.avecl

---
 xlib/avecl/__init__.py                     |   1 +
 xlib/avecl/_internal/EInterpolation.py     |   8 ++
 xlib/avecl/_internal/Tensor.py             |   1 +
 xlib/avecl/_internal/TensorImpl.py         |   1 +
 xlib/avecl/_internal/op/binary_morph.py    |   2 +-
 xlib/avecl/_internal/op/remap_np_affine.py | 137 ++++++++++++++++++++-
 6 files changed, 143 insertions(+), 7 deletions(-)
 create mode 100644 xlib/avecl/_internal/EInterpolation.py

diff --git a/xlib/avecl/__init__.py b/xlib/avecl/__init__.py
index e503228..6e2d849 100644
--- a/xlib/avecl/__init__.py
+++ b/xlib/avecl/__init__.py
@@ -18,6 +18,7 @@ from ._internal.backend import (Device, DeviceInfo, Kernel,
                                 get_available_devices_info, get_best_device,
                                 get_default_device, get_device,
                                 set_default_device)
+from ._internal.EInterpolation import EInterpolation
 from ._internal.HArgs import HArgs
 from ._internal.HKernel import HKernel
 from ._internal.HTensor import HTensor
diff --git a/xlib/avecl/_internal/EInterpolation.py b/xlib/avecl/_internal/EInterpolation.py
new file mode 100644
index 0000000..ee422c8
--- /dev/null
+++ b/xlib/avecl/_internal/EInterpolation.py
@@ -0,0 +1,8 @@
+from enum import IntEnum, IntEnum
+
+class EInterpolation(IntEnum):
+    NEAREST = 0
+    LINEAR = 1
+    CUBIC = 2
+    LANCZOS3 = 3
+    LANCZOS4 = 4
\ No newline at end of file
diff --git a/xlib/avecl/_internal/Tensor.py b/xlib/avecl/_internal/Tensor.py
index bbab164..100225c 100644
--- a/xlib/avecl/_internal/Tensor.py
+++ b/xlib/avecl/_internal/Tensor.py
@@ -102,6 +102,7 @@ class Tensor:
     def __setitem__(self, slices, value): ...
 
     def as_shape(self, shape) -> 'Tensor': ...
+    def cast(self, dtype) -> 'Tensor': ...
     def copy(self) -> 'Tensor': ...
     def max(self, axes=None, keepdims=False) -> 'Tensor': ...
     def mean(self, axes=None, keepdims=False) -> 'Tensor': ...
diff --git a/xlib/avecl/_internal/TensorImpl.py b/xlib/avecl/_internal/TensorImpl.py
index 41587d4..1491582 100644
--- a/xlib/avecl/_internal/TensorImpl.py
+++ b/xlib/avecl/_internal/TensorImpl.py
@@ -60,6 +60,7 @@ def Tensor_as_shape(self : Tensor, shape) -> Tensor:
     return TensorRef(self, shape)
 Tensor.as_shape = Tensor_as_shape
 
+Tensor.cast = cast
 def Tensor_copy(self : Tensor) -> Tensor:
     return Tensor.from_value(self)
 Tensor.copy = Tensor_copy
diff --git a/xlib/avecl/_internal/op/binary_morph.py b/xlib/avecl/_internal/op/binary_morph.py
index 4541a56..5d6cfc1 100644
--- a/xlib/avecl/_internal/op/binary_morph.py
+++ b/xlib/avecl/_internal/op/binary_morph.py
@@ -41,5 +41,5 @@ def binary_morph(input_t : Tensor, erode_dilate : int, blur : float, fade_to_bor
         x = gaussian_blur(x, blur * 0.250, dtype=dtype)
     else:
         x = cast(x, dtype=dtype)
-        
+
     return x[...,H:-H,W:-W]
diff --git a/xlib/avecl/_internal/op/remap_np_affine.py b/xlib/avecl/_internal/op/remap_np_affine.py
index 1d97f87..0ec0afe 100644
--- a/xlib/avecl/_internal/op/remap_np_affine.py
+++ b/xlib/avecl/_internal/op/remap_np_affine.py
@@ -2,11 +2,13 @@ import numpy as np
 
 from ..AShape import AShape
 from ..backend import Kernel
+from ..EInterpolation import EInterpolation
 from ..HKernel import HKernel
 from ..SCacheton import SCacheton
 from ..Tensor import Tensor
 
-def remap_np_affine (input_t : Tensor, affine_n : np.array, inverse=False, output_size=None, dtype=None) -> Tensor:
+
+def remap_np_affine (input_t : Tensor, affine_n : np.ndarray, interpolation : EInterpolation = None, inverse=False, output_size=None, dtype=None) -> Tensor:
     """
     remap affine operator for all channels using single numpy affine mat
 
@@ -16,12 +18,14 @@ def remap_np_affine (input_t : Tensor, affine_n : np.array, inverse=False, outpu
 
         affine_n    np.array (2,3)
 
+        interpolation    EInterpolation
+
         dtype
     """
     if affine_n.shape != (2,3):
         raise ValueError('affine_n.shape must be (2,3)')
 
-    op = SCacheton.get(_RemapAffineOp, input_t.shape, input_t.dtype, output_size, dtype)
+    op = SCacheton.get(_RemapAffineOp, input_t.shape, input_t.dtype, interpolation, output_size, dtype)
 
     output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() )
 
@@ -33,7 +37,7 @@ def remap_np_affine (input_t : Tensor, affine_n : np.array, inverse=False, outpu
         D = 1.0 / D if D != 0.0 else 0.0
         a, b, c, d, e, f = (  e*D, -b*D, (b*f-e*c)*D ,
                              -d*D,  a*D, (d*c-a*f)*D )
-                             
+
     input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(),
                                     np.float32(a), np.float32(b), np.float32(c), np.float32(d), np.float32(e), np.float32(f) )
 
@@ -41,11 +45,13 @@ def remap_np_affine (input_t : Tensor, affine_n : np.array, inverse=False, outpu
 
 
 class _RemapAffineOp():
-    def __init__(self, i_shape : AShape, i_dtype, o_size, o_dtype):
+    def __init__(self, i_shape : AShape, i_dtype, interpolation, o_size, o_dtype):
         if np.dtype(i_dtype).type == np.bool_:
             raise ValueError('np.bool_ dtype of i_dtype is not supported.')
         if i_shape.ndim < 2:
             raise ValueError('i_shape.ndim must be >= 2 (...,H,W)')
+        if interpolation is None:
+            interpolation = EInterpolation.LINEAR
 
         IH,IW = i_shape[-2:]
         if o_size is not None:
@@ -60,7 +66,8 @@ class _RemapAffineOp():
         self.o_shape = o_shape
         self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype
 
-        self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+        if interpolation == EInterpolation.LINEAR:
+            self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
 
 {HKernel.define_tensor('O', o_shape, o_dtype)}
 {HKernel.define_tensor('I', i_shape, i_dtype)}
@@ -93,4 +100,122 @@ __kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I
 
     O_GLOBAL_STORE(gid, p00 + p01 + p10 + p11);
 }}
-""")
\ No newline at end of file
+""")
+        elif interpolation == EInterpolation.CUBIC:
+            self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('I', i_shape, i_dtype)}
+
+float cubic(float p0, float p1, float p2, float p3, float x)
+{{
+    float a0 = p1;
+    float a1 = p2 - p0;
+    float a2 = 2 * p0 - 5 * p1 + 4 * p2 - p3;
+    float a3 = 3 * (p1 - p2) + p3 - p0;
+    return a0 + 0.5 * x * (a1 + x * (a2 + x * a3));
+}}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME,
+                   float a, float b, float c,
+                   float d, float e, float f)
+{{
+    size_t gid = get_global_id(0);
+
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
+
+    float cx01f = om1*a + om2*b + c;
+    float cy01f = om1*d + om2*e + f;
+
+    float cxf = floor(cx01f);   int cx = (int)cxf;
+    float cyf = floor(cy01f);   int cy = (int)cyf;
+
+    float dx = cx01f-cxf;
+    float dy = cy01f-cyf;
+
+    float row[4];
+
+    #pragma unroll
+    for (int y=cy-1, j=0; y<=cy+2; y++, j++)
+    {{
+        float col[4];
+        #pragma unroll
+        for (int x=cx-1, i=0; x<=cx+2; x++, i++)
+        {{
+            float sxy = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='y,x')}));
+
+            col[i] = sxy*(y >= 0 & y < Im2 & x >= 0 & x < Im1);
+        }}
+        row[j] = cubic(col[0], col[1], col[2], col[3], dx);
+    }}
+
+    float O = cubic(row[0], row[1], row[2], row[3], dy);
+
+    O_GLOBAL_STORE(gid, O);
+}}
+""")
+        elif interpolation in [EInterpolation.LANCZOS3, EInterpolation.LANCZOS4]:
+            RAD = 3 if interpolation == EInterpolation.LANCZOS3 else 4
+            self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f"""
+
+{HKernel.define_tensor('O', o_shape, o_dtype)}
+{HKernel.define_tensor('I', i_shape, i_dtype)}
+
+__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME,
+                   float a, float b, float c,
+                   float d, float e, float f)
+{{
+    size_t gid = get_global_id(0);
+
+    {HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)}
+
+    float cx01f = om1*a + om2*b + c;
+    float cy01f = om1*d + om2*e + f;
+
+    float cxf = floor(cx01f);   int cx = (int)cxf;
+    float cyf = floor(cy01f);   int cy = (int)cyf;
+
+    #define RAD {RAD}
+    float Fy[2 * RAD];
+    float Fx[2 * RAD];
+
+    #pragma unroll
+    for (int y=cy-RAD+1, j=0; y<=cy+RAD; y++, j++)
+    {{
+        float dy = fabs(cy01f - y);
+        if (dy < 1e-4) Fy[j] = 1;
+        else if (dy > RAD) Fy[j] = 0;
+        else Fy[j] = ( RAD * sin(M_PI * dy) * sin(M_PI * dy / RAD) ) / ( (M_PI*M_PI)*dy*dy );
+    }}
+
+    #pragma unroll
+    for (int x=cx-RAD+1, i=0; x<=cx+RAD; x++, i++)
+    {{
+        float dx = fabs(cx01f - x);
+        if (dx < 1e-4) Fx[i] = 1;
+        else if (dx > RAD) Fx[i] = 0;
+        else Fx[i] = ( RAD * sin(M_PI * dx) * sin(M_PI * dx / RAD) ) / ( (M_PI*M_PI)*dx*dx );
+    }}
+
+    float FxFysum = 0;
+    float O = 0;
+
+    #pragma unroll
+    for (int y=cy-RAD+1, j=0; y<=cy+RAD; y++, j++)
+    #pragma unroll
+    for (int x=cx-RAD+1, i=0; x<=cx+RAD; x++, i++)
+    {{
+        float sxy = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='y,x')}));
+
+        float Fxyv = Fx[i]*Fy[j];
+        FxFysum += Fxyv;
+
+        O += sxy*Fxyv*(y >= 0 & y < Im2 & x >= 0 & x < Im1);
+    }}
+    O = O / FxFysum;
+
+    O_GLOBAL_STORE(gid, O);
+}}
+""")
+        else:
+            raise ValueError(f'Unsupported interpolation type {interpolation}')
\ No newline at end of file