From 65fb01ff3999d7af3785749fd3a5e4b0f28cd9be Mon Sep 17 00:00:00 2001 From: iperov Date: Wed, 6 Oct 2021 14:50:53 +0400 Subject: [PATCH] update xlib.avecl --- xlib/avecl/__init__.py | 1 + xlib/avecl/_internal/EInterpolation.py | 8 ++ xlib/avecl/_internal/Tensor.py | 1 + xlib/avecl/_internal/TensorImpl.py | 1 + xlib/avecl/_internal/op/binary_morph.py | 2 +- xlib/avecl/_internal/op/remap_np_affine.py | 137 ++++++++++++++++++++- 6 files changed, 143 insertions(+), 7 deletions(-) create mode 100644 xlib/avecl/_internal/EInterpolation.py diff --git a/xlib/avecl/__init__.py b/xlib/avecl/__init__.py index e503228..6e2d849 100644 --- a/xlib/avecl/__init__.py +++ b/xlib/avecl/__init__.py @@ -18,6 +18,7 @@ from ._internal.backend import (Device, DeviceInfo, Kernel, get_available_devices_info, get_best_device, get_default_device, get_device, set_default_device) +from ._internal.EInterpolation import EInterpolation from ._internal.HArgs import HArgs from ._internal.HKernel import HKernel from ._internal.HTensor import HTensor diff --git a/xlib/avecl/_internal/EInterpolation.py b/xlib/avecl/_internal/EInterpolation.py new file mode 100644 index 0000000..ee422c8 --- /dev/null +++ b/xlib/avecl/_internal/EInterpolation.py @@ -0,0 +1,8 @@ +from enum import IntEnum, IntEnum + +class EInterpolation(IntEnum): + NEAREST = 0 + LINEAR = 1 + CUBIC = 2 + LANCZOS3 = 3 + LANCZOS4 = 4 \ No newline at end of file diff --git a/xlib/avecl/_internal/Tensor.py b/xlib/avecl/_internal/Tensor.py index bbab164..100225c 100644 --- a/xlib/avecl/_internal/Tensor.py +++ b/xlib/avecl/_internal/Tensor.py @@ -102,6 +102,7 @@ class Tensor: def __setitem__(self, slices, value): ... def as_shape(self, shape) -> 'Tensor': ... + def cast(self, dtype) -> 'Tensor': ... def copy(self) -> 'Tensor': ... def max(self, axes=None, keepdims=False) -> 'Tensor': ... def mean(self, axes=None, keepdims=False) -> 'Tensor': ... diff --git a/xlib/avecl/_internal/TensorImpl.py b/xlib/avecl/_internal/TensorImpl.py index 41587d4..1491582 100644 --- a/xlib/avecl/_internal/TensorImpl.py +++ b/xlib/avecl/_internal/TensorImpl.py @@ -60,6 +60,7 @@ def Tensor_as_shape(self : Tensor, shape) -> Tensor: return TensorRef(self, shape) Tensor.as_shape = Tensor_as_shape +Tensor.cast = cast def Tensor_copy(self : Tensor) -> Tensor: return Tensor.from_value(self) Tensor.copy = Tensor_copy diff --git a/xlib/avecl/_internal/op/binary_morph.py b/xlib/avecl/_internal/op/binary_morph.py index 4541a56..5d6cfc1 100644 --- a/xlib/avecl/_internal/op/binary_morph.py +++ b/xlib/avecl/_internal/op/binary_morph.py @@ -41,5 +41,5 @@ def binary_morph(input_t : Tensor, erode_dilate : int, blur : float, fade_to_bor x = gaussian_blur(x, blur * 0.250, dtype=dtype) else: x = cast(x, dtype=dtype) - + return x[...,H:-H,W:-W] diff --git a/xlib/avecl/_internal/op/remap_np_affine.py b/xlib/avecl/_internal/op/remap_np_affine.py index 1d97f87..0ec0afe 100644 --- a/xlib/avecl/_internal/op/remap_np_affine.py +++ b/xlib/avecl/_internal/op/remap_np_affine.py @@ -2,11 +2,13 @@ import numpy as np from ..AShape import AShape from ..backend import Kernel +from ..EInterpolation import EInterpolation from ..HKernel import HKernel from ..SCacheton import SCacheton from ..Tensor import Tensor -def remap_np_affine (input_t : Tensor, affine_n : np.array, inverse=False, output_size=None, dtype=None) -> Tensor: + +def remap_np_affine (input_t : Tensor, affine_n : np.ndarray, interpolation : EInterpolation = None, inverse=False, output_size=None, dtype=None) -> Tensor: """ remap affine operator for all channels using single numpy affine mat @@ -16,12 +18,14 @@ def remap_np_affine (input_t : Tensor, affine_n : np.array, inverse=False, outpu affine_n np.array (2,3) + interpolation EInterpolation + dtype """ if affine_n.shape != (2,3): raise ValueError('affine_n.shape must be (2,3)') - op = SCacheton.get(_RemapAffineOp, input_t.shape, input_t.dtype, output_size, dtype) + op = SCacheton.get(_RemapAffineOp, input_t.shape, input_t.dtype, interpolation, output_size, dtype) output_t = Tensor( op.o_shape, op.o_dtype, device=input_t.get_device() ) @@ -33,7 +37,7 @@ def remap_np_affine (input_t : Tensor, affine_n : np.array, inverse=False, outpu D = 1.0 / D if D != 0.0 else 0.0 a, b, c, d, e, f = ( e*D, -b*D, (b*f-e*c)*D , -d*D, a*D, (d*c-a*f)*D ) - + input_t.get_device().run_kernel(op.forward_krn, output_t.get_buffer(), input_t.get_buffer(), np.float32(a), np.float32(b), np.float32(c), np.float32(d), np.float32(e), np.float32(f) ) @@ -41,11 +45,13 @@ def remap_np_affine (input_t : Tensor, affine_n : np.array, inverse=False, outpu class _RemapAffineOp(): - def __init__(self, i_shape : AShape, i_dtype, o_size, o_dtype): + def __init__(self, i_shape : AShape, i_dtype, interpolation, o_size, o_dtype): if np.dtype(i_dtype).type == np.bool_: raise ValueError('np.bool_ dtype of i_dtype is not supported.') if i_shape.ndim < 2: raise ValueError('i_shape.ndim must be >= 2 (...,H,W)') + if interpolation is None: + interpolation = EInterpolation.LINEAR IH,IW = i_shape[-2:] if o_size is not None: @@ -60,7 +66,8 @@ class _RemapAffineOp(): self.o_shape = o_shape self.o_dtype = o_dtype = o_dtype if o_dtype is not None else i_dtype - self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f""" + if interpolation == EInterpolation.LINEAR: + self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f""" {HKernel.define_tensor('O', o_shape, o_dtype)} {HKernel.define_tensor('I', i_shape, i_dtype)} @@ -93,4 +100,122 @@ __kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I O_GLOBAL_STORE(gid, p00 + p01 + p10 + p11); }} -""") \ No newline at end of file +""") + elif interpolation == EInterpolation.CUBIC: + self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f""" + +{HKernel.define_tensor('O', o_shape, o_dtype)} +{HKernel.define_tensor('I', i_shape, i_dtype)} + +float cubic(float p0, float p1, float p2, float p3, float x) +{{ + float a0 = p1; + float a1 = p2 - p0; + float a2 = 2 * p0 - 5 * p1 + 4 * p2 - p3; + float a3 = 3 * (p1 - p2) + p3 - p0; + return a0 + 0.5 * x * (a1 + x * (a2 + x * a3)); +}} + +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, + float a, float b, float c, + float d, float e, float f) +{{ + size_t gid = get_global_id(0); + + {HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)} + + float cx01f = om1*a + om2*b + c; + float cy01f = om1*d + om2*e + f; + + float cxf = floor(cx01f); int cx = (int)cxf; + float cyf = floor(cy01f); int cy = (int)cyf; + + float dx = cx01f-cxf; + float dy = cy01f-cyf; + + float row[4]; + + #pragma unroll + for (int y=cy-1, j=0; y<=cy+2; y++, j++) + {{ + float col[4]; + #pragma unroll + for (int x=cx-1, i=0; x<=cx+2; x++, i++) + {{ + float sxy = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='y,x')})); + + col[i] = sxy*(y >= 0 & y < Im2 & x >= 0 & x < Im1); + }} + row[j] = cubic(col[0], col[1], col[2], col[3], dx); + }} + + float O = cubic(row[0], row[1], row[2], row[3], dy); + + O_GLOBAL_STORE(gid, O); +}} +""") + elif interpolation in [EInterpolation.LANCZOS3, EInterpolation.LANCZOS4]: + RAD = 3 if interpolation == EInterpolation.LANCZOS3 else 4 + self.forward_krn = Kernel(global_shape=(o_shape.size,), kernel_text=f""" + +{HKernel.define_tensor('O', o_shape, o_dtype)} +{HKernel.define_tensor('I', i_shape, i_dtype)} + +__kernel void impl(__global O_PTR_TYPE* O_PTR_NAME, __global const I_PTR_TYPE* I_PTR_NAME, + float a, float b, float c, + float d, float e, float f) +{{ + size_t gid = get_global_id(0); + + {HKernel.decompose_idx_to_axes_idxs('gid', 'O', o_shape.ndim)} + + float cx01f = om1*a + om2*b + c; + float cy01f = om1*d + om2*e + f; + + float cxf = floor(cx01f); int cx = (int)cxf; + float cyf = floor(cy01f); int cy = (int)cyf; + + #define RAD {RAD} + float Fy[2 * RAD]; + float Fx[2 * RAD]; + + #pragma unroll + for (int y=cy-RAD+1, j=0; y<=cy+RAD; y++, j++) + {{ + float dy = fabs(cy01f - y); + if (dy < 1e-4) Fy[j] = 1; + else if (dy > RAD) Fy[j] = 0; + else Fy[j] = ( RAD * sin(M_PI * dy) * sin(M_PI * dy / RAD) ) / ( (M_PI*M_PI)*dy*dy ); + }} + + #pragma unroll + for (int x=cx-RAD+1, i=0; x<=cx+RAD; x++, i++) + {{ + float dx = fabs(cx01f - x); + if (dx < 1e-4) Fx[i] = 1; + else if (dx > RAD) Fx[i] = 0; + else Fx[i] = ( RAD * sin(M_PI * dx) * sin(M_PI * dx / RAD) ) / ( (M_PI*M_PI)*dx*dx ); + }} + + float FxFysum = 0; + float O = 0; + + #pragma unroll + for (int y=cy-RAD+1, j=0; y<=cy+RAD; y++, j++) + #pragma unroll + for (int x=cx-RAD+1, i=0; x<=cx+RAD; x++, i++) + {{ + float sxy = I_GLOBAL_LOAD(I_IDX_MOD({HKernel.axes_seq_enum('O', o_shape.ndim-2, suffix='y,x')})); + + float Fxyv = Fx[i]*Fy[j]; + FxFysum += Fxyv; + + O += sxy*Fxyv*(y >= 0 & y < Im2 & x >= 0 & x < Im1); + }} + O = O / FxFysum; + + O_GLOBAL_STORE(gid, O); +}} +""") + else: + raise ValueError(f'Unsupported interpolation type {interpolation}') \ No newline at end of file