diff --git a/__dev/port.py b/__dev/port.py
new file mode 100644
index 0000000..8d6bc06
--- /dev/null
+++ b/__dev/port.py
@@ -0,0 +1,344 @@
+#import FaceLandmarksExtractor
+
+
+import numpy as np
+import dlib
+import torch
+import keras
+from keras import backend as K
+from keras import layers as KL
+import math
+import os
+import time
+import code
+
+class TorchBatchNorm2D(keras.engine.topology.Layer):
+    def __init__(self, axis=-1, momentum=0.99, epsilon=1e-3, **kwargs):
+        super(TorchBatchNorm2D, self).__init__(**kwargs)
+        self.supports_masking = True
+        self.axis = axis
+        self.momentum = momentum
+        self.epsilon = epsilon
+
+    def build(self, input_shape):
+        dim = input_shape[self.axis]
+        if dim is None:
+            raise ValueError('Axis ' + str(self.axis) + ' of '
+                             'input tensor should have a defined dimension '
+                             'but the layer received an input with shape ' +
+                             str(input_shape) + '.')
+        shape = (dim,)
+        self.gamma = self.add_weight(shape=shape, name='gamma', initializer='ones', regularizer=None, constraint=None)
+        self.beta = self.add_weight(shape=shape, name='beta', initializer='zeros', regularizer=None, constraint=None)
+        self.moving_mean = self.add_weight(shape=shape, name='moving_mean', initializer='zeros', trainable=False)            
+        self.moving_variance = self.add_weight(shape=shape, name='moving_variance', initializer='ones', trainable=False)            
+        self.built = True
+
+    def call(self, inputs, training=None):
+        input_shape = K.int_shape(inputs)
+
+        broadcast_shape = [1] * len(input_shape)
+        broadcast_shape[self.axis] = input_shape[self.axis]
+        
+        broadcast_moving_mean = K.reshape(self.moving_mean, broadcast_shape)
+        broadcast_moving_variance = K.reshape(self.moving_variance, broadcast_shape)
+        broadcast_gamma = K.reshape(self.gamma, broadcast_shape)
+        broadcast_beta = K.reshape(self.beta, broadcast_shape)        
+        invstd = K.ones (shape=broadcast_shape, dtype='float32') / K.sqrt(broadcast_moving_variance + K.constant(self.epsilon, dtype='float32'))
+        
+        return (inputs - broadcast_moving_mean) * invstd * broadcast_gamma + broadcast_beta
+       
+    def get_config(self):
+        config = { 'axis': self.axis, 'momentum': self.momentum, 'epsilon': self.epsilon }
+        base_config = super(TorchBatchNorm2D, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+       
+def t2kw_conv2d (src):
+    if src.bias is not None:
+        return [ np.moveaxis(src.weight.data.cpu().numpy(), [0,1,2,3], [3,2,0,1]), src.bias.data.cpu().numpy() ]
+    else:
+        return [ np.moveaxis(src.weight.data.cpu().numpy(), [0,1,2,3], [3,2,0,1])]
+        
+    
+def t2kw_bn2d(src):
+    return [ src.weight.data.cpu().numpy(), src.bias.data.cpu().numpy(), src.running_mean.cpu().numpy(), src.running_var.cpu().numpy() ]
+
+
+    
+import face_alignment
+fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D,enable_cuda=False,enable_cudnn=False,use_cnn_face_detector=True).face_alignemnt_net
+fa.eval()
+
+
+def KerasConvBlock(in_planes, out_planes, input, srctorch):
+    out1 = TorchBatchNorm2D(axis=1,  momentum=0.1, epsilon=1e-05, weights=t2kw_bn2d(srctorch.bn1) )(input)
+    out1 = KL.Activation( keras.backend.relu ) (out1)
+    out1 = KL.ZeroPadding2D(padding=(1, 1), data_format='channels_first')(out1)
+    out1 = KL.convolutional.Conv2D( int(out_planes/2), kernel_size=3, strides=1, data_format='channels_first', padding='valid', use_bias = False, weights=t2kw_conv2d(srctorch.conv1) ) (out1)
+     
+    out2 = TorchBatchNorm2D(axis=1,  momentum=0.1, epsilon=1e-05, weights=t2kw_bn2d(srctorch.bn2) )(out1)
+    out2 = KL.Activation( keras.backend.relu ) (out2)
+    out2 = KL.ZeroPadding2D(padding=(1, 1), data_format='channels_first')(out2)
+    out2 = KL.convolutional.Conv2D( int(out_planes/4), kernel_size=3, strides=1, data_format='channels_first', padding='valid', use_bias = False, weights=t2kw_conv2d(srctorch.conv2) ) (out2)
+    
+    out3 = TorchBatchNorm2D(axis=1,  momentum=0.1, epsilon=1e-05, weights=t2kw_bn2d(srctorch.bn3) )(out2)
+    out3 = KL.Activation( keras.backend.relu ) (out3)
+    out3 = KL.ZeroPadding2D(padding=(1, 1), data_format='channels_first')(out3)
+    out3 = KL.convolutional.Conv2D( int(out_planes/4), kernel_size=3, strides=1, data_format='channels_first', padding='valid', use_bias = False, weights=t2kw_conv2d(srctorch.conv3) ) (out3)
+     
+    out3 = KL.Concatenate(axis=1)([out1, out2, out3])
+    
+    if in_planes != out_planes:
+        downsample = TorchBatchNorm2D(axis=1,  momentum=0.1, epsilon=1e-05, weights=t2kw_bn2d(srctorch.downsample[0]) )(input)
+        downsample = KL.Activation( keras.backend.relu ) (downsample)
+        downsample = KL.convolutional.Conv2D( out_planes, kernel_size=1, strides=1, data_format='channels_first', padding='valid', use_bias = False, weights=t2kw_conv2d(srctorch.downsample[2]) ) (downsample)
+        out3 = KL.add ( [out3, downsample] )
+    else:
+        out3 = KL.add ( [out3, input] )
+    
+
+    return out3
+    
+def KerasHourGlass (depth, input, srctorch):
+
+    up1 = KerasConvBlock(256, 256, input, srctorch._modules['b1_%d' % (depth)])
+    
+    low1 = KL.AveragePooling2D (pool_size=2, strides=2, data_format='channels_first', padding='valid' )(input)
+    low1 = KerasConvBlock (256, 256, low1, srctorch._modules['b2_%d' % (depth)])
+    
+    if depth > 1:
+        low2 = KerasHourGlass (depth-1, low1, srctorch)
+    else:
+        low2 = KerasConvBlock(256, 256, low1, srctorch._modules['b2_plus_%d' % (depth)])
+    
+    low3 = KerasConvBlock(256, 256, low2, srctorch._modules['b3_%d' % (depth)])
+    
+    up2 = KL.UpSampling2D(size=2, data_format='channels_first') (low3)
+    return KL.add ( [up1, up2] )
+    
+model_path = os.path.join( os.path.dirname(__file__) , "2DFAN-4.h5" )
+if os.path.exists (model_path):    
+    t = time.time()
+    model = keras.models.load_model (model_path, custom_objects={'TorchBatchNorm2D': TorchBatchNorm2D} ) 
+    print ('load takes = %f' %( time.time() - t ) )
+else:
+    _input = keras.layers.Input ( shape=(3, 256,256) )
+    x = KL.ZeroPadding2D(padding=(3, 3), data_format='channels_first')(_input)
+    x = KL.convolutional.Conv2D( 64, kernel_size=7, strides=2, data_format='channels_first', padding='valid', weights=t2kw_conv2d(fa.conv1) ) (x)
+    
+    x = TorchBatchNorm2D(axis=1,  momentum=0.1, epsilon=1e-05, weights=t2kw_bn2d(fa.bn1) )(x)
+    x = KL.Activation( keras.backend.relu ) (x)
+    
+    x = KerasConvBlock (64, 128, x, fa.conv2)
+    x = KL.AveragePooling2D (pool_size=2, strides=2, data_format='channels_first', padding='valid' ) (x)
+    x = KerasConvBlock (128, 128, x, fa.conv3)
+    x = KerasConvBlock (128, 256, x, fa.conv4)
+    
+    outputs = []
+    previous = x
+    for i in range(4):
+        ll = KerasHourGlass (4, previous, fa._modules['m%d' % (i) ])
+        ll = KerasConvBlock (256,256, ll, fa._modules['top_m_%d' % (i)])
+        
+        ll = KL.convolutional.Conv2D(256, kernel_size=1, strides=1, data_format='channels_first', padding='valid', weights=t2kw_conv2d( fa._modules['conv_last%d' % (i)] ) ) (ll)
+        ll = TorchBatchNorm2D(axis=1,  momentum=0.1, epsilon=1e-05, weights=t2kw_bn2d( fa._modules['bn_end%d' % (i)] ) )(ll)
+        ll = KL.Activation( keras.backend.relu ) (ll)
+        
+        tmp_out = KL.convolutional.Conv2D(68, kernel_size=1, strides=1, data_format='channels_first', padding='valid', weights=t2kw_conv2d( fa._modules['l%d' % (i)] ) ) (ll)
+        outputs.append(tmp_out)
+        if i < 4 - 1:
+            ll = KL.convolutional.Conv2D(256, kernel_size=1, strides=1, data_format='channels_first', padding='valid', weights=t2kw_conv2d( fa._modules['bl%d' % (i)] ) ) (ll)
+            previous = KL.add ( [previous, ll, KL.convolutional.Conv2D(256, kernel_size=1, strides=1, data_format='channels_first', padding='valid', weights=t2kw_conv2d( fa._modules['al%d' % (i)] ) ) (tmp_out) ] )
+            
+    model = keras.models.Model (_input, outputs)
+    model.compile ( loss='mse', optimizer='adam' )
+    model.save (model_path)
+    model.save_weights ( os.path.join( os.path.dirname(__file__) , 'weights.h5') )
+    
+def transform(point, center, scale, resolution, invert=False):
+    _pt = torch.ones(3)
+    _pt[0] = point[0]
+    _pt[1] = point[1]
+
+    h = 200.0 * scale
+    t = torch.eye(3)
+    t[0, 0] = resolution / h
+    t[1, 1] = resolution / h
+    t[0, 2] = resolution * (-center[0] / h + 0.5)
+    t[1, 2] = resolution * (-center[1] / h + 0.5)
+
+    if invert:
+        t = torch.inverse(t)
+
+    new_point = (torch.matmul(t, _pt))[0:2]
+
+    return new_point.int()
+    
+def get_preds_fromhm(hm, center=None, scale=None):
+    max, idx = torch.max(  hm.view(hm.size(0), hm.size(1), hm.size(2) * hm.size(3)), 2)
+    idx += 1
+    preds = idx.view(idx.size(0), idx.size(1), 1).repeat(1, 1, 2).float()
+    preds[..., 0].apply_(lambda x: (x - 1) % hm.size(3) + 1)
+    preds[..., 1].add_(-1).div_(hm.size(2)).floor_().add_(1)
+
+    for i in range(preds.size(0)):
+        for j in range(preds.size(1)):
+            hm_ = hm[i, j, :]
+            pX, pY = int(preds[i, j, 0]) - 1, int(preds[i, j, 1]) - 1
+            if pX > 0 and pX < 63 and pY > 0 and pY < 63:
+                diff = torch.FloatTensor(
+                    [hm_[pY, pX + 1] - hm_[pY, pX - 1],
+                     hm_[pY + 1, pX] - hm_[pY - 1, pX]])
+                preds[i, j].add_(diff.sign_().mul_(.25))
+
+    preds.add_(-.5)
+    
+    preds_orig = torch.zeros(preds.size())
+    if center is not None and scale is not None:
+        for i in range(hm.size(0)):
+            for j in range(hm.size(1)):
+                preds_orig[i, j] = transform(
+                    preds[i, j], center, scale, hm.size(2), True)
+                    
+    return preds, preds_orig
+
+
+def get_preds_fromhm2(a, center=None, scale=None):
+    b = a.reshape ( (a.shape[0], a.shape[1]*a.shape[2]) )    
+    c = b.argmax(1).reshape ( (a.shape[0], 1) ).repeat(2, axis=1).astype(np.float)
+    c[:,0] %= a.shape[2]    
+    c[:,1] = np.apply_along_axis ( lambda x: np.floor(x / a.shape[2]), 0, c[:,1] )
+
+    for i in range(a.shape[0]):
+        pX, pY = int(c[i,0]), int(c[i,1])
+        if pX > 0 and pX < 63 and pY > 0 and pY < 63:
+            diff = np.array ( [a[i,pY,pX+1]-a[i,pY,pX-1], a[i,pY+1,pX]-a[i,pY-1,pX]] )
+            c[i] += np.sign(diff)*0.25
+   
+    c += 0.5
+    result = np.empty ( (a.shape[0],2), dtype=np.int )
+    if center is not None and scale is not None:
+        for i in range(a.shape[0]):
+            pt = np.array ( [c[i][0], c[i][1], 1.0] )            
+            h = 200.0 * scale
+            m = np.eye(3)
+            m[0,0] = a.shape[2] / h
+            m[1,1] = a.shape[2] / h
+            m[0,2] = a.shape[2] * ( -center[0] / h + 0.5 )
+            m[1,2] = a.shape[2] * ( -center[1] / h + 0.5 )
+            m = np.linalg.inv(m)
+            result[i] = np.matmul (m, pt)[0:2].astype( np.int )
+    return result
+    
+
+    
+rnd_data = np.random.rand (3, 256,256).astype(np.float32)
+#rnd_data = np.random.random_integers (2, size=(3, 256,256)).astype(np.float32)
+#rnd_data = np.array ( [[[1]*256]*256]*3 , dtype=np.float32 )
+input_data = np.array ([rnd_data])
+
+fa_out_tensor = fa( torch.autograd.Variable( torch.from_numpy(input_data), volatile=True) )[-1].data.cpu()
+fa_out = fa_out_tensor.numpy()
+
+t = time.time()
+m_out = model.predict ( input_data )[-1]
+print ('predict takes = %f' %( time.time() - t ) )
+t = time.time()
+
+#fa_base_out = fa_base(torch.autograd.Variable( torch.from_numpy(input_data), volatile=True))[0].data.cpu().numpy()
+
+print ( 'shapes = %s , %s , equal == %s ' % (fa_out.shape, m_out.shape, (fa_out.shape == m_out.shape) ) )
+print ( 'allclose == %s' %  ( np.allclose(fa_out, m_out) ) )
+print ( 'total abs diff outputs = %f' % ( np.sum ( np.abs(np.ndarray.flatten(fa_out-m_out))) )) 
+
+###
+d = dlib.rectangle(156,364,424,765)
+
+center = torch.FloatTensor(
+                    [d.right() - (d.right() - d.left()) / 2.0, d.bottom() -
+                     (d.bottom() - d.top()) / 2.0])
+center[1] = center[1] - (d.bottom() - d.top()) * 0.12
+scale = (d.right() - d.left() + d.bottom() - d.top()) / 195.0
+pts, pts_img = get_preds_fromhm (fa_out_tensor, center, scale)
+pts_img = pts_img.view(68, 2).numpy()
+
+###
+
+m_pts_img = get_preds_fromhm2 (m_out[0], center, scale)
+
+print ('pts1 == pts2 == %s' % ( np.array_equal(pts_img, m_pts_img) ) )
+
+code.interact(local=dict(globals(), **locals()))
+
+#print ( np.array_equal (fa_out, m_out) ) #>>> False
+#code.interact(local=dict(globals(), **locals()))
+
+#code.interact(local=locals())
+
+#code.interact(local=locals())
+
+###
+#fa.conv1.weight = torch.nn.Parameter( torch.from_numpy ( np.array( [[[[1.0]*7]*7]*3]*64, dtype=np.float32) ) )
+#fa.conv1.bias = torch.nn.Parameter( torch.from_numpy ( np.array( [1.0]*64, dtype=np.float32 ) ) )
+#model.layers[2].set_weights( [ np.array( [[[[1.0]*64]*3]*7]*7, dtype=np.float32), np.array( [1.0]*64, dtype=np.float32 ) ] )
+
+#b = np.array( [1.0]*64, dtype=np.float32 )
+#b = np.random.rand (64).astype(np.float32)
+#w = np.array( [[[[1.0]*7]*7]*3]*64, dtype=np.float32)
+#w = np.random.rand (64, 3, 7, 7).astype(np.float32)
+#s = w #fa_base.conv1.weight.data.cpu().numpy() #64x3x7x7
+#d = np.moveaxis(s, [0,1,2,3], [3,2,0,1] )                
+                
+
+#fa.conv1.weight = torch.nn.Parameter( torch.from_numpy ( w ) )
+#fa.conv1.bias = torch.nn.Parameter( torch.from_numpy ( b ) )
+#model.layers[2].set_weights( [np.transpose(w), b] )
+#model.layers[2].set_weights( [d, b] )
+'''
+for i in range(0,64):
+    for j in range(0,128):
+        b = np.array_equal (fa_out[i,j], m_out[i,j])
+        if b == False:
+            print ( '%d %d == False' %(i,j) ) #>>> False
+'''      
+
+    
+'''
+input = -2.7966828
+gamma = 0.7640695571899414
+beta = 0.22801123559474945
+moving_mean = 0.12693816423416138
+moving_variance = 0.10409101098775864
+epsilon = 0.0 #0.00001
+
+print ( gamma * (input - moving_mean) / math.sqrt(moving_variance + epsilon) + beta )
+print ( (input - moving_mean) * (1.0 / math.sqrt(moving_variance) + epsilon)*gamma + beta   )
+'''
+#code.interact(local=dict(globals(), **locals()))
+'''
+conv_64_128 = x
+conv_64_128 = TorchBatchNorm2D(axis=1,  momentum=0.1, epsilon=1e-05, weights=t2kw_bn2d(fa.conv2.bn1) )(conv_64_128)
+conv_64_128 = KL.Activation( keras.backend.relu ) (conv_64_128)
+conv_64_128 = KL.ZeroPadding2D(padding=(1, 1), data_format='channels_first')(conv_64_128)
+conv_64_128 = KL.convolutional.Conv2D( 64, kernel_size=3, strides=1, data_format='channels_first', padding='valid', use_bias = False, weights=t2kw_conv2d(fa.conv2.conv1) ) (conv_64_128)
+conv_64_128 = TorchBatchNorm2D(axis=1,  momentum=0.1, epsilon=1e-05, weights=t2kw_bn2d(fa.conv2.bn2) )(conv_64_128)
+conv_64_128 = KL.Activation( keras.backend.relu ) (conv_64_128)
+'''
+#
+#
+#keras result = gamma * (input - moving_mean) / sqrt(moving_variance + epsilon) + beta
+#
+# (input - mean / scale_factor) / sqrt(var / scale_factor + eps)
+#
+#input = -3.0322433
+#
+#gamma = 0.1859646
+#beta = -0.17041835
+#moving_mean = -3.0345056
+#moving_variance = 8.773307
+#epsilon = 0.00001
+#
+#result = - 0.17027631
+#
+# fa result = 1.930317
\ No newline at end of file
diff --git a/__dev/test.py b/__dev/test.py
new file mode 100644
index 0000000..2bd2266
--- /dev/null
+++ b/__dev/test.py
@@ -0,0 +1,1282 @@
+import os
+os.environ['force_plaidML'] = '1'
+
+import sys
+import argparse
+from utils import Path_utils
+from utils import os_utils
+from facelib import LandmarksProcessor
+from pathlib import Path
+import numpy as np
+import cv2
+import time
+import multiprocessing
+import traceback
+from tqdm import tqdm
+from utils.DFLPNG import DFLPNG
+from utils.DFLJPG import DFLJPG
+from utils.cv2_utils import *
+from utils import image_utils
+import shutil
+
+
+    
+def umeyama(src, dst, estimate_scale):
+    """Estimate N-D similarity transformation with or without scaling.
+    Parameters
+    ----------
+    src : (M, N) array
+        Source coordinates.
+    dst : (M, N) array
+        Destination coordinates.
+    estimate_scale : bool
+        Whether to estimate scaling factor.
+    Returns
+    -------
+    T : (N + 1, N + 1)
+        The homogeneous similarity transformation matrix. The matrix contains
+        NaN values only if the problem is not well-conditioned.
+    References
+    ----------
+    .. [1] "Least-squares estimation of transformation parameters between two
+            point patterns", Shinji Umeyama, PAMI 1991, DOI: 10.1109/34.88573
+    """
+
+    num = src.shape[0]
+    dim = src.shape[1]
+
+    # Compute mean of src and dst.
+    src_mean = src.mean(axis=0)
+    dst_mean = dst.mean(axis=0)
+
+    # Subtract mean from src and dst.
+    src_demean = src - src_mean
+    dst_demean = dst - dst_mean
+
+    # Eq. (38).
+    A = np.dot(dst_demean.T, src_demean) / num
+
+    # Eq. (39).
+    d = np.ones((dim,), dtype=np.double)
+    if np.linalg.det(A) < 0:
+        d[dim - 1] = -1
+
+    T = np.eye(dim + 1, dtype=np.double)
+
+    U, S, V = np.linalg.svd(A)
+
+    # Eq. (40) and (43).
+    rank = np.linalg.matrix_rank(A)
+    if rank == 0:
+        return np.nan * T
+    elif rank == dim - 1:
+        if np.linalg.det(U) * np.linalg.det(V) > 0:
+            T[:dim, :dim] = np.dot(U, V)
+        else:
+            s = d[dim - 1]
+            d[dim - 1] = -1
+            T[:dim, :dim] = np.dot(U, np.dot(np.diag(d), V))
+            d[dim - 1] = s
+    else:
+        T[:dim, :dim] = np.dot(U, np.dot(np.diag(d), V.T))
+
+    if estimate_scale:
+        # Eq. (41) and (42).
+        scale = 1.0 / src_demean.var(axis=0).sum() * np.dot(S, d)
+    else:
+        scale = 1.0
+
+    T[:dim, dim] = dst_mean - scale * np.dot(T[:dim, :dim], src_mean.T)
+    T[:dim, :dim] *= scale
+
+    return T
+    
+def random_transform(image, rotation_range=10, zoom_range=0.5, shift_range=0.05, random_flip=0):
+    h, w = image.shape[0:2]
+    rotation = np.random.uniform(-rotation_range, rotation_range)
+    scale = np.random.uniform(1 - zoom_range, 1 + zoom_range)
+    tx = np.random.uniform(-shift_range, shift_range) * w
+    ty = np.random.uniform(-shift_range, shift_range) * h
+    mat = cv2.getRotationMatrix2D((w // 2, h // 2), rotation, scale)
+    mat[:, 2] += (tx, ty)
+    result = cv2.warpAffine(
+        image, mat, (w, h), borderMode=cv2.BORDER_REPLICATE)
+    if np.random.random() < random_flip:
+        result = result[:, ::-1]
+    return result
+
+# get pair of random warped images from aligned face image
+def random_warp(image, coverage=160, scale = 5, zoom = 1):
+    assert image.shape == (256, 256, 3)
+    range_ = np.linspace(128 - coverage//2, 128 + coverage//2, 5)
+    mapx = np.broadcast_to(range_, (5, 5))
+    mapy = mapx.T
+
+    mapx = mapx + np.random.normal(size=(5,5), scale=scale)
+    mapy = mapy + np.random.normal(size=(5,5), scale=scale)
+
+    interp_mapx = cv2.resize(mapx, (80*zoom,80*zoom))[8*zoom:72*zoom,8*zoom:72*zoom].astype('float32')
+    interp_mapy = cv2.resize(mapy, (80*zoom,80*zoom))[8*zoom:72*zoom,8*zoom:72*zoom].astype('float32')
+
+    warped_image = cv2.remap(image, interp_mapx, interp_mapy, cv2.INTER_LINEAR)
+
+    src_points = np.stack([mapx.ravel(), mapy.ravel() ], axis=-1)
+    dst_points = np.mgrid[0:65*zoom:16*zoom,0:65*zoom:16*zoom].T.reshape(-1,2)
+    mat = umeyama(src_points, dst_points, True)[0:2]
+
+    target_image = cv2.warpAffine(image, mat, (64*zoom,64*zoom))
+
+    return warped_image, target_image
+
+def input_process(stdin_fd, sq, str):
+    sys.stdin = os.fdopen(stdin_fd)
+    try:
+        inp = input (str)
+        sq.put (True)
+    except:
+        sq.put (False)
+        
+def input_in_time (str, max_time_sec):
+    sq = multiprocessing.Queue()
+    p = multiprocessing.Process(target=input_process, args=( sys.stdin.fileno(), sq, str))
+    p.start()
+    t = time.time()
+    inp = False
+    while True:
+        if not sq.empty():
+            inp = sq.get()
+            break
+        if time.time() - t > max_time_sec:
+            break
+    p.terminate()
+    sys.stdin = os.fdopen( sys.stdin.fileno() )
+    return inp
+    
+
+ 
+def subprocess(sq,cq):   
+    prefetch = 2
+    while True:
+        while prefetch > -1:
+            cq.put ( np.array([1]) ) #memory leak numpy==1.16.0 , but all fine in 1.15.4
+            #cq.put ( [1] )  #no memory leak
+            prefetch -= 1
+            
+        sq.get() #waiting msg from serv to continue posting
+        prefetch += 1 
+
+
+
+def get_image_hull_mask (image_shape, image_landmarks):        
+    if len(image_landmarks) != 68:
+        raise Exception('get_image_hull_mask works only with 68 landmarks')
+        
+    hull_mask = np.zeros(image_shape[0:2]+(1,),dtype=np.float32)
+
+    cv2.fillConvexPoly( hull_mask, cv2.convexHull( np.concatenate ( (image_landmarks[0:17], image_landmarks[48:], [image_landmarks[0]], [image_landmarks[8]], [image_landmarks[16]]))    ), (1,) )
+    cv2.fillConvexPoly( hull_mask, cv2.convexHull( np.concatenate ( (image_landmarks[27:31], [image_landmarks[33]]) )                                                                    ), (1,) )
+    cv2.fillConvexPoly( hull_mask, cv2.convexHull( np.concatenate ( (image_landmarks[17:27], [image_landmarks[0]], [image_landmarks[27]], [image_landmarks[16]], [image_landmarks[33]])) ), (1,) )
+    
+    return hull_mask
+    
+
+def umeyama(src, dst, estimate_scale):
+    """Estimate N-D similarity transformation with or without scaling.
+    Parameters
+    ----------
+    src : (M, N) array
+        Source coordinates.
+    dst : (M, N) array
+        Destination coordinates.
+    estimate_scale : bool
+        Whether to estimate scaling factor.
+    Returns
+    -------
+    T : (N + 1, N + 1)
+        The homogeneous similarity transformation matrix. The matrix contains
+        NaN values only if the problem is not well-conditioned.
+    References
+    ----------
+    .. [1] "Least-squares estimation of transformation parameters between two
+            point patterns", Shinji Umeyama, PAMI 1991, DOI: 10.1109/34.88573
+    """
+
+    num = src.shape[0]
+    dim = src.shape[1]
+
+    # Compute mean of src and dst.
+    src_mean = src.mean(axis=0)
+    dst_mean = dst.mean(axis=0)
+
+    # Subtract mean from src and dst.
+    src_demean = src - src_mean
+    dst_demean = dst - dst_mean
+
+    # Eq. (38).
+    A = np.dot(dst_demean.T, src_demean) / num
+
+    # Eq. (39).
+    d = np.ones((dim,), dtype=np.double)
+    if np.linalg.det(A) < 0:
+        d[dim - 1] = -1
+
+    T = np.eye(dim + 1, dtype=np.double)
+
+    U, S, V = np.linalg.svd(A)
+
+    # Eq. (40) and (43).
+    rank = np.linalg.matrix_rank(A)
+    if rank == 0:
+        return np.nan * T
+    elif rank == dim - 1:
+        if np.linalg.det(U) * np.linalg.det(V) > 0:
+            T[:dim, :dim] = np.dot(U, V)
+        else:
+            s = d[dim - 1]
+            d[dim - 1] = -1
+            T[:dim, :dim] = np.dot(U, np.dot(np.diag(d), V))
+            d[dim - 1] = s
+    else:
+        T[:dim, :dim] = np.dot(U, np.dot(np.diag(d), V.T))
+
+    if estimate_scale:
+        # Eq. (41) and (42).
+        scale = 1.0 / src_demean.var(axis=0).sum() * np.dot(S, d)
+    else:
+        scale = 1.0
+
+    T[:dim, dim] = dst_mean - scale * np.dot(T[:dim, :dim], src_mean.T)
+    T[:dim, :dim] *= scale
+
+    return T
+    
+mean_face_x = np.array([
+0.000213256, 0.0752622, 0.18113, 0.29077, 0.393397, 0.586856, 0.689483, 0.799124,
+0.904991, 0.98004, 0.490127, 0.490127, 0.490127, 0.490127, 0.36688, 0.426036,
+0.490127, 0.554217, 0.613373, 0.121737, 0.187122, 0.265825, 0.334606, 0.260918,
+0.182743, 0.645647, 0.714428, 0.793132, 0.858516, 0.79751, 0.719335, 0.254149,
+0.340985, 0.428858, 0.490127, 0.551395, 0.639268, 0.726104, 0.642159, 0.556721,
+0.490127, 0.423532, 0.338094, 0.290379, 0.428096, 0.490127, 0.552157, 0.689874,
+0.553364, 0.490127, 0.42689 ])
+
+mean_face_y = np.array([
+0.106454, 0.038915, 0.0187482, 0.0344891, 0.0773906, 0.0773906, 0.0344891,
+0.0187482, 0.038915, 0.106454, 0.203352, 0.307009, 0.409805, 0.515625, 0.587326,
+0.609345, 0.628106, 0.609345, 0.587326, 0.216423, 0.178758, 0.179852, 0.231733,
+0.245099, 0.244077, 0.231733, 0.179852, 0.178758, 0.216423, 0.244077, 0.245099,
+0.780233, 0.745405, 0.727388, 0.742578, 0.727388, 0.745405, 0.780233, 0.864805,
+0.902192, 0.909281, 0.902192, 0.864805, 0.784792, 0.778746, 0.785343, 0.778746,
+0.784792, 0.824182, 0.831803, 0.824182 ])
+
+landmarks_2D = np.stack( [ mean_face_x, mean_face_y ], axis=1 )
+
+def get_transform_mat (image_landmarks, output_size, scale=1.0):
+    if not isinstance(image_landmarks, np.ndarray):
+        image_landmarks = np.array (image_landmarks) 
+        
+    padding = (output_size / 64) * 12
+        
+    mat = umeyama(image_landmarks[17:], landmarks_2D, True)[0:2]
+    mat = mat * (output_size - 2 * padding)
+    mat[:,2] += padding        
+    mat *= (1 / scale)
+    mat[:,2] += -output_size*( ( (1 / scale) - 1.0 ) / 2 )
+             
+    return mat
+    
+#alignments = []
+#        
+#aligned_path_image_paths = Path_utils.get_image_paths("D:\\DeepFaceLab\\workspace issue\\data_dst\\aligned")
+#for filepath in tqdm(aligned_path_image_paths, desc="Collecting alignments", ascii=True ):
+#    filepath = Path(filepath)
+#    
+#    if filepath.suffix == '.png':
+#        dflimg = DFLPNG.load( str(filepath), print_on_no_embedded_data=True )
+#    elif filepath.suffix == '.jpg':
+#        dflimg = DFLJPG.load ( str(filepath), print_on_no_embedded_data=True )
+#    else:
+#        print ("%s is not a dfl image file" % (filepath.name) ) 
+#    
+#    #source_filename_stem = Path( dflimg.get_source_filename() ).stem
+#    #if source_filename_stem not in alignments.keys():
+#    #    alignments[ source_filename_stem ] = []
+#
+#    #alignments[ source_filename_stem ].append (dflimg.get_source_landmarks())
+#    alignments.append (dflimg.get_source_landmarks())
+import mathlib
+def main():
+    from nnlib import nnlib
+    exec( nnlib.import_all(), locals(), globals() )
+    PMLTile = nnlib.PMLTile
+    PMLK = nnlib.PMLK
+    
+    image = cv2.imread('D:\\DeepFaceLab\\test\\00000.png').astype(np.float32) / 255.0    
+    image = cv2.resize ( image, (128,128) )
+    
+    image = cv2.cvtColor (image, cv2.COLOR_BGR2GRAY)    
+    image = np.expand_dims (image, -1)
+    image = np.expand_dims (image, 0)
+    image_shape = image.shape
+    
+    t = K.placeholder ( image_shape ) #K.constant ( np.ones ( (10,) ) )
+    import code
+    code.interact(local=dict(globals(), **locals()))
+    
+    '''
+    >>> t[:,0:64,64::2,:].source.op.code
+function (I[N0, N1, N2, N3]) -> (O) {
+
+O[i0, i1, i2, i3: (1 + 1 - 1)/1, (64 + 1 - 1)/1, (64 + 2 - 1)/2, (1 + 1 - 1)/1] = 
+       =(I[1*i0+0, 1*i1+0, 2*i2+64, 1*i3+0]);
+       
+
+        Status GetWindowedOutputSizeVerboseV2(int64 input_size, int64 filter_size,
+                                          int64 dilation_rate, int64 stride,
+                                          Padding padding_type, int64* output_size,
+                                          int64* padding_before,
+                                          int64* padding_after) {
+      if (stride <= 0) {
+        return errors::InvalidArgument("Stride must be > 0, but got ", stride);
+      }
+      if (dilation_rate < 1) {
+        return errors::InvalidArgument("Dilation rate must be >= 1, but got ",
+                                       dilation_rate);
+      }
+
+      // See also the parallel implementation in GetWindowedOutputSizeFromDimsV2.
+      int64 effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+      switch (padding_type) {
+        case Padding::VALID:
+          *output_size = (input_size - effective_filter_size + stride) / stride;
+          *padding_before = *padding_after = 0;
+          break;
+        case Padding::EXPLICIT:
+          *output_size = (input_size + *padding_before + *padding_after -
+                          effective_filter_size + stride) /
+                         stride;
+          break;
+        case Padding::SAME:
+          *output_size = (input_size + stride - 1) / stride;
+          const int64 padding_needed =
+              std::max(int64{0}, (*output_size - 1) * stride +
+                                     effective_filter_size - input_size);
+          // For odd values of total padding, add more padding at the 'right'
+          // side of the given dimension.
+          *padding_before = padding_needed / 2;
+          *padding_after = padding_needed - *padding_before;
+          break;
+      }
+      if (*output_size < 0) {
+        return errors::InvalidArgument(
+            "Computed output size would be negative: ", *output_size,
+            " [input_size: ", input_size,
+            ", effective_filter_size: ", effective_filter_size,
+            ", stride: ", stride, "]");
+      }
+      return Status::OK();
+    }
+    '''
+    class ExtractImagePatchesOP(PMLTile.Operation):
+        def __init__(self, input, ksizes, strides, rates, padding='valid'):
+        
+            batch, in_rows, in_cols, depth = input.shape.dims
+
+            ksize_rows = ksizes[1];
+            ksize_cols = ksizes[2];
+
+            stride_rows = strides[1];
+            stride_cols = strides[2];
+
+            rate_rows = rates[1];
+            rate_cols = rates[2];
+
+            ksize_rows_eff = ksize_rows + (ksize_rows - 1) * (rate_rows - 1);
+            ksize_cols_eff = ksize_cols + (ksize_cols - 1) * (rate_cols - 1);
+            
+            #if padding == 'valid':
+                
+            out_rows = (in_rows - ksize_rows_eff + stride_rows) / stride_rows;
+            out_cols = (in_cols - ksize_cols_eff + stride_cols) / stride_cols;
+            
+            out_sizes = (batch, out_rows, out_cols, ksize_rows * ksize_cols * depth);
+
+            
+        
+            B, H, W, CI = input.shape.dims
+            
+            RATE = PMLK.constant ([1,rate,rate,1], dtype=PMLK.floatx() )
+            
+            #print (target_dims)
+            code = """function (I[B, {H}, {W}, {CI} ], RATES[RB, RH, RW, RC] ) -> (O) {
+                        
+                        O[b, {wnd_size}, {wnd_size}, ] = =(I[b, h, w, ci]);
+                        
+                    }""".format(H=H, W=W, CI=CI, RATES=rates, wnd_size=wnd_size)
+                    
+            super(ExtractImagePatchesOP, self).__init__(code, [('I', input) ],
+                    [('O', PMLTile.Shape(input.shape.dtype, out_sizes ) )])
+
+    
+    
+
+    f = ExtractImagePatchesOP.function(t, [1,65,65,1], [1,1,1,1], [1,1,1,1])
+
+    x, = K.function ([t],[f]) ([ image ])
+    print(x.shape)
+    
+    import code
+    code.interact(local=dict(globals(), **locals()))
+    
+    
+    from nnlib import nnlib
+    exec( nnlib.import_all(), locals(), globals() )
+    
+    #ch = 3
+    #def softmax(x, axis=-1): #from K numpy backend
+    #    y = np.exp(x - np.max(x, axis, keepdims=True))
+    #    return y / np.sum(y, axis, keepdims=True)
+    #    
+    #def gauss_kernel(size, sigma):
+    #    coords = np.arange(0,size, dtype=K.floatx() )                  
+    #    coords -= (size - 1 ) / 2.0
+    #    g = coords**2
+    #    g *= ( -0.5 / (sigma**2) )
+    #    g = np.reshape (g, (1,-1)) + np.reshape(g, (-1,1) )
+    #    g = np.reshape (g, (1,-1))
+    #    g = softmax(g)
+    #    g = np.reshape (g, (size, size, 1, 1))  
+    #    g = np.tile (g, (1,1,ch, size*size*ch))                
+    #    return K.constant(g, dtype=K.floatx() )
+    #
+    ##kernel = gauss_kernel(11,1.5)                
+    #kernel = K.constant( np.ones ( (246,246, 3, 1) ) , dtype=K.floatx() )
+    ##g = np.eye(9).reshape((3, 3, 1, 9)) 
+    ##g = np.tile (g, (1,1,3,1))               
+    ##kernel = K.constant(g , dtype=K.floatx() )
+    #
+    #def reducer(x):
+    #    shape = K.shape(x)
+    #    x = K.reshape(x, (-1, shape[-3] , shape[-2], shape[-1]) )                  
+    #
+    #    y = K.depthwise_conv2d(x, kernel, strides=(1, 1), padding='valid')
+    #    
+    #    y_shape = K.shape(y)
+    #    return y#K.reshape(y, (shape[0], y_shape[1], y_shape[2], y_shape[3] ) )
+    
+    image = cv2.imread('D:\\DeepFaceLab\\test\\00000.png').astype(np.float32) / 255.0    
+    image = cv2.resize ( image, (128,128) )
+    
+    image = cv2.cvtColor (image, cv2.COLOR_BGR2GRAY)    
+    image = np.expand_dims (image, -1)
+    image_shape = image.shape
+    
+    image2 = cv2.imread('D:\\DeepFaceLab\\test\\00001.png').astype(np.float32) / 255.0    
+    #image2 = cv2.cvtColor (image2, cv2.COLOR_BGR2GRAY)    
+    #image2 = np.expand_dims (image2, -1)
+    image2_shape = image2.shape
+
+    image_tensor = K.placeholder(shape=[ 1, image_shape[0], image_shape[1], image_shape[2] ], dtype="float32" )  
+    image2_tensor = K.placeholder(shape=[ 1, image_shape[0], image_shape[1], image_shape[2] ], dtype="float32" )  
+
+    #loss = reducer(image_tensor)
+    #loss = K.reshape (loss, (-1,246,246, 11,11,3) )
+    tf = nnlib.tf
+    
+    sh = K.int_shape(image_tensor)[1]
+    wnd_size = 16
+    step_size = 8
+    k = (sh-wnd_size) // step_size + 1
+    
+    loss = tf.image.extract_image_patches(image_tensor, [1,k,k,1], [1,1,1,1], [1,step_size,step_size,1], 'VALID')
+    print(loss)
+    
+    f = K.function ( [image_tensor], [loss] )
+    x = f ( [ np.expand_dims(image,0) ] )[0][0]
+    
+    import code
+    code.interact(local=dict(globals(), **locals()))
+    
+    for i in range( x.shape[2] ):
+        img = x[:,:,i:i+1]
+    
+        cv2.imshow('', (img*255).astype(np.uint8) )
+        cv2.waitKey(0)
+            
+    #for i in range( len(x) ):
+    #    for j in range ( len(x) ):
+    #        img = x[i,j]
+    #        import code
+    #        code.interact(local=dict(globals(), **locals()))
+    #
+    #        cv2.imshow('', (x[i,j]*255).astype(np.uint8) )
+    #        cv2.waitKey(0)
+    
+    import code
+    code.interact(local=dict(globals(), **locals()))
+    
+    
+    from nnlib import nnlib
+    exec( nnlib.import_all(), locals(), globals() )
+        
+    PNet_Input = Input ( (None, None,3) )
+    x = PNet_Input
+    x = Conv2D (10, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv1")(x)
+    x = PReLU (shared_axes=[1,2], name="PReLU1" )(x)
+    x = MaxPooling2D( pool_size=(2,2), strides=(2,2), padding='same' ) (x)
+    x = Conv2D (16, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv2")(x)
+    x = PReLU (shared_axes=[1,2], name="PReLU2" )(x)
+    x = Conv2D (32, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv3")(x)
+    x = PReLU (shared_axes=[1,2], name="PReLU3" )(x)
+    prob = Conv2D (2, kernel_size=(1,1), strides=(1,1), padding='valid', name="conv41")(x)
+    prob = Softmax()(prob)    
+    x = Conv2D (4, kernel_size=(1,1), strides=(1,1), padding='valid', name="conv42")(x)
+
+    PNet_model = Model(PNet_Input, [x,prob] )        
+    PNet_model.load_weights ( (Path(mtcnn.__file__).parent / 'mtcnn_pnet.h5').__str__() )
+    
+    RNet_Input = Input ( (24, 24, 3) )
+    x = RNet_Input
+    x = Conv2D (28, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv1")(x)
+    x = PReLU (shared_axes=[1,2], name="prelu1" )(x)
+    x = MaxPooling2D( pool_size=(3,3), strides=(2,2), padding='same' ) (x)
+    x = Conv2D (48, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv2")(x)
+    x = PReLU (shared_axes=[1,2], name="prelu2" )(x)
+    x = MaxPooling2D( pool_size=(3,3), strides=(2,2), padding='valid' ) (x)    
+    x = Conv2D (64, kernel_size=(2,2), strides=(1,1), padding='valid', name="conv3")(x)
+    x = PReLU (shared_axes=[1,2], name="prelu3" )(x)
+    x = Lambda ( lambda x: K.reshape (x, (-1, np.prod(K.int_shape(x)[1:]),) ), output_shape=(np.prod(K.int_shape(x)[1:]),) ) (x)
+    x = Dense (128, name='conv4')(x)    
+    x = PReLU (name="prelu4" )(x)
+    prob = Dense (2, name='conv51')(x)
+    prob = Softmax()(prob)  
+    x = Dense (4, name='conv52')(x)        
+    RNet_model = Model(RNet_Input, [x,prob] )        
+    RNet_model.load_weights ( (Path(mtcnn.__file__).parent / 'mtcnn_rnet.h5').__str__() )
+    
+    ONet_Input = Input ( (48, 48, 3) )
+    x = ONet_Input
+    x = Conv2D (32, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv1")(x)
+    x = PReLU (shared_axes=[1,2], name="prelu1" )(x)
+    x = MaxPooling2D( pool_size=(3,3), strides=(2,2), padding='same' ) (x)
+    x = Conv2D (64, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv2")(x)
+    x = PReLU (shared_axes=[1,2], name="prelu2" )(x)
+    x = MaxPooling2D( pool_size=(3,3), strides=(2,2), padding='valid' ) (x)    
+    x = Conv2D (64, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv3")(x)
+    x = PReLU (shared_axes=[1,2], name="prelu3" )(x)
+    x = MaxPooling2D( pool_size=(2,2), strides=(2,2), padding='same' ) (x) 
+    x = Conv2D (128, kernel_size=(2,2), strides=(1,1), padding='valid', name="conv4")(x)
+    x = PReLU (shared_axes=[1,2], name="prelu4" )(x)
+    x = Lambda ( lambda x: K.reshape (x, (-1, np.prod(K.int_shape(x)[1:]),) ), output_shape=(np.prod(K.int_shape(x)[1:]),) ) (x)    
+    x = Dense (256, name='conv5')(x)
+    x = PReLU (name="prelu5" )(x)
+    prob = Dense (2, name='conv61')(x)
+    prob = Softmax()(prob)    
+    x1 = Dense (4, name='conv62')(x)
+    x2 = Dense (10, name='conv63')(x)        
+    ONet_model = Model(ONet_Input, [x1,x2,prob] )        
+    ONet_model.load_weights ( (Path(mtcnn.__file__).parent / 'mtcnn_onet.h5').__str__() )
+    
+    pnet_fun = K.function ( PNet_model.inputs, PNet_model.outputs )
+    rnet_fun = K.function ( RNet_model.inputs, RNet_model.outputs )
+    onet_fun = K.function ( ONet_model.inputs, ONet_model.outputs )
+        
+    pnet_test_data = np.random.uniform ( size=(1, 64,64,3) )
+    pnet_result1, pnet_result2 = pnet_fun ([pnet_test_data])
+    
+    rnet_test_data = np.random.uniform ( size=(1,24,24,3) )
+    rnet_result1, rnet_result2 = rnet_fun ([rnet_test_data])
+
+    onet_test_data = np.random.uniform ( size=(1,48,48,3) )
+    onet_result1, onet_result2, onet_result3 = onet_fun ([onet_test_data])
+    
+    import code
+    code.interact(local=dict(globals(), **locals()))
+    
+    from nnlib import nnlib
+    #exec( nnlib.import_all( nnlib.device.Config(cpu_only=True) ), locals(), globals() )# nnlib.device.Config(cpu_only=True)
+    exec( nnlib.import_all(), locals(), globals() )# nnlib.device.Config(cpu_only=True)
+    
+    #det1_Input = Input ( (None, None,3) )
+    #x = det1_Input
+    #x = Conv2D (10, kernel_size=(3,3), strides=(1,1), padding='valid')(x)
+    #
+    #import code
+    #code.interact(local=dict(globals(), **locals()))
+    
+    tf = nnlib.tf
+    tf_session = nnlib.tf_sess
+    
+    with tf.variable_scope('pnet2'):
+        data = tf.placeholder(tf.float32, (None,None,None,3), 'input')
+        pnet2 = mtcnn.PNet(tf, {'data':data})        
+        pnet2.load( (Path(mtcnn.__file__).parent / 'det1.npy').__str__(), tf_session)
+    with tf.variable_scope('rnet2'):
+        data = tf.placeholder(tf.float32, (None,24,24,3), 'input')
+        rnet2 = mtcnn.RNet(tf, {'data':data})
+        rnet2.load( (Path(mtcnn.__file__).parent / 'det2.npy').__str__(), tf_session)
+    with tf.variable_scope('onet2'):
+        data = tf.placeholder(tf.float32, (None,48,48,3), 'input')
+        onet2 = mtcnn.ONet(tf, {'data':data})
+        onet2.load( (Path(mtcnn.__file__).parent / 'det3.npy').__str__(), tf_session)
+    
+    
+    
+    pnet_fun = K.function([pnet2.layers['data']],[pnet2.layers['conv4-2'], pnet2.layers['prob1']])
+    rnet_fun = K.function([rnet2.layers['data']],[rnet2.layers['conv5-2'], rnet2.layers['prob1']])
+    onet_fun = K.function([onet2.layers['data']],[onet2.layers['conv6-2'], onet2.layers['conv6-3'], onet2.layers['prob1']])
+
+    det1_dict = np.load((Path(mtcnn.__file__).parent / 'det1.npy').__str__(), encoding='latin1').item()
+    det2_dict = np.load((Path(mtcnn.__file__).parent / 'det2.npy').__str__(), encoding='latin1').item()
+    det3_dict = np.load((Path(mtcnn.__file__).parent / 'det3.npy').__str__(), encoding='latin1').item()      
+    
+    PNet_Input = Input ( (None, None,3) )
+    x = PNet_Input
+    x = Conv2D (10, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv1")(x)
+    x = PReLU (shared_axes=[1,2], name="PReLU1" )(x)
+    x = MaxPooling2D( pool_size=(2,2), strides=(2,2), padding='same' ) (x)
+    x = Conv2D (16, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv2")(x)
+    x = PReLU (shared_axes=[1,2], name="PReLU2" )(x)
+    x = Conv2D (32, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv3")(x)
+    x = PReLU (shared_axes=[1,2], name="PReLU3" )(x)
+    prob = Conv2D (2, kernel_size=(1,1), strides=(1,1), padding='valid', name="conv41")(x)
+    prob = Softmax()(prob)    
+    x = Conv2D (4, kernel_size=(1,1), strides=(1,1), padding='valid', name="conv42")(x)
+    
+    
+    PNet_model = Model(PNet_Input, [x,prob] )
+    
+    #PNet_model.load_weights ( (Path(mtcnn.__file__).parent / 'mtcnn_pnet.h5').__str__() )
+    PNet_model.get_layer("conv1").set_weights ( [ det1_dict['conv1']['weights'], det1_dict['conv1']['biases'] ] )
+    PNet_model.get_layer("PReLU1").set_weights ( [ np.reshape(det1_dict['PReLU1']['alpha'], (1,1,-1)) ] )
+    PNet_model.get_layer("conv2").set_weights ( [ det1_dict['conv2']['weights'], det1_dict['conv2']['biases'] ] )
+    PNet_model.get_layer("PReLU2").set_weights ( [ np.reshape(det1_dict['PReLU2']['alpha'], (1,1,-1)) ] )
+    PNet_model.get_layer("conv3").set_weights ( [ det1_dict['conv3']['weights'], det1_dict['conv3']['biases'] ] )
+    PNet_model.get_layer("PReLU3").set_weights ( [ np.reshape(det1_dict['PReLU3']['alpha'], (1,1,-1)) ] )
+    PNet_model.get_layer("conv41").set_weights ( [ det1_dict['conv4-1']['weights'], det1_dict['conv4-1']['biases'] ] )
+    PNet_model.get_layer("conv42").set_weights ( [ det1_dict['conv4-2']['weights'], det1_dict['conv4-2']['biases'] ] )
+    PNet_model.save ( (Path(mtcnn.__file__).parent / 'mtcnn_pnet.h5').__str__() )
+
+    pnet_test_data = np.random.uniform ( size=(1, 64,64,3) )
+    pnet_result1, pnet_result2 = pnet_fun ([pnet_test_data])
+    pnet2_result1, pnet2_result2 =  K.function ( PNet_model.inputs, PNet_model.outputs ) ([pnet_test_data])   
+    
+    pnet_diff1 = np.mean ( np.abs(pnet_result1 - pnet2_result1) )
+    pnet_diff2 = np.mean ( np.abs(pnet_result2 - pnet2_result2) )
+    print ("pnet_diff1 = %f, pnet_diff2 = %f, "  % (pnet_diff1, pnet_diff2) )
+    
+    RNet_Input = Input ( (24, 24, 3) )
+    x = RNet_Input
+    x = Conv2D (28, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv1")(x)
+    x = PReLU (shared_axes=[1,2], name="prelu1" )(x)
+    x = MaxPooling2D( pool_size=(3,3), strides=(2,2), padding='same' ) (x)
+    x = Conv2D (48, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv2")(x)
+    x = PReLU (shared_axes=[1,2], name="prelu2" )(x)
+    x = MaxPooling2D( pool_size=(3,3), strides=(2,2), padding='valid' ) (x)    
+    x = Conv2D (64, kernel_size=(2,2), strides=(1,1), padding='valid', name="conv3")(x)
+    x = PReLU (shared_axes=[1,2], name="prelu3" )(x)
+    x = Lambda ( lambda x: K.reshape (x, (-1, np.prod(K.int_shape(x)[1:]),) ), output_shape=(np.prod(K.int_shape(x)[1:]),) ) (x)
+    x = Dense (128, name='conv4')(x)
+    x = PReLU (name="prelu4" )(x)
+    prob = Dense (2, name='conv51')(x)
+    prob = Softmax()(prob)  
+    x = Dense (4, name='conv52')(x)
+    
+    RNet_model = Model(RNet_Input, [x,prob] )
+    
+    #RNet_model.load_weights ( (Path(mtcnn.__file__).parent / 'mtcnn_rnet.h5').__str__() )
+    RNet_model.get_layer("conv1").set_weights ( [ det2_dict['conv1']['weights'], det2_dict['conv1']['biases'] ] )
+    RNet_model.get_layer("prelu1").set_weights ( [ np.reshape(det2_dict['prelu1']['alpha'], (1,1,-1)) ] )
+    RNet_model.get_layer("conv2").set_weights ( [ det2_dict['conv2']['weights'], det2_dict['conv2']['biases'] ] )
+    RNet_model.get_layer("prelu2").set_weights ( [ np.reshape(det2_dict['prelu2']['alpha'], (1,1,-1)) ] )
+    RNet_model.get_layer("conv3").set_weights ( [ det2_dict['conv3']['weights'], det2_dict['conv3']['biases'] ] )
+    RNet_model.get_layer("prelu3").set_weights ( [ np.reshape(det2_dict['prelu3']['alpha'], (1,1,-1)) ] )
+    RNet_model.get_layer("conv4").set_weights ( [ det2_dict['conv4']['weights'], det2_dict['conv4']['biases'] ] )
+    RNet_model.get_layer("prelu4").set_weights ( [ det2_dict['prelu4']['alpha'] ] )
+    RNet_model.get_layer("conv51").set_weights ( [ det2_dict['conv5-1']['weights'], det2_dict['conv5-1']['biases'] ] )
+    RNet_model.get_layer("conv52").set_weights ( [ det2_dict['conv5-2']['weights'], det2_dict['conv5-2']['biases'] ] )
+    RNet_model.save ( (Path(mtcnn.__file__).parent / 'mtcnn_rnet.h5').__str__() )
+    
+    #import code
+    #code.interact(local=dict(globals(), **locals()))   
+
+    rnet_test_data = np.random.uniform ( size=(1,24,24,3) )
+    rnet_result1, rnet_result2 = rnet_fun ([rnet_test_data])
+    rnet2_result1, rnet2_result2 =  K.function ( RNet_model.inputs, RNet_model.outputs ) ([rnet_test_data])   
+    
+    rnet_diff1 = np.mean ( np.abs(rnet_result1 - rnet2_result1) )
+    rnet_diff2 = np.mean ( np.abs(rnet_result2 - rnet2_result2) )
+    print ("rnet_diff1 = %f, rnet_diff2 = %f, "  % (rnet_diff1, rnet_diff2) )
+    
+    
+    #################
+    '''
+    (self.feed('data') #pylint: disable=no-value-for-parameter, no-member
+             .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv1')
+             .prelu(name='prelu1')
+             .max_pool(3, 3, 2, 2, name='pool1')
+             .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv2')
+             .prelu(name='prelu2')
+             .max_pool(3, 3, 2, 2, padding='VALID', name='pool2')
+             .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv3')
+             .prelu(name='prelu3')
+             .max_pool(2, 2, 2, 2, name='pool3')
+             .conv(2, 2, 128, 1, 1, padding='VALID', relu=False, name='conv4')
+             .prelu(name='prelu4')
+             .fc(256, relu=False, name='conv5')
+             .prelu(name='prelu5')
+             .fc(2, relu=False, name='conv6-1')
+             .softmax(1, name='prob1'))
+
+        (self.feed('prelu5') #pylint: disable=no-value-for-parameter
+             .fc(4, relu=False, name='conv6-2'))
+
+        (self.feed('prelu5') #pylint: disable=no-value-for-parameter
+             .fc(10, relu=False, name='conv6-3'))
+    '''
+    ONet_Input = Input ( (48, 48, 3) )
+    x = ONet_Input
+    x = Conv2D (32, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv1")(x)
+    x = PReLU (shared_axes=[1,2], name="prelu1" )(x)
+    x = MaxPooling2D( pool_size=(3,3), strides=(2,2), padding='same' ) (x)
+    x = Conv2D (64, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv2")(x)
+    x = PReLU (shared_axes=[1,2], name="prelu2" )(x)
+    x = MaxPooling2D( pool_size=(3,3), strides=(2,2), padding='valid' ) (x)    
+    x = Conv2D (64, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv3")(x)
+    x = PReLU (shared_axes=[1,2], name="prelu3" )(x)
+    x = MaxPooling2D( pool_size=(2,2), strides=(2,2), padding='same' ) (x) 
+    x = Conv2D (128, kernel_size=(2,2), strides=(1,1), padding='valid', name="conv4")(x)
+    x = PReLU (shared_axes=[1,2], name="prelu4" )(x)
+    x = Lambda ( lambda x: K.reshape (x, (-1, np.prod(K.int_shape(x)[1:]),) ), output_shape=(np.prod(K.int_shape(x)[1:]),) ) (x)    
+    x = Dense (256, name='conv5')(x)
+    x = PReLU (name="prelu5" )(x)
+    prob = Dense (2, name='conv61')(x)
+    prob = Softmax()(prob)    
+    x1 = Dense (4, name='conv62')(x)
+    x2 = Dense (10, name='conv63')(x)
+    
+    ONet_model = Model(ONet_Input, [x1,x2,prob] )
+    
+    #ONet_model.load_weights ( (Path(mtcnn.__file__).parent / 'mtcnn_onet.h5').__str__() )
+    ONet_model.get_layer("conv1").set_weights ( [ det3_dict['conv1']['weights'], det3_dict['conv1']['biases'] ] )
+    ONet_model.get_layer("prelu1").set_weights ( [ np.reshape(det3_dict['prelu1']['alpha'], (1,1,-1)) ] )
+    ONet_model.get_layer("conv2").set_weights ( [ det3_dict['conv2']['weights'], det3_dict['conv2']['biases'] ] )
+    ONet_model.get_layer("prelu2").set_weights ( [ np.reshape(det3_dict['prelu2']['alpha'], (1,1,-1)) ] )
+    ONet_model.get_layer("conv3").set_weights ( [ det3_dict['conv3']['weights'], det3_dict['conv3']['biases'] ] )
+    ONet_model.get_layer("prelu3").set_weights ( [ np.reshape(det3_dict['prelu3']['alpha'], (1,1,-1)) ] )
+    ONet_model.get_layer("conv4").set_weights ( [ det3_dict['conv4']['weights'], det3_dict['conv4']['biases'] ] )
+    ONet_model.get_layer("prelu4").set_weights ( [ np.reshape(det3_dict['prelu4']['alpha'], (1,1,-1)) ] )
+    ONet_model.get_layer("conv5").set_weights ( [ det3_dict['conv5']['weights'], det3_dict['conv5']['biases'] ] )
+    ONet_model.get_layer("prelu5").set_weights ( [ det3_dict['prelu5']['alpha'] ] )
+    ONet_model.get_layer("conv61").set_weights ( [ det3_dict['conv6-1']['weights'], det3_dict['conv6-1']['biases'] ] )
+    ONet_model.get_layer("conv62").set_weights ( [ det3_dict['conv6-2']['weights'], det3_dict['conv6-2']['biases'] ] )
+    ONet_model.get_layer("conv63").set_weights ( [ det3_dict['conv6-3']['weights'], det3_dict['conv6-3']['biases'] ] )
+    ONet_model.save ( (Path(mtcnn.__file__).parent / 'mtcnn_onet.h5').__str__() )
+    
+    onet_test_data = np.random.uniform ( size=(1,48,48,3) )
+    onet_result1, onet_result2, onet_result3 = onet_fun ([onet_test_data])
+    onet2_result1, onet2_result2, onet2_result3 =  K.function ( ONet_model.inputs, ONet_model.outputs ) ([onet_test_data])   
+    
+    onet_diff1 = np.mean ( np.abs(onet_result1 - onet2_result1) )
+    onet_diff2 = np.mean ( np.abs(onet_result2 - onet2_result2) )
+    onet_diff3 = np.mean ( np.abs(onet_result3 - onet2_result3) )
+    print ("onet_diff1 = %f, onet_diff2 = %f, , onet_diff3 = %f "  % (onet_diff1, onet_diff2, onet_diff3) )
+    
+    
+    import code
+    code.interact(local=dict(globals(), **locals()))
+    
+    
+    
+    
+    
+    import code
+    code.interact(local=dict(globals(), **locals()))
+    
+    
+    
+    
+    
+    
+    #class MTCNNSoftmax(keras.Layer):
+    #
+    #    def __init__(self, axis=-1, **kwargs):
+    #        super(MTCNNSoftmax, self).__init__(**kwargs)
+    #        self.supports_masking = True
+    #        self.axis = axis
+    #
+    #    def call(self, inputs):
+    #    
+    #    def softmax(self, target, axis, name=None):
+    #        max_axis = self.tf.reduce_max(target, axis, keepdims=True)
+    #        target_exp = self.tf.exp(target-max_axis)
+    #        normalize = self.tf.reduce_sum(target_exp, axis, keepdims=True)
+    #        softmax = self.tf.div(target_exp, normalize, name)
+    #        return softmax
+    #        #return activations.softmax(inputs, axis=self.axis)
+    #
+    #    def get_config(self):
+    #        config = {'axis': self.axis}
+    #        base_config = super(MTCNNSoftmax, self).get_config()
+    #        return dict(list(base_config.items()) + list(config.items()))
+    #
+    #    def compute_output_shape(self, input_shape):
+    #        return input_shape
+    
+    from nnlib import nnlib
+    exec( nnlib.import_all(), locals(), globals() )
+    
+    
+    
+    
+    image = cv2.imread('D:\\DeepFaceLab\\test\\00000.png').astype(np.float32) / 255.0    
+    image = cv2.cvtColor (image, cv2.COLOR_BGR2GRAY)    
+    image = np.expand_dims (image, -1)
+    image_shape = image.shape
+    
+    image2 = cv2.imread('D:\\DeepFaceLab\\test\\00001.png').astype(np.float32) / 255.0    
+    image2 = cv2.cvtColor (image2, cv2.COLOR_BGR2GRAY)    
+    image2 = np.expand_dims (image2, -1)
+    image2_shape = image2.shape
+    
+    #cv2.imshow('', image)
+
+    
+    image_tensor = K.placeholder(shape=[ 1, image_shape[0], image_shape[1], image_shape[2] ], dtype="float32" )  
+    image2_tensor = K.placeholder(shape=[ 1, image_shape[0], image_shape[1], image_shape[2] ], dtype="float32" )  
+
+    blurred_image_tensor = gaussian_blur(16.0)(image_tensor)   
+    x, = nnlib.tf_sess.run ( blurred_image_tensor, feed_dict={image_tensor: np.expand_dims(image,0)} )
+    cv2.imshow('', (x*255).astype(np.uint8) )
+    cv2.waitKey(0)
+    
+    import code
+    code.interact(local=dict(globals(), **locals()))
+    
+    
+    #os.environ['plaidML'] = '1'
+    from nnlib import nnlib
+    
+    dvc = nnlib.device.Config(force_gpu_idx=1)
+    exec( nnlib.import_all(dvc), locals(), globals() )
+    
+    tf = nnlib.tf
+    
+    image = cv2.imread('D:\\DeepFaceLab\\test\\00000.png').astype(np.float32) / 255.0    
+    image = cv2.cvtColor (image, cv2.COLOR_BGR2GRAY)    
+    image = np.expand_dims (image, -1)
+    image_shape = image.shape
+    
+    image2 = cv2.imread('D:\\DeepFaceLab\\test\\00001.png').astype(np.float32) / 255.0    
+    image2 = cv2.cvtColor (image2, cv2.COLOR_BGR2GRAY)    
+    image2 = np.expand_dims (image2, -1)
+    image2_shape = image2.shape
+    
+    image1_tensor = K.placeholder(shape=[ 1, image_shape[0], image_shape[1], image_shape[2] ], dtype="float32" )  
+    image2_tensor = K.placeholder(shape=[ 1, image_shape[0], image_shape[1], image_shape[2] ], dtype="float32" )  
+    
+    
+    
+    #import code
+    #code.interact(local=dict(globals(), **locals()))
+    def manual_conv(input, filter, strides, padding):
+          h_f, w_f, c_in, c_out = filter.get_shape().as_list()
+          input_patches = tf.extract_image_patches(input, ksizes=[1, h_f, w_f, 1 ], strides=strides, rates=[1, 1, 1, 1], padding=padding)
+          return input_patches
+          filters_flat = tf.reshape(filter, shape=[h_f*w_f*c_in, c_out])
+          return tf.einsum("ijkl,lm->ijkm", input_patches, filters_flat)
+          
+    def extract_image_patches(x, ksizes, ssizes, padding='SAME',
+                          data_format='channels_last'):
+        """Extract the patches from an image.
+        # Arguments
+            x: The input image
+            ksizes: 2-d tuple with the kernel size
+            ssizes: 2-d tuple with the strides size
+            padding: 'same' or 'valid'
+            data_format: 'channels_last' or 'channels_first'
+        # Returns
+            The (k_w,k_h) patches extracted
+            TF ==> (batch_size,w,h,k_w,k_h,c)
+            TH ==> (batch_size,w,h,c,k_w,k_h)
+        """
+        kernel = [1, ksizes[0], ksizes[1], 1]
+        strides = [1, ssizes[0], ssizes[1], 1]
+        if data_format == 'channels_first':
+            x = K.permute_dimensions(x, (0, 2, 3, 1))
+        bs_i, w_i, h_i, ch_i = K.int_shape(x)
+        patches = tf.extract_image_patches(x, kernel, strides, [1, 1, 1, 1],
+                                           padding)
+        # Reshaping to fit Theano
+        bs, w, h, ch = K.int_shape(patches)
+        reshaped = tf.reshape(patches, [-1, w, h, tf.floordiv(ch, ch_i), ch_i])
+        final_shape = [-1, w, h, ch_i, ksizes[0], ksizes[1]]
+        patches = tf.reshape(tf.transpose(reshaped, [0, 1, 2, 4, 3]), final_shape)
+        if data_format == 'channels_last':
+            patches = K.permute_dimensions(patches, [0, 1, 2, 4, 5, 3])
+        return patches
+    
+    m = 32
+    c_in = 3
+    c_out = 16
+
+    filter_sizes = [5, 11]
+    strides = [1]
+    #paddings = ["VALID", "SAME"]
+
+    for fs in filter_sizes:
+        h = w = 128
+        h_f = w_f = fs
+        str = 2
+        #print "Testing for", imsize, fs, stri, pad
+
+        #tf.reset_default_graph()
+        X = tf.constant(1.0+np.random.rand(m, h, w, c_in), tf.float32)
+        W = tf.constant(np.ones([h_f, w_f, c_in, h_f*w_f*c_in]), tf.float32)
+        
+        
+        Z = tf.nn.conv2d(X, W, strides=[1, str, str, 1], padding="VALID")
+        Z_manual = manual_conv(X, W, strides=[1, str, str, 1], padding="VALID")
+        Z_2 = extract_image_patches (X, (fs,fs), (str,str),  padding="VALID")
+        import code
+        code.interact(local=dict(globals(), **locals()))
+        #
+        sess = tf.Session()
+        sess.run(tf.global_variables_initializer())
+        Z_, Z_manual_ = sess.run([Z, Z_manual])
+        #self.assertEqual(Z_.shape, Z_manual_.shape)
+        #self.assertTrue(np.allclose(Z_, Z_manual_, rtol=1e-05))
+        sess.close()
+
+
+        import code
+        code.interact(local=dict(globals(), **locals()))
+    
+    
+    
+    
+    
+    #k_loss_t = keras_style_loss()(image1_tensor, image2_tensor)
+    #k_loss_run = K.function( [image1_tensor, image2_tensor],[k_loss_t])
+    #import code
+    #code.interact(local=dict(globals(), **locals()))
+    #image = np.expand_dims(image,0)
+    #image2 = np.expand_dims(image2,0)
+    #k_loss = k_loss_run([image, image2])
+    #t_loss = t_loss_run([image, image2])
+    
+    
+    
+    
+    #x, = tf_sess_run ([np.expand_dims(image,0)])
+    #x = x[0]
+    ##import code
+    ##code.interact(local=dict(globals(), **locals()))
+    
+    
+   
+    image = cv2.imread('D:\\DeepFaceLab\\test\\00000.png').astype(np.float32) / 255.0    
+    image = cv2.cvtColor (image, cv2.COLOR_BGR2GRAY)    
+    image = np.expand_dims (image, -1)
+    image_shape = image.shape
+    
+    image2 = cv2.imread('D:\\DeepFaceLab\\test\\00001.png').astype(np.float32) / 255.0    
+    image2 = cv2.cvtColor (image2, cv2.COLOR_BGR2GRAY)    
+    image2 = np.expand_dims (image2, -1)
+    image2_shape = image2.shape
+    
+    image_tensor = tf.placeholder(tf.float32, shape=[1, image_shape[0], image_shape[1], image_shape[2] ])
+    image2_tensor = tf.placeholder(tf.float32, shape=[1, image2_shape[0], image2_shape[1], image2_shape[2] ])
+    
+    blurred_image_tensor = sl(image_tensor, image2_tensor)        
+    x = tf_sess.run ( blurred_image_tensor, feed_dict={image_tensor: np.expand_dims(image,0), image2_tensor: np.expand_dims(image2,0) } )
+    
+    cv2.imshow('', x[0])
+    cv2.waitKey(0)
+    import code
+    code.interact(local=dict(globals(), **locals()))
+    
+    while True:
+        image = cv2.imread('D:\\DeepFaceLab\\workspace\\data_src\\aligned\\00000.png').astype(np.float32) / 255.0
+        image = cv2.resize(image, (256,256))
+        image = random_transform( image )
+        warped_img, target_img = random_warp( image )
+
+        #cv2.imshow('', image)
+        #cv2.waitKey(0)
+    
+        cv2.imshow('', warped_img)
+        cv2.waitKey(0)
+        cv2.imshow('', target_img)
+        cv2.waitKey(0)
+    
+    import code
+    code.interact(local=dict(globals(), **locals()))
+    
+    import code
+    code.interact(local=dict(globals(), **locals()))
+
+    return
+    
+    
+    def keras_gaussian_blur(radius=2.0):
+        def gaussian(x, mu, sigma):
+            return np.exp(-(float(x) - float(mu)) ** 2 / (2 * sigma ** 2))
+
+        def make_kernel(sigma):
+            kernel_size = max(3, int(2 * 2 * sigma + 1))
+            mean = np.floor(0.5 * kernel_size)
+            kernel_1d = np.array([gaussian(x, mean, sigma) for x in range(kernel_size)])
+            np_kernel = np.outer(kernel_1d, kernel_1d).astype(dtype=K.floatx())
+            kernel = np_kernel / np.sum(np_kernel)
+            return kernel
+      
+        gauss_kernel = make_kernel(radius)
+        gauss_kernel = gauss_kernel[:, :,np.newaxis, np.newaxis]
+        
+        #import code
+        #code.interact(local=dict(globals(), **locals()))
+        def func(input):
+            inputs = [ input[:,:,:,i:i+1]  for i in range( K.int_shape( input )[-1] ) ]
+
+            outputs = []
+            for i in range(len(inputs)):
+                outputs += [ K.conv2d( inputs[i] , K.constant(gauss_kernel) , strides=(1,1), padding="same") ]
+
+            return K.concatenate (outputs, axis=-1)
+        return func
+    
+    def keras_style_loss(gaussian_blur_radius=0.0, loss_weight=1.0, epsilon=1e-5):
+        if gaussian_blur_radius > 0.0:
+            gblur = keras_gaussian_blur(gaussian_blur_radius)
+        
+        def sd(content, style):
+            content_nc = K.int_shape(content)[-1]
+            style_nc = K.int_shape(style)[-1]
+            if content_nc != style_nc:
+                raise Exception("keras_style_loss() content_nc != style_nc")
+                
+            axes = [1,2]
+            c_mean, c_var = K.mean(content, axis=axes, keepdims=True), K.var(content, axis=axes, keepdims=True)
+            s_mean, s_var = K.mean(style, axis=axes, keepdims=True), K.var(style, axis=axes, keepdims=True)
+            c_std, s_std = K.sqrt(c_var + epsilon), K.sqrt(s_var + epsilon)
+
+            mean_loss = K.sum(K.square(c_mean-s_mean))
+            std_loss = K.sum(K.square(c_std-s_std))
+            
+            return (mean_loss + std_loss) * loss_weight
+            
+        def func(target, style):
+            if gaussian_blur_radius > 0.0:
+                return sd( gblur(target), gblur(style))
+            else:
+                return sd( target, style )
+        return func
+    
+    data = tf.placeholder(tf.float32, (None,None,None,3), 'input')
+    pnet2 = mtcnn.PNet(tf, {'data':data})
+    filename = str(Path(mtcnn.__file__).parent/'det1.npy')
+    pnet2.load(filename, tf_sess)
+    
+    pnet_fun = K.function([pnet2.layers['data']],[pnet2.layers['conv4-2'], pnet2.layers['prob1']])
+    
+    import code
+    code.interact(local=dict(globals(), **locals()))
+
+    return
+    
+    
+    while True:
+        img_bgr = np.random.rand ( 268, 640, 3 )
+        img_size = img_bgr.shape[1], img_bgr.shape[0]
+
+        mat = np.array( [[ 1.99319629e+00, -1.81504324e-01, -3.62479778e+02],
+                         [ 1.81504324e-01,  1.99319629e+00, -8.05396709e+01]] )
+
+        tmp_0 = np.random.rand ( 128,128 ) - 0.1
+        tmp   = np.expand_dims (tmp_0, axis=-1)
+
+        mask = np.ones ( tmp.shape, dtype=np.float32)        
+        mask_border_size = int ( mask.shape[1] * 0.0625 )
+        mask[:,0:mask_border_size,:] = 0
+        mask[:,-mask_border_size:,:] = 0                        
+
+        x = cv2.warpAffine( mask, mat, img_size, np.zeros(img_bgr.shape, dtype=np.float32), cv2.WARP_INVERSE_MAP | cv2.INTER_LANCZOS4, cv2.BORDER_TRANSPARENT )
+        
+        if len ( np.argwhere( np.isnan(x) ) ) == 0:
+            print ("fine")
+        else:
+            print ("wtf")
+         
+    import code
+    code.interact(local=dict(globals(), **locals()))
+
+    return
+    
+    aligned_path_image_paths = Path_utils.get_image_paths("E:\\FakeFaceVideoSources\\Datasets\\CelebA aligned")
+    
+    a = []
+    r_vec = np.array([[0.01891013], [0.08560084], [-3.14392813]])
+    t_vec = np.array([[-14.97821226], [-10.62040383], [-2053.03596872]])
+        
+    yaws = []
+    pitchs = []
+    for filepath in tqdm(aligned_path_image_paths, desc="test", ascii=True ):
+        filepath = Path(filepath)
+        
+        if filepath.suffix == '.png':
+            dflimg = DFLPNG.load( str(filepath), print_on_no_embedded_data=True )
+        elif filepath.suffix == '.jpg':
+            dflimg = DFLJPG.load ( str(filepath), print_on_no_embedded_data=True )
+        else:
+            print ("%s is not a dfl image file" % (filepath.name) ) 
+        
+        #source_filename_stem = Path( dflimg.get_source_filename() ).stem
+        #if source_filename_stem not in alignments.keys():
+        #    alignments[ source_filename_stem ] = []
+        
+        
+        #focal_length = dflimg.shape[1]
+        #camera_center = (dflimg.shape[1] / 2, dflimg.shape[0] / 2)
+        #camera_matrix = np.array(
+        #    [[focal_length, 0, camera_center[0]],
+        #     [0, focal_length, camera_center[1]],
+        #     [0, 0, 1]], dtype=np.float32)
+        #
+        landmarks = dflimg.get_landmarks()
+        #
+        #lm = landmarks.astype(np.float32)
+        
+        img = cv2_imread (str(filepath)) / 255.0
+        
+        LandmarksProcessor.draw_landmarks(img, landmarks, (1,1,1) )
+        
+        
+        #(_, rotation_vector, translation_vector) = cv2.solvePnP(
+        #    LandmarksProcessor.landmarks_68_3D,
+        #    lm,
+        #    camera_matrix,
+        #    np.zeros((4, 1)) ) 
+        #
+        #rme = mathlib.rotationMatrixToEulerAngles( cv2.Rodrigues(rotation_vector)[0] )
+        #import code
+        #code.interact(local=dict(globals(), **locals()))
+    
+        #rotation_vector = rotation_vector / np.linalg.norm(rotation_vector)
+        
+        
+        #img2 = image_utils.get_text_image ( (256,10, 3), str(rotation_vector) )
+        pitch, yaw = LandmarksProcessor.estimate_pitch_yaw (landmarks)
+        yaws += [yaw]
+        #print(pitch, yaw)
+        #cv2.imshow ("", (img * 255).astype(np.uint8) )
+        #cv2.waitKey(0)
+        #a += [ rotation_vector]
+    yaws = np.array(yaws)       
+    import code
+    code.interact(local=dict(globals(), **locals()))
+        
+        
+        
+        
+        
+        
+        #alignments[ source_filename_stem ].append (dflimg.get_source_landmarks())
+        #alignments.append (dflimg.get_source_landmarks())
+
+
+        
+    
+    
+    
+    
+    o = np.ones ( (128,128,3), dtype=np.float32 )
+    cv2.imwrite ("D:\\temp\\z.jpg", o)
+    
+    #DFLJPG.embed_data ("D:\\temp\\z.jpg", )
+    
+    dfljpg = DFLJPG.load("D:\\temp\\z.jpg")
+    
+    import code
+    code.interact(local=dict(globals(), **locals()))
+
+    return
+
+
+
+    import sys, numpy; print(numpy.__version__, sys.version)
+    sq = multiprocessing.Queue()
+    cq = multiprocessing.Queue()
+
+    p = multiprocessing.Process(target=subprocess, args=(sq,cq,))
+    p.start()
+    
+    while True:
+        cq.get() #waiting numpy array
+        sq.put (1) #send message we are ready to get more
+            
+    #import code
+    #code.interact(local=dict(globals(), **locals()))
+    
+    os.environ['TF_MIN_GPU_MULTIPROCESSOR_COUNT'] = '2'
+
+    from nnlib import nnlib
+    exec( nnlib.import_all(), locals(), globals() )
+
+    
+    
+    
+    #import tensorflow as tf
+    #tf_module = tf
+    #    
+    #config = tf_module.ConfigProto()
+    #config.gpu_options.force_gpu_compatible = True
+    #tf_session = tf_module.Session(config=config)
+    #
+    #srgb_tensor = tf.placeholder("float", [None, None, 3])
+    #
+    #filename = Path(__file__).parent / '00050.png'
+    #img = cv2.imread(str(filename)).astype(np.float32) / 255.0
+    #
+    #lab_tensor = rgb_to_lab (tf_module, srgb_tensor)
+    #
+    #rgb_tensor = lab_to_rgb (tf_module, lab_tensor)
+    #
+    #rgb = tf_session.run(rgb_tensor, feed_dict={srgb_tensor: img})
+    #cv2.imshow("", rgb)
+    #cv2.waitKey(0)    
+    
+    #from skimage import io, color
+    #def_lab = color.rgb2lab(img)  
+    #
+    #t = time.time()
+    #def_lab = color.rgb2lab(img)  
+    #print ( time.time() - t )
+    #
+    #lab = tf_session.run(lab_tensor, feed_dict={srgb_tensor: img})
+    #
+    #t = time.time()
+    #lab = tf_session.run(lab_tensor, feed_dict={srgb_tensor: img})
+    #print ( time.time() - t )
+    
+    
+    
+    
+    
+    
+    #lab_clr = color.rgb2lab(img_bgr)                         
+    #lab_bw = color.rgb2lab(out_img)                          
+    #tmp_channel, a_channel, b_channel = cv2.split(lab_clr)   
+    #l_channel, tmp2_channel, tmp3_channel = cv2.split(lab_bw)
+    #img_LAB = cv2.merge((l_channel,a_channel, b_channel))    
+    #out_img = color.lab2rgb(lab.astype(np.float64))                         
+    #
+    #cv2.imshow("", out_img)
+    #cv2.waitKey(0)    
+    
+    #import code
+    #code.interact(local=dict(globals(), **locals()))
+
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/doc/doc_build_and_repository_info.md b/doc/doc_build_and_repository_info.md
index 6fe4737..6fb36bb 100644
--- a/doc/doc_build_and_repository_info.md
+++ b/doc/doc_build_and_repository_info.md
@@ -2,6 +2,7 @@
 
 DeepFaceLab officially supports Windows-only. If you want to support Mac/Linux/Docker - create a fork, it will be referenced here.
 
+[Linux fork](https://github.com/lbfs/DeepFaceLab_Linux) by @lbfs
 
 #### **Installing dlib on Windows**
 
diff --git a/facelib/MTCExtractor.py b/facelib/MTCExtractor.py
index 41c779d..a43829b 100644
--- a/facelib/MTCExtractor.py
+++ b/facelib/MTCExtractor.py
@@ -3,15 +3,11 @@ import os
 import cv2
 
 from pathlib import Path
-
-from .mtcnn import *
+from nnlib import nnlib
 
 class MTCExtractor(object):   
-    def __init__(self, keras, tf, tf_session):
+    def __init__(self):
         self.scale_to = 1920
-        self.keras = keras
-        self.tf = tf
-        self.tf_session = tf_session
         
         self.min_face_size = self.scale_to * 0.042
         self.thresh1 = 0.7
@@ -19,25 +15,72 @@ class MTCExtractor(object):
         self.thresh3 = 0.6
         self.scale_factor = 0.95
 
+        exec( nnlib.import_all(), locals(), globals() )
+        PNet_Input = Input ( (None, None,3) )
+        x = PNet_Input
+        x = Conv2D (10, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv1")(x)
+        x = PReLU (shared_axes=[1,2], name="PReLU1" )(x)
+        x = MaxPooling2D( pool_size=(2,2), strides=(2,2), padding='same' ) (x)
+        x = Conv2D (16, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv2")(x)
+        x = PReLU (shared_axes=[1,2], name="PReLU2" )(x)
+        x = Conv2D (32, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv3")(x)
+        x = PReLU (shared_axes=[1,2], name="PReLU3" )(x)
+        prob = Conv2D (2, kernel_size=(1,1), strides=(1,1), padding='valid', name="conv41")(x)
+        prob = Softmax()(prob)    
+        x = Conv2D (4, kernel_size=(1,1), strides=(1,1), padding='valid', name="conv42")(x)
+
+        PNet_model = Model(PNet_Input, [x,prob] )        
+        PNet_model.load_weights ( (Path(__file__).parent / 'mtcnn_pnet.h5').__str__() )
+        
+        RNet_Input = Input ( (24, 24, 3) )
+        x = RNet_Input
+        x = Conv2D (28, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv1")(x)
+        x = PReLU (shared_axes=[1,2], name="prelu1" )(x)
+        x = MaxPooling2D( pool_size=(3,3), strides=(2,2), padding='same' ) (x)
+        x = Conv2D (48, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv2")(x)
+        x = PReLU (shared_axes=[1,2], name="prelu2" )(x)
+        x = MaxPooling2D( pool_size=(3,3), strides=(2,2), padding='valid' ) (x)    
+        x = Conv2D (64, kernel_size=(2,2), strides=(1,1), padding='valid', name="conv3")(x)
+        x = PReLU (shared_axes=[1,2], name="prelu3" )(x)
+        x = Lambda ( lambda x: K.reshape (x, (-1, np.prod(K.int_shape(x)[1:]),) ), output_shape=(np.prod(K.int_shape(x)[1:]),) ) (x)
+        x = Dense (128, name='conv4')(x)    
+        x = PReLU (name="prelu4" )(x)
+        prob = Dense (2, name='conv51')(x)
+        prob = Softmax()(prob)  
+        x = Dense (4, name='conv52')(x)        
+        RNet_model = Model(RNet_Input, [x,prob] )        
+        RNet_model.load_weights ( (Path(__file__).parent / 'mtcnn_rnet.h5').__str__() )
+        
+        ONet_Input = Input ( (48, 48, 3) )
+        x = ONet_Input
+        x = Conv2D (32, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv1")(x)
+        x = PReLU (shared_axes=[1,2], name="prelu1" )(x)
+        x = MaxPooling2D( pool_size=(3,3), strides=(2,2), padding='same' ) (x)
+        x = Conv2D (64, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv2")(x)
+        x = PReLU (shared_axes=[1,2], name="prelu2" )(x)
+        x = MaxPooling2D( pool_size=(3,3), strides=(2,2), padding='valid' ) (x)    
+        x = Conv2D (64, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv3")(x)
+        x = PReLU (shared_axes=[1,2], name="prelu3" )(x)
+        x = MaxPooling2D( pool_size=(2,2), strides=(2,2), padding='same' ) (x) 
+        x = Conv2D (128, kernel_size=(2,2), strides=(1,1), padding='valid', name="conv4")(x)
+        x = PReLU (shared_axes=[1,2], name="prelu4" )(x)
+        x = Lambda ( lambda x: K.reshape (x, (-1, np.prod(K.int_shape(x)[1:]),) ), output_shape=(np.prod(K.int_shape(x)[1:]),) ) (x)    
+        x = Dense (256, name='conv5')(x)
+        x = PReLU (name="prelu5" )(x)
+        prob = Dense (2, name='conv61')(x)
+        prob = Softmax()(prob)    
+        x1 = Dense (4, name='conv62')(x)
+        x2 = Dense (10, name='conv63')(x)        
+        ONet_model = Model(ONet_Input, [x1,x2,prob] )        
+        ONet_model.load_weights ( (Path(__file__).parent / 'mtcnn_onet.h5').__str__() )
+
+        self.pnet_fun = K.function ( PNet_model.inputs, PNet_model.outputs )
+        self.rnet_fun = K.function ( RNet_model.inputs, RNet_model.outputs )
+        self.onet_fun = K.function ( ONet_model.inputs, ONet_model.outputs )
+
     def __enter__(self):
-        with self.tf.variable_scope('pnet2'):
-            data = self.tf.placeholder(self.tf.float32, (None,None,None,3), 'input')
-            pnet2 = PNet(self.tf, {'data':data})
-            pnet2.load(str(Path(__file__).parent/'det1.npy'), self.tf_session)
-        with self.tf.variable_scope('rnet2'):
-            data = self.tf.placeholder(self.tf.float32, (None,24,24,3), 'input')
-            rnet2 = RNet(self.tf, {'data':data})
-            rnet2.load(str(Path(__file__).parent/'det2.npy'), self.tf_session)
-        with self.tf.variable_scope('onet2'):
-            data = self.tf.placeholder(self.tf.float32, (None,48,48,3), 'input')
-            onet2 = ONet(self.tf, {'data':data})
-            onet2.load(str(Path(__file__).parent/'det3.npy'), self.tf_session)
-
-        self.pnet_fun = self.keras.backend.function([pnet2.layers['data']],[pnet2.layers['conv4-2'], pnet2.layers['prob1']])
-        self.rnet_fun = self.keras.backend.function([rnet2.layers['data']],[rnet2.layers['conv5-2'], rnet2.layers['prob1']])
-        self.onet_fun = self.keras.backend.function([onet2.layers['data']],[onet2.layers['conv6-2'], onet2.layers['conv6-3'], onet2.layers['prob1']])
-
         faces, pnts = detect_face ( np.zeros ( (self.scale_to, self.scale_to, 3)), self.min_face_size, self.pnet_fun, self.rnet_fun, self.onet_fun, [ self.thresh1, self.thresh2, self.thresh3 ], self.scale_factor ) 
+        
         return self
         
     def __exit__(self, exc_type=None, exc_value=None, traceback=None):
@@ -47,7 +90,6 @@ class MTCExtractor(object):
         input_image = input_image[:,:,::-1].copy()
         (h, w, ch) = input_image.shape
 
-
         input_scale = self.scale_to / (w if w > h else h)
         input_image = cv2.resize (input_image, ( int(w*input_scale), int(h*input_scale) ), interpolation=cv2.INTER_LINEAR)
 
@@ -56,3 +98,249 @@ class MTCExtractor(object):
           
         return detected_faces
 
+def detect_face(img, minsize, pnet, rnet, onet, threshold, factor):
+    """Detects faces in an image, and returns bounding boxes and points for them.
+    img: input image
+    minsize: minimum faces' size
+    pnet, rnet, onet: caffemodel
+    threshold: threshold=[th1, th2, th3], th1-3 are three steps's threshold
+    factor: the factor used to create a scaling pyramid of face sizes to detect in the image.
+    """
+    factor_count=0
+    total_boxes=np.empty((0,9))
+    points=np.empty(0)
+    h=img.shape[0]
+    w=img.shape[1]
+    minl=np.amin([h, w])
+    m=12.0/minsize
+    minl=minl*m
+    # create scale pyramid
+    scales=[]
+    while minl>=12:
+        scales += [m*np.power(factor, factor_count)]
+        minl = minl*factor
+        factor_count += 1
+    # first stage
+    for scale in scales:
+        hs=int(np.ceil(h*scale))
+        ws=int(np.ceil(w*scale))
+        #print ('scale %f %d %d' % (scale, ws,hs))
+        im_data = imresample(img, (hs, ws))
+        im_data = (im_data-127.5)*0.0078125
+        img_x = np.expand_dims(im_data, 0)
+        img_y = np.transpose(img_x, (0,2,1,3))
+        out = pnet([img_y])
+        out0 = np.transpose(out[0], (0,2,1,3))
+        out1 = np.transpose(out[1], (0,2,1,3))
+        
+        boxes, _ = generateBoundingBox(out1[0,:,:,1].copy(), out0[0,:,:,:].copy(), scale, threshold[0])
+        
+        # inter-scale nms
+        pick = nms(boxes.copy(), 0.5, 'Union')
+        if boxes.size>0 and pick.size>0:
+            boxes = boxes[pick,:]
+            total_boxes = np.append(total_boxes, boxes, axis=0)
+
+    numbox = total_boxes.shape[0]
+    if numbox>0:
+        pick = nms(total_boxes.copy(), 0.7, 'Union')
+        total_boxes = total_boxes[pick,:]
+        regw = total_boxes[:,2]-total_boxes[:,0]
+        regh = total_boxes[:,3]-total_boxes[:,1]
+        qq1 = total_boxes[:,0]+total_boxes[:,5]*regw
+        qq2 = total_boxes[:,1]+total_boxes[:,6]*regh
+        qq3 = total_boxes[:,2]+total_boxes[:,7]*regw
+        qq4 = total_boxes[:,3]+total_boxes[:,8]*regh
+        total_boxes = np.transpose(np.vstack([qq1, qq2, qq3, qq4, total_boxes[:,4]]))
+        total_boxes = rerec(total_boxes.copy())
+        total_boxes[:,0:4] = np.fix(total_boxes[:,0:4]).astype(np.int32)
+        dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h)
+
+    numbox = total_boxes.shape[0]
+    if numbox>0:
+        # second stage
+        tempimg = np.zeros((24,24,3,numbox))
+        for k in range(0,numbox):
+            tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
+            tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
+            if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
+                tempimg[:,:,:,k] = imresample(tmp, (24, 24))
+            else:
+                return np.empty()
+        tempimg = (tempimg-127.5)*0.0078125
+        tempimg1 = np.transpose(tempimg, (3,1,0,2))
+        out = rnet([tempimg1])
+        out0 = np.transpose(out[0])
+        out1 = np.transpose(out[1])
+        score = out1[1,:]
+        ipass = np.where(score>threshold[1])
+        total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
+        mv = out0[:,ipass[0]]
+        if total_boxes.shape[0]>0:
+            pick = nms(total_boxes, 0.7, 'Union')
+            total_boxes = total_boxes[pick,:]
+            total_boxes = bbreg(total_boxes.copy(), np.transpose(mv[:,pick]))
+            total_boxes = rerec(total_boxes.copy())
+
+    numbox = total_boxes.shape[0]
+    if numbox>0:
+        # third stage
+        total_boxes = np.fix(total_boxes).astype(np.int32)
+        dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h)
+        tempimg = np.zeros((48,48,3,numbox))
+        for k in range(0,numbox):
+            tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
+            tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
+            if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
+                tempimg[:,:,:,k] = imresample(tmp, (48, 48))
+            else:
+                return np.empty()
+        tempimg = (tempimg-127.5)*0.0078125
+        tempimg1 = np.transpose(tempimg, (3,1,0,2))
+        out = onet([tempimg1])
+        out0 = np.transpose(out[0])
+        out1 = np.transpose(out[1])
+        out2 = np.transpose(out[2])
+        score = out2[1,:]
+        points = out1
+        ipass = np.where(score>threshold[2])
+        points = points[:,ipass[0]]
+        total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
+        mv = out0[:,ipass[0]]
+
+        w = total_boxes[:,2]-total_boxes[:,0]+1
+        h = total_boxes[:,3]-total_boxes[:,1]+1
+        points[0:5,:] = np.tile(w,(5, 1))*points[0:5,:] + np.tile(total_boxes[:,0],(5, 1))-1
+        points[5:10,:] = np.tile(h,(5, 1))*points[5:10,:] + np.tile(total_boxes[:,1],(5, 1))-1
+        if total_boxes.shape[0]>0:
+            total_boxes = bbreg(total_boxes.copy(), np.transpose(mv))
+            pick = nms(total_boxes.copy(), 0.7, 'Min')
+            total_boxes = total_boxes[pick,:]
+            points = points[:,pick]
+                
+    return total_boxes, points
+
+
+# function [boundingbox] = bbreg(boundingbox,reg)
+def bbreg(boundingbox,reg):
+    """Calibrate bounding boxes"""
+    if reg.shape[1]==1:
+        reg = np.reshape(reg, (reg.shape[2], reg.shape[3]))
+
+    w = boundingbox[:,2]-boundingbox[:,0]+1
+    h = boundingbox[:,3]-boundingbox[:,1]+1
+    b1 = boundingbox[:,0]+reg[:,0]*w
+    b2 = boundingbox[:,1]+reg[:,1]*h
+    b3 = boundingbox[:,2]+reg[:,2]*w
+    b4 = boundingbox[:,3]+reg[:,3]*h
+    boundingbox[:,0:4] = np.transpose(np.vstack([b1, b2, b3, b4 ]))
+    return boundingbox
+ 
+def generateBoundingBox(imap, reg, scale, t):
+    """Use heatmap to generate bounding boxes"""
+    stride=2
+    cellsize=12
+
+    imap = np.transpose(imap)
+    dx1 = np.transpose(reg[:,:,0])
+    dy1 = np.transpose(reg[:,:,1])
+    dx2 = np.transpose(reg[:,:,2])
+    dy2 = np.transpose(reg[:,:,3])
+    y, x = np.where(imap >= t)
+    if y.shape[0]==1:
+        dx1 = np.flipud(dx1)
+        dy1 = np.flipud(dy1)
+        dx2 = np.flipud(dx2)
+        dy2 = np.flipud(dy2)
+    score = imap[(y,x)]
+    reg = np.transpose(np.vstack([ dx1[(y,x)], dy1[(y,x)], dx2[(y,x)], dy2[(y,x)] ]))
+    if reg.size==0:
+        reg = np.empty((0,3))
+    bb = np.transpose(np.vstack([y,x]))
+    q1 = np.fix((stride*bb+1)/scale)
+    q2 = np.fix((stride*bb+cellsize-1+1)/scale)
+    boundingbox = np.hstack([q1, q2, np.expand_dims(score,1), reg])
+    return boundingbox, reg
+ 
+# function pick = nms(boxes,threshold,type)
+def nms(boxes, threshold, method):
+    if boxes.size==0:
+        return np.empty((0,3))
+    x1 = boxes[:,0]
+    y1 = boxes[:,1]
+    x2 = boxes[:,2]
+    y2 = boxes[:,3]
+    s = boxes[:,4]
+    area = (x2-x1+1) * (y2-y1+1)
+    I = np.argsort(s)
+    pick = np.zeros_like(s, dtype=np.int16)
+    counter = 0
+    while I.size>0:
+        i = I[-1]
+        pick[counter] = i
+        counter += 1
+        idx = I[0:-1]
+        xx1 = np.maximum(x1[i], x1[idx])
+        yy1 = np.maximum(y1[i], y1[idx])
+        xx2 = np.minimum(x2[i], x2[idx])
+        yy2 = np.minimum(y2[i], y2[idx])
+        w = np.maximum(0.0, xx2-xx1+1)
+        h = np.maximum(0.0, yy2-yy1+1)
+        inter = w * h
+        if method is 'Min':
+            o = inter / np.minimum(area[i], area[idx])
+        else:
+            o = inter / (area[i] + area[idx] - inter)
+        I = I[np.where(o<=threshold)]
+    pick = pick[0:counter]
+    return pick
+
+# function [dy edy dx edx y ey x ex tmpw tmph] = pad(total_boxes,w,h)
+def pad(total_boxes, w, h):
+    """Compute the padding coordinates (pad the bounding boxes to square)"""
+    tmpw = (total_boxes[:,2]-total_boxes[:,0]+1).astype(np.int32)
+    tmph = (total_boxes[:,3]-total_boxes[:,1]+1).astype(np.int32)
+    numbox = total_boxes.shape[0]
+
+    dx = np.ones((numbox), dtype=np.int32)
+    dy = np.ones((numbox), dtype=np.int32)
+    edx = tmpw.copy().astype(np.int32)
+    edy = tmph.copy().astype(np.int32)
+
+    x = total_boxes[:,0].copy().astype(np.int32)
+    y = total_boxes[:,1].copy().astype(np.int32)
+    ex = total_boxes[:,2].copy().astype(np.int32)
+    ey = total_boxes[:,3].copy().astype(np.int32)
+
+    tmp = np.where(ex>w)
+    edx.flat[tmp] = np.expand_dims(-ex[tmp]+w+tmpw[tmp],1)
+    ex[tmp] = w
+    
+    tmp = np.where(ey>h)
+    edy.flat[tmp] = np.expand_dims(-ey[tmp]+h+tmph[tmp],1)
+    ey[tmp] = h
+
+    tmp = np.where(x<1)
+    dx.flat[tmp] = np.expand_dims(2-x[tmp],1)
+    x[tmp] = 1
+
+    tmp = np.where(y<1)
+    dy.flat[tmp] = np.expand_dims(2-y[tmp],1)
+    y[tmp] = 1
+    
+    return dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph
+
+# function [bboxA] = rerec(bboxA)
+def rerec(bboxA):
+    """Convert bboxA to square."""
+    h = bboxA[:,3]-bboxA[:,1]
+    w = bboxA[:,2]-bboxA[:,0]
+    l = np.maximum(w, h)
+    bboxA[:,0] = bboxA[:,0]+w*0.5-l*0.5
+    bboxA[:,1] = bboxA[:,1]+h*0.5-l*0.5
+    bboxA[:,2:4] = bboxA[:,0:2] + np.transpose(np.tile(l,(2,1)))
+    return bboxA
+
+def imresample(img, sz):
+    im_data = cv2.resize(img, (sz[1], sz[0]), interpolation=cv2.INTER_LINEAR) #@UndefinedVariable
+    return im_data
diff --git a/facelib/det1.npy b/facelib/det1.npy
deleted file mode 100644
index 7c05a2c..0000000
Binary files a/facelib/det1.npy and /dev/null differ
diff --git a/facelib/mtcnn.py b/facelib/mtcnn.py
deleted file mode 100644
index 7247954..0000000
--- a/facelib/mtcnn.py
+++ /dev/null
@@ -1,761 +0,0 @@
-# Source: https://github.com/davidsandberg/facenet/blob/master/src/align/
-
-""" Tensorflow implementation of the face detection / alignment algorithm found at
-https://github.com/kpzhang93/MTCNN_face_detection_alignment
-"""
-# MIT License
-# 
-# Copyright (c) 2016 David Sandberg
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from six import string_types, iteritems
-
-import numpy as np
-#from math import floor
-import cv2
-import os
-
-def layer(op):
-    """Decorator for composable network layers."""
-
-    def layer_decorated(self, *args, **kwargs):
-        # Automatically set a name if not provided.
-        name = kwargs.setdefault('name', self.get_unique_name(op.__name__))
-        # Figure out the layer inputs.
-        if len(self.terminals) == 0:
-            raise RuntimeError('No input variables found for layer %s.' % name)
-        elif len(self.terminals) == 1:
-            layer_input = self.terminals[0]
-        else:
-            layer_input = list(self.terminals)
-        # Perform the operation and get the output.
-        layer_output = op(self, layer_input, *args, **kwargs)
-        # Add to layer LUT.
-        self.layers[name] = layer_output
-        # This output is now the input for the next layer.
-        self.feed(layer_output)
-        # Return self for chained calls.
-        return self
-
-    return layer_decorated
-
-class Network(object):
-
-    def __init__(self, tf, inputs, trainable=True):
-        # The input nodes for this network
-        self.tf = tf
-        self.inputs = inputs
-        # The current list of terminal nodes
-        self.terminals = []
-        # Mapping from layer names to layers
-        self.layers = dict(inputs)
-        # If true, the resulting variables are set as trainable
-        self.trainable = trainable
-
-        self.setup()
-
-    def setup(self):
-        """Construct the network. """
-        raise NotImplementedError('Must be implemented by the subclass.')
-
-    def load(self, data_path, session, ignore_missing=False):
-        """Load network weights.
-        data_path: The path to the numpy-serialized network weights
-        session: The current TensorFlow session
-        ignore_missing: If true, serialized weights for missing layers are ignored.
-        """
-        data_dict = np.load(data_path, encoding='latin1').item() #pylint: disable=no-member
-
-        for op_name in data_dict:
-            with self.tf.variable_scope(op_name, reuse=True):
-                for param_name, data in iteritems(data_dict[op_name]):
-                    try:
-                        var = self.tf.get_variable(param_name)
-                        session.run(var.assign(data))
-                    except ValueError:
-                        if not ignore_missing:
-                            raise
-
-    def feed(self, *args):
-        """Set the input(s) for the next operation by replacing the terminal nodes.
-        The arguments can be either layer names or the actual layers.
-        """
-        assert len(args) != 0
-        self.terminals = []
-        for fed_layer in args:
-            if isinstance(fed_layer, string_types):
-                try:
-                    fed_layer = self.layers[fed_layer]
-                except KeyError:
-                    raise KeyError('Unknown layer name fed: %s' % fed_layer)
-            self.terminals.append(fed_layer)
-        return self
-
-    def get_output(self):
-        """Returns the current network output."""
-        return self.terminals[-1]
-
-    def get_unique_name(self, prefix):
-        """Returns an index-suffixed unique name for the given prefix.
-        This is used for auto-generating layer names based on the type-prefix.
-        """
-        ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1
-        return '%s_%d' % (prefix, ident)
-
-    def make_var(self, name, shape):
-        """Creates a new TensorFlow variable."""
-        return self.tf.get_variable(name, shape, trainable=self.trainable)
-
-    def validate_padding(self, padding):
-        """Verifies that the padding is one of the supported ones."""
-        assert padding in ('SAME', 'VALID')
-
-    @layer
-    def conv(self,
-             inp,
-             k_h,
-             k_w,
-             c_o,
-             s_h,
-             s_w,
-             name,
-             relu=True,
-             padding='SAME',
-             group=1,
-             biased=True):
-        # Verify that the padding is acceptable
-        self.validate_padding(padding)
-        # Get the number of channels in the input
-        c_i = int(inp.get_shape()[-1])
-        # Verify that the grouping parameter is valid
-        assert c_i % group == 0
-        assert c_o % group == 0
-        # Convolution for a given input and kernel
-        convolve = lambda i, k: self.tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding)
-        with self.tf.variable_scope(name) as scope:
-            kernel = self.make_var('weights', shape=[k_h, k_w, c_i // group, c_o])
-            # This is the common-case. Convolve the input without any further complications.
-            output = convolve(inp, kernel)
-            # Add the biases
-            if biased:
-                biases = self.make_var('biases', [c_o])
-                output = self.tf.nn.bias_add(output, biases)
-            if relu:
-                # ReLU non-linearity
-                output = self.tf.nn.relu(output, name=scope.name)
-            return output
-
-    @layer
-    def prelu(self, inp, name):
-        with self.tf.variable_scope(name):
-            i = int(inp.get_shape()[-1])
-            alpha = self.make_var('alpha', shape=(i,))
-            output = self.tf.nn.relu(inp) + self.tf.multiply(alpha, -self.tf.nn.relu(-inp))
-        return output
-
-    @layer
-    def max_pool(self, inp, k_h, k_w, s_h, s_w, name, padding='SAME'):
-        self.validate_padding(padding)
-        return self.tf.nn.max_pool(inp,
-                              ksize=[1, k_h, k_w, 1],
-                              strides=[1, s_h, s_w, 1],
-                              padding=padding,
-                              name=name)
-
-    @layer
-    def fc(self, inp, num_out, name, relu=True):
-        with self.tf.variable_scope(name):
-            input_shape = inp.get_shape()
-            if input_shape.ndims == 4:
-                # The input is spatial. Vectorize it first.
-                dim = 1
-                for d in input_shape[1:].as_list():
-                    dim *= int(d)
-                feed_in = self.tf.reshape(inp, [-1, dim])
-            else:
-                feed_in, dim = (inp, input_shape[-1].value)
-            weights = self.make_var('weights', shape=[dim, num_out])
-            biases = self.make_var('biases', [num_out])
-            op = self.tf.nn.relu_layer if relu else self.tf.nn.xw_plus_b
-            fc = op(feed_in, weights, biases, name=name)
-            return fc
-
-
-    """
-    Multi dimensional softmax,
-    refer to https://github.com/tensorflow/tensorflow/issues/210
-    compute softmax along the dimension of target
-    the native softmax only supports batch_size x dimension
-    """
-    @layer
-    def softmax(self, target, axis, name=None):
-        max_axis = self.tf.reduce_max(target, axis, keepdims=True)
-        target_exp = self.tf.exp(target-max_axis)
-        normalize = self.tf.reduce_sum(target_exp, axis, keepdims=True)
-        softmax = self.tf.div(target_exp, normalize, name)
-        return softmax
-    
-class PNet(Network):
-    def setup(self):
-        (self.feed('data') #pylint: disable=no-value-for-parameter, no-member
-             .conv(3, 3, 10, 1, 1, padding='VALID', relu=False, name='conv1')
-             .prelu(name='PReLU1')
-             .max_pool(2, 2, 2, 2, name='pool1')
-             .conv(3, 3, 16, 1, 1, padding='VALID', relu=False, name='conv2')
-             .prelu(name='PReLU2')
-             .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv3')
-             .prelu(name='PReLU3')
-             .conv(1, 1, 2, 1, 1, relu=False, name='conv4-1')
-             .softmax(3,name='prob1'))
-
-        (self.feed('PReLU3') #pylint: disable=no-value-for-parameter
-             .conv(1, 1, 4, 1, 1, relu=False, name='conv4-2'))
-        
-class RNet(Network):
-    def setup(self):
-        (self.feed('data') #pylint: disable=no-value-for-parameter, no-member
-             .conv(3, 3, 28, 1, 1, padding='VALID', relu=False, name='conv1')
-             .prelu(name='prelu1')
-             .max_pool(3, 3, 2, 2, name='pool1')
-             .conv(3, 3, 48, 1, 1, padding='VALID', relu=False, name='conv2')
-             .prelu(name='prelu2')
-             .max_pool(3, 3, 2, 2, padding='VALID', name='pool2')
-             .conv(2, 2, 64, 1, 1, padding='VALID', relu=False, name='conv3')
-             .prelu(name='prelu3')
-             .fc(128, relu=False, name='conv4')
-             .prelu(name='prelu4')
-             .fc(2, relu=False, name='conv5-1')
-             .softmax(1,name='prob1'))
-
-        (self.feed('prelu4') #pylint: disable=no-value-for-parameter
-             .fc(4, relu=False, name='conv5-2'))
-
-class ONet(Network):
-    def setup(self):
-        (self.feed('data') #pylint: disable=no-value-for-parameter, no-member
-             .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv1')
-             .prelu(name='prelu1')
-             .max_pool(3, 3, 2, 2, name='pool1')
-             .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv2')
-             .prelu(name='prelu2')
-             .max_pool(3, 3, 2, 2, padding='VALID', name='pool2')
-             .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv3')
-             .prelu(name='prelu3')
-             .max_pool(2, 2, 2, 2, name='pool3')
-             .conv(2, 2, 128, 1, 1, padding='VALID', relu=False, name='conv4')
-             .prelu(name='prelu4')
-             .fc(256, relu=False, name='conv5')
-             .prelu(name='prelu5')
-             .fc(2, relu=False, name='conv6-1')
-             .softmax(1, name='prob1'))
-
-        (self.feed('prelu5') #pylint: disable=no-value-for-parameter
-             .fc(4, relu=False, name='conv6-2'))
-
-        (self.feed('prelu5') #pylint: disable=no-value-for-parameter
-             .fc(10, relu=False, name='conv6-3'))
-
-def detect_face(img, minsize, pnet, rnet, onet, threshold, factor):
-    """Detects faces in an image, and returns bounding boxes and points for them.
-    img: input image
-    minsize: minimum faces' size
-    pnet, rnet, onet: caffemodel
-    threshold: threshold=[th1, th2, th3], th1-3 are three steps's threshold
-    factor: the factor used to create a scaling pyramid of face sizes to detect in the image.
-    """
-    factor_count=0
-    total_boxes=np.empty((0,9))
-    points=np.empty(0)
-    h=img.shape[0]
-    w=img.shape[1]
-    minl=np.amin([h, w])
-    m=12.0/minsize
-    minl=minl*m
-    # create scale pyramid
-    scales=[]
-    while minl>=12:
-        scales += [m*np.power(factor, factor_count)]
-        minl = minl*factor
-        factor_count += 1
-    # first stage
-    for scale in scales:
-        hs=int(np.ceil(h*scale))
-        ws=int(np.ceil(w*scale))
-        #print ('scale %f %d %d' % (scale, ws,hs))
-        im_data = imresample(img, (hs, ws))
-        im_data = (im_data-127.5)*0.0078125
-        img_x = np.expand_dims(im_data, 0)
-        img_y = np.transpose(img_x, (0,2,1,3))
-        out = pnet([img_y])
-        out0 = np.transpose(out[0], (0,2,1,3))
-        out1 = np.transpose(out[1], (0,2,1,3))
-        
-        boxes, _ = generateBoundingBox(out1[0,:,:,1].copy(), out0[0,:,:,:].copy(), scale, threshold[0])
-        
-        # inter-scale nms
-        pick = nms(boxes.copy(), 0.5, 'Union')
-        if boxes.size>0 and pick.size>0:
-            boxes = boxes[pick,:]
-            total_boxes = np.append(total_boxes, boxes, axis=0)
-
-    numbox = total_boxes.shape[0]
-    if numbox>0:
-        pick = nms(total_boxes.copy(), 0.7, 'Union')
-        total_boxes = total_boxes[pick,:]
-        regw = total_boxes[:,2]-total_boxes[:,0]
-        regh = total_boxes[:,3]-total_boxes[:,1]
-        qq1 = total_boxes[:,0]+total_boxes[:,5]*regw
-        qq2 = total_boxes[:,1]+total_boxes[:,6]*regh
-        qq3 = total_boxes[:,2]+total_boxes[:,7]*regw
-        qq4 = total_boxes[:,3]+total_boxes[:,8]*regh
-        total_boxes = np.transpose(np.vstack([qq1, qq2, qq3, qq4, total_boxes[:,4]]))
-        total_boxes = rerec(total_boxes.copy())
-        total_boxes[:,0:4] = np.fix(total_boxes[:,0:4]).astype(np.int32)
-        dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h)
-
-    numbox = total_boxes.shape[0]
-    if numbox>0:
-        # second stage
-        tempimg = np.zeros((24,24,3,numbox))
-        for k in range(0,numbox):
-            tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
-            tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
-            if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
-                tempimg[:,:,:,k] = imresample(tmp, (24, 24))
-            else:
-                return np.empty()
-        tempimg = (tempimg-127.5)*0.0078125
-        tempimg1 = np.transpose(tempimg, (3,1,0,2))
-        out = rnet([tempimg1])
-        out0 = np.transpose(out[0])
-        out1 = np.transpose(out[1])
-        score = out1[1,:]
-        ipass = np.where(score>threshold[1])
-        total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
-        mv = out0[:,ipass[0]]
-        if total_boxes.shape[0]>0:
-            pick = nms(total_boxes, 0.7, 'Union')
-            total_boxes = total_boxes[pick,:]
-            total_boxes = bbreg(total_boxes.copy(), np.transpose(mv[:,pick]))
-            total_boxes = rerec(total_boxes.copy())
-
-    numbox = total_boxes.shape[0]
-    if numbox>0:
-        # third stage
-        total_boxes = np.fix(total_boxes).astype(np.int32)
-        dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h)
-        tempimg = np.zeros((48,48,3,numbox))
-        for k in range(0,numbox):
-            tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
-            tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
-            if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
-                tempimg[:,:,:,k] = imresample(tmp, (48, 48))
-            else:
-                return np.empty()
-        tempimg = (tempimg-127.5)*0.0078125
-        tempimg1 = np.transpose(tempimg, (3,1,0,2))
-        out = onet([tempimg1])
-        out0 = np.transpose(out[0])
-        out1 = np.transpose(out[1])
-        out2 = np.transpose(out[2])
-        score = out2[1,:]
-        points = out1
-        ipass = np.where(score>threshold[2])
-        points = points[:,ipass[0]]
-        total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
-        mv = out0[:,ipass[0]]
-
-        w = total_boxes[:,2]-total_boxes[:,0]+1
-        h = total_boxes[:,3]-total_boxes[:,1]+1
-        points[0:5,:] = np.tile(w,(5, 1))*points[0:5,:] + np.tile(total_boxes[:,0],(5, 1))-1
-        points[5:10,:] = np.tile(h,(5, 1))*points[5:10,:] + np.tile(total_boxes[:,1],(5, 1))-1
-        if total_boxes.shape[0]>0:
-            total_boxes = bbreg(total_boxes.copy(), np.transpose(mv))
-            pick = nms(total_boxes.copy(), 0.7, 'Min')
-            total_boxes = total_boxes[pick,:]
-            points = points[:,pick]
-                
-    return total_boxes, points
-
-
-def bulk_detect_face(images, detection_window_size_ratio, pnet, rnet, onet, threshold, factor):
-    """Detects faces in a list of images
-    images: list containing input images
-    detection_window_size_ratio: ratio of minimum face size to smallest image dimension
-    pnet, rnet, onet: caffemodel
-    threshold: threshold=[th1 th2 th3], th1-3 are three steps's threshold [0-1]
-    factor: the factor used to create a scaling pyramid of face sizes to detect in the image.
-    """
-    all_scales = [None] * len(images)
-    images_with_boxes = [None] * len(images)
-
-    for i in range(len(images)):
-        images_with_boxes[i] = {'total_boxes': np.empty((0, 9))}
-
-    # create scale pyramid
-    for index, img in enumerate(images):
-        all_scales[index] = []
-        h = img.shape[0]
-        w = img.shape[1]
-        minsize = int(detection_window_size_ratio * np.minimum(w, h))
-        factor_count = 0
-        minl = np.amin([h, w])
-        if minsize <= 12:
-            minsize = 12
-
-        m = 12.0 / minsize
-        minl = minl * m
-        while minl >= 12:
-            all_scales[index].append(m * np.power(factor, factor_count))
-            minl = minl * factor
-            factor_count += 1
-
-    # # # # # # # # # # # # #
-    # first stage - fast proposal network (pnet) to obtain face candidates
-    # # # # # # # # # # # # #
-
-    images_obj_per_resolution = {}
-
-    # TODO: use some type of rounding to number module 8 to increase probability that pyramid images will have the same resolution across input images
-
-    for index, scales in enumerate(all_scales):
-        h = images[index].shape[0]
-        w = images[index].shape[1]
-
-        for scale in scales:
-            hs = int(np.ceil(h * scale))
-            ws = int(np.ceil(w * scale))
-
-            if (ws, hs) not in images_obj_per_resolution:
-                images_obj_per_resolution[(ws, hs)] = []
-
-            im_data = imresample(images[index], (hs, ws))
-            im_data = (im_data - 127.5) * 0.0078125
-            img_y = np.transpose(im_data, (1, 0, 2))  # caffe uses different dimensions ordering
-            images_obj_per_resolution[(ws, hs)].append({'scale': scale, 'image': img_y, 'index': index})
-
-    for resolution in images_obj_per_resolution:
-        images_per_resolution = [i['image'] for i in images_obj_per_resolution[resolution]]
-        outs = pnet(images_per_resolution)
-
-        for index in range(len(outs[0])):
-            scale = images_obj_per_resolution[resolution][index]['scale']
-            image_index = images_obj_per_resolution[resolution][index]['index']
-            out0 = np.transpose(outs[0][index], (1, 0, 2))
-            out1 = np.transpose(outs[1][index], (1, 0, 2))
-
-            boxes, _ = generateBoundingBox(out1[:, :, 1].copy(), out0[:, :, :].copy(), scale, threshold[0])
-
-            # inter-scale nms
-            pick = nms(boxes.copy(), 0.5, 'Union')
-            if boxes.size > 0 and pick.size > 0:
-                boxes = boxes[pick, :]
-                images_with_boxes[image_index]['total_boxes'] = np.append(images_with_boxes[image_index]['total_boxes'],
-                                                                          boxes,
-                                                                          axis=0)
-
-    for index, image_obj in enumerate(images_with_boxes):
-        numbox = image_obj['total_boxes'].shape[0]
-        if numbox > 0:
-            h = images[index].shape[0]
-            w = images[index].shape[1]
-            pick = nms(image_obj['total_boxes'].copy(), 0.7, 'Union')
-            image_obj['total_boxes'] = image_obj['total_boxes'][pick, :]
-            regw = image_obj['total_boxes'][:, 2] - image_obj['total_boxes'][:, 0]
-            regh = image_obj['total_boxes'][:, 3] - image_obj['total_boxes'][:, 1]
-            qq1 = image_obj['total_boxes'][:, 0] + image_obj['total_boxes'][:, 5] * regw
-            qq2 = image_obj['total_boxes'][:, 1] + image_obj['total_boxes'][:, 6] * regh
-            qq3 = image_obj['total_boxes'][:, 2] + image_obj['total_boxes'][:, 7] * regw
-            qq4 = image_obj['total_boxes'][:, 3] + image_obj['total_boxes'][:, 8] * regh
-            image_obj['total_boxes'] = np.transpose(np.vstack([qq1, qq2, qq3, qq4, image_obj['total_boxes'][:, 4]]))
-            image_obj['total_boxes'] = rerec(image_obj['total_boxes'].copy())
-            image_obj['total_boxes'][:, 0:4] = np.fix(image_obj['total_boxes'][:, 0:4]).astype(np.int32)
-            dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(image_obj['total_boxes'].copy(), w, h)
-
-            numbox = image_obj['total_boxes'].shape[0]
-            tempimg = np.zeros((24, 24, 3, numbox))
-
-            if numbox > 0:
-                for k in range(0, numbox):
-                    tmp = np.zeros((int(tmph[k]), int(tmpw[k]), 3))
-                    tmp[dy[k] - 1:edy[k], dx[k] - 1:edx[k], :] = images[index][y[k] - 1:ey[k], x[k] - 1:ex[k], :]
-                    if tmp.shape[0] > 0 and tmp.shape[1] > 0 or tmp.shape[0] == 0 and tmp.shape[1] == 0:
-                        tempimg[:, :, :, k] = imresample(tmp, (24, 24))
-                    else:
-                        return np.empty()
-
-                tempimg = (tempimg - 127.5) * 0.0078125
-                image_obj['rnet_input'] = np.transpose(tempimg, (3, 1, 0, 2))
-
-    # # # # # # # # # # # # #
-    # second stage - refinement of face candidates with rnet
-    # # # # # # # # # # # # #
-
-    bulk_rnet_input = np.empty((0, 24, 24, 3))
-    for index, image_obj in enumerate(images_with_boxes):
-        if 'rnet_input' in image_obj:
-            bulk_rnet_input = np.append(bulk_rnet_input, image_obj['rnet_input'], axis=0)
-
-    out = rnet(bulk_rnet_input)
-    out0 = np.transpose(out[0])
-    out1 = np.transpose(out[1])
-    score = out1[1, :]
-
-    i = 0
-    for index, image_obj in enumerate(images_with_boxes):
-        if 'rnet_input' not in image_obj:
-            continue
-
-        rnet_input_count = image_obj['rnet_input'].shape[0]
-        score_per_image = score[i:i + rnet_input_count]
-        out0_per_image = out0[:, i:i + rnet_input_count]
-
-        ipass = np.where(score_per_image > threshold[1])
-        image_obj['total_boxes'] = np.hstack([image_obj['total_boxes'][ipass[0], 0:4].copy(),
-                                              np.expand_dims(score_per_image[ipass].copy(), 1)])
-
-        mv = out0_per_image[:, ipass[0]]
-
-        if image_obj['total_boxes'].shape[0] > 0:
-            h = images[index].shape[0]
-            w = images[index].shape[1]
-            pick = nms(image_obj['total_boxes'], 0.7, 'Union')
-            image_obj['total_boxes'] = image_obj['total_boxes'][pick, :]
-            image_obj['total_boxes'] = bbreg(image_obj['total_boxes'].copy(), np.transpose(mv[:, pick]))
-            image_obj['total_boxes'] = rerec(image_obj['total_boxes'].copy())
-
-            numbox = image_obj['total_boxes'].shape[0]
-
-            if numbox > 0:
-                tempimg = np.zeros((48, 48, 3, numbox))
-                image_obj['total_boxes'] = np.fix(image_obj['total_boxes']).astype(np.int32)
-                dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(image_obj['total_boxes'].copy(), w, h)
-
-                for k in range(0, numbox):
-                    tmp = np.zeros((int(tmph[k]), int(tmpw[k]), 3))
-                    tmp[dy[k] - 1:edy[k], dx[k] - 1:edx[k], :] = images[index][y[k] - 1:ey[k], x[k] - 1:ex[k], :]
-                    if tmp.shape[0] > 0 and tmp.shape[1] > 0 or tmp.shape[0] == 0 and tmp.shape[1] == 0:
-                        tempimg[:, :, :, k] = imresample(tmp, (48, 48))
-                    else:
-                        return np.empty()
-                tempimg = (tempimg - 127.5) * 0.0078125
-                image_obj['onet_input'] = np.transpose(tempimg, (3, 1, 0, 2))
-
-        i += rnet_input_count
-
-    # # # # # # # # # # # # #
-    # third stage - further refinement and facial landmarks positions with onet
-    # # # # # # # # # # # # #
-
-    bulk_onet_input = np.empty((0, 48, 48, 3))
-    for index, image_obj in enumerate(images_with_boxes):
-        if 'onet_input' in image_obj:
-            bulk_onet_input = np.append(bulk_onet_input, image_obj['onet_input'], axis=0)
-
-    out = onet(bulk_onet_input)
-
-    out0 = np.transpose(out[0])
-    out1 = np.transpose(out[1])
-    out2 = np.transpose(out[2])
-    score = out2[1, :]
-    points = out1
-
-    i = 0
-    ret = []
-    for index, image_obj in enumerate(images_with_boxes):
-        if 'onet_input' not in image_obj:
-            ret.append(None)
-            continue
-
-        onet_input_count = image_obj['onet_input'].shape[0]
-
-        out0_per_image = out0[:, i:i + onet_input_count]
-        score_per_image = score[i:i + onet_input_count]
-        points_per_image = points[:, i:i + onet_input_count]
-
-        ipass = np.where(score_per_image > threshold[2])
-        points_per_image = points_per_image[:, ipass[0]]
-
-        image_obj['total_boxes'] = np.hstack([image_obj['total_boxes'][ipass[0], 0:4].copy(),
-                                              np.expand_dims(score_per_image[ipass].copy(), 1)])
-        mv = out0_per_image[:, ipass[0]]
-
-        w = image_obj['total_boxes'][:, 2] - image_obj['total_boxes'][:, 0] + 1
-        h = image_obj['total_boxes'][:, 3] - image_obj['total_boxes'][:, 1] + 1
-        points_per_image[0:5, :] = np.tile(w, (5, 1)) * points_per_image[0:5, :] + np.tile(
-            image_obj['total_boxes'][:, 0], (5, 1)) - 1
-        points_per_image[5:10, :] = np.tile(h, (5, 1)) * points_per_image[5:10, :] + np.tile(
-            image_obj['total_boxes'][:, 1], (5, 1)) - 1
-
-        if image_obj['total_boxes'].shape[0] > 0:
-            image_obj['total_boxes'] = bbreg(image_obj['total_boxes'].copy(), np.transpose(mv))
-            pick = nms(image_obj['total_boxes'].copy(), 0.7, 'Min')
-            image_obj['total_boxes'] = image_obj['total_boxes'][pick, :]
-            points_per_image = points_per_image[:, pick]
-
-            ret.append((image_obj['total_boxes'], points_per_image))
-        else:
-            ret.append(None)
-
-        i += onet_input_count
-
-    return ret
-
-
-# function [boundingbox] = bbreg(boundingbox,reg)
-def bbreg(boundingbox,reg):
-    """Calibrate bounding boxes"""
-    if reg.shape[1]==1:
-        reg = np.reshape(reg, (reg.shape[2], reg.shape[3]))
-
-    w = boundingbox[:,2]-boundingbox[:,0]+1
-    h = boundingbox[:,3]-boundingbox[:,1]+1
-    b1 = boundingbox[:,0]+reg[:,0]*w
-    b2 = boundingbox[:,1]+reg[:,1]*h
-    b3 = boundingbox[:,2]+reg[:,2]*w
-    b4 = boundingbox[:,3]+reg[:,3]*h
-    boundingbox[:,0:4] = np.transpose(np.vstack([b1, b2, b3, b4 ]))
-    return boundingbox
- 
-def generateBoundingBox(imap, reg, scale, t):
-    """Use heatmap to generate bounding boxes"""
-    stride=2
-    cellsize=12
-
-    imap = np.transpose(imap)
-    dx1 = np.transpose(reg[:,:,0])
-    dy1 = np.transpose(reg[:,:,1])
-    dx2 = np.transpose(reg[:,:,2])
-    dy2 = np.transpose(reg[:,:,3])
-    y, x = np.where(imap >= t)
-    if y.shape[0]==1:
-        dx1 = np.flipud(dx1)
-        dy1 = np.flipud(dy1)
-        dx2 = np.flipud(dx2)
-        dy2 = np.flipud(dy2)
-    score = imap[(y,x)]
-    reg = np.transpose(np.vstack([ dx1[(y,x)], dy1[(y,x)], dx2[(y,x)], dy2[(y,x)] ]))
-    if reg.size==0:
-        reg = np.empty((0,3))
-    bb = np.transpose(np.vstack([y,x]))
-    q1 = np.fix((stride*bb+1)/scale)
-    q2 = np.fix((stride*bb+cellsize-1+1)/scale)
-    boundingbox = np.hstack([q1, q2, np.expand_dims(score,1), reg])
-    return boundingbox, reg
- 
-# function pick = nms(boxes,threshold,type)
-def nms(boxes, threshold, method):
-    if boxes.size==0:
-        return np.empty((0,3))
-    x1 = boxes[:,0]
-    y1 = boxes[:,1]
-    x2 = boxes[:,2]
-    y2 = boxes[:,3]
-    s = boxes[:,4]
-    area = (x2-x1+1) * (y2-y1+1)
-    I = np.argsort(s)
-    pick = np.zeros_like(s, dtype=np.int16)
-    counter = 0
-    while I.size>0:
-        i = I[-1]
-        pick[counter] = i
-        counter += 1
-        idx = I[0:-1]
-        xx1 = np.maximum(x1[i], x1[idx])
-        yy1 = np.maximum(y1[i], y1[idx])
-        xx2 = np.minimum(x2[i], x2[idx])
-        yy2 = np.minimum(y2[i], y2[idx])
-        w = np.maximum(0.0, xx2-xx1+1)
-        h = np.maximum(0.0, yy2-yy1+1)
-        inter = w * h
-        if method is 'Min':
-            o = inter / np.minimum(area[i], area[idx])
-        else:
-            o = inter / (area[i] + area[idx] - inter)
-        I = I[np.where(o<=threshold)]
-    pick = pick[0:counter]
-    return pick
-
-# function [dy edy dx edx y ey x ex tmpw tmph] = pad(total_boxes,w,h)
-def pad(total_boxes, w, h):
-    """Compute the padding coordinates (pad the bounding boxes to square)"""
-    tmpw = (total_boxes[:,2]-total_boxes[:,0]+1).astype(np.int32)
-    tmph = (total_boxes[:,3]-total_boxes[:,1]+1).astype(np.int32)
-    numbox = total_boxes.shape[0]
-
-    dx = np.ones((numbox), dtype=np.int32)
-    dy = np.ones((numbox), dtype=np.int32)
-    edx = tmpw.copy().astype(np.int32)
-    edy = tmph.copy().astype(np.int32)
-
-    x = total_boxes[:,0].copy().astype(np.int32)
-    y = total_boxes[:,1].copy().astype(np.int32)
-    ex = total_boxes[:,2].copy().astype(np.int32)
-    ey = total_boxes[:,3].copy().astype(np.int32)
-
-    tmp = np.where(ex>w)
-    edx.flat[tmp] = np.expand_dims(-ex[tmp]+w+tmpw[tmp],1)
-    ex[tmp] = w
-    
-    tmp = np.where(ey>h)
-    edy.flat[tmp] = np.expand_dims(-ey[tmp]+h+tmph[tmp],1)
-    ey[tmp] = h
-
-    tmp = np.where(x<1)
-    dx.flat[tmp] = np.expand_dims(2-x[tmp],1)
-    x[tmp] = 1
-
-    tmp = np.where(y<1)
-    dy.flat[tmp] = np.expand_dims(2-y[tmp],1)
-    y[tmp] = 1
-    
-    return dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph
-
-# function [bboxA] = rerec(bboxA)
-def rerec(bboxA):
-    """Convert bboxA to square."""
-    h = bboxA[:,3]-bboxA[:,1]
-    w = bboxA[:,2]-bboxA[:,0]
-    l = np.maximum(w, h)
-    bboxA[:,0] = bboxA[:,0]+w*0.5-l*0.5
-    bboxA[:,1] = bboxA[:,1]+h*0.5-l*0.5
-    bboxA[:,2:4] = bboxA[:,0:2] + np.transpose(np.tile(l,(2,1)))
-    return bboxA
-
-def imresample(img, sz):
-    im_data = cv2.resize(img, (sz[1], sz[0]), interpolation=cv2.INTER_LINEAR) #@UndefinedVariable
-    return im_data
-
-    # This method is kept for debugging purpose
-#     h=img.shape[0]
-#     w=img.shape[1]
-#     hs, ws = sz
-#     dx = float(w) / ws
-#     dy = float(h) / hs
-#     im_data = np.zeros((hs,ws,3))
-#     for a1 in range(0,hs):
-#         for a2 in range(0,ws):
-#             for a3 in range(0,3):
-#                 im_data[a1,a2,a3] = img[int(floor(a1*dy)),int(floor(a2*dx)),a3]
-#     return im_data
-
diff --git a/facelib/det3.npy b/facelib/mtcnn_onet.h5
similarity index 95%
rename from facelib/det3.npy
rename to facelib/mtcnn_onet.h5
index 90d5ba9..bd615de 100644
Binary files a/facelib/det3.npy and b/facelib/mtcnn_onet.h5 differ
diff --git a/facelib/mtcnn_pnet.h5 b/facelib/mtcnn_pnet.h5
new file mode 100644
index 0000000..e13f81b
Binary files /dev/null and b/facelib/mtcnn_pnet.h5 differ
diff --git a/facelib/det2.npy b/facelib/mtcnn_rnet.h5
similarity index 88%
rename from facelib/det2.npy
rename to facelib/mtcnn_rnet.h5
index 85d5bf0..798a807 100644
Binary files a/facelib/det2.npy and b/facelib/mtcnn_rnet.h5 differ
diff --git a/main.py b/main.py
index eee0e7f..4735db7 100644
--- a/main.py
+++ b/main.py
@@ -137,6 +137,8 @@ if __name__ == "__main__":
     if arguments.tf_suppress_std:
         os.environ['TF_SUPPRESS_STD'] = '1'
 
+    #os.environ['force_plaidML'] = '1'
+    
     arguments.func(arguments)
 
     print ("Done.")
diff --git a/mainscripts/Converter.py b/mainscripts/Converter.py
index 0de31bb..8bc74af 100644
--- a/mainscripts/Converter.py
+++ b/mainscripts/Converter.py
@@ -149,7 +149,8 @@ class ConvertSubprocessor(SubprocessorBase):
         files_processed = 1
         faces_processed = 0
             
-        output_filename_path = self.output_path / filename_path.name
+        output_filename_path = self.output_path / (filename_path.stem + '.png')
+
         if self.converter.get_mode() == ConverterBase.MODE_FACE and filename_path.stem not in self.alignments.keys():                    
             if not self.debug:
                 print ( 'no faces found for %s, copying without faces' % (filename_path.name) )
diff --git a/mainscripts/Extractor.py b/mainscripts/Extractor.py
index ea7a359..8879ba6 100644
--- a/mainscripts/Extractor.py
+++ b/mainscripts/Extractor.py
@@ -62,23 +62,35 @@ class ExtractSubprocessor(SubprocessorBase):
                     
             cv2.setMouseCallback(self.wnd_name, onMouse, self.param)
     
-    def get_devices_for_type (self, type, multi_gpu):
-        if (type == 'rects' or type == 'landmarks'):
-            if multi_gpu:
-                devices = nnlib.device.getDevicesWithAtLeastTotalMemoryGB(2)
-            
-            if not multi_gpu or len(devices) == 0:
-                devices = [nnlib.device.getBestDeviceIdx()]
-                
-            if len(devices) == 0:
-                devices = [0]
+    def get_devices_for_type (self, type, multi_gpu, cpu_only):
+        if not cpu_only and (type == 'rects' or type == 'landmarks'):
+            if type == 'rects' and self.detector == 'mt' and nnlib.device.backend == "plaidML":
+                cpu_only = True
+            else:
+                if multi_gpu:
+                    devices = nnlib.device.getValidDevicesWithAtLeastTotalMemoryGB(2)
+                if not multi_gpu or len(devices) == 0:
+                    devices = [nnlib.device.getBestValidDeviceIdx()]                    
+                if len(devices) == 0:
+                    devices = [0]
                     
-            devices = [ (idx, nnlib.device.getDeviceName(idx), nnlib.device.getDeviceVRAMTotalGb(idx) ) for idx in devices]
+                for idx in devices:
+                    dev_name = nnlib.device.getDeviceName(idx)
+                    dev_vram = nnlib.device.getDeviceVRAMTotalGb(idx)
+                    
+                    if not self.manual and self.type == 'rects' and self.detector == 'mt':
+                        for i in range ( int (max (1, dev_vram / 2) ) ):
+                            yield (idx, 'GPU', '%s #%d' % (dev_name,i) , dev_vram)
+                    else:
+                        yield (idx, 'GPU', dev_name, dev_vram)
 
-        elif type == 'final':
-            devices = [ (i, 'CPU%d' % (i), 0 ) for i in range(0, multiprocessing.cpu_count()) ]
-            
-        return devices 
+        if cpu_only and (type == 'rects' or type == 'landmarks'):
+            for i in range( min(8, multiprocessing.cpu_count() // 2) ):
+                yield (i, 'CPU', 'CPU%d' % (i), 0 )
+                
+        if type == 'final':
+            for i in range( min(8, multiprocessing.cpu_count()) ):
+                yield (i, 'CPU', 'CPU%d' % (i), 0 ) 
         
     #override
     def process_info_generator(self):
@@ -89,31 +101,13 @@ class ExtractSubprocessor(SubprocessorBase):
                      'output_dir': str(self.output_path), 
                      'detector': self.detector}
     
-        if not self.cpu_only:
-            for (device_idx, device_name, device_total_vram_gb) in self.get_devices_for_type(self.type, self.multi_gpu): 
-                num_processes = 1
-                if not self.manual and self.type == 'rects' and self.detector == 'mt':
-                    num_processes = int ( max (1, device_total_vram_gb / 2) )
-                    
-                for i in range(0, num_processes ):
-                    client_dict = base_dict.copy()
-                    client_dict['device_idx'] = device_idx
-                    client_dict['device_name'] = device_name if num_processes == 1 else '%s #%d' % (device_name,i)
-                    client_dict['device_type'] = 'GPU'
-                    
-                    yield client_dict['device_name'], {}, client_dict
-        else:
-            num_processes = 1
-            if not self.manual and self.type == 'rects' and self.detector == 'mt':
-                num_processes = int ( max (1, multiprocessing.cpu_count() / 2 ) )
-            
-            for i in range(0, num_processes ):
-                client_dict = base_dict.copy()
-                client_dict['device_idx'] = 0
-                client_dict['device_name'] = 'CPU' if num_processes == 1 else 'CPU #%d' % (i),
-                client_dict['device_type'] = 'CPU'
-                
-                yield client_dict['device_name'], {}, client_dict
+        for (device_idx, device_type, device_name, device_total_vram_gb) in self.get_devices_for_type(self.type, self.multi_gpu, self.cpu_only): 
+            client_dict = base_dict.copy()
+            client_dict['device_idx'] = device_idx
+            client_dict['device_name'] = device_name
+            client_dict['device_type'] = device_type            
+            yield client_dict['device_name'], {}, client_dict
+    
                     
     #override
     def get_no_process_started_message(self):
@@ -265,18 +259,17 @@ class ExtractSubprocessor(SubprocessorBase):
         self.detector     = client_dict['detector']
 
         self.e = None
-
         device_config = nnlib.DeviceConfig ( cpu_only=self.cpu_only, force_gpu_idx=self.device_idx, allow_growth=True)
         if self.type == 'rects':
             if self.detector is not None:
                 if self.detector == 'mt':
                     nnlib.import_all (device_config)
-                    self.e = facelib.MTCExtractor(nnlib.keras, nnlib.tf, nnlib.tf_sess)                            
+                    self.e = facelib.MTCExtractor()                            
                 elif self.detector == 'dlib':
                     nnlib.import_dlib (device_config)
                     self.e = facelib.DLIBExtractor(nnlib.dlib)
                 self.e.__enter__()
-
+                
         elif self.type == 'landmarks':
             nnlib.import_all (device_config)
             self.e = facelib.LandmarksExtractor(nnlib.keras)
diff --git a/models/ModelBase.py b/models/ModelBase.py
index a721d5e..265c861 100644
--- a/models/ModelBase.py
+++ b/models/ModelBase.py
@@ -22,7 +22,7 @@ class ModelBase(object):
     def __init__(self, model_path, training_data_src_path=None, training_data_dst_path=None, debug = False, force_gpu_idx=-1, **in_options):
     
         if force_gpu_idx == -1: 
-            idxs_names_list = nnlib.device.getAllDevicesIdxsWithNamesList()
+            idxs_names_list = nnlib.device.getValidDevicesIdxsWithNamesList()
             if len(idxs_names_list) > 1:
                 print ("You have multi GPUs in a system: ")
                 for idx, name in idxs_names_list:
diff --git a/models/Model_DF/Model.py b/models/Model_DF/Model.py
index 9d7ac1b..531b3b0 100644
--- a/models/Model_DF/Model.py
+++ b/models/Model_DF/Model.py
@@ -16,14 +16,14 @@ class Model(ModelBase):
     def onInitializeOptions(self, is_first_run, ask_override):        
         if is_first_run or ask_override:
             def_pixel_loss = self.options.get('pixel_loss', False)
-            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 20k epochs to enhance fine details and remove face jitter.")
+            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 20k epochs to enhance fine details and decrease face jitter.")
         else:
             self.options['pixel_loss'] = self.options.get('pixel_loss', False)
             
     #override
     def onInitialize(self, **in_options):
         exec(nnlib.import_all(), locals(), globals())
-        self.set_vram_batch_requirements( {4.5:4,5:6,6:8,7:16,8:24,9:24,10:32,11:32,12:32,13:48} )
+        self.set_vram_batch_requirements( {4.5:4} )
                 
         ae_input_layer = Input(shape=(128, 128, 3))
         mask_layer = Input(shape=(128, 128, 1)) #same as output
diff --git a/models/Model_H128/Model.py b/models/Model_H128/Model.py
index 98dcf6b..28e0016 100644
--- a/models/Model_H128/Model.py
+++ b/models/Model_H128/Model.py
@@ -24,14 +24,14 @@ class Model(ModelBase):
             
         if is_first_run or ask_override:
             def_pixel_loss = self.options.get('pixel_loss', False)
-            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 20k epochs to enhance fine details and remove face jitter.")
+            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 20k epochs to enhance fine details and decrease face jitter.")
         else:
             self.options['pixel_loss'] = self.options.get('pixel_loss', False)
             
     #override
     def onInitialize(self, **in_options):
         exec(nnlib.import_all(), locals(), globals())        
-        self.set_vram_batch_requirements( {2.5:2,3:2,4:2,4:4,5:8,6:12,7:16,8:16,9:24,10:24,11:32,12:32,13:48} )
+        self.set_vram_batch_requirements( {2.5:4} )
                 
         bgr_shape, mask_shape, self.encoder, self.decoder_src, self.decoder_dst = self.Build( self.options['lighter_ae'] )
         if not self.is_first_run():
diff --git a/models/Model_H64/Model.py b/models/Model_H64/Model.py
index c2bb60b..39c7d25 100644
--- a/models/Model_H64/Model.py
+++ b/models/Model_H64/Model.py
@@ -24,15 +24,15 @@ class Model(ModelBase):
             
         if is_first_run or ask_override:
             def_pixel_loss = self.options.get('pixel_loss', False)
-            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 20k epochs to enhance fine details and remove face jitter.")
+            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 20k epochs to enhance fine details and decrease face jitter.")
         else:
             self.options['pixel_loss'] = self.options.get('pixel_loss', False)
             
     #override
     def onInitialize(self, **in_options):
         exec(nnlib.import_all(), locals(), globals())
-        self.set_vram_batch_requirements( {1.5:2,2:2,3:8,4:16,5:24,6:32,7:40,8:48} )
-
+        self.set_vram_batch_requirements( {1.5:4} )
+        
         
         bgr_shape, mask_shape, self.encoder, self.decoder_src, self.decoder_dst = self.Build(self.options['lighter_ae'])
         
diff --git a/models/Model_LIAEF128/Model.py b/models/Model_LIAEF128/Model.py
index f885f9b..e5674ea 100644
--- a/models/Model_LIAEF128/Model.py
+++ b/models/Model_LIAEF128/Model.py
@@ -17,14 +17,14 @@ class Model(ModelBase):
     def onInitializeOptions(self, is_first_run, ask_override):        
         if is_first_run or ask_override:
             def_pixel_loss = self.options.get('pixel_loss', False)
-            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 20k epochs to enhance fine details and remove face jitter.")
+            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 20k epochs to enhance fine details and decrease face jitter.")
         else:
             self.options['pixel_loss'] = self.options.get('pixel_loss', False)
             
     #override
     def onInitialize(self, **in_options):
         exec(nnlib.import_all(), locals(), globals())
-        self.set_vram_batch_requirements( {4.5:4,5:4,6:8,7:12,8:16,9:20,10:24,11:24,12:32,13:48} )
+        self.set_vram_batch_requirements( {4.5:4} )
 
         ae_input_layer = Input(shape=(128, 128, 3))
         mask_layer = Input(shape=(128, 128, 1)) #same as output
diff --git a/models/Model_SAE/Model.py b/models/Model_SAE/Model.py
index 73569e6..26e8a64 100644
--- a/models/Model_SAE/Model.py
+++ b/models/Model_SAE/Model.py
@@ -29,30 +29,14 @@ class SAEModel(ModelBase):
         if is_first_run:
             self.options['resolution'] = input_int("Resolution (64,128 ?:help skip:128) : ", default_resolution, [64,128], help_message="More resolution requires more VRAM.")
             self.options['face_type'] = input_str ("Half or Full face? (h/f, ?:help skip:f) : ", default_face_type, ['h','f'], help_message="Half face has better resolution, but covers less area of cheeks.").lower()            
+            self.options['learn_mask'] = input_bool ("Learn mask? (y/n, ?:help skip:y) : ", True, help_message="Learning mask can help model to recognize face directions. Learn without mask can reduce model size, in this case converter forced to use 'not predicted mask' that is not smooth as predicted. Model with style values can be learned without mask and produce same quality result.")
             self.options['archi'] = input_str ("AE architecture (df, liae, ?:help skip:%s) : " % (default_archi) , default_archi, ['df','liae'], help_message="DF keeps faces more natural, while LIAE can fix overly different face shapes.").lower()            
-            self.options['lighter_encoder'] = input_bool ("Use lightweight encoder? (y/n, ?:help skip:n) : ", False, help_message="Lightweight encoder is 35% faster, requires less VRAM, sacrificing overall quality.")
-            self.options['learn_mask'] = input_bool ("Learn mask? (y/n, ?:help skip:y) : ", True, help_message="Choose NO to reduce model size. In this case converter forced to use 'not predicted mask' that is not smooth as predicted. Styled SAE can learn without mask and produce same quality fake.")
         else:
             self.options['resolution'] = self.options.get('resolution', default_resolution)
             self.options['face_type'] = self.options.get('face_type', default_face_type)
+            self.options['learn_mask'] = self.options.get('learn_mask', True)            
             self.options['archi'] = self.options.get('archi', default_archi)
-            self.options['lighter_encoder'] = self.options.get('lighter_encoder', False)
-            self.options['learn_mask'] = self.options.get('learn_mask', True)
-            
-        default_face_style_power = 10.0
-        if is_first_run or ask_override:
-            default_face_style_power = default_face_style_power if is_first_run else self.options.get('face_style_power', default_face_style_power)
-            self.options['face_style_power'] = np.clip ( input_number("Face style power ( 0.0 .. 100.0 ?:help skip:%.2f) : " % (default_face_style_power), default_face_style_power, help_message="How fast NN will learn dst face style during generalization of src and dst faces. If style is learned good enough, set this value to 0.01 to prevent artifacts appearing."), 0.0, 100.0 )            
-        else:
-            self.options['face_style_power'] = self.options.get('face_style_power', default_face_style_power)
         
-        default_bg_style_power = 10.0        
-        if is_first_run or ask_override: 
-            default_bg_style_power = default_bg_style_power if is_first_run else self.options.get('bg_style_power', default_bg_style_power)
-            self.options['bg_style_power'] = np.clip ( input_number("Background style power ( 0.0 .. 100.0 ?:help skip:%.2f) : " % (default_bg_style_power), default_bg_style_power, help_message="How fast NN will learn dst background style during generalization of src and dst faces. If style is learned good enough, set this value to 0.1-0.3 to prevent artifacts appearing."), 0.0, 100.0 )            
-        else:
-            self.options['bg_style_power'] = self.options.get('bg_style_power', default_bg_style_power)
-
         default_ae_dims = 256 if self.options['archi'] == 'liae' else 512
         default_ed_ch_dims = 42
         if is_first_run:
@@ -62,13 +46,36 @@ class SAEModel(ModelBase):
             self.options['ae_dims'] = self.options.get('ae_dims', default_ae_dims)
             self.options['ed_ch_dims'] = self.options.get('ed_ch_dims', default_ed_ch_dims)
             
+        if is_first_run:
+            self.options['lighter_encoder'] = input_bool ("Use lightweight encoder? (y/n, ?:help skip:n) : ", False, help_message="Lightweight encoder is 35% faster, requires less VRAM, but sacrificing overall quality.")
+            self.options['multiscale_decoder'] = input_bool ("Use multiscale decoder? (y/n, ?:help skip:y) : ", True, help_message="Multiscale decoder helps to get better details.")
+        else:
+            self.options['lighter_encoder'] = self.options.get('lighter_encoder', False)
+            self.options['multiscale_decoder'] = self.options.get('multiscale_decoder', True)
+            
+        default_face_style_power = 0.0        
+        default_bg_style_power = 0.0  
+        if is_first_run or ask_override:
+            def_pixel_loss = self.options.get('pixel_loss', False)
+            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 15-25k epochs to enhance fine details and decrease face jitter.")
         
+            default_face_style_power = default_face_style_power if is_first_run else self.options.get('face_style_power', default_face_style_power)
+            self.options['face_style_power'] = np.clip ( input_number("Face style power ( 0.0 .. 100.0 ?:help skip:%.2f) : " % (default_face_style_power), default_face_style_power, 
+                                                                               help_message="Learn to transfer face style details such as light and color conditions. Warning: Enable it only after 10k epochs, when predicted face is clear enough to start learn style. Start from 0.1 value and check history changes."), 0.0, 100.0 )            
+                            
+            default_bg_style_power = default_bg_style_power if is_first_run else self.options.get('bg_style_power', default_bg_style_power)
+            self.options['bg_style_power'] = np.clip ( input_number("Background style power ( 0.0 .. 100.0 ?:help skip:%.2f) : " % (default_bg_style_power), default_bg_style_power, 
+                                                                               help_message="Learn to transfer image around face. This can make face more like dst."), 0.0, 100.0 )            
+        else:
+            self.options['pixel_loss'] = self.options.get('pixel_loss', False)
+            self.options['face_style_power'] = self.options.get('face_style_power', default_face_style_power)
+            self.options['bg_style_power'] = self.options.get('bg_style_power', default_bg_style_power)
 
     #override
     def onInitialize(self, **in_options):
         exec(nnlib.import_all(), locals(), globals())
 
-        self.set_vram_batch_requirements({2:1,3:2,4:3,5:6,6:8,7:12,8:16})
+        self.set_vram_batch_requirements({1.5:4})
         
         resolution = self.options['resolution']
         ae_dims = self.options['ae_dims']
@@ -76,8 +83,10 @@ class SAEModel(ModelBase):
         adapt_k_size = False
         bgr_shape = (resolution, resolution, 3)
         mask_shape = (resolution, resolution, 1)
-
-        dssim_pixel_alpha = Input( (1,) )
+        
+        self.ms_count = ms_count = 3 if self.options['multiscale_decoder'] else 1
+        
+        epoch_alpha = Input( (1,) )
         warped_src = Input(bgr_shape)
         target_src = Input(bgr_shape)
         target_srcm = Input(mask_shape)
@@ -85,7 +94,12 @@ class SAEModel(ModelBase):
         warped_dst = Input(bgr_shape)
         target_dst = Input(bgr_shape)
         target_dstm = Input(mask_shape)
-            
+
+        target_src_ar = [ Input ( ( bgr_shape[0] // (2**i) ,)*2 + (bgr_shape[-1],) ) for i in range(ms_count-1, -1, -1)]
+        target_srcm_ar = [ Input ( ( mask_shape[0] // (2**i) ,)*2 + (mask_shape[-1],) ) for i in range(ms_count-1, -1, -1)]
+        target_dst_ar  = [ Input ( ( bgr_shape[0] // (2**i) ,)*2 + (bgr_shape[-1],) ) for i in range(ms_count-1, -1, -1)]
+        target_dstm_ar = [ Input ( ( mask_shape[0] // (2**i) ,)*2 + (mask_shape[-1],) ) for i in range(ms_count-1, -1, -1)]
+
         if self.options['archi'] == 'liae':
             self.encoder = modelify(SAEModel.LIAEEncFlow(resolution, adapt_k_size, self.options['lighter_encoder'], ed_ch_dims=ed_ch_dims)  ) (Input(bgr_shape))
             
@@ -96,10 +110,10 @@ class SAEModel(ModelBase):
             
             inter_output_Inputs = [ Input( np.array(K.int_shape(x)[1:])*(1,1,2) ) for x in self.inter_B.outputs ] 
 
-            self.decoder = modelify(SAEModel.LIAEDecFlow (bgr_shape[2],ed_ch_dims=ed_ch_dims//2, multiscale_decoder=True)) (inter_output_Inputs)
+            self.decoder = modelify(SAEModel.LIAEDecFlow (bgr_shape[2],ed_ch_dims=ed_ch_dims//2, multiscale_count=self.ms_count )) (inter_output_Inputs)
             
             if self.options['learn_mask']:
-                self.decoderm = modelify(SAEModel.LIAEDecFlow (mask_shape[2],ed_ch_dims=int(ed_ch_dims/1.5), multiscale_decoder=False )) (inter_output_Inputs)
+                self.decoderm = modelify(SAEModel.LIAEDecFlow (mask_shape[2],ed_ch_dims=int(ed_ch_dims/1.5) )) (inter_output_Inputs)
 
             if not self.is_first_run():
                 self.encoder.load_weights  (self.get_strpath_storage_for_file(self.encoderH5))
@@ -128,20 +142,18 @@ class SAEModel(ModelBase):
                 pred_src_srcm = self.decoderm(warped_src_inter_code)
                 pred_dst_dstm = self.decoderm(warped_dst_inter_code)
                 pred_src_dstm = self.decoderm(warped_src_dst_inter_code)
-                
-                
 
         else:
             self.encoder = modelify(SAEModel.DFEncFlow(resolution, adapt_k_size, self.options['lighter_encoder'], ae_dims=ae_dims, ed_ch_dims=ed_ch_dims)  ) (Input(bgr_shape))
 
             dec_Inputs = [ Input(K.int_shape(x)[1:]) for x in self.encoder.outputs ] 
             
-            self.decoder_src = modelify(SAEModel.DFDecFlow (bgr_shape[2],ed_ch_dims=ed_ch_dims//2, multiscale_decoder=True)) (dec_Inputs)
-            self.decoder_dst = modelify(SAEModel.DFDecFlow (bgr_shape[2],ed_ch_dims=ed_ch_dims//2, multiscale_decoder=True)) (dec_Inputs)
+            self.decoder_src = modelify(SAEModel.DFDecFlow (bgr_shape[2],ed_ch_dims=ed_ch_dims//2, multiscale_count=self.ms_count )) (dec_Inputs)
+            self.decoder_dst = modelify(SAEModel.DFDecFlow (bgr_shape[2],ed_ch_dims=ed_ch_dims//2, multiscale_count=self.ms_count )) (dec_Inputs)
             
             if self.options['learn_mask']:
-                self.decoder_srcm = modelify(SAEModel.DFDecFlow (mask_shape[2],ed_ch_dims=int(ed_ch_dims/1.5), multiscale_decoder=False)) (dec_Inputs)
-                self.decoder_dstm = modelify(SAEModel.DFDecFlow (mask_shape[2],ed_ch_dims=int(ed_ch_dims/1.5), multiscale_decoder=False)) (dec_Inputs)
+                self.decoder_srcm = modelify(SAEModel.DFDecFlow (mask_shape[2],ed_ch_dims=int(ed_ch_dims/1.5) )) (dec_Inputs)
+                self.decoder_dstm = modelify(SAEModel.DFDecFlow (mask_shape[2],ed_ch_dims=int(ed_ch_dims/1.5) )) (dec_Inputs)
            
             if not self.is_first_run():
                 self.encoder.load_weights      (self.get_strpath_storage_for_file(self.encoderH5))
@@ -166,19 +178,12 @@ class SAEModel(ModelBase):
         
         if self.options['learn_mask']:
             pred_src_srcm, pred_dst_dstm, pred_src_dstm = [ [x] if type(x) != list else x for x in [pred_src_srcm, pred_dst_dstm, pred_src_dstm] ]
- 
-        ms_count = len(pred_src_src)
-        
-        target_src_ar  = [ target_src  if i == 0 else tf.image.resize_bicubic( target_src,  (resolution // (2**i) ,)*2 )  for i in range(ms_count-1, -1, -1)]
-        target_srcm_ar = [ target_srcm if i == 0 else tf.image.resize_bicubic( target_srcm, (resolution // (2**i) ,)*2 )  for i in range(ms_count-1, -1, -1)]
-        target_dst_ar  = [ target_dst  if i == 0 else tf.image.resize_bicubic( target_dst,  (resolution // (2**i) ,)*2 )  for i in range(ms_count-1, -1, -1)]
-        target_dstm_ar = [ target_dstm if i == 0 else tf.image.resize_bicubic( target_dstm, (resolution // (2**i) ,)*2 )  for i in range(ms_count-1, -1, -1)]
 
-        target_srcm_blurred_ar = [ tf_gaussian_blur( max(1, x.get_shape().as_list()[1] // 32) )(x) for x in target_srcm_ar]
+        target_srcm_blurred_ar = [ gaussian_blur( max(1, K.int_shape(x)[1] // 32) )(x) for x in target_srcm_ar]
         target_srcm_sigm_ar = [ x / 2.0 + 0.5 for x in target_srcm_blurred_ar] 
         target_srcm_anti_sigm_ar = [ 1.0 - x for x in target_srcm_sigm_ar] 
     
-        target_dstm_blurred_ar = [ tf_gaussian_blur( max(1, x.get_shape().as_list()[1] // 32) )(x) for x in target_dstm_ar]
+        target_dstm_blurred_ar = [ gaussian_blur( max(1, K.int_shape(x)[1] // 32) )(x) for x in target_dstm_ar]
         target_dstm_sigm_ar = [ x / 2.0 + 0.5 for x in target_dstm_blurred_ar] 
         target_dstm_anti_sigm_ar = [ 1.0 - x for x in target_dstm_sigm_ar] 
         
@@ -199,9 +204,7 @@ class SAEModel(ModelBase):
         if self.is_training_mode:
             def optimizer():
                 return Adam(lr=5e-5, beta_1=0.5, beta_2=0.999)
-            
-            dssim_pixel_alpha_value = dssim_pixel_alpha[0][0]
-            
+   
             if self.options['archi'] == 'liae':          
                 src_dst_loss_train_weights = self.encoder.trainable_weights + self.inter_B.trainable_weights + self.inter_AB.trainable_weights + self.decoder.trainable_weights
                 if self.options['learn_mask']:
@@ -210,35 +213,51 @@ class SAEModel(ModelBase):
                 src_dst_loss_train_weights = self.encoder.trainable_weights + self.decoder_src.trainable_weights + self.decoder_dst.trainable_weights
                 if self.options['learn_mask']:
                     src_dst_mask_loss_train_weights = self.encoder.trainable_weights + self.decoder_srcm.trainable_weights + self.decoder_dstm.trainable_weights
-             
-            src_dssim_loss_batch = sum([ (  100*K.square(tf_dssim(2.0)( target_src_masked_ar[i],  pred_src_src_sigm_ar[i] * target_srcm_sigm_ar[i] ) )) for i in range(len(target_src_masked_ar)) ])
-            src_pixel_loss_batch = sum([ tf_reduce_mean ( 100*K.square( target_src_masked_ar[i] - pred_src_src_sigm_ar[i] * target_srcm_sigm_ar[i] ), axis=[1,2,3]) for i in range(len(target_src_masked_ar)) ])
             
-            src_loss_batch = src_dssim_loss_batch*(1.0-dssim_pixel_alpha_value) + src_pixel_loss_batch*dssim_pixel_alpha_value
+            if not self.options['pixel_loss']:
+                src_loss_batch = sum([ (  100*K.square( dssim(max_value=2.0)( target_src_masked_ar[i],  pred_src_src_sigm_ar[i] * target_srcm_sigm_ar[i] ) )) for i in range(len(target_src_masked_ar)) ])
+            else:
+                src_loss_batch = sum([ K.mean ( 100*K.square( target_src_masked_ar[i] - pred_src_src_sigm_ar[i] * target_srcm_sigm_ar[i] ), axis=[1,2,3]) for i in range(len(target_src_masked_ar)) ])
+
             src_loss = K.mean(src_loss_batch)
 
-            if self.options['face_style_power'] != 0:
-                face_style_power = self.options['face_style_power'] / 100.0
-                src_loss += tf_style_loss(gaussian_blur_radius=resolution // 8, loss_weight=0.2*face_style_power)( psd_target_dst_masked_ar[-1], target_dst_masked_ar[-1] ) 
-                
-            if self.options['bg_style_power'] != 0:
-                bg_style_power = self.options['bg_style_power'] / 100.0                
-                bg_dssim_loss = K.mean( (100*bg_style_power)*K.square(tf_dssim(2.0)( psd_target_dst_anti_masked_ar[-1], target_dst_anti_masked_ar[-1] )))
-                bg_pixel_loss = K.mean( (100*bg_style_power)*K.square( psd_target_dst_anti_masked_ar[-1] - target_dst_anti_masked_ar[-1] ))
-                src_loss += bg_dssim_loss*(1.0-dssim_pixel_alpha_value) + bg_pixel_loss*dssim_pixel_alpha_value
+            face_style_power = self.options['face_style_power']  / 100.0
+            
+            if face_style_power != 0:    
+                src_loss += style_loss(gaussian_blur_radius=resolution//16, loss_weight=face_style_power, wnd_size=0)( psd_target_dst_masked_ar[-1], target_dst_masked_ar[-1] ) 
 
-            dst_dssim_loss_batch = sum([ (  100*K.square(tf_dssim(2.0)( target_dst_masked_ar[i],  pred_dst_dst_sigm_ar[i] * target_dstm_sigm_ar[i] ) )) for i in range(len(target_dst_masked_ar)) ])
-            dst_pixel_loss_batch = sum([ tf_reduce_mean ( 100*K.square( target_dst_masked_ar[i] - pred_dst_dst_sigm_ar[i] * target_dstm_sigm_ar[i] ), axis=[1,2,3]) for i in range(len(target_dst_masked_ar)) ])
-            dst_loss_batch = dst_dssim_loss_batch*(1.0-dssim_pixel_alpha_value) + dst_pixel_loss_batch*dssim_pixel_alpha_value
+            bg_style_power = self.options['bg_style_power'] / 100.0
+            if bg_style_power != 0:
+                if not self.options['pixel_loss']:
+                    bg_loss = K.mean( (100*bg_style_power)*K.square(dssim(max_value=2.0)( psd_target_dst_anti_masked_ar[-1], target_dst_anti_masked_ar[-1] )))
+                else:
+                    bg_loss = K.mean( (100*bg_style_power)*K.square( psd_target_dst_anti_masked_ar[-1] - target_dst_anti_masked_ar[-1] ))
+                src_loss += bg_loss
+
+            if not self.options['pixel_loss']:
+                dst_loss_batch = sum([ (  100*K.square(dssim(max_value=2.0)( target_dst_masked_ar[i],  pred_dst_dst_sigm_ar[i] * target_dstm_sigm_ar[i] ) )) for i in range(len(target_dst_masked_ar)) ])
+            else:
+                dst_loss_batch = sum([ K.mean ( 100*K.square( target_dst_masked_ar[i] - pred_dst_dst_sigm_ar[i] * target_dstm_sigm_ar[i] ), axis=[1,2,3]) for i in range(len(target_dst_masked_ar)) ])
+                
             dst_loss = K.mean(dst_loss_batch)
 
-            self.src_dst_train = K.function ([dssim_pixel_alpha, warped_src, target_src, target_srcm, warped_dst, target_dst, target_dstm ],[src_loss,dst_loss,src_loss_batch,dst_loss_batch], optimizer().get_updates(src_loss+dst_loss, src_dst_loss_train_weights) )
-            
-   
+            feed = [warped_src, warped_dst]            
+            feed += target_src_ar[::-1]
+            feed += target_srcm_ar[::-1]
+            feed += target_dst_ar[::-1]
+            feed += target_dstm_ar[::-1]
+    
+            self.src_dst_train = K.function (feed,[src_loss,dst_loss], optimizer().get_updates(src_loss+dst_loss, src_dst_loss_train_weights) )
+
             if self.options['learn_mask']:
                 src_mask_loss = sum([ K.mean(K.square(target_srcm_ar[-1]-pred_src_srcm[-1])) for i in range(len(target_srcm_ar)) ])
                 dst_mask_loss = sum([ K.mean(K.square(target_dstm_ar[-1]-pred_dst_dstm[-1])) for i in range(len(target_dstm_ar)) ])
-                self.src_dst_mask_train = K.function ([warped_src, target_srcm, warped_dst, target_dstm],[src_mask_loss, dst_mask_loss], optimizer().get_updates(src_mask_loss+dst_mask_loss, src_dst_mask_loss_train_weights) )
+                
+                feed = [ warped_src, warped_dst]    
+                feed += target_srcm_ar[::-1]
+                feed += target_dstm_ar[::-1]            
+                
+                self.src_dst_mask_train = K.function (feed,[src_mask_loss, dst_mask_loss], optimizer().get_updates(src_mask_loss+dst_mask_loss, src_dst_mask_loss_train_weights) )
 
             if self.options['learn_mask']:
                 self.AE_view = K.function ([warped_src, warped_dst], [pred_src_src[-1], pred_dst_dst[-1], pred_src_dst[-1], pred_src_dstm[-1]])
@@ -257,21 +276,20 @@ class SAEModel(ModelBase):
             
             f = SampleProcessor.TypeFlags            
             face_type = f.FACE_ALIGN_FULL if self.options['face_type'] == 'f' else f.FACE_ALIGN_HALF
+            
+            output_sample_types=[ [f.WARPED_TRANSFORMED | face_type | f.MODE_BGR, resolution] ]            
+            output_sample_types += [ [f.TRANSFORMED | face_type | f.MODE_BGR, resolution // (2**i) ] for i in range(ms_count)]
+            output_sample_types += [ [f.TRANSFORMED | face_type | f.MODE_M | f.FACE_MASK_FULL, resolution // (2**i) ] for i in range(ms_count)]
+            
             self.set_training_data_generators ([            
                     SampleGeneratorFace(self.training_data_src_path, sort_by_yaw_target_samples_path=self.training_data_dst_path if self.sort_by_yaw else None, 
                                                                      debug=self.is_debug(), batch_size=self.batch_size, 
                         sample_process_options=SampleProcessor.Options(random_flip=self.random_flip, normalize_tanh = True, scale_range=np.array([-0.05, 0.05])+self.src_scale_mod / 100.0 ), 
-                        output_sample_types=[ [f.WARPED_TRANSFORMED | face_type | f.MODE_BGR, resolution],                        
-                                              [f.TRANSFORMED | face_type | f.MODE_BGR, resolution], 
-                                              [f.TRANSFORMED | face_type | f.MODE_M | f.FACE_MASK_FULL, resolution]                                              
-                                            ], add_sample_idx=True ),
+                        output_sample_types=output_sample_types ),
                                               
                     SampleGeneratorFace(self.training_data_dst_path, debug=self.is_debug(), batch_size=self.batch_size,
                         sample_process_options=SampleProcessor.Options(random_flip=self.random_flip, normalize_tanh = True), 
-                        output_sample_types=[ [f.WARPED_TRANSFORMED | face_type | f.MODE_BGR, resolution],                                             
-                                              [f.TRANSFORMED | face_type | f.MODE_BGR, resolution], 
-                                              [f.TRANSFORMED | face_type | f.MODE_M | f.FACE_MASK_FULL, resolution]                                               
-                                            ], add_sample_idx=True )
+                        output_sample_types=output_sample_types )
                 ])
     #override
     def onSave(self):
@@ -297,17 +315,20 @@ class SAEModel(ModelBase):
     
     #override
     def onTrainOneEpoch(self, generators_samples, generators_list):
-        warped_src, target_src, target_src_mask, src_sample_idxs = generators_samples[0]
-        warped_dst, target_dst, target_dst_mask, dst_sample_idxs = generators_samples[1]
+        src_samples  = generators_samples[0]
+        dst_samples  = generators_samples[1]
 
-        dssim_pixel_alpha = np.clip ( (self.epoch - 5000) / 15000.0, 0.0, 1.0 ) #smooth transition between DSSIM and MSE in 5-20k epochs
-        dssim_pixel_alpha = np.repeat( dssim_pixel_alpha, (self.batch_size,) )
-        dssim_pixel_alpha = np.expand_dims(dssim_pixel_alpha,-1)
-
-        src_loss, dst_loss, src_sample_losses, dst_sample_losses = self.src_dst_train ([dssim_pixel_alpha, warped_src, target_src, target_src_mask, warped_dst, target_dst, target_dst_mask])
+        feed = [src_samples[0], dst_samples[0] ] + \
+               src_samples[1:1+self.ms_count*2] + \
+               dst_samples[1:1+self.ms_count*2]
+               
+        src_loss, dst_loss, = self.src_dst_train (feed)
             
         if self.options['learn_mask']:
-            src_mask_loss, dst_mask_loss, = self.src_dst_mask_train ([warped_src, target_src_mask, warped_dst, target_dst_mask])
+            feed = [ src_samples[0], dst_samples[0] ] + \
+                   src_samples[1+self.ms_count:1+self.ms_count*2] + \
+                   dst_samples[1+self.ms_count:1+self.ms_count*2]
+            src_mask_loss, dst_mask_loss, = self.src_dst_mask_train (feed)
         
         return ( ('src_loss', src_loss), ('dst_loss', dst_loss) )
         
@@ -430,7 +451,7 @@ class SAEModel(ModelBase):
         return func
         
     @staticmethod
-    def LIAEDecFlow(output_nc,ed_ch_dims=21, multiscale_decoder=True):
+    def LIAEDecFlow(output_nc,ed_ch_dims=21, multiscale_count=1):
         exec (nnlib.import_all(), locals(), globals())
         ed_dims = output_nc * ed_ch_dims
         
@@ -449,12 +470,12 @@ class SAEModel(ModelBase):
             outputs = []
             x1     = upscale(ed_dims*8)( x )       
             
-            if multiscale_decoder:
+            if multiscale_count >= 3:
                 outputs += [ to_bgr() ( x1 ) ]  
                 
             x2     = upscale(ed_dims*4)( x1 )    
             
-            if multiscale_decoder:
+            if multiscale_count >= 2:
                 outputs += [ to_bgr() ( x2 ) ]
                 
             x3     = upscale(ed_dims*2)( x2 )
@@ -513,7 +534,7 @@ class SAEModel(ModelBase):
         return func
     
     @staticmethod
-    def DFDecFlow(output_nc, ed_ch_dims=21, multiscale_decoder=True):
+    def DFDecFlow(output_nc, ed_ch_dims=21, multiscale_count=1):
         exec (nnlib.import_all(), locals(), globals())
         ed_dims = output_nc * ed_ch_dims
 
@@ -535,12 +556,12 @@ class SAEModel(ModelBase):
             outputs = []
             x1     = upscale(ed_dims*8)( x )       
             
-            if multiscale_decoder:
+            if multiscale_count >= 3:
                 outputs += [ to_bgr() ( x1 ) ]  
                 
             x2     = upscale(ed_dims*4)( x1 )    
             
-            if multiscale_decoder:
+            if multiscale_count >= 2:
                 outputs += [ to_bgr() ( x2 ) ]
                 
             x3     = upscale(ed_dims*2)( x2 )
diff --git a/nnlib/device.py b/nnlib/device.py
new file mode 100644
index 0000000..fb25278
--- /dev/null
+++ b/nnlib/device.py
@@ -0,0 +1,333 @@
+import os
+import json
+import numpy as np
+from .pynvml import *
+
+tf_min_req_cap = 37 #min req compute capability for tensorflow-gpu==1.11.0
+    
+class device:
+    backend = None
+    class Config():    
+        force_gpu_idx = -1
+        multi_gpu = False
+        force_gpu_idxs = None
+        choose_worst_gpu = False
+        gpu_idxs = []
+        gpu_names = []
+        gpu_compute_caps = []
+        gpu_vram_gb = []
+        allow_growth = True
+        use_fp16 = False
+        cpu_only = False
+        backend = None
+        def __init__ (self, force_gpu_idx = -1, 
+                            multi_gpu = False, 
+                            force_gpu_idxs = None, 
+                            choose_worst_gpu = False,
+                            allow_growth = True,
+                            use_fp16 = False,
+                            cpu_only = False,
+                            **in_options):
+            
+            self.backend = device.backend
+            self.use_fp16 = use_fp16
+            self.cpu_only = cpu_only
+            
+            if not self.cpu_only:
+                self.cpu_only = (self.backend == "tensorflow-cpu")
+            
+            if not self.cpu_only:
+                self.force_gpu_idx = force_gpu_idx
+                self.multi_gpu = multi_gpu
+                self.force_gpu_idxs = force_gpu_idxs
+                self.choose_worst_gpu = choose_worst_gpu        
+                self.allow_growth = allow_growth
+          
+                self.gpu_idxs = []
+
+                if force_gpu_idxs is not None:
+                    for idx in force_gpu_idxs.split(','):
+                        idx = int(idx)
+                        if device.isValidDeviceIdx(idx):
+                            self.gpu_idxs.append(idx)     
+                else:
+                    gpu_idx = force_gpu_idx if (force_gpu_idx >= 0 and device.isValidDeviceIdx(force_gpu_idx)) else device.getBestValidDeviceIdx() if not choose_worst_gpu else device.getWorstValidDeviceIdx()
+                    if gpu_idx != -1:
+                        if self.multi_gpu:
+                            self.gpu_idxs = device.getDeviceIdxsEqualModel( gpu_idx )
+                            if len(self.gpu_idxs) <= 1:
+                                self.multi_gpu = False
+                        else:
+                            self.gpu_idxs = [gpu_idx]
+                            
+                self.cpu_only = (len(self.gpu_idxs) == 0)
+ 
+            if not self.cpu_only:
+                self.gpu_names = []
+                self.gpu_compute_caps = []
+                self.gpu_vram_gb = []
+                for gpu_idx in self.gpu_idxs:
+                    self.gpu_names += [device.getDeviceName(gpu_idx)]
+                    self.gpu_compute_caps += [ device.getDeviceComputeCapability(gpu_idx) ]
+                    self.gpu_vram_gb += [ device.getDeviceVRAMTotalGb(gpu_idx) ]
+                self.cpu_only = (len(self.gpu_idxs) == 0)
+                
+            if self.cpu_only:
+                self.backend = "tensorflow-cpu"
+                
+    @staticmethod
+    def getValidDeviceIdxsEnumerator():
+        if device.backend == "plaidML":
+            for i in range(plaidML_devices_count):
+                yield i
+        elif device.backend == "tensorflow":
+            for gpu_idx in range(nvmlDeviceGetCount()):
+                cap = device.getDeviceComputeCapability (gpu_idx)
+                if cap >= tf_min_req_cap:
+                    yield gpu_idx
+        elif device.backend == "tensorflow-generic":
+            yield 0
+        
+    
+    @staticmethod
+    def getValidDevicesWithAtLeastTotalMemoryGB(totalmemsize_gb):
+        result = []
+        if device.backend == "plaidML":
+            for i in device.getValidDeviceIdxsEnumerator():
+                if plaidML_devices[i]['globalMemSize'] >= totalmemsize_gb*1024*1024*1024:
+                     result.append (i)
+        elif device.backend == "tensorflow":
+            for i in device.getValidDeviceIdxsEnumerator():
+                handle = nvmlDeviceGetHandleByIndex(i)
+                memInfo = nvmlDeviceGetMemoryInfo( handle )
+                if (memInfo.total) >= totalmemsize_gb*1024*1024*1024:
+                    result.append (i)
+        elif device.backend == "tensorflow-generic":
+            return [0]
+                   
+        return result
+        
+    @staticmethod
+    def getAllDevicesIdxsList():
+        if device.backend == "plaidML":
+            return [ *range(plaidML_devices_count) ]
+        elif device.backend == "tensorflow":
+            return [ *range(nvmlDeviceGetCount() ) ]
+        elif device.backend == "tensorflow-generic":
+            return [0]  
+            
+    @staticmethod
+    def getValidDevicesIdxsWithNamesList():
+        if device.backend == "plaidML":
+            return [ (i, plaidML_devices[i]['description'] ) for i in device.getValidDeviceIdxsEnumerator() ]
+        elif device.backend == "tensorflow":
+            return [ (i, nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)).decode() ) for i in device.getValidDeviceIdxsEnumerator() ]
+        elif device.backend == "tensorflow-cpu":
+            return [ (0, 'CPU') ]
+        elif device.backend == "tensorflow-generic":
+            return [ (0, device.getDeviceName(0) ) ]
+
+    @staticmethod
+    def getDeviceVRAMTotalGb (idx):
+        if device.backend == "plaidML":
+            if idx < plaidML_devices_count: 
+                return plaidML_devices[idx]['globalMemSize'] / (1024*1024*1024)
+        elif device.backend == "tensorflow":
+            if idx < nvmlDeviceGetCount():    
+                memInfo = nvmlDeviceGetMemoryInfo(  nvmlDeviceGetHandleByIndex(idx) )
+                return round ( memInfo.total / (1024*1024*1024) )
+
+            return 0
+        elif device.backend == "tensorflow-generic":
+            return 2
+            
+    @staticmethod
+    def getBestValidDeviceIdx():
+        if device.backend == "plaidML":
+            idx = -1
+            idx_mem = 0
+            for i in device.getValidDeviceIdxsEnumerator():
+                total = plaidML_devices[i]['globalMemSize']
+                if total > idx_mem:
+                    idx = i
+                    idx_mem = total
+
+            return idx
+        elif device.backend == "tensorflow":
+            idx = -1
+            idx_mem = 0
+            for i in device.getValidDeviceIdxsEnumerator():
+                memInfo = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(i) )
+                if memInfo.total > idx_mem:
+                    idx = i
+                    idx_mem = memInfo.total
+
+            return idx
+        elif device.backend == "tensorflow-generic":
+            return 0
+            
+    @staticmethod
+    def getWorstValidDeviceIdx():
+        if device.backend == "plaidML":
+            idx = -1
+            idx_mem = sys.maxsize
+            for i in device.getValidDeviceIdxsEnumerator():
+                total = plaidML_devices[i]['globalMemSize']
+                if total < idx_mem:
+                    idx = i
+                    idx_mem = total
+
+            return idx
+        elif device.backend == "tensorflow":
+            idx = -1
+            idx_mem = sys.maxsize
+            for i in device.getValidDeviceIdxsEnumerator():
+                memInfo = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(i) )
+                if memInfo.total < idx_mem:
+                    idx = i
+                    idx_mem = memInfo.total
+
+            return idx
+        elif device.backend == "tensorflow-generic":
+            return 0
+        
+    @staticmethod
+    def isValidDeviceIdx(idx):
+        if device.backend == "plaidML":
+            return idx in [*device.getValidDeviceIdxsEnumerator()]
+        elif device.backend == "tensorflow":
+            return idx in [*device.getValidDeviceIdxsEnumerator()]
+        elif device.backend == "tensorflow-generic":
+            return (idx == 0)
+        
+    @staticmethod
+    def getDeviceIdxsEqualModel(idx):
+        if device.backend == "plaidML":
+            result = []  
+            idx_name = plaidML_devices[idx]['description']
+            for i in device.getValidDeviceIdxsEnumerator():
+                if plaidML_devices[i]['description'] == idx_name:
+                    result.append (i)
+
+            return result
+        elif device.backend == "tensorflow":
+            result = []  
+            idx_name = nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(idx)).decode()
+            for i in device.getValidDeviceIdxsEnumerator():
+                if nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)).decode() == idx_name:
+                    result.append (i)
+
+            return result
+        elif device.backend == "tensorflow-generic":
+            return [0] if idx == 0 else []            
+            
+    @staticmethod
+    def getDeviceName (idx):
+        if device.backend == "plaidML":
+            if idx < plaidML_devices_count:    
+                return plaidML_devices[idx]['description']
+        elif device.backend == "tensorflow":
+            if idx < nvmlDeviceGetCount():    
+                return nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(idx)).decode()
+        elif device.backend == "tensorflow-generic":
+            if idx == 0:
+                return "Generic GeForce GPU"
+    
+        return None
+        
+    @staticmethod
+    def getDeviceID (idx):
+        if device.backend == "plaidML":
+            if idx < plaidML_devices_count:    
+                return plaidML_devices[idx]['id'].decode()
+
+        return None   
+        
+    @staticmethod
+    def getDeviceComputeCapability(idx):
+        result = 0
+        if device.backend == "plaidML":
+            return 99
+        elif device.backend == "tensorflow":
+            if idx < nvmlDeviceGetCount():    
+                result = nvmlDeviceGetCudaComputeCapability(nvmlDeviceGetHandleByIndex(idx))
+        elif device.backend == "tensorflow-generic":
+            return 99 if idx == 0 else 0   
+            
+        return result[0] * 10 + result[1]
+
+        
+force_plaidML = os.environ.get("force_plaidML", "0") == "1"
+has_nvml = False
+has_nvml_cap = False
+has_nvidia_device = False
+plaidML_devices = []
+
+# Using plaidML OpenCL backend to determine system devices and has_nvidia_device
+try:    
+    os.environ['PLAIDML_EXPERIMENTAL'] = 'false' #this enables work plaidML without run 'plaidml-setup'
+    import plaidml        
+    ctx = plaidml.Context()
+    for d in plaidml.devices(ctx, return_all=True)[0]:
+        details = json.loads(d.details)
+        if 'nvidia' in details['vendor'].lower():
+            has_nvidia_device = True
+        plaidML_devices += [ {'id':d.id,
+                              'globalMemSize' : int(details['globalMemSize']), 
+                              'description' : d.description.decode()
+                           }]
+    ctx.shutdown()
+except:
+    pass
+    
+plaidML_devices_count = len(plaidML_devices)
+
+#choosing backend
+
+if device.backend is None:
+    #first trying to load NVSMI and detect CUDA devices for tensorflow backend,
+    #even force_plaidML is choosed, because if plaidML will fail, we can choose tensorflow
+    try:
+        nvmlInit()
+        has_nvml = True
+        device.backend = "tensorflow"   #set tensorflow backend in order to use device.*device() functions
+        
+        gpu_idxs = device.getAllDevicesIdxsList()
+        gpu_caps = np.array ( [ device.getDeviceComputeCapability(gpu_idx) for gpu_idx in gpu_idxs ] )
+        
+        if len ( np.ndarray.flatten ( np.argwhere (gpu_caps >= tf_min_req_cap) ) ) == 0:        
+            if not force_plaidML:
+                print ("No CUDA devices found with minimum required compute capability: %d.%d. Falling back to OpenCL mode." % (tf_min_req_cap // 10, tf_min_req_cap % 10) )
+            device.backend = None
+            nvmlShutdown()
+        else:
+            has_nvml_cap = True
+    except:
+        #if no NVSMI installed exception will occur
+        device.backend = None
+        has_nvml = False     
+
+if device.backend is None or force_plaidML:
+    #tensorflow backend was failed or forcing plaidML, trying to use plaidML backend
+    if plaidML_devices_count == 0:
+        print ("plaidML: No capable OpenCL devices found. Falling back to tensorflow backend.")
+        device.backend = None
+    else:
+        device.backend = "plaidML"
+
+if device.backend is None:
+    if not has_nvml:        
+        if has_nvidia_device:
+            #some notebook systems have NVIDIA card without NVSMI in official drivers
+            #in that case considering we have system with one capable GPU and let tensorflow to choose best GPU
+            device.backend = "tensorflow-generic"
+        else:
+            #no NVSMI and no NVIDIA cards, also plaidML was failed, then CPU only
+            device.backend = "tensorflow-cpu"
+    else:
+        if has_nvml_cap:
+            #has NVSMI and capable CUDA-devices, but force_plaidML was failed, then we choosing tensorflow
+            device.backend = "tensorflow"
+        else:
+            #has NVSMI, no capable CUDA-devices, also plaidML was failed, then CPU only
+            device.backend = "tensorflow-cpu"
diff --git a/nnlib/devicelib.py b/nnlib/devicelib.py
deleted file mode 100644
index 1d516fe..0000000
--- a/nnlib/devicelib.py
+++ /dev/null
@@ -1,186 +0,0 @@
-from .pynvml import *
-
-try:
-    nvmlInit()
-    hasNVML = True
-except:
-    hasNVML = False
-
-class devicelib:
-    class Config():    
-        force_gpu_idx = -1
-        multi_gpu = False
-        force_gpu_idxs = None
-        choose_worst_gpu = False
-        gpu_idxs = []
-        gpu_names = []
-        gpu_compute_caps = []
-        gpu_vram_gb = []
-        allow_growth = True
-        use_fp16 = False
-        cpu_only = False
-        
-        def __init__ (self, force_gpu_idx = -1, 
-                            multi_gpu = False, 
-                            force_gpu_idxs = None, 
-                            choose_worst_gpu = False,
-                            allow_growth = True,
-                            use_fp16 = False,
-                            cpu_only = False,
-                            **in_options):
-
-            self.use_fp16 = use_fp16
-            if cpu_only:
-                self.cpu_only = True
-            else:
-                self.force_gpu_idx = force_gpu_idx
-                self.multi_gpu = multi_gpu
-                self.force_gpu_idxs = force_gpu_idxs
-                self.choose_worst_gpu = choose_worst_gpu        
-                self.allow_growth = allow_growth
-          
-                self.gpu_idxs = []
-
-                if force_gpu_idxs is not None:
-                    for idx in force_gpu_idxs.split(','):
-                        idx = int(idx)
-                        if devicelib.isValidDeviceIdx(idx):
-                            self.gpu_idxs.append(idx)     
-                else:
-                    gpu_idx = force_gpu_idx if (force_gpu_idx >= 0 and devicelib.isValidDeviceIdx(force_gpu_idx)) else devicelib.getBestDeviceIdx() if not choose_worst_gpu else devicelib.getWorstDeviceIdx()
-                    if gpu_idx != -1:
-                        if self.multi_gpu:
-                            self.gpu_idxs = devicelib.getDeviceIdxsEqualModel( gpu_idx )
-                            if len(self.gpu_idxs) <= 1:
-                                self.multi_gpu = False
-                        else:
-                            self.gpu_idxs = [gpu_idx]
-                            
-                self.cpu_only = (len(self.gpu_idxs) == 0)
- 
-                if not self.cpu_only:
-                    self.gpu_names = []
-                    self.gpu_compute_caps = []
-                    for gpu_idx in self.gpu_idxs:
-                        self.gpu_names += [devicelib.getDeviceName(gpu_idx)]
-                        self.gpu_compute_caps += [ devicelib.getDeviceComputeCapability ( gpu_idx ) ]
-                        self.gpu_vram_gb += [ devicelib.getDeviceVRAMTotalGb ( gpu_idx ) ]
-                        
-    @staticmethod
-    def getDevicesWithAtLeastTotalMemoryGB(totalmemsize_gb):
-        if not hasNVML:
-            return [0]
-            
-        result = []
-        for i in range(nvmlDeviceGetCount()):
-            handle = nvmlDeviceGetHandleByIndex(i)
-            memInfo = nvmlDeviceGetMemoryInfo( handle )
-            if (memInfo.total) >= totalmemsize_gb*1024*1024*1024:
-                result.append (i)
-        return result
-        
-    @staticmethod
-    def getAllDevicesIdxsList():
-        if not hasNVML:
-            return [0]
-            
-        return [ i for i in range(0, nvmlDeviceGetCount() ) ]
-        
-    @staticmethod
-    def getAllDevicesIdxsWithNamesList():
-        if not hasNVML:
-            return [ (0, devicelib.getDeviceName(0) ) ]
-  
-        return [ (i, nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)).decode() ) for i in range(nvmlDeviceGetCount() ) ]
-        
-    @staticmethod
-    def getDeviceVRAMFree (idx):
-        if not hasNVML:
-            return 2
-
-        if idx < nvmlDeviceGetCount():    
-            memInfo = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(idx) )
-            return memInfo.total - memInfo.used
-
-        return 0
-        
-    @staticmethod
-    def getDeviceVRAMTotalGb (idx):
-        if not hasNVML:
-            return 2
-            
-        if idx < nvmlDeviceGetCount():    
-            memInfo = nvmlDeviceGetMemoryInfo(  nvmlDeviceGetHandleByIndex(idx) )
-            return round ( memInfo.total / (1024*1024*1024) )
-
-        return 0
-        
-    @staticmethod
-    def getBestDeviceIdx():
-        if not hasNVML:
-            return 0
-
-        idx = -1
-        idx_mem = 0
-        for i in range( nvmlDeviceGetCount() ):
-            memInfo = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(i) )
-            if memInfo.total > idx_mem:
-                idx = i
-                idx_mem = memInfo.total
-
-        return idx
-        
-    @staticmethod
-    def getWorstDeviceIdx():
-        if not hasNVML:
-            return 0
-
-        idx = -1
-        idx_mem = sys.maxsize
-        for i in range( nvmlDeviceGetCount() ):
-            memInfo = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(i) )
-            if memInfo.total < idx_mem:
-                idx = i
-                idx_mem = memInfo.total
-
-        return idx
-        
-    @staticmethod
-    def isValidDeviceIdx(idx):
-        if not hasNVML:
-            return (idx == 0)
-   
-        return (idx < nvmlDeviceGetCount())
-        
-    @staticmethod
-    def getDeviceIdxsEqualModel(idx):
-        if not hasNVML:
-            return [0] if idx == 0 else []            
-        
-        result = []  
-        idx_name = nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(idx)).decode()
-        for i in range( nvmlDeviceGetCount() ):
-            if nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)).decode() == idx_name:
-                result.append (i)
-
-        return result
-        
-    @staticmethod
-    def getDeviceName (idx):
-        if not hasNVML:
-            return 'Generic GeForce GPU'
-            
-        if idx < nvmlDeviceGetCount():    
-            return nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(idx)).decode()
-
-        return None
-        
-    @staticmethod
-    def getDeviceComputeCapability(idx):
-        if not hasNVML:
-            return 99 if idx == 0 else 0
-            
-        result = 0  
-        if idx < nvmlDeviceGetCount():    
-            result = nvmlDeviceGetCudaComputeCapability(nvmlDeviceGetHandleByIndex(idx))
-        return result[0] * 10 + result[1]
diff --git a/nnlib/nnlib.py b/nnlib/nnlib.py
index 7a92169..b6f10e9 100644
--- a/nnlib/nnlib.py
+++ b/nnlib/nnlib.py
@@ -4,66 +4,37 @@ import contextlib
 import numpy as np
 
 from utils import std_utils
-from .devicelib import devicelib
+from .device import device
 
 class nnlib(object):
-    device = devicelib #forwards nnlib.devicelib to device in order to use nnlib as standalone lib
-    DeviceConfig = devicelib.Config
+    device = device #forwards nnlib.devicelib to device in order to use nnlib as standalone lib
+    DeviceConfig = device.Config
     active_DeviceConfig = DeviceConfig() #default is one best GPU
 
     dlib = None
+    
     keras = None
     keras_contrib = None
+    
     tf = None
     tf_sess = None
     
-    code_import_tf = None
+    PML = None
+    PMLK = None
+    PMLTile= None
+    
     code_import_keras = None
     code_import_keras_contrib = None
     code_import_all = None
     
     code_import_dlib = None
 
-    tf_dssim = None
-    tf_ssim = None
-    tf_resize_like = None
-    tf_image_histogram = None
-    tf_rgb_to_lab = None
-    tf_lab_to_rgb = None
-    tf_adain = None
-    tf_gaussian_blur = None
-    tf_style_loss = None
-    
-    modelify = None
-    ReflectionPadding2D = None
-    DSSIMLoss = None
-    DSSIMMSEMaskLoss = None
-    PixelShuffler = None  
-    SubpixelUpscaler = None
-    AddUniformNoise = None
     
     ResNet = None
     UNet = None
     UNetTemporalPredictor = None
     NLayerDiscriminator = None
-    
-    code_import_tf_string = \
-"""
-tf = nnlib.tf
-tf_sess = nnlib.tf_sess
 
-tf_reduce_mean = tf.reduce_mean # todo tf 12+ = tf.math.reduce_mean
-tf_total_variation = tf.image.total_variation
-tf_dssim = nnlib.tf_dssim
-tf_ssim = nnlib.tf_ssim
-tf_resize_like = nnlib.tf_resize_like
-tf_image_histogram = nnlib.tf_image_histogram
-tf_rgb_to_lab = nnlib.tf_rgb_to_lab
-tf_lab_to_rgb = nnlib.tf_lab_to_rgb
-tf_adain = nnlib.tf_adain
-tf_gaussian_blur = nnlib.tf_gaussian_blur
-tf_style_loss = nnlib.tf_style_loss
-"""
     code_import_keras_string = \
 """
 keras = nnlib.keras
@@ -81,9 +52,11 @@ BatchNormalization = keras.layers.BatchNormalization
 
 LeakyReLU = keras.layers.LeakyReLU
 ReLU = keras.layers.ReLU
+PReLU = keras.layers.PReLU
 tanh = keras.layers.Activation('tanh')
 sigmoid = keras.layers.Activation('sigmoid')
 Dropout = keras.layers.Dropout
+Softmax = keras.layers.Softmax
 
 Lambda = keras.layers.Lambda
 Add = keras.layers.Add
@@ -100,12 +73,14 @@ Model = keras.models.Model
 Adam = keras.optimizers.Adam
 
 modelify = nnlib.modelify
-ReflectionPadding2D = nnlib.ReflectionPadding2D
-DSSIMLoss = nnlib.DSSIMLoss
-DSSIMMSEMaskLoss = nnlib.DSSIMMSEMaskLoss
+gaussian_blur = nnlib.gaussian_blur
+style_loss = nnlib.style_loss
+dssim = nnlib.dssim
+
+#ReflectionPadding2D = nnlib.ReflectionPadding2D
 PixelShuffler = nnlib.PixelShuffler
 SubpixelUpscaler = nnlib.SubpixelUpscaler
-AddUniformNoise = nnlib.AddUniformNoise
+#AddUniformNoise = nnlib.AddUniformNoise
 """
     code_import_keras_contrib_string = \
 """
@@ -113,7 +88,6 @@ keras_contrib = nnlib.keras_contrib
 GroupNormalization = keras_contrib.layers.GroupNormalization
 InstanceNormalization = keras_contrib.layers.InstanceNormalization
 Padam = keras_contrib.optimizers.Padam
-PELU = keras_contrib.layers.advanced_activations.PELU
 """
     code_import_dlib_string = \
 """
@@ -122,6 +96,7 @@ dlib = nnlib.dlib
 
     code_import_all_string = \
 """
+DSSIMMSEMaskLoss = nnlib.DSSIMMSEMaskLoss
 ResNet = nnlib.ResNet
 UNet = nnlib.UNet
 UNetTemporalPredictor = nnlib.UNetTemporalPredictor
@@ -130,7 +105,7 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
     
             
     @staticmethod
-    def import_tf(device_config = None):
+    def _import_tf(device_config):
         if nnlib.tf is not None:
             return nnlib.code_import_tf
 
@@ -147,263 +122,63 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
         import tensorflow as tf
         nnlib.tf = tf
         
-        if device_config is None:
-            device_config = nnlib.active_DeviceConfig
-        
-        tf_ver = [int(x) for x in tf.VERSION.split('.')]
-        req_cap = 35
-        if tf_ver[0] > 1 or (tf_ver[0] == 1 and tf_ver[1] >= 11):
-            req_cap = 37
-            
-        if not device_config.cpu_only and device_config.gpu_compute_caps[0] < req_cap:
-            if suppressor is not None:  
-                suppressor.__exit__()
-            
-            print ("%s does not meet minimum required compute capability: %d.%d. Falling back to CPU mode." % ( device_config.gpu_names[0], req_cap // 10, req_cap % 10 ) )
-            device_config = nnlib.DeviceConfig(cpu_only=True)
-            
-            if suppressor is not None:  
-                suppressor.__enter__()
-
-        nnlib.active_DeviceConfig = device_config
-        
         if device_config.cpu_only:
-            config = tf.ConfigProto( device_count = {'GPU': 0} )
-        else:     
+            config = tf.ConfigProto(device_count={'GPU': 0})
+        else:   
             config = tf.ConfigProto()
-            visible_device_list = ''
-            for idx in device_config.gpu_idxs:
-                visible_device_list += str(idx) + ','
-            config.gpu_options.visible_device_list=visible_device_list[:-1]
+            
+            if device_config.backend != "tensorflow-generic":
+                #tensorflow-generic is system with NVIDIA card, but w/o NVSMI
+                #so dont hide devices and let tensorflow to choose best card
+                visible_device_list = ''
+                for idx in device_config.gpu_idxs:
+                    visible_device_list += str(idx) + ','
+                config.gpu_options.visible_device_list=visible_device_list[:-1]
             
         config.gpu_options.force_gpu_compatible = True            
         config.gpu_options.allow_growth = device_config.allow_growth
-        
+
         nnlib.tf_sess = tf.Session(config=config)
             
         if suppressor is not None:  
             suppressor.__exit__()
-
-        nnlib.__initialize_tf_functions()
-        nnlib.code_import_tf = compile (nnlib.code_import_tf_string,'','exec')
-        return nnlib.code_import_tf
         
-    @staticmethod
-    def __initialize_tf_functions():
-        tf = nnlib.tf
-
-        def tf_dssim_(max_value=1.0):
-            def func(t1,t2):
-                return (1.0 - tf.image.ssim (t1, t2, max_value)) / 2.0
-            return func
-        nnlib.tf_dssim = tf_dssim_
-         
-        def tf_ssim_(max_value=1.0):            
-            def func(t1,t2):
-                return tf.image.ssim (t1, t2, max_value)
-            return func
-        nnlib.tf_ssim = tf_ssim_
-        
-        def tf_resize_like_(ref_tensor):
-            def func(input_tensor):
-                H, W = ref_tensor.get_shape()[1], ref_tensor.get_shape()[2]
-                return tf.image.resize_bilinear(input_tensor, [H.value, W.value])
-            return func
-        nnlib.tf_resize_like = tf_resize_like_
-
-        def tf_rgb_to_lab():
-            def func(rgb_input):
-                with tf.name_scope("rgb_to_lab"):
-                    srgb_pixels = tf.reshape(rgb_input, [-1, 3])
-
-                    with tf.name_scope("srgb_to_xyz"):
-                        linear_mask = tf.cast(srgb_pixels <= 0.04045, dtype=tf.float32)
-                        exponential_mask = tf.cast(srgb_pixels > 0.04045, dtype=tf.float32)
-                        rgb_pixels = (srgb_pixels / 12.92 * linear_mask) + (((srgb_pixels + 0.055) / 1.055) ** 2.4) * exponential_mask
-                        rgb_to_xyz = tf.constant([
-                            #    X        Y          Z
-                            [0.412453, 0.212671, 0.019334], # R
-                            [0.357580, 0.715160, 0.119193], # G
-                            [0.180423, 0.072169, 0.950227], # B
-                        ])
-                        xyz_pixels = tf.matmul(rgb_pixels, rgb_to_xyz)
-
-                    # https://en.wikipedia.org/wiki/Lab_color_space#CIELAB-CIEXYZ_conversions
-                    with tf.name_scope("xyz_to_cielab"):
-                        # convert to fx = f(X/Xn), fy = f(Y/Yn), fz = f(Z/Zn)
-
-                        # normalize for D65 white point
-                        xyz_normalized_pixels = tf.multiply(xyz_pixels, [1/0.950456, 1.0, 1/1.088754])
-
-                        epsilon = 6/29
-                        linear_mask = tf.cast(xyz_normalized_pixels <= (epsilon**3), dtype=tf.float32)
-                        exponential_mask = tf.cast(xyz_normalized_pixels > (epsilon**3), dtype=tf.float32)
-                        fxfyfz_pixels = (xyz_normalized_pixels / (3 * epsilon**2) + 4/29) * linear_mask + (xyz_normalized_pixels ** (1/3)) * exponential_mask
-
-                        # convert to lab
-                        fxfyfz_to_lab = tf.constant([
-                            #  l       a       b
-                            [  0.0,  500.0,    0.0], # fx
-                            [116.0, -500.0,  200.0], # fy
-                            [  0.0,    0.0, -200.0], # fz
-                        ])
-                        lab_pixels = tf.matmul(fxfyfz_pixels, fxfyfz_to_lab) + tf.constant([-16.0, 0.0, 0.0])
-                    return tf.reshape(lab_pixels, tf.shape(rgb_input))
-            return func
-        nnlib.tf_rgb_to_lab = tf_rgb_to_lab
-        
-        def tf_lab_to_rgb():
-            def func(lab):
-                with tf.name_scope("lab_to_rgb"):
-                    lab_pixels = tf.reshape(lab, [-1, 3])
-
-                    # https://en.wikipedia.org/wiki/Lab_color_space#CIELAB-CIEXYZ_conversions
-                    with tf.name_scope("cielab_to_xyz"):
-                        # convert to fxfyfz
-                        lab_to_fxfyfz = tf.constant([
-                            #   fx      fy        fz
-                            [1/116.0, 1/116.0,  1/116.0], # l
-                            [1/500.0,     0.0,      0.0], # a
-                            [    0.0,     0.0, -1/200.0], # b
-                        ])
-                        fxfyfz_pixels = tf.matmul(lab_pixels + tf.constant([16.0, 0.0, 0.0]), lab_to_fxfyfz)
-
-                        # convert to xyz
-                        epsilon = 6/29
-                        linear_mask = tf.cast(fxfyfz_pixels <= epsilon, dtype=tf.float32)
-                        exponential_mask = tf.cast(fxfyfz_pixels > epsilon, dtype=tf.float32)
-                        xyz_pixels = (3 * epsilon**2 * (fxfyfz_pixels - 4/29)) * linear_mask + (fxfyfz_pixels ** 3) * exponential_mask
-
-                        # denormalize for D65 white point
-                        xyz_pixels = tf.multiply(xyz_pixels, [0.950456, 1.0, 1.088754])
-
-                    with tf.name_scope("xyz_to_srgb"):
-                        xyz_to_rgb = tf.constant([
-                            #     r           g          b
-                            [ 3.2404542, -0.9692660,  0.0556434], # x
-                            [-1.5371385,  1.8760108, -0.2040259], # y
-                            [-0.4985314,  0.0415560,  1.0572252], # z
-                        ])
-                        rgb_pixels = tf.matmul(xyz_pixels, xyz_to_rgb)
-                        # avoid a slightly negative number messing up the conversion
-                        rgb_pixels = tf.clip_by_value(rgb_pixels, 0.0, 1.0)
-                        linear_mask = tf.cast(rgb_pixels <= 0.0031308, dtype=tf.float32)
-                        exponential_mask = tf.cast(rgb_pixels > 0.0031308, dtype=tf.float32)
-                        srgb_pixels = (rgb_pixels * 12.92 * linear_mask) + ((rgb_pixels ** (1/2.4) * 1.055) - 0.055) * exponential_mask
-
-                    return tf.reshape(srgb_pixels, tf.shape(lab))
-            return func
-        nnlib.tf_lab_to_rgb = tf_lab_to_rgb
-
-        def tf_image_histogram():
-            def func(input):
-                x = input
-                x += 1 / 255.0
-                
-                output = []
-                for i in range(256, 0, -1):
-                    v = i / 255.0
-                    y = (x - v) * 1000
-                    
-                    y = tf.clip_by_value (y, -1.0, 0.0) + 1
-
-                    output.append ( tf.reduce_sum (y) )
-                    x -= y*v
-
-                return tf.stack ( output[::-1] )
-            return func
-        nnlib.tf_image_histogram = tf_image_histogram
-     
-        def tf_adain(epsilon=1e-5):
-            def func(content, style):
-                axes = [1,2]
-                c_mean, c_var = tf.nn.moments(content, axes=axes, keep_dims=True)
-                s_mean, s_var = tf.nn.moments(style, axes=axes, keep_dims=True)
-                c_std, s_std = tf.sqrt(c_var + epsilon), tf.sqrt(s_var + epsilon)
-                return s_std * (content - c_mean) / c_std + s_mean
-            return func
-        nnlib.tf_adain = tf_adain
-        
-        def tf_gaussian_blur(radius=2.0):
-            def gaussian_kernel(size,mean,std):
-                d = tf.distributions.Normal( float(mean), float(std) )
-
-                vals = d.prob(tf.range(start = -int(size), limit = int(size) + 1, dtype = tf.float32))
-
-                gauss_kernel = tf.einsum('i,j->ij',
-                                              vals,
-                                              vals)
-
-                return gauss_kernel / tf.reduce_sum(gauss_kernel)
-
-            gauss_kernel = gaussian_kernel(radius, 1.0, radius )
-            gauss_kernel = gauss_kernel[:, :, tf.newaxis, tf.newaxis]
-            
-            def func(input):
-                input_nc = input.get_shape().as_list()[-1]
-                inputs = tf.split(input, input_nc, -1)
-                
-                outputs = []
-                for i in range(len(inputs)):
-                    outputs += [ tf.nn.conv2d( inputs[i] , gauss_kernel, strides=[1, 1, 1, 1], padding="SAME") ]
-
-                return tf.concat (outputs, axis=-1)
-            return func
-        nnlib.tf_gaussian_blur = tf_gaussian_blur
-
-        #any channel count style diff
-        #outputs 0.0 .. 1.0 style difference*loss_weight , 0.0 - no diff
-        def tf_style_loss(gaussian_blur_radius=0.0, loss_weight=1.0, batch_normalize=False, epsilon=1e-5):
-            gblur = tf_gaussian_blur(gaussian_blur_radius)
-            
-            def sd(content, style):
-                content_nc = content.get_shape().as_list()[-1]
-                style_nc = style.get_shape().as_list()[-1]
-                if content_nc != style_nc:
-                    raise Exception("tf_style_loss() content_nc != style_nc")
-                    
-                axes = [1,2]
-                c_mean, c_var = tf.nn.moments(content, axes=axes, keep_dims=True)
-                s_mean, s_var = tf.nn.moments(style, axes=axes, keep_dims=True)
-                c_std, s_std = tf.sqrt(c_var + epsilon), tf.sqrt(s_var + epsilon)
-
-                mean_loss = tf.reduce_sum(tf.squared_difference(c_mean, s_mean))
-                std_loss = tf.reduce_sum(tf.squared_difference(c_std, s_std))
-
-                if batch_normalize:
-                    #normalize w.r.t batch size
-                    n = tf.cast(tf.shape(content)[0], dtype=tf.float32)
-                    mean_loss /= n
-                    std_loss /= n
-                
-                return (mean_loss + std_loss) * loss_weight
-                
-            def func(target, style):
-                if gaussian_blur_radius > 0.0:
-                    return sd( gblur(target), gblur(style))
-                else:
-                    return sd( target, style )
-            return func
-            
-        nnlib.tf_style_loss = tf_style_loss
-
     @staticmethod
     def import_keras(device_config = None):
         if nnlib.keras is not None:
             return nnlib.code_import_keras
 
-        nnlib.import_tf(device_config)
-        device_config = nnlib.active_DeviceConfig
+        if device_config is None:
+            device_config = nnlib.active_DeviceConfig
+            
+        nnlib.active_DeviceConfig = device_config
+
+        if "tensorflow" in device_config.backend:
+            nnlib._import_tf(device_config)
+            device_config = nnlib.active_DeviceConfig
+        elif device_config.backend == "plaidML":
+            os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
+            os.environ["PLAIDML_DEVICE_IDS"] = ",".join ( [ nnlib.device.getDeviceID(idx) for idx in device_config.gpu_idxs] )
+
         if 'TF_SUPPRESS_STD' in os.environ.keys() and os.environ['TF_SUPPRESS_STD'] == '1':
             suppressor = std_utils.suppress_stdout_stderr().__enter__()
-            
+ 
         import keras as keras_
         nnlib.keras = keras_
         
+        if device_config.backend == "plaidML":
+            import plaidml
+            import plaidml.tile
+            nnlib.PML = plaidml
+            nnlib.PMLK = plaidml.keras.backend
+            nnlib.PMLTile = plaidml.tile
+            
         if device_config.use_fp16:
             nnlib.keras.backend.set_floatx('float16')
         
-        nnlib.keras.backend.set_session(nnlib.tf_sess)
+        if "tensorflow" in device_config.backend:
+            nnlib.keras.backend.set_session(nnlib.tf_sess)
+            
         nnlib.keras.backend.set_image_data_format('channels_last')
         
         if 'TF_SUPPRESS_STD' in os.environ.keys() and os.environ['TF_SUPPRESS_STD'] == '1':        
@@ -411,14 +186,12 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
 
         nnlib.__initialize_keras_functions()  
         nnlib.code_import_keras = compile (nnlib.code_import_keras_string,'','exec')
-
+        return nnlib.code_import_keras
         
     @staticmethod
     def __initialize_keras_functions():
-        tf = nnlib.tf
         keras = nnlib.keras
         K = keras.backend
-        exec (nnlib.code_import_tf, locals(), globals())
         
         def modelify(model_functor):
             def func(tensor):
@@ -427,68 +200,172 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
         
         nnlib.modelify = modelify
         
-        class ReflectionPadding2D(keras.layers.Layer):
-            def __init__(self, padding=(1, 1), **kwargs):
-                self.padding = tuple(padding)
-                self.input_spec = [keras.layers.InputSpec(ndim=4)]
-                super(ReflectionPadding2D, self).__init__(**kwargs)
+        def gaussian_blur(radius=2.0):
+            def gaussian(x, mu, sigma):
+                return np.exp(-(float(x) - float(mu)) ** 2 / (2 * sigma ** 2))
 
-            def compute_output_shape(self, s):
-                """ If you are using "channels_last" configuration"""
-                return (s[0], s[1] + 2 * self.padding[0], s[2] + 2 * self.padding[1], s[3])
+            def make_kernel(sigma):
+                kernel_size = max(3, int(2 * 2 * sigma + 1))
+                mean = np.floor(0.5 * kernel_size)
+                kernel_1d = np.array([gaussian(x, mean, sigma) for x in range(kernel_size)])
+                np_kernel = np.outer(kernel_1d, kernel_1d).astype(dtype=K.floatx())
+                kernel = np_kernel / np.sum(np_kernel)
+                return kernel
+          
+            gauss_kernel = make_kernel(radius)
+            gauss_kernel = gauss_kernel[:, :,np.newaxis, np.newaxis]
 
-            def call(self, x, mask=None):
-                w_pad,h_pad = self.padding
-                return tf.pad(x, [[0,0], [h_pad,h_pad], [w_pad,w_pad], [0,0] ], 'REFLECT')
-        nnlib.ReflectionPadding2D = ReflectionPadding2D
+            def func(input):
+                inputs = [ input[:,:,:,i:i+1]  for i in range( K.int_shape( input )[-1] ) ]
 
-        class DSSIMLoss(object):
-            def __init__(self, is_tanh=False):
-                self.is_tanh = is_tanh
-                
-            def __call__(self,y_true, y_pred):
-                if not self.is_tanh:            
-                    return (1.0 - tf.image.ssim (y_true, y_pred, 1.0)) / 2.0
-                else:
-                    return (1.0 - tf.image.ssim ((y_true/2+0.5), (y_pred/2+0.5), 1.0)) / 2.0
-        nnlib.DSSIMLoss = DSSIMLoss
+                outputs = []
+                for i in range(len(inputs)):
+                    outputs += [ K.conv2d( inputs[i] , K.constant(gauss_kernel) , strides=(1,1), padding="same") ]
+
+                return K.concatenate (outputs, axis=-1)
+            return func
+        nnlib.gaussian_blur = gaussian_blur
         
-        class DSSIMMSEMaskLoss(object):
-            def __init__(self, mask, is_mse=False):
-                self.mask = mask
-                self.is_mse = is_mse
+        def style_loss(gaussian_blur_radius=0.0, loss_weight=1.0, wnd_size=0, step_size=1):
+            if gaussian_blur_radius > 0.0:
+                gblur = gaussian_blur(gaussian_blur_radius)
+            
+            def sd(content, style, loss_weight):
+                content_nc = K.int_shape(content)[-1]
+                style_nc = K.int_shape(style)[-1]
+                if content_nc != style_nc:
+                    raise Exception("style_loss() content_nc != style_nc")
+                    
+                axes = [1,2]
+                c_mean, c_var = K.mean(content, axis=axes, keepdims=True), K.var(content, axis=axes, keepdims=True)
+                s_mean, s_var = K.mean(style, axis=axes, keepdims=True), K.var(style, axis=axes, keepdims=True)
+                c_std, s_std = K.sqrt(c_var + 1e-5), K.sqrt(s_var + 1e-5)
+
+                mean_loss = K.sum(K.square(c_mean-s_mean))
+                std_loss = K.sum(K.square(c_std-s_std))
                 
-            def __call__(self,y_true, y_pred):
-                total_loss = None
-       
-                mask = self.mask
-                if self.is_mse:                
-                    blur_mask = tf_gaussian_blur(max(1, mask.get_shape().as_list()[1] // 32))(mask)
-                    return K.mean ( 100*K.square( y_true*blur_mask - y_pred*blur_mask ) )
+                return (mean_loss + std_loss) * ( loss_weight / float(content_nc) )
+                
+            def func(target, style):
+                if wnd_size == 0:
+                    if gaussian_blur_radius > 0.0:
+                        return sd( gblur(target), gblur(style), loss_weight=loss_weight)
+                    else:
+                        return sd( target, style, loss_weight=loss_weight )
                 else:
-                    return (1.0 - (tf.image.ssim (y_true*mask, y_pred*mask, 1.0))) / 2.0
-        nnlib.DSSIMMSEMaskLoss = DSSIMMSEMaskLoss
+                    #currently unused
+                    if nnlib.tf is not None:
+                        sh = K.int_shape(target)[1]
+                        k = (sh-wnd_size) // step_size + 1                        
+                        if gaussian_blur_radius > 0.0:
+                            target, style = gblur(target), gblur(style)                        
+                        target = nnlib.tf.image.extract_image_patches(target, [1,k,k,1], [1,1,1,1], [1,step_size,step_size,1], 'VALID')
+                        style  = nnlib.tf.image.extract_image_patches(style,  [1,k,k,1], [1,1,1,1], [1,step_size,step_size,1], 'VALID')
+                        return sd( target, style, loss_weight )
+                    if nnlib.PML is not None:
+                        print ("Sorry, plaidML backend does not support style_loss")
+                        return 0
+            return func
+        nnlib.style_loss = style_loss  
+        
+        
+        def dssim(k1=0.01, k2=0.03, max_value=1.0):
+            # port of tf.image.ssim to pure keras in order to work on plaidML backend.
+
+            def func(y_true, y_pred):
+                ch = K.int_shape(y_pred)[-1]
+                
+                def softmax(x, axis=-1): #from K numpy backend
+                    y = np.exp(x - np.max(x, axis, keepdims=True))
+                    return y / np.sum(y, axis, keepdims=True)
+                    
+                def gauss_kernel(size, sigma):
+                    coords = np.arange(0,size, dtype=K.floatx() )                  
+                    coords -= (size - 1 ) / 2.0
+                    g = coords**2
+                    g *= ( -0.5 / (sigma**2) )
+                    g = np.reshape (g, (1,-1)) + np.reshape(g, (-1,1) )
+                    g = np.reshape (g, (1,-1))
+                    g = softmax(g)
+                    g = np.reshape (g, (size, size, 1, 1))  
+                    g = np.tile (g, (1,1,ch,1))                
+                    return K.constant(g, dtype=K.floatx() )
+    
+                kernel = gauss_kernel(11,1.5)                
+              
+                def reducer(x):
+                    shape = K.shape(x)
+                    x = K.reshape(x, (-1, shape[-3] , shape[-2], shape[-1]) )                  
+                    y = K.depthwise_conv2d(x, kernel, strides=(1, 1), padding='valid')
+                    y_shape = K.shape(y)
+                    return K.reshape(y, (shape[0], y_shape[1], y_shape[2], y_shape[3] ) )
+
+                def _ssim_helper(x, y, reducer, compensation=1.0):
+                    c1 = (k1 * max_value) ** 2
+                    c2 = (k2 * max_value) ** 2
+                    
+                    mean0 = reducer(x)
+                    mean1 = reducer(y)
+                    num0 = mean0 * mean1 * 2.0
+                    den0 = K.square(mean0) + K.square(mean1)
+                    luminance = (num0 + c1) / (den0 + c1)
+                    
+                    num1 = reducer(x * y) * 2.0
+                    den1 = reducer(K.square(x) + K.square(y))
+                    c2 *= compensation
+                    cs = (num1 - num0 + c2) / (den1 - den0 + c2)
+                    
+                    return luminance, cs
+
+                luminance, cs = _ssim_helper(y_true, y_pred, reducer)
+                ssim_val = K.mean(luminance * cs, axis=(-3, -2) )
+                return K.mean( (1.0 - ssim_val ) / 2.0 )
+
+            return func
+        nnlib.dssim = dssim
         
         class PixelShuffler(keras.layers.Layer):
             def __init__(self, size=(2, 2), data_format=None, **kwargs):
                 super(PixelShuffler, self).__init__(**kwargs)
-                self.data_format = keras.backend.common.normalize_data_format(data_format)
+                self.data_format = K.normalize_data_format(data_format)
                 self.size = keras.utils.conv_utils.normalize_tuple(size, 2, 'size')
 
             def call(self, inputs):
-                input_shape = keras.backend.int_shape(inputs)
+
+                input_shape = K.int_shape(inputs)
                 if len(input_shape) != 4:
                     raise ValueError('Inputs should have rank ' +
                                      str(4) +
                                      '; Received input shape:', str(input_shape))
 
                 if self.data_format == 'channels_first':
-                    return tf.depth_to_space(inputs, self.size[0], 'NCHW')
+                    batch_size, c, h, w = input_shape
+                    if batch_size is None:
+                        batch_size = -1
+                    rh, rw = self.size
+                    oh, ow = h * rh, w * rw
+                    oc = c // (rh * rw)
+
+                    out = K.reshape(inputs, (batch_size, rh, rw, oc, h, w))
+                    out = K.permute_dimensions(out, (0, 3, 4, 1, 5, 2))
+                    out = K.reshape(out, (batch_size, oc, oh, ow))
+                    return out
 
                 elif self.data_format == 'channels_last':
-                    return tf.depth_to_space(inputs, self.size[0], 'NHWC')
+                    batch_size, h, w, c = input_shape
+                    if batch_size is None:
+                        batch_size = -1
+                    rh, rw = self.size
+                    oh, ow = h * rh, w * rw
+                    oc = c // (rh * rw)
+
+                    out = K.reshape(inputs, (batch_size, h, w, rh, rw, oc))
+                    out = K.permute_dimensions(out, (0, 1, 3, 2, 4, 5))
+                    out = K.reshape(out, (batch_size, oh, ow, oc))
+                    return out
 
             def compute_output_shape(self, input_shape):
+
                 if len(input_shape) != 4:
                     raise ValueError('Inputs should have rank ' +
                                      str(4) +
@@ -525,11 +402,28 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
                           'data_format': self.data_format}
                 base_config = super(PixelShuffler, self).get_config()
 
-                return dict(list(base_config.items()) + list(config.items()))
-
-        nnlib.PixelShuffler = PixelShuffler
-        nnlib.SubpixelUpscaler = PixelShuffler
+                return dict(list(base_config.items()) + list(config.items()))        
         
+        nnlib.PixelShuffler = PixelShuffler
+        nnlib.SubpixelUpscaler = PixelShuffler    
+        '''
+        
+        class ReflectionPadding2D(keras.layers.Layer):
+            def __init__(self, padding=(1, 1), **kwargs):
+                self.padding = tuple(padding)
+                self.input_spec = [keras.layers.InputSpec(ndim=4)]
+                super(ReflectionPadding2D, self).__init__(**kwargs)
+
+            def compute_output_shape(self, s):
+                """ If you are using "channels_last" configuration"""
+                return (s[0], s[1] + 2 * self.padding[0], s[2] + 2 * self.padding[1], s[3])
+
+            def call(self, x, mask=None):
+                w_pad,h_pad = self.padding
+                return tf.pad(x, [[0,0], [h_pad,h_pad], [w_pad,w_pad], [0,0] ], 'REFLECT')
+        nnlib.ReflectionPadding2D = ReflectionPadding2D
+
+       
         class AddUniformNoise(keras.layers.Layer):
             def __init__(self, power=1.0, minval=-1.0, maxval=1.0, **kwargs):
                 super(AddUniformNoise, self).__init__(**kwargs)
@@ -548,7 +442,7 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
                 base_config = super(AddUniformNoise, self).get_config()
                 return dict(list(base_config.items()) + list(config.items()))
         nnlib.AddUniformNoise = AddUniformNoise       
-                
+        '''        
     @staticmethod
     def import_keras_contrib(device_config = None):
         if nnlib.keras_contrib is not None:
@@ -570,20 +464,17 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
 
         import dlib as dlib_
         nnlib.dlib = dlib_
-        if not device_config.cpu_only and len(device_config.gpu_idxs) > 0:
-            nnlib.dlib.cuda.set_device(device_config.gpu_idxs[0])
-            
+        if not device_config.cpu_only and "tensorflow" in device_config.backend and len(device_config.gpu_idxs) > 0:
+            nnlib.dlib.cuda.set_device(device_config.gpu_idxs[0])           
         
         nnlib.code_import_dlib = compile (nnlib.code_import_dlib_string,'','exec')
     
     @staticmethod
     def import_all(device_config = None):
-        if nnlib.code_import_all is None:
-            nnlib.import_tf(device_config)        
+        if nnlib.code_import_all is None:  
             nnlib.import_keras(device_config)
-            nnlib.import_keras_contrib(device_config)        
-            nnlib.code_import_all = compile (nnlib.code_import_tf_string + '\n' 
-                                            + nnlib.code_import_keras_string + '\n'
+            nnlib.import_keras_contrib(device_config)                                                
+            nnlib.code_import_all = compile (nnlib.code_import_keras_string + '\n'
                                             + nnlib.code_import_keras_contrib_string 
                                             + nnlib.code_import_all_string,'','exec')        
             nnlib.__initialize_all_functions()
@@ -592,6 +483,24 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
     
     @staticmethod
     def __initialize_all_functions():
+        exec (nnlib.import_keras(), locals(), globals())
+        exec (nnlib.import_keras_contrib(), locals(), globals())
+        
+        class DSSIMMSEMaskLoss(object):
+            def __init__(self, mask, is_mse=False):
+                self.mask = mask
+                self.is_mse = is_mse                
+            def __call__(self,y_true, y_pred):
+                total_loss = None
+                mask = self.mask
+                if self.is_mse:                
+                    blur_mask = gaussian_blur(max(1, K.int_shape(mask)[1] // 64))(mask)
+                    return K.mean ( 50*K.square( y_true*blur_mask - y_pred*blur_mask ) )
+                else:
+                    return 10*dssim() (y_true*mask, y_pred*mask)                    
+        nnlib.DSSIMMSEMaskLoss = DSSIMMSEMaskLoss
+        
+        '''
         def ResNet(output_nc, use_batch_norm, ngf=64, n_blocks=6, use_dropout=False):
             exec (nnlib.import_all(), locals(), globals())
 
@@ -775,7 +684,7 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
                 return Conv2D( 1, 4, 1, 'valid')(x)
             return func
         nnlib.NLayerDiscriminator = NLayerDiscriminator
-        
+        '''
     @staticmethod
     def finalize_all():
         if nnlib.keras_contrib is not None:
@@ -786,7 +695,6 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
             nnlib.keras = None
 
         if nnlib.tf is not None:
-            nnlib.tf_sess.close()
             nnlib.tf_sess = None
             nnlib.tf = None
 
diff --git a/requirements-gpu-cuda9-cudnn7.txt b/requirements-gpu-opencl-cuda9-cudnn7.txt
similarity index 90%
rename from requirements-gpu-cuda9-cudnn7.txt
rename to requirements-gpu-opencl-cuda9-cudnn7.txt
index 4d2d071..f2ee595 100644
--- a/requirements-gpu-cuda9-cudnn7.txt
+++ b/requirements-gpu-opencl-cuda9-cudnn7.txt
@@ -5,6 +5,7 @@ h5py==2.7.1
 Keras==2.2.4
 opencv-python==4.0.0.21
 tensorflow-gpu==1.11.0
+plaidml-keras==0.5.0
 scikit-image
 dlib==19.10.0
 tqdm
diff --git a/samples/SampleProcessor.py b/samples/SampleProcessor.py
index 7f7ec49..93fe1c6 100644
--- a/samples/SampleProcessor.py
+++ b/samples/SampleProcessor.py
@@ -8,29 +8,28 @@ from facelib import FaceType
 
 class SampleProcessor(object):
     class TypeFlags(IntEnum):
-        SOURCE               = 0x00000001,
-        WARPED               = 0x00000002,
-        WARPED_TRANSFORMED   = 0x00000004,
-        TRANSFORMED          = 0x00000008,
-        LANDMARKS_ARRAY      = 0x00000010, #currently unused
-        
-        RANDOM_CLOSE       = 0x00000020,
-        MORPH_TO_RANDOM_CLOSE \
-                             = 0x00000040,
-        
-        FACE_ALIGN_HALF      = 0x00000100,
-        FACE_ALIGN_FULL      = 0x00000200,
-        FACE_ALIGN_HEAD      = 0x00000400,
-        FACE_ALIGN_AVATAR    = 0x00000800,    
-        
-        FACE_MASK_FULL       = 0x00001000,
-        FACE_MASK_EYES       = 0x00002000,
-        
-        MODE_BGR             = 0x01000000,  #BGR
-        MODE_G               = 0x02000000,  #Grayscale
-        MODE_GGG             = 0x04000000,  #3xGrayscale 
-        MODE_M               = 0x08000000,  #mask only
-        MODE_BGR_SHUFFLE     = 0x10000000,  #BGR shuffle
+        SOURCE                = 0x00000001,
+        WARPED                = 0x00000002,
+        WARPED_TRANSFORMED    = 0x00000004,
+        TRANSFORMED           = 0x00000008,
+        LANDMARKS_ARRAY       = 0x00000010, #currently unused
+                              
+        RANDOM_CLOSE          = 0x00000020,
+        MORPH_TO_RANDOM_CLOSE = 0x00000040,
+                              
+        FACE_ALIGN_HALF       = 0x00000100,
+        FACE_ALIGN_FULL       = 0x00000200,
+        FACE_ALIGN_HEAD       = 0x00000400,
+        FACE_ALIGN_AVATAR     = 0x00000800,    
+                              
+        FACE_MASK_FULL        = 0x00001000,
+        FACE_MASK_EYES        = 0x00002000,
+                              
+        MODE_BGR              = 0x01000000,  #BGR
+        MODE_G                = 0x02000000,  #Grayscale
+        MODE_GGG              = 0x04000000,  #3xGrayscale 
+        MODE_M                = 0x08000000,  #mask only
+        MODE_BGR_SHUFFLE      = 0x10000000,  #BGR shuffle
    
     class Options(object):     
         def __init__(self, random_flip = True, normalize_tanh = False, rotation_range=[-10,10], scale_range=[-0.05, 0.05], tx_range=[-0.05, 0.05], ty_range=[-0.05, 0.05]):
diff --git a/utils/image_utils.py b/utils/image_utils.py
index 0670a16..98e6e4a 100644
--- a/utils/image_utils.py
+++ b/utils/image_utils.py
@@ -5,7 +5,6 @@ import cv2
 import localization
 from scipy.spatial import Delaunay
 from PIL import Image, ImageDraw, ImageFont
-from nnlib import nnlib
 
 def reinhard_color_transfer(target, source, clip=False, preserve_paper=False, source_mask=None, target_mask=None):
 	"""
@@ -423,24 +422,4 @@ def reduce_colors (img_bgr, n_colors):
     img_bgr = cv2.cvtColor( np.array(img_rgb_p, dtype=np.float32) / 255.0, cv2.COLOR_RGB2BGR )
     
     return img_bgr
-    
-    
-class TFLabConverter():
-    def __init__(self):        
-        exec (nnlib.import_tf(), locals(), globals())
-        self.tf_sess = tf_sess
-        
-        self.bgr_input_tensor = tf.placeholder("float", [None, None, 3])
-        self.lab_input_tensor = tf.placeholder("float", [None, None, 3])
-        
-        self.lab_output_tensor = tf_rgb_to_lab()(self.bgr_input_tensor)        
-        self.bgr_output_tensor = tf_lab_to_rgb()(self.lab_input_tensor)
-        
-        
-    def bgr2lab(self, bgr):    
-        return self.tf_sess.run(self.lab_output_tensor, feed_dict={self.bgr_input_tensor: bgr})
-        
-    def lab2bgr(self, lab):    
-        return self.tf_sess.run(self.bgr_output_tensor, feed_dict={self.lab_input_tensor: lab})    
-        
-    
\ No newline at end of file
+  
\ No newline at end of file