added support of AMD videocards

added Intel's plaidML backend to use OpenCL engine. Check new requirements. smart choosing of backend in device.py env var 'force_plaidML' can be choosed to forced using plaidML all tf functions transferred to pure keras MTCNN transferred to pure keras, but it works slow on plaidML (forced to CPU in this case) default batch size for all models and VRAMs now 4, feel free to adjust it on your own SAE: default style options now ZERO, because there are no best values for all scenes, set them on your own. SAE: return back option pixel_loss, feel free to enable it on your own. SAE: added option multiscale_decoder default is true, but you can disable it to get 100% same as H,DF,LIAEF model behaviour. fix converter output to .png added linux fork reference to doc/doc_build_and_repository_info.md
2025-07-06 13:02:15 -07:00 · 2019-02-19 17:33:12 +04:00 · 2019-02-19 17:33:12 +04:00 · 72ba6b103c
commit 72ba6b103c
parent 3a9d450281
24 changed files with 2694 additions and 1489 deletions
--- a/__dev/port.py
+++ b/__dev/port.py
@ -0,0 +1,344 @@
+#import FaceLandmarksExtractor
+
+
+import numpy as np
+import dlib
+import torch
+import keras
+from keras import backend as K
+from keras import layers as KL
+import math
+import os
+import time
+import code
+
+class TorchBatchNorm2D(keras.engine.topology.Layer):
+    def __init__(self, axis=-1, momentum=0.99, epsilon=1e-3, **kwargs):
+        super(TorchBatchNorm2D, self).__init__(**kwargs)
+        self.supports_masking = True
+        self.axis = axis
+        self.momentum = momentum
+        self.epsilon = epsilon
+
+    def build(self, input_shape):
+        dim = input_shape[self.axis]
+        if dim is None:
+            raise ValueError('Axis ' + str(self.axis) + ' of '
+                             'input tensor should have a defined dimension '
+                             'but the layer received an input with shape ' +
+                             str(input_shape) + '.')
+        shape = (dim,)
+        self.gamma = self.add_weight(shape=shape, name='gamma', initializer='ones', regularizer=None, constraint=None)
+        self.beta = self.add_weight(shape=shape, name='beta', initializer='zeros', regularizer=None, constraint=None)
+        self.moving_mean = self.add_weight(shape=shape, name='moving_mean', initializer='zeros', trainable=False)            
+        self.moving_variance = self.add_weight(shape=shape, name='moving_variance', initializer='ones', trainable=False)            
+        self.built = True
+
+    def call(self, inputs, training=None):
+        input_shape = K.int_shape(inputs)
+
+        broadcast_shape = [1] * len(input_shape)
+        broadcast_shape[self.axis] = input_shape[self.axis]
+        
+        broadcast_moving_mean = K.reshape(self.moving_mean, broadcast_shape)
+        broadcast_moving_variance = K.reshape(self.moving_variance, broadcast_shape)
+        broadcast_gamma = K.reshape(self.gamma, broadcast_shape)
+        broadcast_beta = K.reshape(self.beta, broadcast_shape)        
+        invstd = K.ones (shape=broadcast_shape, dtype='float32') / K.sqrt(broadcast_moving_variance + K.constant(self.epsilon, dtype='float32'))
+        
+        return (inputs - broadcast_moving_mean) * invstd * broadcast_gamma + broadcast_beta
+       
+    def get_config(self):
+        config = { 'axis': self.axis, 'momentum': self.momentum, 'epsilon': self.epsilon }
+        base_config = super(TorchBatchNorm2D, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+       
+def t2kw_conv2d (src):
+    if src.bias is not None:
+        return [ np.moveaxis(src.weight.data.cpu().numpy(), [0,1,2,3], [3,2,0,1]), src.bias.data.cpu().numpy() ]
+    else:
+        return [ np.moveaxis(src.weight.data.cpu().numpy(), [0,1,2,3], [3,2,0,1])]
+        
+    
+def t2kw_bn2d(src):
+    return [ src.weight.data.cpu().numpy(), src.bias.data.cpu().numpy(), src.running_mean.cpu().numpy(), src.running_var.cpu().numpy() ]
+
+
+    
+import face_alignment
+fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D,enable_cuda=False,enable_cudnn=False,use_cnn_face_detector=True).face_alignemnt_net
+fa.eval()
+
+
+def KerasConvBlock(in_planes, out_planes, input, srctorch):
+    out1 = TorchBatchNorm2D(axis=1,  momentum=0.1, epsilon=1e-05, weights=t2kw_bn2d(srctorch.bn1) )(input)
+    out1 = KL.Activation( keras.backend.relu ) (out1)
+    out1 = KL.ZeroPadding2D(padding=(1, 1), data_format='channels_first')(out1)
+    out1 = KL.convolutional.Conv2D( int(out_planes/2), kernel_size=3, strides=1, data_format='channels_first', padding='valid', use_bias = False, weights=t2kw_conv2d(srctorch.conv1) ) (out1)
+     
+    out2 = TorchBatchNorm2D(axis=1,  momentum=0.1, epsilon=1e-05, weights=t2kw_bn2d(srctorch.bn2) )(out1)
+    out2 = KL.Activation( keras.backend.relu ) (out2)
+    out2 = KL.ZeroPadding2D(padding=(1, 1), data_format='channels_first')(out2)
+    out2 = KL.convolutional.Conv2D( int(out_planes/4), kernel_size=3, strides=1, data_format='channels_first', padding='valid', use_bias = False, weights=t2kw_conv2d(srctorch.conv2) ) (out2)
+    
+    out3 = TorchBatchNorm2D(axis=1,  momentum=0.1, epsilon=1e-05, weights=t2kw_bn2d(srctorch.bn3) )(out2)
+    out3 = KL.Activation( keras.backend.relu ) (out3)
+    out3 = KL.ZeroPadding2D(padding=(1, 1), data_format='channels_first')(out3)
+    out3 = KL.convolutional.Conv2D( int(out_planes/4), kernel_size=3, strides=1, data_format='channels_first', padding='valid', use_bias = False, weights=t2kw_conv2d(srctorch.conv3) ) (out3)
+     
+    out3 = KL.Concatenate(axis=1)([out1, out2, out3])
+    
+    if in_planes != out_planes:
+        downsample = TorchBatchNorm2D(axis=1,  momentum=0.1, epsilon=1e-05, weights=t2kw_bn2d(srctorch.downsample[0]) )(input)
+        downsample = KL.Activation( keras.backend.relu ) (downsample)
+        downsample = KL.convolutional.Conv2D( out_planes, kernel_size=1, strides=1, data_format='channels_first', padding='valid', use_bias = False, weights=t2kw_conv2d(srctorch.downsample[2]) ) (downsample)
+        out3 = KL.add ( [out3, downsample] )
+    else:
+        out3 = KL.add ( [out3, input] )
+    
+
+    return out3
+    
+def KerasHourGlass (depth, input, srctorch):
+
+    up1 = KerasConvBlock(256, 256, input, srctorch._modules['b1_%d' % (depth)])
+    
+    low1 = KL.AveragePooling2D (pool_size=2, strides=2, data_format='channels_first', padding='valid' )(input)
+    low1 = KerasConvBlock (256, 256, low1, srctorch._modules['b2_%d' % (depth)])
+    
+    if depth > 1:
+        low2 = KerasHourGlass (depth-1, low1, srctorch)
+    else:
+        low2 = KerasConvBlock(256, 256, low1, srctorch._modules['b2_plus_%d' % (depth)])
+    
+    low3 = KerasConvBlock(256, 256, low2, srctorch._modules['b3_%d' % (depth)])
+    
+    up2 = KL.UpSampling2D(size=2, data_format='channels_first') (low3)
+    return KL.add ( [up1, up2] )
+    
+model_path = os.path.join( os.path.dirname(__file__) , "2DFAN-4.h5" )
+if os.path.exists (model_path):    
+    t = time.time()
+    model = keras.models.load_model (model_path, custom_objects={'TorchBatchNorm2D': TorchBatchNorm2D} ) 
+    print ('load takes = %f' %( time.time() - t ) )
+else:
+    _input = keras.layers.Input ( shape=(3, 256,256) )
+    x = KL.ZeroPadding2D(padding=(3, 3), data_format='channels_first')(_input)
+    x = KL.convolutional.Conv2D( 64, kernel_size=7, strides=2, data_format='channels_first', padding='valid', weights=t2kw_conv2d(fa.conv1) ) (x)
+    
+    x = TorchBatchNorm2D(axis=1,  momentum=0.1, epsilon=1e-05, weights=t2kw_bn2d(fa.bn1) )(x)
+    x = KL.Activation( keras.backend.relu ) (x)
+    
+    x = KerasConvBlock (64, 128, x, fa.conv2)
+    x = KL.AveragePooling2D (pool_size=2, strides=2, data_format='channels_first', padding='valid' ) (x)
+    x = KerasConvBlock (128, 128, x, fa.conv3)
+    x = KerasConvBlock (128, 256, x, fa.conv4)
+    
+    outputs = []
+    previous = x
+    for i in range(4):
+        ll = KerasHourGlass (4, previous, fa._modules['m%d' % (i) ])
+        ll = KerasConvBlock (256,256, ll, fa._modules['top_m_%d' % (i)])
+        
+        ll = KL.convolutional.Conv2D(256, kernel_size=1, strides=1, data_format='channels_first', padding='valid', weights=t2kw_conv2d( fa._modules['conv_last%d' % (i)] ) ) (ll)
+        ll = TorchBatchNorm2D(axis=1,  momentum=0.1, epsilon=1e-05, weights=t2kw_bn2d( fa._modules['bn_end%d' % (i)] ) )(ll)
+        ll = KL.Activation( keras.backend.relu ) (ll)
+        
+        tmp_out = KL.convolutional.Conv2D(68, kernel_size=1, strides=1, data_format='channels_first', padding='valid', weights=t2kw_conv2d( fa._modules['l%d' % (i)] ) ) (ll)
+        outputs.append(tmp_out)
+        if i < 4 - 1:
+            ll = KL.convolutional.Conv2D(256, kernel_size=1, strides=1, data_format='channels_first', padding='valid', weights=t2kw_conv2d( fa._modules['bl%d' % (i)] ) ) (ll)
+            previous = KL.add ( [previous, ll, KL.convolutional.Conv2D(256, kernel_size=1, strides=1, data_format='channels_first', padding='valid', weights=t2kw_conv2d( fa._modules['al%d' % (i)] ) ) (tmp_out) ] )
+            
+    model = keras.models.Model (_input, outputs)
+    model.compile ( loss='mse', optimizer='adam' )
+    model.save (model_path)
+    model.save_weights ( os.path.join( os.path.dirname(__file__) , 'weights.h5') )
+    
+def transform(point, center, scale, resolution, invert=False):
+    _pt = torch.ones(3)
+    _pt[0] = point[0]
+    _pt[1] = point[1]
+
+    h = 200.0 * scale
+    t = torch.eye(3)
+    t[0, 0] = resolution / h
+    t[1, 1] = resolution / h
+    t[0, 2] = resolution * (-center[0] / h + 0.5)
+    t[1, 2] = resolution * (-center[1] / h + 0.5)
+
+    if invert:
+        t = torch.inverse(t)
+
+    new_point = (torch.matmul(t, _pt))[0:2]
+
+    return new_point.int()
+    
+def get_preds_fromhm(hm, center=None, scale=None):
+    max, idx = torch.max(  hm.view(hm.size(0), hm.size(1), hm.size(2) * hm.size(3)), 2)
+    idx += 1
+    preds = idx.view(idx.size(0), idx.size(1), 1).repeat(1, 1, 2).float()
+    preds[..., 0].apply_(lambda x: (x - 1) % hm.size(3) + 1)
+    preds[..., 1].add_(-1).div_(hm.size(2)).floor_().add_(1)
+
+    for i in range(preds.size(0)):
+        for j in range(preds.size(1)):
+            hm_ = hm[i, j, :]
+            pX, pY = int(preds[i, j, 0]) - 1, int(preds[i, j, 1]) - 1
+            if pX > 0 and pX < 63 and pY > 0 and pY < 63:
+                diff = torch.FloatTensor(
+                    [hm_[pY, pX + 1] - hm_[pY, pX - 1],
+                     hm_[pY + 1, pX] - hm_[pY - 1, pX]])
+                preds[i, j].add_(diff.sign_().mul_(.25))
+
+    preds.add_(-.5)
+    
+    preds_orig = torch.zeros(preds.size())
+    if center is not None and scale is not None:
+        for i in range(hm.size(0)):
+            for j in range(hm.size(1)):
+                preds_orig[i, j] = transform(
+                    preds[i, j], center, scale, hm.size(2), True)
+                    
+    return preds, preds_orig
+
+
+def get_preds_fromhm2(a, center=None, scale=None):
+    b = a.reshape ( (a.shape[0], a.shape[1]*a.shape[2]) )    
+    c = b.argmax(1).reshape ( (a.shape[0], 1) ).repeat(2, axis=1).astype(np.float)
+    c[:,0] %= a.shape[2]    
+    c[:,1] = np.apply_along_axis ( lambda x: np.floor(x / a.shape[2]), 0, c[:,1] )
+
+    for i in range(a.shape[0]):
+        pX, pY = int(c[i,0]), int(c[i,1])
+        if pX > 0 and pX < 63 and pY > 0 and pY < 63:
+            diff = np.array ( [a[i,pY,pX+1]-a[i,pY,pX-1], a[i,pY+1,pX]-a[i,pY-1,pX]] )
+            c[i] += np.sign(diff)*0.25
+   
+    c += 0.5
+    result = np.empty ( (a.shape[0],2), dtype=np.int )
+    if center is not None and scale is not None:
+        for i in range(a.shape[0]):
+            pt = np.array ( [c[i][0], c[i][1], 1.0] )            
+            h = 200.0 * scale
+            m = np.eye(3)
+            m[0,0] = a.shape[2] / h
+            m[1,1] = a.shape[2] / h
+            m[0,2] = a.shape[2] * ( -center[0] / h + 0.5 )
+            m[1,2] = a.shape[2] * ( -center[1] / h + 0.5 )
+            m = np.linalg.inv(m)
+            result[i] = np.matmul (m, pt)[0:2].astype( np.int )
+    return result
+    
+
+    
+rnd_data = np.random.rand (3, 256,256).astype(np.float32)
+#rnd_data = np.random.random_integers (2, size=(3, 256,256)).astype(np.float32)
+#rnd_data = np.array ( [[[1]*256]*256]*3 , dtype=np.float32 )
+input_data = np.array ([rnd_data])
+
+fa_out_tensor = fa( torch.autograd.Variable( torch.from_numpy(input_data), volatile=True) )[-1].data.cpu()
+fa_out = fa_out_tensor.numpy()
+
+t = time.time()
+m_out = model.predict ( input_data )[-1]
+print ('predict takes = %f' %( time.time() - t ) )
+t = time.time()
+
+#fa_base_out = fa_base(torch.autograd.Variable( torch.from_numpy(input_data), volatile=True))[0].data.cpu().numpy()
+
+print ( 'shapes = %s , %s , equal == %s ' % (fa_out.shape, m_out.shape, (fa_out.shape == m_out.shape) ) )
+print ( 'allclose == %s' %  ( np.allclose(fa_out, m_out) ) )
+print ( 'total abs diff outputs = %f' % ( np.sum ( np.abs(np.ndarray.flatten(fa_out-m_out))) )) 
+
+###
+d = dlib.rectangle(156,364,424,765)
+
+center = torch.FloatTensor(
+                    [d.right() - (d.right() - d.left()) / 2.0, d.bottom() -
+                     (d.bottom() - d.top()) / 2.0])
+center[1] = center[1] - (d.bottom() - d.top()) * 0.12
+scale = (d.right() - d.left() + d.bottom() - d.top()) / 195.0
+pts, pts_img = get_preds_fromhm (fa_out_tensor, center, scale)
+pts_img = pts_img.view(68, 2).numpy()
+
+###
+
+m_pts_img = get_preds_fromhm2 (m_out[0], center, scale)
+
+print ('pts1 == pts2 == %s' % ( np.array_equal(pts_img, m_pts_img) ) )
+
+code.interact(local=dict(globals(), **locals()))
+
+#print ( np.array_equal (fa_out, m_out) ) #>>> False
+#code.interact(local=dict(globals(), **locals()))
+
+#code.interact(local=locals())
+
+#code.interact(local=locals())
+
+###
+#fa.conv1.weight = torch.nn.Parameter( torch.from_numpy ( np.array( [[[[1.0]*7]*7]*3]*64, dtype=np.float32) ) )
+#fa.conv1.bias = torch.nn.Parameter( torch.from_numpy ( np.array( [1.0]*64, dtype=np.float32 ) ) )
+#model.layers[2].set_weights( [ np.array( [[[[1.0]*64]*3]*7]*7, dtype=np.float32), np.array( [1.0]*64, dtype=np.float32 ) ] )
+
+#b = np.array( [1.0]*64, dtype=np.float32 )
+#b = np.random.rand (64).astype(np.float32)
+#w = np.array( [[[[1.0]*7]*7]*3]*64, dtype=np.float32)
+#w = np.random.rand (64, 3, 7, 7).astype(np.float32)
+#s = w #fa_base.conv1.weight.data.cpu().numpy() #64x3x7x7
+#d = np.moveaxis(s, [0,1,2,3], [3,2,0,1] )                
+                
+
+#fa.conv1.weight = torch.nn.Parameter( torch.from_numpy ( w ) )
+#fa.conv1.bias = torch.nn.Parameter( torch.from_numpy ( b ) )
+#model.layers[2].set_weights( [np.transpose(w), b] )
+#model.layers[2].set_weights( [d, b] )
+'''
+for i in range(0,64):
+    for j in range(0,128):
+        b = np.array_equal (fa_out[i,j], m_out[i,j])
+        if b == False:
+            print ( '%d %d == False' %(i,j) ) #>>> False
+'''      
+
+    
+'''
+input = -2.7966828
+gamma = 0.7640695571899414
+beta = 0.22801123559474945
+moving_mean = 0.12693816423416138
+moving_variance = 0.10409101098775864
+epsilon = 0.0 #0.00001
+
+print ( gamma * (input - moving_mean) / math.sqrt(moving_variance + epsilon) + beta )
+print ( (input - moving_mean) * (1.0 / math.sqrt(moving_variance) + epsilon)*gamma + beta   )
+'''
+#code.interact(local=dict(globals(), **locals()))
+'''
+conv_64_128 = x
+conv_64_128 = TorchBatchNorm2D(axis=1,  momentum=0.1, epsilon=1e-05, weights=t2kw_bn2d(fa.conv2.bn1) )(conv_64_128)
+conv_64_128 = KL.Activation( keras.backend.relu ) (conv_64_128)
+conv_64_128 = KL.ZeroPadding2D(padding=(1, 1), data_format='channels_first')(conv_64_128)
+conv_64_128 = KL.convolutional.Conv2D( 64, kernel_size=3, strides=1, data_format='channels_first', padding='valid', use_bias = False, weights=t2kw_conv2d(fa.conv2.conv1) ) (conv_64_128)
+conv_64_128 = TorchBatchNorm2D(axis=1,  momentum=0.1, epsilon=1e-05, weights=t2kw_bn2d(fa.conv2.bn2) )(conv_64_128)
+conv_64_128 = KL.Activation( keras.backend.relu ) (conv_64_128)
+'''
+#
+#
+#keras result = gamma * (input - moving_mean) / sqrt(moving_variance + epsilon) + beta
+#
+# (input - mean / scale_factor) / sqrt(var / scale_factor + eps)
+#
+#input = -3.0322433
+#
+#gamma = 0.1859646
+#beta = -0.17041835
+#moving_mean = -3.0345056
+#moving_variance = 8.773307
+#epsilon = 0.00001
+#
+#result = - 0.17027631
+#
+# fa result = 1.930317
--- a/__dev/test.py
+++ b/__dev/test.py
--- a/doc/doc_build_and_repository_info.md
+++ b/doc/doc_build_and_repository_info.md
@ -2,6 +2,7 @@

 DeepFaceLab officially supports Windows-only. If you want to support Mac/Linux/Docker - create a fork, it will be referenced here.

+[Linux fork](https://github.com/lbfs/DeepFaceLab_Linux) by @lbfs

 #### **Installing dlib on Windows**

--- a/facelib/MTCExtractor.py
+++ b/facelib/MTCExtractor.py
@ -3,15 +3,11 @@ import os
 import cv2

 from pathlib import Path
-
-from .mtcnn import *
+from nnlib import nnlib

 class MTCExtractor(object):   
-    def __init__(self, keras, tf, tf_session):
+    def __init__(self):
        self.scale_to = 1920
-        self.keras = keras
-        self.tf = tf
-        self.tf_session = tf_session
        
        self.min_face_size = self.scale_to * 0.042
        self.thresh1 = 0.7
@ -19,25 +15,72 @@ class MTCExtractor(object):
        self.thresh3 = 0.6
        self.scale_factor = 0.95

+        exec( nnlib.import_all(), locals(), globals() )
+        PNet_Input = Input ( (None, None,3) )
+        x = PNet_Input
+        x = Conv2D (10, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv1")(x)
+        x = PReLU (shared_axes=[1,2], name="PReLU1" )(x)
+        x = MaxPooling2D( pool_size=(2,2), strides=(2,2), padding='same' ) (x)
+        x = Conv2D (16, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv2")(x)
+        x = PReLU (shared_axes=[1,2], name="PReLU2" )(x)
+        x = Conv2D (32, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv3")(x)
+        x = PReLU (shared_axes=[1,2], name="PReLU3" )(x)
+        prob = Conv2D (2, kernel_size=(1,1), strides=(1,1), padding='valid', name="conv41")(x)
+        prob = Softmax()(prob)    
+        x = Conv2D (4, kernel_size=(1,1), strides=(1,1), padding='valid', name="conv42")(x)
+
+        PNet_model = Model(PNet_Input, [x,prob] )        
+        PNet_model.load_weights ( (Path(__file__).parent / 'mtcnn_pnet.h5').__str__() )
+        
+        RNet_Input = Input ( (24, 24, 3) )
+        x = RNet_Input
+        x = Conv2D (28, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv1")(x)
+        x = PReLU (shared_axes=[1,2], name="prelu1" )(x)
+        x = MaxPooling2D( pool_size=(3,3), strides=(2,2), padding='same' ) (x)
+        x = Conv2D (48, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv2")(x)
+        x = PReLU (shared_axes=[1,2], name="prelu2" )(x)
+        x = MaxPooling2D( pool_size=(3,3), strides=(2,2), padding='valid' ) (x)    
+        x = Conv2D (64, kernel_size=(2,2), strides=(1,1), padding='valid', name="conv3")(x)
+        x = PReLU (shared_axes=[1,2], name="prelu3" )(x)
+        x = Lambda ( lambda x: K.reshape (x, (-1, np.prod(K.int_shape(x)[1:]),) ), output_shape=(np.prod(K.int_shape(x)[1:]),) ) (x)
+        x = Dense (128, name='conv4')(x)    
+        x = PReLU (name="prelu4" )(x)
+        prob = Dense (2, name='conv51')(x)
+        prob = Softmax()(prob)  
+        x = Dense (4, name='conv52')(x)        
+        RNet_model = Model(RNet_Input, [x,prob] )        
+        RNet_model.load_weights ( (Path(__file__).parent / 'mtcnn_rnet.h5').__str__() )
+        
+        ONet_Input = Input ( (48, 48, 3) )
+        x = ONet_Input
+        x = Conv2D (32, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv1")(x)
+        x = PReLU (shared_axes=[1,2], name="prelu1" )(x)
+        x = MaxPooling2D( pool_size=(3,3), strides=(2,2), padding='same' ) (x)
+        x = Conv2D (64, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv2")(x)
+        x = PReLU (shared_axes=[1,2], name="prelu2" )(x)
+        x = MaxPooling2D( pool_size=(3,3), strides=(2,2), padding='valid' ) (x)    
+        x = Conv2D (64, kernel_size=(3,3), strides=(1,1), padding='valid', name="conv3")(x)
+        x = PReLU (shared_axes=[1,2], name="prelu3" )(x)
+        x = MaxPooling2D( pool_size=(2,2), strides=(2,2), padding='same' ) (x) 
+        x = Conv2D (128, kernel_size=(2,2), strides=(1,1), padding='valid', name="conv4")(x)
+        x = PReLU (shared_axes=[1,2], name="prelu4" )(x)
+        x = Lambda ( lambda x: K.reshape (x, (-1, np.prod(K.int_shape(x)[1:]),) ), output_shape=(np.prod(K.int_shape(x)[1:]),) ) (x)    
+        x = Dense (256, name='conv5')(x)
+        x = PReLU (name="prelu5" )(x)
+        prob = Dense (2, name='conv61')(x)
+        prob = Softmax()(prob)    
+        x1 = Dense (4, name='conv62')(x)
+        x2 = Dense (10, name='conv63')(x)        
+        ONet_model = Model(ONet_Input, [x1,x2,prob] )        
+        ONet_model.load_weights ( (Path(__file__).parent / 'mtcnn_onet.h5').__str__() )
+
+        self.pnet_fun = K.function ( PNet_model.inputs, PNet_model.outputs )
+        self.rnet_fun = K.function ( RNet_model.inputs, RNet_model.outputs )
+        self.onet_fun = K.function ( ONet_model.inputs, ONet_model.outputs )
+
    def __enter__(self):
-        with self.tf.variable_scope('pnet2'):
-            data = self.tf.placeholder(self.tf.float32, (None,None,None,3), 'input')
-            pnet2 = PNet(self.tf, {'data':data})
-            pnet2.load(str(Path(__file__).parent/'det1.npy'), self.tf_session)
-        with self.tf.variable_scope('rnet2'):
-            data = self.tf.placeholder(self.tf.float32, (None,24,24,3), 'input')
-            rnet2 = RNet(self.tf, {'data':data})
-            rnet2.load(str(Path(__file__).parent/'det2.npy'), self.tf_session)
-        with self.tf.variable_scope('onet2'):
-            data = self.tf.placeholder(self.tf.float32, (None,48,48,3), 'input')
-            onet2 = ONet(self.tf, {'data':data})
-            onet2.load(str(Path(__file__).parent/'det3.npy'), self.tf_session)
-
-        self.pnet_fun = self.keras.backend.function([pnet2.layers['data']],[pnet2.layers['conv4-2'], pnet2.layers['prob1']])
-        self.rnet_fun = self.keras.backend.function([rnet2.layers['data']],[rnet2.layers['conv5-2'], rnet2.layers['prob1']])
-        self.onet_fun = self.keras.backend.function([onet2.layers['data']],[onet2.layers['conv6-2'], onet2.layers['conv6-3'], onet2.layers['prob1']])
-
        faces, pnts = detect_face ( np.zeros ( (self.scale_to, self.scale_to, 3)), self.min_face_size, self.pnet_fun, self.rnet_fun, self.onet_fun, [ self.thresh1, self.thresh2, self.thresh3 ], self.scale_factor ) 
+        
        return self
        
    def __exit__(self, exc_type=None, exc_value=None, traceback=None):
@ -47,7 +90,6 @@ class MTCExtractor(object):
        input_image = input_image[:,:,::-1].copy()
        (h, w, ch) = input_image.shape

-
        input_scale = self.scale_to / (w if w > h else h)
        input_image = cv2.resize (input_image, ( int(w*input_scale), int(h*input_scale) ), interpolation=cv2.INTER_LINEAR)

@ -56,3 +98,249 @@ class MTCExtractor(object):
          
        return detected_faces

+def detect_face(img, minsize, pnet, rnet, onet, threshold, factor):
+    """Detects faces in an image, and returns bounding boxes and points for them.
+    img: input image
+    minsize: minimum faces' size
+    pnet, rnet, onet: caffemodel
+    threshold: threshold=[th1, th2, th3], th1-3 are three steps's threshold
+    factor: the factor used to create a scaling pyramid of face sizes to detect in the image.
+    """
+    factor_count=0
+    total_boxes=np.empty((0,9))
+    points=np.empty(0)
+    h=img.shape[0]
+    w=img.shape[1]
+    minl=np.amin([h, w])
+    m=12.0/minsize
+    minl=minl*m
+    # create scale pyramid
+    scales=[]
+    while minl>=12:
+        scales += [m*np.power(factor, factor_count)]
+        minl = minl*factor
+        factor_count += 1
+    # first stage
+    for scale in scales:
+        hs=int(np.ceil(h*scale))
+        ws=int(np.ceil(w*scale))
+        #print ('scale %f %d %d' % (scale, ws,hs))
+        im_data = imresample(img, (hs, ws))
+        im_data = (im_data-127.5)*0.0078125
+        img_x = np.expand_dims(im_data, 0)
+        img_y = np.transpose(img_x, (0,2,1,3))
+        out = pnet([img_y])
+        out0 = np.transpose(out[0], (0,2,1,3))
+        out1 = np.transpose(out[1], (0,2,1,3))
+        
+        boxes, _ = generateBoundingBox(out1[0,:,:,1].copy(), out0[0,:,:,:].copy(), scale, threshold[0])
+        
+        # inter-scale nms
+        pick = nms(boxes.copy(), 0.5, 'Union')
+        if boxes.size>0 and pick.size>0:
+            boxes = boxes[pick,:]
+            total_boxes = np.append(total_boxes, boxes, axis=0)
+
+    numbox = total_boxes.shape[0]
+    if numbox>0:
+        pick = nms(total_boxes.copy(), 0.7, 'Union')
+        total_boxes = total_boxes[pick,:]
+        regw = total_boxes[:,2]-total_boxes[:,0]
+        regh = total_boxes[:,3]-total_boxes[:,1]
+        qq1 = total_boxes[:,0]+total_boxes[:,5]*regw
+        qq2 = total_boxes[:,1]+total_boxes[:,6]*regh
+        qq3 = total_boxes[:,2]+total_boxes[:,7]*regw
+        qq4 = total_boxes[:,3]+total_boxes[:,8]*regh
+        total_boxes = np.transpose(np.vstack([qq1, qq2, qq3, qq4, total_boxes[:,4]]))
+        total_boxes = rerec(total_boxes.copy())
+        total_boxes[:,0:4] = np.fix(total_boxes[:,0:4]).astype(np.int32)
+        dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h)
+
+    numbox = total_boxes.shape[0]
+    if numbox>0:
+        # second stage
+        tempimg = np.zeros((24,24,3,numbox))
+        for k in range(0,numbox):
+            tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
+            tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
+            if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
+                tempimg[:,:,:,k] = imresample(tmp, (24, 24))
+            else:
+                return np.empty()
+        tempimg = (tempimg-127.5)*0.0078125
+        tempimg1 = np.transpose(tempimg, (3,1,0,2))
+        out = rnet([tempimg1])
+        out0 = np.transpose(out[0])
+        out1 = np.transpose(out[1])
+        score = out1[1,:]
+        ipass = np.where(score>threshold[1])
+        total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
+        mv = out0[:,ipass[0]]
+        if total_boxes.shape[0]>0:
+            pick = nms(total_boxes, 0.7, 'Union')
+            total_boxes = total_boxes[pick,:]
+            total_boxes = bbreg(total_boxes.copy(), np.transpose(mv[:,pick]))
+            total_boxes = rerec(total_boxes.copy())
+
+    numbox = total_boxes.shape[0]
+    if numbox>0:
+        # third stage
+        total_boxes = np.fix(total_boxes).astype(np.int32)
+        dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h)
+        tempimg = np.zeros((48,48,3,numbox))
+        for k in range(0,numbox):
+            tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
+            tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
+            if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
+                tempimg[:,:,:,k] = imresample(tmp, (48, 48))
+            else:
+                return np.empty()
+        tempimg = (tempimg-127.5)*0.0078125
+        tempimg1 = np.transpose(tempimg, (3,1,0,2))
+        out = onet([tempimg1])
+        out0 = np.transpose(out[0])
+        out1 = np.transpose(out[1])
+        out2 = np.transpose(out[2])
+        score = out2[1,:]
+        points = out1
+        ipass = np.where(score>threshold[2])
+        points = points[:,ipass[0]]
+        total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
+        mv = out0[:,ipass[0]]
+
+        w = total_boxes[:,2]-total_boxes[:,0]+1
+        h = total_boxes[:,3]-total_boxes[:,1]+1
+        points[0:5,:] = np.tile(w,(5, 1))*points[0:5,:] + np.tile(total_boxes[:,0],(5, 1))-1
+        points[5:10,:] = np.tile(h,(5, 1))*points[5:10,:] + np.tile(total_boxes[:,1],(5, 1))-1
+        if total_boxes.shape[0]>0:
+            total_boxes = bbreg(total_boxes.copy(), np.transpose(mv))
+            pick = nms(total_boxes.copy(), 0.7, 'Min')
+            total_boxes = total_boxes[pick,:]
+            points = points[:,pick]
+                
+    return total_boxes, points
+
+
+# function [boundingbox] = bbreg(boundingbox,reg)
+def bbreg(boundingbox,reg):
+    """Calibrate bounding boxes"""
+    if reg.shape[1]==1:
+        reg = np.reshape(reg, (reg.shape[2], reg.shape[3]))
+
+    w = boundingbox[:,2]-boundingbox[:,0]+1
+    h = boundingbox[:,3]-boundingbox[:,1]+1
+    b1 = boundingbox[:,0]+reg[:,0]*w
+    b2 = boundingbox[:,1]+reg[:,1]*h
+    b3 = boundingbox[:,2]+reg[:,2]*w
+    b4 = boundingbox[:,3]+reg[:,3]*h
+    boundingbox[:,0:4] = np.transpose(np.vstack([b1, b2, b3, b4 ]))
+    return boundingbox
+ 
+def generateBoundingBox(imap, reg, scale, t):
+    """Use heatmap to generate bounding boxes"""
+    stride=2
+    cellsize=12
+
+    imap = np.transpose(imap)
+    dx1 = np.transpose(reg[:,:,0])
+    dy1 = np.transpose(reg[:,:,1])
+    dx2 = np.transpose(reg[:,:,2])
+    dy2 = np.transpose(reg[:,:,3])
+    y, x = np.where(imap >= t)
+    if y.shape[0]==1:
+        dx1 = np.flipud(dx1)
+        dy1 = np.flipud(dy1)
+        dx2 = np.flipud(dx2)
+        dy2 = np.flipud(dy2)
+    score = imap[(y,x)]
+    reg = np.transpose(np.vstack([ dx1[(y,x)], dy1[(y,x)], dx2[(y,x)], dy2[(y,x)] ]))
+    if reg.size==0:
+        reg = np.empty((0,3))
+    bb = np.transpose(np.vstack([y,x]))
+    q1 = np.fix((stride*bb+1)/scale)
+    q2 = np.fix((stride*bb+cellsize-1+1)/scale)
+    boundingbox = np.hstack([q1, q2, np.expand_dims(score,1), reg])
+    return boundingbox, reg
+ 
+# function pick = nms(boxes,threshold,type)
+def nms(boxes, threshold, method):
+    if boxes.size==0:
+        return np.empty((0,3))
+    x1 = boxes[:,0]
+    y1 = boxes[:,1]
+    x2 = boxes[:,2]
+    y2 = boxes[:,3]
+    s = boxes[:,4]
+    area = (x2-x1+1) * (y2-y1+1)
+    I = np.argsort(s)
+    pick = np.zeros_like(s, dtype=np.int16)
+    counter = 0
+    while I.size>0:
+        i = I[-1]
+        pick[counter] = i
+        counter += 1
+        idx = I[0:-1]
+        xx1 = np.maximum(x1[i], x1[idx])
+        yy1 = np.maximum(y1[i], y1[idx])
+        xx2 = np.minimum(x2[i], x2[idx])
+        yy2 = np.minimum(y2[i], y2[idx])
+        w = np.maximum(0.0, xx2-xx1+1)
+        h = np.maximum(0.0, yy2-yy1+1)
+        inter = w * h
+        if method is 'Min':
+            o = inter / np.minimum(area[i], area[idx])
+        else:
+            o = inter / (area[i] + area[idx] - inter)
+        I = I[np.where(o<=threshold)]
+    pick = pick[0:counter]
+    return pick
+
+# function [dy edy dx edx y ey x ex tmpw tmph] = pad(total_boxes,w,h)
+def pad(total_boxes, w, h):
+    """Compute the padding coordinates (pad the bounding boxes to square)"""
+    tmpw = (total_boxes[:,2]-total_boxes[:,0]+1).astype(np.int32)
+    tmph = (total_boxes[:,3]-total_boxes[:,1]+1).astype(np.int32)
+    numbox = total_boxes.shape[0]
+
+    dx = np.ones((numbox), dtype=np.int32)
+    dy = np.ones((numbox), dtype=np.int32)
+    edx = tmpw.copy().astype(np.int32)
+    edy = tmph.copy().astype(np.int32)
+
+    x = total_boxes[:,0].copy().astype(np.int32)
+    y = total_boxes[:,1].copy().astype(np.int32)
+    ex = total_boxes[:,2].copy().astype(np.int32)
+    ey = total_boxes[:,3].copy().astype(np.int32)
+
+    tmp = np.where(ex>w)
+    edx.flat[tmp] = np.expand_dims(-ex[tmp]+w+tmpw[tmp],1)
+    ex[tmp] = w
+    
+    tmp = np.where(ey>h)
+    edy.flat[tmp] = np.expand_dims(-ey[tmp]+h+tmph[tmp],1)
+    ey[tmp] = h
+
+    tmp = np.where(x<1)
+    dx.flat[tmp] = np.expand_dims(2-x[tmp],1)
+    x[tmp] = 1
+
+    tmp = np.where(y<1)
+    dy.flat[tmp] = np.expand_dims(2-y[tmp],1)
+    y[tmp] = 1
+    
+    return dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph
+
+# function [bboxA] = rerec(bboxA)
+def rerec(bboxA):
+    """Convert bboxA to square."""
+    h = bboxA[:,3]-bboxA[:,1]
+    w = bboxA[:,2]-bboxA[:,0]
+    l = np.maximum(w, h)
+    bboxA[:,0] = bboxA[:,0]+w*0.5-l*0.5
+    bboxA[:,1] = bboxA[:,1]+h*0.5-l*0.5
+    bboxA[:,2:4] = bboxA[:,0:2] + np.transpose(np.tile(l,(2,1)))
+    return bboxA
+
+def imresample(img, sz):
+    im_data = cv2.resize(img, (sz[1], sz[0]), interpolation=cv2.INTER_LINEAR) #@UndefinedVariable
+    return im_data
--- a/facelib/det1.npy
+++ b/facelib/det1.npy
--- a/facelib/mtcnn.py
+++ b/facelib/mtcnn.py
@ -1,761 +0,0 @@
-# Source: https://github.com/davidsandberg/facenet/blob/master/src/align/
-
-""" Tensorflow implementation of the face detection / alignment algorithm found at
-https://github.com/kpzhang93/MTCNN_face_detection_alignment
-"""
-# MIT License
-# 
-# Copyright (c) 2016 David Sandberg
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from six import string_types, iteritems
-
-import numpy as np
-#from math import floor
-import cv2
-import os
-
-def layer(op):
-    """Decorator for composable network layers."""
-
-    def layer_decorated(self, *args, **kwargs):
-        # Automatically set a name if not provided.
-        name = kwargs.setdefault('name', self.get_unique_name(op.__name__))
-        # Figure out the layer inputs.
-        if len(self.terminals) == 0:
-            raise RuntimeError('No input variables found for layer %s.' % name)
-        elif len(self.terminals) == 1:
-            layer_input = self.terminals[0]
-        else:
-            layer_input = list(self.terminals)
-        # Perform the operation and get the output.
-        layer_output = op(self, layer_input, *args, **kwargs)
-        # Add to layer LUT.
-        self.layers[name] = layer_output
-        # This output is now the input for the next layer.
-        self.feed(layer_output)
-        # Return self for chained calls.
-        return self
-
-    return layer_decorated
-
-class Network(object):
-
-    def __init__(self, tf, inputs, trainable=True):
-        # The input nodes for this network
-        self.tf = tf
-        self.inputs = inputs
-        # The current list of terminal nodes
-        self.terminals = []
-        # Mapping from layer names to layers
-        self.layers = dict(inputs)
-        # If true, the resulting variables are set as trainable
-        self.trainable = trainable
-
-        self.setup()
-
-    def setup(self):
-        """Construct the network. """
-        raise NotImplementedError('Must be implemented by the subclass.')
-
-    def load(self, data_path, session, ignore_missing=False):
-        """Load network weights.
-        data_path: The path to the numpy-serialized network weights
-        session: The current TensorFlow session
-        ignore_missing: If true, serialized weights for missing layers are ignored.
-        """
-        data_dict = np.load(data_path, encoding='latin1').item() #pylint: disable=no-member
-
-        for op_name in data_dict:
-            with self.tf.variable_scope(op_name, reuse=True):
-                for param_name, data in iteritems(data_dict[op_name]):
-                    try:
-                        var = self.tf.get_variable(param_name)
-                        session.run(var.assign(data))
-                    except ValueError:
-                        if not ignore_missing:
-                            raise
-
-    def feed(self, *args):
-        """Set the input(s) for the next operation by replacing the terminal nodes.
-        The arguments can be either layer names or the actual layers.
-        """
-        assert len(args) != 0
-        self.terminals = []
-        for fed_layer in args:
-            if isinstance(fed_layer, string_types):
-                try:
-                    fed_layer = self.layers[fed_layer]
-                except KeyError:
-                    raise KeyError('Unknown layer name fed: %s' % fed_layer)
-            self.terminals.append(fed_layer)
-        return self
-
-    def get_output(self):
-        """Returns the current network output."""
-        return self.terminals[-1]
-
-    def get_unique_name(self, prefix):
-        """Returns an index-suffixed unique name for the given prefix.
-        This is used for auto-generating layer names based on the type-prefix.
-        """
-        ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1
-        return '%s_%d' % (prefix, ident)
-
-    def make_var(self, name, shape):
-        """Creates a new TensorFlow variable."""
-        return self.tf.get_variable(name, shape, trainable=self.trainable)
-
-    def validate_padding(self, padding):
-        """Verifies that the padding is one of the supported ones."""
-        assert padding in ('SAME', 'VALID')
-
-    @layer
-    def conv(self,
-             inp,
-             k_h,
-             k_w,
-             c_o,
-             s_h,
-             s_w,
-             name,
-             relu=True,
-             padding='SAME',
-             group=1,
-             biased=True):
-        # Verify that the padding is acceptable
-        self.validate_padding(padding)
-        # Get the number of channels in the input
-        c_i = int(inp.get_shape()[-1])
-        # Verify that the grouping parameter is valid
-        assert c_i % group == 0
-        assert c_o % group == 0
-        # Convolution for a given input and kernel
-        convolve = lambda i, k: self.tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding)
-        with self.tf.variable_scope(name) as scope:
-            kernel = self.make_var('weights', shape=[k_h, k_w, c_i // group, c_o])
-            # This is the common-case. Convolve the input without any further complications.
-            output = convolve(inp, kernel)
-            # Add the biases
-            if biased:
-                biases = self.make_var('biases', [c_o])
-                output = self.tf.nn.bias_add(output, biases)
-            if relu:
-                # ReLU non-linearity
-                output = self.tf.nn.relu(output, name=scope.name)
-            return output
-
-    @layer
-    def prelu(self, inp, name):
-        with self.tf.variable_scope(name):
-            i = int(inp.get_shape()[-1])
-            alpha = self.make_var('alpha', shape=(i,))
-            output = self.tf.nn.relu(inp) + self.tf.multiply(alpha, -self.tf.nn.relu(-inp))
-        return output
-
-    @layer
-    def max_pool(self, inp, k_h, k_w, s_h, s_w, name, padding='SAME'):
-        self.validate_padding(padding)
-        return self.tf.nn.max_pool(inp,
-                              ksize=[1, k_h, k_w, 1],
-                              strides=[1, s_h, s_w, 1],
-                              padding=padding,
-                              name=name)
-
-    @layer
-    def fc(self, inp, num_out, name, relu=True):
-        with self.tf.variable_scope(name):
-            input_shape = inp.get_shape()
-            if input_shape.ndims == 4:
-                # The input is spatial. Vectorize it first.
-                dim = 1
-                for d in input_shape[1:].as_list():
-                    dim *= int(d)
-                feed_in = self.tf.reshape(inp, [-1, dim])
-            else:
-                feed_in, dim = (inp, input_shape[-1].value)
-            weights = self.make_var('weights', shape=[dim, num_out])
-            biases = self.make_var('biases', [num_out])
-            op = self.tf.nn.relu_layer if relu else self.tf.nn.xw_plus_b
-            fc = op(feed_in, weights, biases, name=name)
-            return fc
-
-
-    """
-    Multi dimensional softmax,
-    refer to https://github.com/tensorflow/tensorflow/issues/210
-    compute softmax along the dimension of target
-    the native softmax only supports batch_size x dimension
-    """
-    @layer
-    def softmax(self, target, axis, name=None):
-        max_axis = self.tf.reduce_max(target, axis, keepdims=True)
-        target_exp = self.tf.exp(target-max_axis)
-        normalize = self.tf.reduce_sum(target_exp, axis, keepdims=True)
-        softmax = self.tf.div(target_exp, normalize, name)
-        return softmax
-    
-class PNet(Network):
-    def setup(self):
-        (self.feed('data') #pylint: disable=no-value-for-parameter, no-member
-             .conv(3, 3, 10, 1, 1, padding='VALID', relu=False, name='conv1')
-             .prelu(name='PReLU1')
-             .max_pool(2, 2, 2, 2, name='pool1')
-             .conv(3, 3, 16, 1, 1, padding='VALID', relu=False, name='conv2')
-             .prelu(name='PReLU2')
-             .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv3')
-             .prelu(name='PReLU3')
-             .conv(1, 1, 2, 1, 1, relu=False, name='conv4-1')
-             .softmax(3,name='prob1'))
-
-        (self.feed('PReLU3') #pylint: disable=no-value-for-parameter
-             .conv(1, 1, 4, 1, 1, relu=False, name='conv4-2'))
-        
-class RNet(Network):
-    def setup(self):
-        (self.feed('data') #pylint: disable=no-value-for-parameter, no-member
-             .conv(3, 3, 28, 1, 1, padding='VALID', relu=False, name='conv1')
-             .prelu(name='prelu1')
-             .max_pool(3, 3, 2, 2, name='pool1')
-             .conv(3, 3, 48, 1, 1, padding='VALID', relu=False, name='conv2')
-             .prelu(name='prelu2')
-             .max_pool(3, 3, 2, 2, padding='VALID', name='pool2')
-             .conv(2, 2, 64, 1, 1, padding='VALID', relu=False, name='conv3')
-             .prelu(name='prelu3')
-             .fc(128, relu=False, name='conv4')
-             .prelu(name='prelu4')
-             .fc(2, relu=False, name='conv5-1')
-             .softmax(1,name='prob1'))
-
-        (self.feed('prelu4') #pylint: disable=no-value-for-parameter
-             .fc(4, relu=False, name='conv5-2'))
-
-class ONet(Network):
-    def setup(self):
-        (self.feed('data') #pylint: disable=no-value-for-parameter, no-member
-             .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv1')
-             .prelu(name='prelu1')
-             .max_pool(3, 3, 2, 2, name='pool1')
-             .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv2')
-             .prelu(name='prelu2')
-             .max_pool(3, 3, 2, 2, padding='VALID', name='pool2')
-             .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv3')
-             .prelu(name='prelu3')
-             .max_pool(2, 2, 2, 2, name='pool3')
-             .conv(2, 2, 128, 1, 1, padding='VALID', relu=False, name='conv4')
-             .prelu(name='prelu4')
-             .fc(256, relu=False, name='conv5')
-             .prelu(name='prelu5')
-             .fc(2, relu=False, name='conv6-1')
-             .softmax(1, name='prob1'))
-
-        (self.feed('prelu5') #pylint: disable=no-value-for-parameter
-             .fc(4, relu=False, name='conv6-2'))
-
-        (self.feed('prelu5') #pylint: disable=no-value-for-parameter
-             .fc(10, relu=False, name='conv6-3'))
-
-def detect_face(img, minsize, pnet, rnet, onet, threshold, factor):
-    """Detects faces in an image, and returns bounding boxes and points for them.
-    img: input image
-    minsize: minimum faces' size
-    pnet, rnet, onet: caffemodel
-    threshold: threshold=[th1, th2, th3], th1-3 are three steps's threshold
-    factor: the factor used to create a scaling pyramid of face sizes to detect in the image.
-    """
-    factor_count=0
-    total_boxes=np.empty((0,9))
-    points=np.empty(0)
-    h=img.shape[0]
-    w=img.shape[1]
-    minl=np.amin([h, w])
-    m=12.0/minsize
-    minl=minl*m
-    # create scale pyramid
-    scales=[]
-    while minl>=12:
-        scales += [m*np.power(factor, factor_count)]
-        minl = minl*factor
-        factor_count += 1
-    # first stage
-    for scale in scales:
-        hs=int(np.ceil(h*scale))
-        ws=int(np.ceil(w*scale))
-        #print ('scale %f %d %d' % (scale, ws,hs))
-        im_data = imresample(img, (hs, ws))
-        im_data = (im_data-127.5)*0.0078125
-        img_x = np.expand_dims(im_data, 0)
-        img_y = np.transpose(img_x, (0,2,1,3))
-        out = pnet([img_y])
-        out0 = np.transpose(out[0], (0,2,1,3))
-        out1 = np.transpose(out[1], (0,2,1,3))
-        
-        boxes, _ = generateBoundingBox(out1[0,:,:,1].copy(), out0[0,:,:,:].copy(), scale, threshold[0])
-        
-        # inter-scale nms
-        pick = nms(boxes.copy(), 0.5, 'Union')
-        if boxes.size>0 and pick.size>0:
-            boxes = boxes[pick,:]
-            total_boxes = np.append(total_boxes, boxes, axis=0)
-
-    numbox = total_boxes.shape[0]
-    if numbox>0:
-        pick = nms(total_boxes.copy(), 0.7, 'Union')
-        total_boxes = total_boxes[pick,:]
-        regw = total_boxes[:,2]-total_boxes[:,0]
-        regh = total_boxes[:,3]-total_boxes[:,1]
-        qq1 = total_boxes[:,0]+total_boxes[:,5]*regw
-        qq2 = total_boxes[:,1]+total_boxes[:,6]*regh
-        qq3 = total_boxes[:,2]+total_boxes[:,7]*regw
-        qq4 = total_boxes[:,3]+total_boxes[:,8]*regh
-        total_boxes = np.transpose(np.vstack([qq1, qq2, qq3, qq4, total_boxes[:,4]]))
-        total_boxes = rerec(total_boxes.copy())
-        total_boxes[:,0:4] = np.fix(total_boxes[:,0:4]).astype(np.int32)
-        dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h)
-
-    numbox = total_boxes.shape[0]
-    if numbox>0:
-        # second stage
-        tempimg = np.zeros((24,24,3,numbox))
-        for k in range(0,numbox):
-            tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
-            tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
-            if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
-                tempimg[:,:,:,k] = imresample(tmp, (24, 24))
-            else:
-                return np.empty()
-        tempimg = (tempimg-127.5)*0.0078125
-        tempimg1 = np.transpose(tempimg, (3,1,0,2))
-        out = rnet([tempimg1])
-        out0 = np.transpose(out[0])
-        out1 = np.transpose(out[1])
-        score = out1[1,:]
-        ipass = np.where(score>threshold[1])
-        total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
-        mv = out0[:,ipass[0]]
-        if total_boxes.shape[0]>0:
-            pick = nms(total_boxes, 0.7, 'Union')
-            total_boxes = total_boxes[pick,:]
-            total_boxes = bbreg(total_boxes.copy(), np.transpose(mv[:,pick]))
-            total_boxes = rerec(total_boxes.copy())
-
-    numbox = total_boxes.shape[0]
-    if numbox>0:
-        # third stage
-        total_boxes = np.fix(total_boxes).astype(np.int32)
-        dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h)
-        tempimg = np.zeros((48,48,3,numbox))
-        for k in range(0,numbox):
-            tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
-            tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
-            if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
-                tempimg[:,:,:,k] = imresample(tmp, (48, 48))
-            else:
-                return np.empty()
-        tempimg = (tempimg-127.5)*0.0078125
-        tempimg1 = np.transpose(tempimg, (3,1,0,2))
-        out = onet([tempimg1])
-        out0 = np.transpose(out[0])
-        out1 = np.transpose(out[1])
-        out2 = np.transpose(out[2])
-        score = out2[1,:]
-        points = out1
-        ipass = np.where(score>threshold[2])
-        points = points[:,ipass[0]]
-        total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
-        mv = out0[:,ipass[0]]
-
-        w = total_boxes[:,2]-total_boxes[:,0]+1
-        h = total_boxes[:,3]-total_boxes[:,1]+1
-        points[0:5,:] = np.tile(w,(5, 1))*points[0:5,:] + np.tile(total_boxes[:,0],(5, 1))-1
-        points[5:10,:] = np.tile(h,(5, 1))*points[5:10,:] + np.tile(total_boxes[:,1],(5, 1))-1
-        if total_boxes.shape[0]>0:
-            total_boxes = bbreg(total_boxes.copy(), np.transpose(mv))
-            pick = nms(total_boxes.copy(), 0.7, 'Min')
-            total_boxes = total_boxes[pick,:]
-            points = points[:,pick]
-                
-    return total_boxes, points
-
-
-def bulk_detect_face(images, detection_window_size_ratio, pnet, rnet, onet, threshold, factor):
-    """Detects faces in a list of images
-    images: list containing input images
-    detection_window_size_ratio: ratio of minimum face size to smallest image dimension
-    pnet, rnet, onet: caffemodel
-    threshold: threshold=[th1 th2 th3], th1-3 are three steps's threshold [0-1]
-    factor: the factor used to create a scaling pyramid of face sizes to detect in the image.
-    """
-    all_scales = [None] * len(images)
-    images_with_boxes = [None] * len(images)
-
-    for i in range(len(images)):
-        images_with_boxes[i] = {'total_boxes': np.empty((0, 9))}
-
-    # create scale pyramid
-    for index, img in enumerate(images):
-        all_scales[index] = []
-        h = img.shape[0]
-        w = img.shape[1]
-        minsize = int(detection_window_size_ratio * np.minimum(w, h))
-        factor_count = 0
-        minl = np.amin([h, w])
-        if minsize <= 12:
-            minsize = 12
-
-        m = 12.0 / minsize
-        minl = minl * m
-        while minl >= 12:
-            all_scales[index].append(m * np.power(factor, factor_count))
-            minl = minl * factor
-            factor_count += 1
-
-    # # # # # # # # # # # # #
-    # first stage - fast proposal network (pnet) to obtain face candidates
-    # # # # # # # # # # # # #
-
-    images_obj_per_resolution = {}
-
-    # TODO: use some type of rounding to number module 8 to increase probability that pyramid images will have the same resolution across input images
-
-    for index, scales in enumerate(all_scales):
-        h = images[index].shape[0]
-        w = images[index].shape[1]
-
-        for scale in scales:
-            hs = int(np.ceil(h * scale))
-            ws = int(np.ceil(w * scale))
-
-            if (ws, hs) not in images_obj_per_resolution:
-                images_obj_per_resolution[(ws, hs)] = []
-
-            im_data = imresample(images[index], (hs, ws))
-            im_data = (im_data - 127.5) * 0.0078125
-            img_y = np.transpose(im_data, (1, 0, 2))  # caffe uses different dimensions ordering
-            images_obj_per_resolution[(ws, hs)].append({'scale': scale, 'image': img_y, 'index': index})
-
-    for resolution in images_obj_per_resolution:
-        images_per_resolution = [i['image'] for i in images_obj_per_resolution[resolution]]
-        outs = pnet(images_per_resolution)
-
-        for index in range(len(outs[0])):
-            scale = images_obj_per_resolution[resolution][index]['scale']
-            image_index = images_obj_per_resolution[resolution][index]['index']
-            out0 = np.transpose(outs[0][index], (1, 0, 2))
-            out1 = np.transpose(outs[1][index], (1, 0, 2))
-
-            boxes, _ = generateBoundingBox(out1[:, :, 1].copy(), out0[:, :, :].copy(), scale, threshold[0])
-
-            # inter-scale nms
-            pick = nms(boxes.copy(), 0.5, 'Union')
-            if boxes.size > 0 and pick.size > 0:
-                boxes = boxes[pick, :]
-                images_with_boxes[image_index]['total_boxes'] = np.append(images_with_boxes[image_index]['total_boxes'],
-                                                                          boxes,
-                                                                          axis=0)
-
-    for index, image_obj in enumerate(images_with_boxes):
-        numbox = image_obj['total_boxes'].shape[0]
-        if numbox > 0:
-            h = images[index].shape[0]
-            w = images[index].shape[1]
-            pick = nms(image_obj['total_boxes'].copy(), 0.7, 'Union')
-            image_obj['total_boxes'] = image_obj['total_boxes'][pick, :]
-            regw = image_obj['total_boxes'][:, 2] - image_obj['total_boxes'][:, 0]
-            regh = image_obj['total_boxes'][:, 3] - image_obj['total_boxes'][:, 1]
-            qq1 = image_obj['total_boxes'][:, 0] + image_obj['total_boxes'][:, 5] * regw
-            qq2 = image_obj['total_boxes'][:, 1] + image_obj['total_boxes'][:, 6] * regh
-            qq3 = image_obj['total_boxes'][:, 2] + image_obj['total_boxes'][:, 7] * regw
-            qq4 = image_obj['total_boxes'][:, 3] + image_obj['total_boxes'][:, 8] * regh
-            image_obj['total_boxes'] = np.transpose(np.vstack([qq1, qq2, qq3, qq4, image_obj['total_boxes'][:, 4]]))
-            image_obj['total_boxes'] = rerec(image_obj['total_boxes'].copy())
-            image_obj['total_boxes'][:, 0:4] = np.fix(image_obj['total_boxes'][:, 0:4]).astype(np.int32)
-            dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(image_obj['total_boxes'].copy(), w, h)
-
-            numbox = image_obj['total_boxes'].shape[0]
-            tempimg = np.zeros((24, 24, 3, numbox))
-
-            if numbox > 0:
-                for k in range(0, numbox):
-                    tmp = np.zeros((int(tmph[k]), int(tmpw[k]), 3))
-                    tmp[dy[k] - 1:edy[k], dx[k] - 1:edx[k], :] = images[index][y[k] - 1:ey[k], x[k] - 1:ex[k], :]
-                    if tmp.shape[0] > 0 and tmp.shape[1] > 0 or tmp.shape[0] == 0 and tmp.shape[1] == 0:
-                        tempimg[:, :, :, k] = imresample(tmp, (24, 24))
-                    else:
-                        return np.empty()
-
-                tempimg = (tempimg - 127.5) * 0.0078125
-                image_obj['rnet_input'] = np.transpose(tempimg, (3, 1, 0, 2))
-
-    # # # # # # # # # # # # #
-    # second stage - refinement of face candidates with rnet
-    # # # # # # # # # # # # #
-
-    bulk_rnet_input = np.empty((0, 24, 24, 3))
-    for index, image_obj in enumerate(images_with_boxes):
-        if 'rnet_input' in image_obj:
-            bulk_rnet_input = np.append(bulk_rnet_input, image_obj['rnet_input'], axis=0)
-
-    out = rnet(bulk_rnet_input)
-    out0 = np.transpose(out[0])
-    out1 = np.transpose(out[1])
-    score = out1[1, :]
-
-    i = 0
-    for index, image_obj in enumerate(images_with_boxes):
-        if 'rnet_input' not in image_obj:
-            continue
-
-        rnet_input_count = image_obj['rnet_input'].shape[0]
-        score_per_image = score[i:i + rnet_input_count]
-        out0_per_image = out0[:, i:i + rnet_input_count]
-
-        ipass = np.where(score_per_image > threshold[1])
-        image_obj['total_boxes'] = np.hstack([image_obj['total_boxes'][ipass[0], 0:4].copy(),
-                                              np.expand_dims(score_per_image[ipass].copy(), 1)])
-
-        mv = out0_per_image[:, ipass[0]]
-
-        if image_obj['total_boxes'].shape[0] > 0:
-            h = images[index].shape[0]
-            w = images[index].shape[1]
-            pick = nms(image_obj['total_boxes'], 0.7, 'Union')
-            image_obj['total_boxes'] = image_obj['total_boxes'][pick, :]
-            image_obj['total_boxes'] = bbreg(image_obj['total_boxes'].copy(), np.transpose(mv[:, pick]))
-            image_obj['total_boxes'] = rerec(image_obj['total_boxes'].copy())
-
-            numbox = image_obj['total_boxes'].shape[0]
-
-            if numbox > 0:
-                tempimg = np.zeros((48, 48, 3, numbox))
-                image_obj['total_boxes'] = np.fix(image_obj['total_boxes']).astype(np.int32)
-                dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(image_obj['total_boxes'].copy(), w, h)
-
-                for k in range(0, numbox):
-                    tmp = np.zeros((int(tmph[k]), int(tmpw[k]), 3))
-                    tmp[dy[k] - 1:edy[k], dx[k] - 1:edx[k], :] = images[index][y[k] - 1:ey[k], x[k] - 1:ex[k], :]
-                    if tmp.shape[0] > 0 and tmp.shape[1] > 0 or tmp.shape[0] == 0 and tmp.shape[1] == 0:
-                        tempimg[:, :, :, k] = imresample(tmp, (48, 48))
-                    else:
-                        return np.empty()
-                tempimg = (tempimg - 127.5) * 0.0078125
-                image_obj['onet_input'] = np.transpose(tempimg, (3, 1, 0, 2))
-
-        i += rnet_input_count
-
-    # # # # # # # # # # # # #
-    # third stage - further refinement and facial landmarks positions with onet
-    # # # # # # # # # # # # #
-
-    bulk_onet_input = np.empty((0, 48, 48, 3))
-    for index, image_obj in enumerate(images_with_boxes):
-        if 'onet_input' in image_obj:
-            bulk_onet_input = np.append(bulk_onet_input, image_obj['onet_input'], axis=0)
-
-    out = onet(bulk_onet_input)
-
-    out0 = np.transpose(out[0])
-    out1 = np.transpose(out[1])
-    out2 = np.transpose(out[2])
-    score = out2[1, :]
-    points = out1
-
-    i = 0
-    ret = []
-    for index, image_obj in enumerate(images_with_boxes):
-        if 'onet_input' not in image_obj:
-            ret.append(None)
-            continue
-
-        onet_input_count = image_obj['onet_input'].shape[0]
-
-        out0_per_image = out0[:, i:i + onet_input_count]
-        score_per_image = score[i:i + onet_input_count]
-        points_per_image = points[:, i:i + onet_input_count]
-
-        ipass = np.where(score_per_image > threshold[2])
-        points_per_image = points_per_image[:, ipass[0]]
-
-        image_obj['total_boxes'] = np.hstack([image_obj['total_boxes'][ipass[0], 0:4].copy(),
-                                              np.expand_dims(score_per_image[ipass].copy(), 1)])
-        mv = out0_per_image[:, ipass[0]]
-
-        w = image_obj['total_boxes'][:, 2] - image_obj['total_boxes'][:, 0] + 1
-        h = image_obj['total_boxes'][:, 3] - image_obj['total_boxes'][:, 1] + 1
-        points_per_image[0:5, :] = np.tile(w, (5, 1)) * points_per_image[0:5, :] + np.tile(
-            image_obj['total_boxes'][:, 0], (5, 1)) - 1
-        points_per_image[5:10, :] = np.tile(h, (5, 1)) * points_per_image[5:10, :] + np.tile(
-            image_obj['total_boxes'][:, 1], (5, 1)) - 1
-
-        if image_obj['total_boxes'].shape[0] > 0:
-            image_obj['total_boxes'] = bbreg(image_obj['total_boxes'].copy(), np.transpose(mv))
-            pick = nms(image_obj['total_boxes'].copy(), 0.7, 'Min')
-            image_obj['total_boxes'] = image_obj['total_boxes'][pick, :]
-            points_per_image = points_per_image[:, pick]
-
-            ret.append((image_obj['total_boxes'], points_per_image))
-        else:
-            ret.append(None)
-
-        i += onet_input_count
-
-    return ret
-
-
-# function [boundingbox] = bbreg(boundingbox,reg)
-def bbreg(boundingbox,reg):
-    """Calibrate bounding boxes"""
-    if reg.shape[1]==1:
-        reg = np.reshape(reg, (reg.shape[2], reg.shape[3]))
-
-    w = boundingbox[:,2]-boundingbox[:,0]+1
-    h = boundingbox[:,3]-boundingbox[:,1]+1
-    b1 = boundingbox[:,0]+reg[:,0]*w
-    b2 = boundingbox[:,1]+reg[:,1]*h
-    b3 = boundingbox[:,2]+reg[:,2]*w
-    b4 = boundingbox[:,3]+reg[:,3]*h
-    boundingbox[:,0:4] = np.transpose(np.vstack([b1, b2, b3, b4 ]))
-    return boundingbox
- 
-def generateBoundingBox(imap, reg, scale, t):
-    """Use heatmap to generate bounding boxes"""
-    stride=2
-    cellsize=12
-
-    imap = np.transpose(imap)
-    dx1 = np.transpose(reg[:,:,0])
-    dy1 = np.transpose(reg[:,:,1])
-    dx2 = np.transpose(reg[:,:,2])
-    dy2 = np.transpose(reg[:,:,3])
-    y, x = np.where(imap >= t)
-    if y.shape[0]==1:
-        dx1 = np.flipud(dx1)
-        dy1 = np.flipud(dy1)
-        dx2 = np.flipud(dx2)
-        dy2 = np.flipud(dy2)
-    score = imap[(y,x)]
-    reg = np.transpose(np.vstack([ dx1[(y,x)], dy1[(y,x)], dx2[(y,x)], dy2[(y,x)] ]))
-    if reg.size==0:
-        reg = np.empty((0,3))
-    bb = np.transpose(np.vstack([y,x]))
-    q1 = np.fix((stride*bb+1)/scale)
-    q2 = np.fix((stride*bb+cellsize-1+1)/scale)
-    boundingbox = np.hstack([q1, q2, np.expand_dims(score,1), reg])
-    return boundingbox, reg
- 
-# function pick = nms(boxes,threshold,type)
-def nms(boxes, threshold, method):
-    if boxes.size==0:
-        return np.empty((0,3))
-    x1 = boxes[:,0]
-    y1 = boxes[:,1]
-    x2 = boxes[:,2]
-    y2 = boxes[:,3]
-    s = boxes[:,4]
-    area = (x2-x1+1) * (y2-y1+1)
-    I = np.argsort(s)
-    pick = np.zeros_like(s, dtype=np.int16)
-    counter = 0
-    while I.size>0:
-        i = I[-1]
-        pick[counter] = i
-        counter += 1
-        idx = I[0:-1]
-        xx1 = np.maximum(x1[i], x1[idx])
-        yy1 = np.maximum(y1[i], y1[idx])
-        xx2 = np.minimum(x2[i], x2[idx])
-        yy2 = np.minimum(y2[i], y2[idx])
-        w = np.maximum(0.0, xx2-xx1+1)
-        h = np.maximum(0.0, yy2-yy1+1)
-        inter = w * h
-        if method is 'Min':
-            o = inter / np.minimum(area[i], area[idx])
-        else:
-            o = inter / (area[i] + area[idx] - inter)
-        I = I[np.where(o<=threshold)]
-    pick = pick[0:counter]
-    return pick
-
-# function [dy edy dx edx y ey x ex tmpw tmph] = pad(total_boxes,w,h)
-def pad(total_boxes, w, h):
-    """Compute the padding coordinates (pad the bounding boxes to square)"""
-    tmpw = (total_boxes[:,2]-total_boxes[:,0]+1).astype(np.int32)
-    tmph = (total_boxes[:,3]-total_boxes[:,1]+1).astype(np.int32)
-    numbox = total_boxes.shape[0]
-
-    dx = np.ones((numbox), dtype=np.int32)
-    dy = np.ones((numbox), dtype=np.int32)
-    edx = tmpw.copy().astype(np.int32)
-    edy = tmph.copy().astype(np.int32)
-
-    x = total_boxes[:,0].copy().astype(np.int32)
-    y = total_boxes[:,1].copy().astype(np.int32)
-    ex = total_boxes[:,2].copy().astype(np.int32)
-    ey = total_boxes[:,3].copy().astype(np.int32)
-
-    tmp = np.where(ex>w)
-    edx.flat[tmp] = np.expand_dims(-ex[tmp]+w+tmpw[tmp],1)
-    ex[tmp] = w
-    
-    tmp = np.where(ey>h)
-    edy.flat[tmp] = np.expand_dims(-ey[tmp]+h+tmph[tmp],1)
-    ey[tmp] = h
-
-    tmp = np.where(x<1)
-    dx.flat[tmp] = np.expand_dims(2-x[tmp],1)
-    x[tmp] = 1
-
-    tmp = np.where(y<1)
-    dy.flat[tmp] = np.expand_dims(2-y[tmp],1)
-    y[tmp] = 1
-    
-    return dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph
-
-# function [bboxA] = rerec(bboxA)
-def rerec(bboxA):
-    """Convert bboxA to square."""
-    h = bboxA[:,3]-bboxA[:,1]
-    w = bboxA[:,2]-bboxA[:,0]
-    l = np.maximum(w, h)
-    bboxA[:,0] = bboxA[:,0]+w*0.5-l*0.5
-    bboxA[:,1] = bboxA[:,1]+h*0.5-l*0.5
-    bboxA[:,2:4] = bboxA[:,0:2] + np.transpose(np.tile(l,(2,1)))
-    return bboxA
-
-def imresample(img, sz):
-    im_data = cv2.resize(img, (sz[1], sz[0]), interpolation=cv2.INTER_LINEAR) #@UndefinedVariable
-    return im_data
-
-    # This method is kept for debugging purpose
-#     h=img.shape[0]
-#     w=img.shape[1]
-#     hs, ws = sz
-#     dx = float(w) / ws
-#     dy = float(h) / hs
-#     im_data = np.zeros((hs,ws,3))
-#     for a1 in range(0,hs):
-#         for a2 in range(0,ws):
-#             for a3 in range(0,3):
-#                 im_data[a1,a2,a3] = img[int(floor(a1*dy)),int(floor(a2*dx)),a3]
-#     return im_data
-
--- a/facelib/mtcnn_onet.h5
+++ b/facelib/mtcnn_onet.h5
--- a/facelib/mtcnn_pnet.h5
+++ b/facelib/mtcnn_pnet.h5
--- a/facelib/mtcnn_rnet.h5
+++ b/facelib/mtcnn_rnet.h5
--- a/main.py
+++ b/main.py
@ -137,6 +137,8 @@ if __name__ == "__main__":
    if arguments.tf_suppress_std:
        os.environ['TF_SUPPRESS_STD'] = '1'

+    #os.environ['force_plaidML'] = '1'
+    
    arguments.func(arguments)

    print ("Done.")
--- a/mainscripts/Converter.py
+++ b/mainscripts/Converter.py
@ -149,7 +149,8 @@ class ConvertSubprocessor(SubprocessorBase):
        files_processed = 1
        faces_processed = 0
            
-        output_filename_path = self.output_path / filename_path.name
+        output_filename_path = self.output_path / (filename_path.stem + '.png')
+
        if self.converter.get_mode() == ConverterBase.MODE_FACE and filename_path.stem not in self.alignments.keys():                    
            if not self.debug:
                print ( 'no faces found for %s, copying without faces' % (filename_path.name) )
--- a/mainscripts/Extractor.py
+++ b/mainscripts/Extractor.py
@ -62,23 +62,35 @@ class ExtractSubprocessor(SubprocessorBase):
                    
            cv2.setMouseCallback(self.wnd_name, onMouse, self.param)
    
-    def get_devices_for_type (self, type, multi_gpu):
-        if (type == 'rects' or type == 'landmarks'):
-            if multi_gpu:
-                devices = nnlib.device.getDevicesWithAtLeastTotalMemoryGB(2)
+    def get_devices_for_type (self, type, multi_gpu, cpu_only):
+        if not cpu_only and (type == 'rects' or type == 'landmarks'):
+            if type == 'rects' and self.detector == 'mt' and nnlib.device.backend == "plaidML":
+                cpu_only = True
+            else:
+                if multi_gpu:
+                    devices = nnlib.device.getValidDevicesWithAtLeastTotalMemoryGB(2)
+                if not multi_gpu or len(devices) == 0:
+                    devices = [nnlib.device.getBestValidDeviceIdx()]                    
+                if len(devices) == 0:
+                    devices = [0]
                    
-            if not multi_gpu or len(devices) == 0:
-                devices = [nnlib.device.getBestDeviceIdx()]
+                for idx in devices:
+                    dev_name = nnlib.device.getDeviceName(idx)
+                    dev_vram = nnlib.device.getDeviceVRAMTotalGb(idx)
                    
-            if len(devices) == 0:
-                devices = [0]
+                    if not self.manual and self.type == 'rects' and self.detector == 'mt':
+                        for i in range ( int (max (1, dev_vram / 2) ) ):
+                            yield (idx, 'GPU', '%s #%d' % (dev_name,i) , dev_vram)
+                    else:
+                        yield (idx, 'GPU', dev_name, dev_vram)

-            devices = [ (idx, nnlib.device.getDeviceName(idx), nnlib.device.getDeviceVRAMTotalGb(idx) ) for idx in devices]
+        if cpu_only and (type == 'rects' or type == 'landmarks'):
+            for i in range( min(8, multiprocessing.cpu_count() // 2) ):
+                yield (i, 'CPU', 'CPU%d' % (i), 0 )
                
-        elif type == 'final':
-            devices = [ (i, 'CPU%d' % (i), 0 ) for i in range(0, multiprocessing.cpu_count()) ]
-            
-        return devices 
+        if type == 'final':
+            for i in range( min(8, multiprocessing.cpu_count()) ):
+                yield (i, 'CPU', 'CPU%d' % (i), 0 ) 
        
    #override
    def process_info_generator(self):
@ -89,31 +101,13 @@ class ExtractSubprocessor(SubprocessorBase):
                     'output_dir': str(self.output_path), 
                     'detector': self.detector}
    
-        if not self.cpu_only:
-            for (device_idx, device_name, device_total_vram_gb) in self.get_devices_for_type(self.type, self.multi_gpu): 
-                num_processes = 1
-                if not self.manual and self.type == 'rects' and self.detector == 'mt':
-                    num_processes = int ( max (1, device_total_vram_gb / 2) )
+        for (device_idx, device_type, device_name, device_total_vram_gb) in self.get_devices_for_type(self.type, self.multi_gpu, self.cpu_only): 
+            client_dict = base_dict.copy()
+            client_dict['device_idx'] = device_idx
+            client_dict['device_name'] = device_name
+            client_dict['device_type'] = device_type            
+            yield client_dict['device_name'], {}, client_dict
    
-                for i in range(0, num_processes ):
-                    client_dict = base_dict.copy()
-                    client_dict['device_idx'] = device_idx
-                    client_dict['device_name'] = device_name if num_processes == 1 else '%s #%d' % (device_name,i)
-                    client_dict['device_type'] = 'GPU'
-                    
-                    yield client_dict['device_name'], {}, client_dict
-        else:
-            num_processes = 1
-            if not self.manual and self.type == 'rects' and self.detector == 'mt':
-                num_processes = int ( max (1, multiprocessing.cpu_count() / 2 ) )
-            
-            for i in range(0, num_processes ):
-                client_dict = base_dict.copy()
-                client_dict['device_idx'] = 0
-                client_dict['device_name'] = 'CPU' if num_processes == 1 else 'CPU #%d' % (i),
-                client_dict['device_type'] = 'CPU'
-                
-                yield client_dict['device_name'], {}, client_dict
                    
    #override
    def get_no_process_started_message(self):
@ -265,13 +259,12 @@ class ExtractSubprocessor(SubprocessorBase):
        self.detector     = client_dict['detector']

        self.e = None
-
        device_config = nnlib.DeviceConfig ( cpu_only=self.cpu_only, force_gpu_idx=self.device_idx, allow_growth=True)
        if self.type == 'rects':
            if self.detector is not None:
                if self.detector == 'mt':
                    nnlib.import_all (device_config)
-                    self.e = facelib.MTCExtractor(nnlib.keras, nnlib.tf, nnlib.tf_sess)                            
+                    self.e = facelib.MTCExtractor()                            
                elif self.detector == 'dlib':
                    nnlib.import_dlib (device_config)
                    self.e = facelib.DLIBExtractor(nnlib.dlib)
--- a/models/ModelBase.py
+++ b/models/ModelBase.py
@ -22,7 +22,7 @@ class ModelBase(object):
    def __init__(self, model_path, training_data_src_path=None, training_data_dst_path=None, debug = False, force_gpu_idx=-1, **in_options):
    
        if force_gpu_idx == -1: 
-            idxs_names_list = nnlib.device.getAllDevicesIdxsWithNamesList()
+            idxs_names_list = nnlib.device.getValidDevicesIdxsWithNamesList()
            if len(idxs_names_list) > 1:
                print ("You have multi GPUs in a system: ")
                for idx, name in idxs_names_list:
--- a/models/Model_DF/Model.py
+++ b/models/Model_DF/Model.py
@ -16,14 +16,14 @@ class Model(ModelBase):
    def onInitializeOptions(self, is_first_run, ask_override):        
        if is_first_run or ask_override:
            def_pixel_loss = self.options.get('pixel_loss', False)
-            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 20k epochs to enhance fine details and remove face jitter.")
+            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 20k epochs to enhance fine details and decrease face jitter.")
        else:
            self.options['pixel_loss'] = self.options.get('pixel_loss', False)
            
    #override
    def onInitialize(self, **in_options):
        exec(nnlib.import_all(), locals(), globals())
-        self.set_vram_batch_requirements( {4.5:4,5:6,6:8,7:16,8:24,9:24,10:32,11:32,12:32,13:48} )
+        self.set_vram_batch_requirements( {4.5:4} )
                
        ae_input_layer = Input(shape=(128, 128, 3))
        mask_layer = Input(shape=(128, 128, 1)) #same as output
--- a/models/Model_H128/Model.py
+++ b/models/Model_H128/Model.py
@ -24,14 +24,14 @@ class Model(ModelBase):
            
        if is_first_run or ask_override:
            def_pixel_loss = self.options.get('pixel_loss', False)
-            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 20k epochs to enhance fine details and remove face jitter.")
+            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 20k epochs to enhance fine details and decrease face jitter.")
        else:
            self.options['pixel_loss'] = self.options.get('pixel_loss', False)
            
    #override
    def onInitialize(self, **in_options):
        exec(nnlib.import_all(), locals(), globals())        
-        self.set_vram_batch_requirements( {2.5:2,3:2,4:2,4:4,5:8,6:12,7:16,8:16,9:24,10:24,11:32,12:32,13:48} )
+        self.set_vram_batch_requirements( {2.5:4} )
                
        bgr_shape, mask_shape, self.encoder, self.decoder_src, self.decoder_dst = self.Build( self.options['lighter_ae'] )
        if not self.is_first_run():
--- a/models/Model_H64/Model.py
+++ b/models/Model_H64/Model.py
@ -24,14 +24,14 @@ class Model(ModelBase):
            
        if is_first_run or ask_override:
            def_pixel_loss = self.options.get('pixel_loss', False)
-            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 20k epochs to enhance fine details and remove face jitter.")
+            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 20k epochs to enhance fine details and decrease face jitter.")
        else:
            self.options['pixel_loss'] = self.options.get('pixel_loss', False)
            
    #override
    def onInitialize(self, **in_options):
        exec(nnlib.import_all(), locals(), globals())
-        self.set_vram_batch_requirements( {1.5:2,2:2,3:8,4:16,5:24,6:32,7:40,8:48} )
+        self.set_vram_batch_requirements( {1.5:4} )
        
        
        bgr_shape, mask_shape, self.encoder, self.decoder_src, self.decoder_dst = self.Build(self.options['lighter_ae'])
--- a/models/Model_LIAEF128/Model.py
+++ b/models/Model_LIAEF128/Model.py
@ -17,14 +17,14 @@ class Model(ModelBase):
    def onInitializeOptions(self, is_first_run, ask_override):        
        if is_first_run or ask_override:
            def_pixel_loss = self.options.get('pixel_loss', False)
-            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 20k epochs to enhance fine details and remove face jitter.")
+            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 20k epochs to enhance fine details and decrease face jitter.")
        else:
            self.options['pixel_loss'] = self.options.get('pixel_loss', False)
            
    #override
    def onInitialize(self, **in_options):
        exec(nnlib.import_all(), locals(), globals())
-        self.set_vram_batch_requirements( {4.5:4,5:4,6:8,7:12,8:16,9:20,10:24,11:24,12:32,13:48} )
+        self.set_vram_batch_requirements( {4.5:4} )

        ae_input_layer = Input(shape=(128, 128, 3))
        mask_layer = Input(shape=(128, 128, 1)) #same as output
--- a/models/Model_SAE/Model.py
+++ b/models/Model_SAE/Model.py
@ -29,29 +29,13 @@ class SAEModel(ModelBase):
        if is_first_run:
            self.options['resolution'] = input_int("Resolution (64,128 ?:help skip:128) : ", default_resolution, [64,128], help_message="More resolution requires more VRAM.")
            self.options['face_type'] = input_str ("Half or Full face? (h/f, ?:help skip:f) : ", default_face_type, ['h','f'], help_message="Half face has better resolution, but covers less area of cheeks.").lower()            
+            self.options['learn_mask'] = input_bool ("Learn mask? (y/n, ?:help skip:y) : ", True, help_message="Learning mask can help model to recognize face directions. Learn without mask can reduce model size, in this case converter forced to use 'not predicted mask' that is not smooth as predicted. Model with style values can be learned without mask and produce same quality result.")
            self.options['archi'] = input_str ("AE architecture (df, liae, ?:help skip:%s) : " % (default_archi) , default_archi, ['df','liae'], help_message="DF keeps faces more natural, while LIAE can fix overly different face shapes.").lower()            
-            self.options['lighter_encoder'] = input_bool ("Use lightweight encoder? (y/n, ?:help skip:n) : ", False, help_message="Lightweight encoder is 35% faster, requires less VRAM, sacrificing overall quality.")
-            self.options['learn_mask'] = input_bool ("Learn mask? (y/n, ?:help skip:y) : ", True, help_message="Choose NO to reduce model size. In this case converter forced to use 'not predicted mask' that is not smooth as predicted. Styled SAE can learn without mask and produce same quality fake.")
        else:
            self.options['resolution'] = self.options.get('resolution', default_resolution)
            self.options['face_type'] = self.options.get('face_type', default_face_type)
-            self.options['archi'] = self.options.get('archi', default_archi)
-            self.options['lighter_encoder'] = self.options.get('lighter_encoder', False)
            self.options['learn_mask'] = self.options.get('learn_mask', True)            
-            
-        default_face_style_power = 10.0
-        if is_first_run or ask_override:
-            default_face_style_power = default_face_style_power if is_first_run else self.options.get('face_style_power', default_face_style_power)
-            self.options['face_style_power'] = np.clip ( input_number("Face style power ( 0.0 .. 100.0 ?:help skip:%.2f) : " % (default_face_style_power), default_face_style_power, help_message="How fast NN will learn dst face style during generalization of src and dst faces. If style is learned good enough, set this value to 0.01 to prevent artifacts appearing."), 0.0, 100.0 )            
-        else:
-            self.options['face_style_power'] = self.options.get('face_style_power', default_face_style_power)
-        
-        default_bg_style_power = 10.0        
-        if is_first_run or ask_override: 
-            default_bg_style_power = default_bg_style_power if is_first_run else self.options.get('bg_style_power', default_bg_style_power)
-            self.options['bg_style_power'] = np.clip ( input_number("Background style power ( 0.0 .. 100.0 ?:help skip:%.2f) : " % (default_bg_style_power), default_bg_style_power, help_message="How fast NN will learn dst background style during generalization of src and dst faces. If style is learned good enough, set this value to 0.1-0.3 to prevent artifacts appearing."), 0.0, 100.0 )            
-        else:
-            self.options['bg_style_power'] = self.options.get('bg_style_power', default_bg_style_power)
+            self.options['archi'] = self.options.get('archi', default_archi)
        
        default_ae_dims = 256 if self.options['archi'] == 'liae' else 512
        default_ed_ch_dims = 42
@ -62,13 +46,36 @@ class SAEModel(ModelBase):
            self.options['ae_dims'] = self.options.get('ae_dims', default_ae_dims)
            self.options['ed_ch_dims'] = self.options.get('ed_ch_dims', default_ed_ch_dims)
            
+        if is_first_run:
+            self.options['lighter_encoder'] = input_bool ("Use lightweight encoder? (y/n, ?:help skip:n) : ", False, help_message="Lightweight encoder is 35% faster, requires less VRAM, but sacrificing overall quality.")
+            self.options['multiscale_decoder'] = input_bool ("Use multiscale decoder? (y/n, ?:help skip:y) : ", True, help_message="Multiscale decoder helps to get better details.")
+        else:
+            self.options['lighter_encoder'] = self.options.get('lighter_encoder', False)
+            self.options['multiscale_decoder'] = self.options.get('multiscale_decoder', True)
            
+        default_face_style_power = 0.0        
+        default_bg_style_power = 0.0  
+        if is_first_run or ask_override:
+            def_pixel_loss = self.options.get('pixel_loss', False)
+            self.options['pixel_loss'] = input_bool ("Use pixel loss? (y/n, ?:help skip: n/default ) : ", def_pixel_loss, help_message="Default DSSIM loss good for initial understanding structure of faces. Use pixel loss after 15-25k epochs to enhance fine details and decrease face jitter.")
+        
+            default_face_style_power = default_face_style_power if is_first_run else self.options.get('face_style_power', default_face_style_power)
+            self.options['face_style_power'] = np.clip ( input_number("Face style power ( 0.0 .. 100.0 ?:help skip:%.2f) : " % (default_face_style_power), default_face_style_power, 
+                                                                               help_message="Learn to transfer face style details such as light and color conditions. Warning: Enable it only after 10k epochs, when predicted face is clear enough to start learn style. Start from 0.1 value and check history changes."), 0.0, 100.0 )            
+                            
+            default_bg_style_power = default_bg_style_power if is_first_run else self.options.get('bg_style_power', default_bg_style_power)
+            self.options['bg_style_power'] = np.clip ( input_number("Background style power ( 0.0 .. 100.0 ?:help skip:%.2f) : " % (default_bg_style_power), default_bg_style_power, 
+                                                                               help_message="Learn to transfer image around face. This can make face more like dst."), 0.0, 100.0 )            
+        else:
+            self.options['pixel_loss'] = self.options.get('pixel_loss', False)
+            self.options['face_style_power'] = self.options.get('face_style_power', default_face_style_power)
+            self.options['bg_style_power'] = self.options.get('bg_style_power', default_bg_style_power)

    #override
    def onInitialize(self, **in_options):
        exec(nnlib.import_all(), locals(), globals())

-        self.set_vram_batch_requirements({2:1,3:2,4:3,5:6,6:8,7:12,8:16})
+        self.set_vram_batch_requirements({1.5:4})
        
        resolution = self.options['resolution']
        ae_dims = self.options['ae_dims']
@ -77,7 +84,9 @@ class SAEModel(ModelBase):
        bgr_shape = (resolution, resolution, 3)
        mask_shape = (resolution, resolution, 1)
        
-        dssim_pixel_alpha = Input( (1,) )
+        self.ms_count = ms_count = 3 if self.options['multiscale_decoder'] else 1
+        
+        epoch_alpha = Input( (1,) )
        warped_src = Input(bgr_shape)
        target_src = Input(bgr_shape)
        target_srcm = Input(mask_shape)
@ -86,6 +95,11 @@ class SAEModel(ModelBase):
        target_dst = Input(bgr_shape)
        target_dstm = Input(mask_shape)

+        target_src_ar = [ Input ( ( bgr_shape[0] // (2**i) ,)*2 + (bgr_shape[-1],) ) for i in range(ms_count-1, -1, -1)]
+        target_srcm_ar = [ Input ( ( mask_shape[0] // (2**i) ,)*2 + (mask_shape[-1],) ) for i in range(ms_count-1, -1, -1)]
+        target_dst_ar  = [ Input ( ( bgr_shape[0] // (2**i) ,)*2 + (bgr_shape[-1],) ) for i in range(ms_count-1, -1, -1)]
+        target_dstm_ar = [ Input ( ( mask_shape[0] // (2**i) ,)*2 + (mask_shape[-1],) ) for i in range(ms_count-1, -1, -1)]
+
        if self.options['archi'] == 'liae':
            self.encoder = modelify(SAEModel.LIAEEncFlow(resolution, adapt_k_size, self.options['lighter_encoder'], ed_ch_dims=ed_ch_dims)  ) (Input(bgr_shape))
            
@ -96,10 +110,10 @@ class SAEModel(ModelBase):
            
            inter_output_Inputs = [ Input( np.array(K.int_shape(x)[1:])*(1,1,2) ) for x in self.inter_B.outputs ] 

-            self.decoder = modelify(SAEModel.LIAEDecFlow (bgr_shape[2],ed_ch_dims=ed_ch_dims//2, multiscale_decoder=True)) (inter_output_Inputs)
+            self.decoder = modelify(SAEModel.LIAEDecFlow (bgr_shape[2],ed_ch_dims=ed_ch_dims//2, multiscale_count=self.ms_count )) (inter_output_Inputs)
            
            if self.options['learn_mask']:
-                self.decoderm = modelify(SAEModel.LIAEDecFlow (mask_shape[2],ed_ch_dims=int(ed_ch_dims/1.5), multiscale_decoder=False )) (inter_output_Inputs)
+                self.decoderm = modelify(SAEModel.LIAEDecFlow (mask_shape[2],ed_ch_dims=int(ed_ch_dims/1.5) )) (inter_output_Inputs)

            if not self.is_first_run():
                self.encoder.load_weights  (self.get_strpath_storage_for_file(self.encoderH5))
@ -129,19 +143,17 @@ class SAEModel(ModelBase):
                pred_dst_dstm = self.decoderm(warped_dst_inter_code)
                pred_src_dstm = self.decoderm(warped_src_dst_inter_code)

-                
-
        else:
            self.encoder = modelify(SAEModel.DFEncFlow(resolution, adapt_k_size, self.options['lighter_encoder'], ae_dims=ae_dims, ed_ch_dims=ed_ch_dims)  ) (Input(bgr_shape))

            dec_Inputs = [ Input(K.int_shape(x)[1:]) for x in self.encoder.outputs ] 
            
-            self.decoder_src = modelify(SAEModel.DFDecFlow (bgr_shape[2],ed_ch_dims=ed_ch_dims//2, multiscale_decoder=True)) (dec_Inputs)
-            self.decoder_dst = modelify(SAEModel.DFDecFlow (bgr_shape[2],ed_ch_dims=ed_ch_dims//2, multiscale_decoder=True)) (dec_Inputs)
+            self.decoder_src = modelify(SAEModel.DFDecFlow (bgr_shape[2],ed_ch_dims=ed_ch_dims//2, multiscale_count=self.ms_count )) (dec_Inputs)
+            self.decoder_dst = modelify(SAEModel.DFDecFlow (bgr_shape[2],ed_ch_dims=ed_ch_dims//2, multiscale_count=self.ms_count )) (dec_Inputs)
            
            if self.options['learn_mask']:
-                self.decoder_srcm = modelify(SAEModel.DFDecFlow (mask_shape[2],ed_ch_dims=int(ed_ch_dims/1.5), multiscale_decoder=False)) (dec_Inputs)
-                self.decoder_dstm = modelify(SAEModel.DFDecFlow (mask_shape[2],ed_ch_dims=int(ed_ch_dims/1.5), multiscale_decoder=False)) (dec_Inputs)
+                self.decoder_srcm = modelify(SAEModel.DFDecFlow (mask_shape[2],ed_ch_dims=int(ed_ch_dims/1.5) )) (dec_Inputs)
+                self.decoder_dstm = modelify(SAEModel.DFDecFlow (mask_shape[2],ed_ch_dims=int(ed_ch_dims/1.5) )) (dec_Inputs)
           
            if not self.is_first_run():
                self.encoder.load_weights      (self.get_strpath_storage_for_file(self.encoderH5))
@ -167,18 +179,11 @@ class SAEModel(ModelBase):
        if self.options['learn_mask']:
            pred_src_srcm, pred_dst_dstm, pred_src_dstm = [ [x] if type(x) != list else x for x in [pred_src_srcm, pred_dst_dstm, pred_src_dstm] ]

-        ms_count = len(pred_src_src)
-        
-        target_src_ar  = [ target_src  if i == 0 else tf.image.resize_bicubic( target_src,  (resolution // (2**i) ,)*2 )  for i in range(ms_count-1, -1, -1)]
-        target_srcm_ar = [ target_srcm if i == 0 else tf.image.resize_bicubic( target_srcm, (resolution // (2**i) ,)*2 )  for i in range(ms_count-1, -1, -1)]
-        target_dst_ar  = [ target_dst  if i == 0 else tf.image.resize_bicubic( target_dst,  (resolution // (2**i) ,)*2 )  for i in range(ms_count-1, -1, -1)]
-        target_dstm_ar = [ target_dstm if i == 0 else tf.image.resize_bicubic( target_dstm, (resolution // (2**i) ,)*2 )  for i in range(ms_count-1, -1, -1)]
-
-        target_srcm_blurred_ar = [ tf_gaussian_blur( max(1, x.get_shape().as_list()[1] // 32) )(x) for x in target_srcm_ar]
+        target_srcm_blurred_ar = [ gaussian_blur( max(1, K.int_shape(x)[1] // 32) )(x) for x in target_srcm_ar]
        target_srcm_sigm_ar = [ x / 2.0 + 0.5 for x in target_srcm_blurred_ar] 
        target_srcm_anti_sigm_ar = [ 1.0 - x for x in target_srcm_sigm_ar] 
    
-        target_dstm_blurred_ar = [ tf_gaussian_blur( max(1, x.get_shape().as_list()[1] // 32) )(x) for x in target_dstm_ar]
+        target_dstm_blurred_ar = [ gaussian_blur( max(1, K.int_shape(x)[1] // 32) )(x) for x in target_dstm_ar]
        target_dstm_sigm_ar = [ x / 2.0 + 0.5 for x in target_dstm_blurred_ar] 
        target_dstm_anti_sigm_ar = [ 1.0 - x for x in target_dstm_sigm_ar] 
        
@ -200,8 +205,6 @@ class SAEModel(ModelBase):
            def optimizer():
                return Adam(lr=5e-5, beta_1=0.5, beta_2=0.999)
   
-            dssim_pixel_alpha_value = dssim_pixel_alpha[0][0]
-            
            if self.options['archi'] == 'liae':          
                src_dst_loss_train_weights = self.encoder.trainable_weights + self.inter_B.trainable_weights + self.inter_AB.trainable_weights + self.decoder.trainable_weights
                if self.options['learn_mask']:
@ -211,34 +214,50 @@ class SAEModel(ModelBase):
                if self.options['learn_mask']:
                    src_dst_mask_loss_train_weights = self.encoder.trainable_weights + self.decoder_srcm.trainable_weights + self.decoder_dstm.trainable_weights
            
-            src_dssim_loss_batch = sum([ (  100*K.square(tf_dssim(2.0)( target_src_masked_ar[i],  pred_src_src_sigm_ar[i] * target_srcm_sigm_ar[i] ) )) for i in range(len(target_src_masked_ar)) ])
-            src_pixel_loss_batch = sum([ tf_reduce_mean ( 100*K.square( target_src_masked_ar[i] - pred_src_src_sigm_ar[i] * target_srcm_sigm_ar[i] ), axis=[1,2,3]) for i in range(len(target_src_masked_ar)) ])
+            if not self.options['pixel_loss']:
+                src_loss_batch = sum([ (  100*K.square( dssim(max_value=2.0)( target_src_masked_ar[i],  pred_src_src_sigm_ar[i] * target_srcm_sigm_ar[i] ) )) for i in range(len(target_src_masked_ar)) ])
+            else:
+                src_loss_batch = sum([ K.mean ( 100*K.square( target_src_masked_ar[i] - pred_src_src_sigm_ar[i] * target_srcm_sigm_ar[i] ), axis=[1,2,3]) for i in range(len(target_src_masked_ar)) ])

-            src_loss_batch = src_dssim_loss_batch*(1.0-dssim_pixel_alpha_value) + src_pixel_loss_batch*dssim_pixel_alpha_value
            src_loss = K.mean(src_loss_batch)

-            if self.options['face_style_power'] != 0:
-                face_style_power = self.options['face_style_power'] / 100.0
-                src_loss += tf_style_loss(gaussian_blur_radius=resolution // 8, loss_weight=0.2*face_style_power)( psd_target_dst_masked_ar[-1], target_dst_masked_ar[-1] ) 
+            face_style_power = self.options['face_style_power']  / 100.0
            
-            if self.options['bg_style_power'] != 0:
-                bg_style_power = self.options['bg_style_power'] / 100.0                
-                bg_dssim_loss = K.mean( (100*bg_style_power)*K.square(tf_dssim(2.0)( psd_target_dst_anti_masked_ar[-1], target_dst_anti_masked_ar[-1] )))
-                bg_pixel_loss = K.mean( (100*bg_style_power)*K.square( psd_target_dst_anti_masked_ar[-1] - target_dst_anti_masked_ar[-1] ))
-                src_loss += bg_dssim_loss*(1.0-dssim_pixel_alpha_value) + bg_pixel_loss*dssim_pixel_alpha_value
+            if face_style_power != 0:    
+                src_loss += style_loss(gaussian_blur_radius=resolution//16, loss_weight=face_style_power, wnd_size=0)( psd_target_dst_masked_ar[-1], target_dst_masked_ar[-1] ) 
+
+            bg_style_power = self.options['bg_style_power'] / 100.0
+            if bg_style_power != 0:
+                if not self.options['pixel_loss']:
+                    bg_loss = K.mean( (100*bg_style_power)*K.square(dssim(max_value=2.0)( psd_target_dst_anti_masked_ar[-1], target_dst_anti_masked_ar[-1] )))
+                else:
+                    bg_loss = K.mean( (100*bg_style_power)*K.square( psd_target_dst_anti_masked_ar[-1] - target_dst_anti_masked_ar[-1] ))
+                src_loss += bg_loss
+
+            if not self.options['pixel_loss']:
+                dst_loss_batch = sum([ (  100*K.square(dssim(max_value=2.0)( target_dst_masked_ar[i],  pred_dst_dst_sigm_ar[i] * target_dstm_sigm_ar[i] ) )) for i in range(len(target_dst_masked_ar)) ])
+            else:
+                dst_loss_batch = sum([ K.mean ( 100*K.square( target_dst_masked_ar[i] - pred_dst_dst_sigm_ar[i] * target_dstm_sigm_ar[i] ), axis=[1,2,3]) for i in range(len(target_dst_masked_ar)) ])
                
-            dst_dssim_loss_batch = sum([ (  100*K.square(tf_dssim(2.0)( target_dst_masked_ar[i],  pred_dst_dst_sigm_ar[i] * target_dstm_sigm_ar[i] ) )) for i in range(len(target_dst_masked_ar)) ])
-            dst_pixel_loss_batch = sum([ tf_reduce_mean ( 100*K.square( target_dst_masked_ar[i] - pred_dst_dst_sigm_ar[i] * target_dstm_sigm_ar[i] ), axis=[1,2,3]) for i in range(len(target_dst_masked_ar)) ])
-            dst_loss_batch = dst_dssim_loss_batch*(1.0-dssim_pixel_alpha_value) + dst_pixel_loss_batch*dssim_pixel_alpha_value
            dst_loss = K.mean(dst_loss_batch)

-            self.src_dst_train = K.function ([dssim_pixel_alpha, warped_src, target_src, target_srcm, warped_dst, target_dst, target_dstm ],[src_loss,dst_loss,src_loss_batch,dst_loss_batch], optimizer().get_updates(src_loss+dst_loss, src_dst_loss_train_weights) )
+            feed = [warped_src, warped_dst]            
+            feed += target_src_ar[::-1]
+            feed += target_srcm_ar[::-1]
+            feed += target_dst_ar[::-1]
+            feed += target_dstm_ar[::-1]
    
+            self.src_dst_train = K.function (feed,[src_loss,dst_loss], optimizer().get_updates(src_loss+dst_loss, src_dst_loss_train_weights) )

            if self.options['learn_mask']:
                src_mask_loss = sum([ K.mean(K.square(target_srcm_ar[-1]-pred_src_srcm[-1])) for i in range(len(target_srcm_ar)) ])
                dst_mask_loss = sum([ K.mean(K.square(target_dstm_ar[-1]-pred_dst_dstm[-1])) for i in range(len(target_dstm_ar)) ])
-                self.src_dst_mask_train = K.function ([warped_src, target_srcm, warped_dst, target_dstm],[src_mask_loss, dst_mask_loss], optimizer().get_updates(src_mask_loss+dst_mask_loss, src_dst_mask_loss_train_weights) )
+                
+                feed = [ warped_src, warped_dst]    
+                feed += target_srcm_ar[::-1]
+                feed += target_dstm_ar[::-1]            
+                
+                self.src_dst_mask_train = K.function (feed,[src_mask_loss, dst_mask_loss], optimizer().get_updates(src_mask_loss+dst_mask_loss, src_dst_mask_loss_train_weights) )

            if self.options['learn_mask']:
                self.AE_view = K.function ([warped_src, warped_dst], [pred_src_src[-1], pred_dst_dst[-1], pred_src_dst[-1], pred_src_dstm[-1]])
@ -257,21 +276,20 @@ class SAEModel(ModelBase):
            
            f = SampleProcessor.TypeFlags            
            face_type = f.FACE_ALIGN_FULL if self.options['face_type'] == 'f' else f.FACE_ALIGN_HALF
+            
+            output_sample_types=[ [f.WARPED_TRANSFORMED | face_type | f.MODE_BGR, resolution] ]            
+            output_sample_types += [ [f.TRANSFORMED | face_type | f.MODE_BGR, resolution // (2**i) ] for i in range(ms_count)]
+            output_sample_types += [ [f.TRANSFORMED | face_type | f.MODE_M | f.FACE_MASK_FULL, resolution // (2**i) ] for i in range(ms_count)]
+            
            self.set_training_data_generators ([            
                    SampleGeneratorFace(self.training_data_src_path, sort_by_yaw_target_samples_path=self.training_data_dst_path if self.sort_by_yaw else None, 
                                                                     debug=self.is_debug(), batch_size=self.batch_size, 
                        sample_process_options=SampleProcessor.Options(random_flip=self.random_flip, normalize_tanh = True, scale_range=np.array([-0.05, 0.05])+self.src_scale_mod / 100.0 ), 
-                        output_sample_types=[ [f.WARPED_TRANSFORMED | face_type | f.MODE_BGR, resolution],                        
-                                              [f.TRANSFORMED | face_type | f.MODE_BGR, resolution], 
-                                              [f.TRANSFORMED | face_type | f.MODE_M | f.FACE_MASK_FULL, resolution]                                              
-                                            ], add_sample_idx=True ),
+                        output_sample_types=output_sample_types ),
                                              
                    SampleGeneratorFace(self.training_data_dst_path, debug=self.is_debug(), batch_size=self.batch_size,
                        sample_process_options=SampleProcessor.Options(random_flip=self.random_flip, normalize_tanh = True), 
-                        output_sample_types=[ [f.WARPED_TRANSFORMED | face_type | f.MODE_BGR, resolution],                                             
-                                              [f.TRANSFORMED | face_type | f.MODE_BGR, resolution], 
-                                              [f.TRANSFORMED | face_type | f.MODE_M | f.FACE_MASK_FULL, resolution]                                               
-                                            ], add_sample_idx=True )
+                        output_sample_types=output_sample_types )
                ])
    #override
    def onSave(self):
@ -297,17 +315,20 @@ class SAEModel(ModelBase):
    
    #override
    def onTrainOneEpoch(self, generators_samples, generators_list):
-        warped_src, target_src, target_src_mask, src_sample_idxs = generators_samples[0]
-        warped_dst, target_dst, target_dst_mask, dst_sample_idxs = generators_samples[1]
+        src_samples  = generators_samples[0]
+        dst_samples  = generators_samples[1]

-        dssim_pixel_alpha = np.clip ( (self.epoch - 5000) / 15000.0, 0.0, 1.0 ) #smooth transition between DSSIM and MSE in 5-20k epochs
-        dssim_pixel_alpha = np.repeat( dssim_pixel_alpha, (self.batch_size,) )
-        dssim_pixel_alpha = np.expand_dims(dssim_pixel_alpha,-1)
+        feed = [src_samples[0], dst_samples[0] ] + \
+               src_samples[1:1+self.ms_count*2] + \
+               dst_samples[1:1+self.ms_count*2]
               
-        src_loss, dst_loss, src_sample_losses, dst_sample_losses = self.src_dst_train ([dssim_pixel_alpha, warped_src, target_src, target_src_mask, warped_dst, target_dst, target_dst_mask])
+        src_loss, dst_loss, = self.src_dst_train (feed)
            
        if self.options['learn_mask']:
-            src_mask_loss, dst_mask_loss, = self.src_dst_mask_train ([warped_src, target_src_mask, warped_dst, target_dst_mask])
+            feed = [ src_samples[0], dst_samples[0] ] + \
+                   src_samples[1+self.ms_count:1+self.ms_count*2] + \
+                   dst_samples[1+self.ms_count:1+self.ms_count*2]
+            src_mask_loss, dst_mask_loss, = self.src_dst_mask_train (feed)
        
        return ( ('src_loss', src_loss), ('dst_loss', dst_loss) )
        
@ -430,7 +451,7 @@ class SAEModel(ModelBase):
        return func
        
    @staticmethod
-    def LIAEDecFlow(output_nc,ed_ch_dims=21, multiscale_decoder=True):
+    def LIAEDecFlow(output_nc,ed_ch_dims=21, multiscale_count=1):
        exec (nnlib.import_all(), locals(), globals())
        ed_dims = output_nc * ed_ch_dims
        
@ -449,12 +470,12 @@ class SAEModel(ModelBase):
            outputs = []
            x1     = upscale(ed_dims*8)( x )       
            
-            if multiscale_decoder:
+            if multiscale_count >= 3:
                outputs += [ to_bgr() ( x1 ) ]  
                
            x2     = upscale(ed_dims*4)( x1 )    
            
-            if multiscale_decoder:
+            if multiscale_count >= 2:
                outputs += [ to_bgr() ( x2 ) ]
                
            x3     = upscale(ed_dims*2)( x2 )
@ -513,7 +534,7 @@ class SAEModel(ModelBase):
        return func
    
    @staticmethod
-    def DFDecFlow(output_nc, ed_ch_dims=21, multiscale_decoder=True):
+    def DFDecFlow(output_nc, ed_ch_dims=21, multiscale_count=1):
        exec (nnlib.import_all(), locals(), globals())
        ed_dims = output_nc * ed_ch_dims

@ -535,12 +556,12 @@ class SAEModel(ModelBase):
            outputs = []
            x1     = upscale(ed_dims*8)( x )       
            
-            if multiscale_decoder:
+            if multiscale_count >= 3:
                outputs += [ to_bgr() ( x1 ) ]  
                
            x2     = upscale(ed_dims*4)( x1 )    
            
-            if multiscale_decoder:
+            if multiscale_count >= 2:
                outputs += [ to_bgr() ( x2 ) ]
                
            x3     = upscale(ed_dims*2)( x2 )
--- a/nnlib/device.py
+++ b/nnlib/device.py
@ -0,0 +1,333 @@
+import os
+import json
+import numpy as np
+from .pynvml import *
+
+tf_min_req_cap = 37 #min req compute capability for tensorflow-gpu==1.11.0
+    
+class device:
+    backend = None
+    class Config():    
+        force_gpu_idx = -1
+        multi_gpu = False
+        force_gpu_idxs = None
+        choose_worst_gpu = False
+        gpu_idxs = []
+        gpu_names = []
+        gpu_compute_caps = []
+        gpu_vram_gb = []
+        allow_growth = True
+        use_fp16 = False
+        cpu_only = False
+        backend = None
+        def __init__ (self, force_gpu_idx = -1, 
+                            multi_gpu = False, 
+                            force_gpu_idxs = None, 
+                            choose_worst_gpu = False,
+                            allow_growth = True,
+                            use_fp16 = False,
+                            cpu_only = False,
+                            **in_options):
+            
+            self.backend = device.backend
+            self.use_fp16 = use_fp16
+            self.cpu_only = cpu_only
+            
+            if not self.cpu_only:
+                self.cpu_only = (self.backend == "tensorflow-cpu")
+            
+            if not self.cpu_only:
+                self.force_gpu_idx = force_gpu_idx
+                self.multi_gpu = multi_gpu
+                self.force_gpu_idxs = force_gpu_idxs
+                self.choose_worst_gpu = choose_worst_gpu        
+                self.allow_growth = allow_growth
+          
+                self.gpu_idxs = []
+
+                if force_gpu_idxs is not None:
+                    for idx in force_gpu_idxs.split(','):
+                        idx = int(idx)
+                        if device.isValidDeviceIdx(idx):
+                            self.gpu_idxs.append(idx)     
+                else:
+                    gpu_idx = force_gpu_idx if (force_gpu_idx >= 0 and device.isValidDeviceIdx(force_gpu_idx)) else device.getBestValidDeviceIdx() if not choose_worst_gpu else device.getWorstValidDeviceIdx()
+                    if gpu_idx != -1:
+                        if self.multi_gpu:
+                            self.gpu_idxs = device.getDeviceIdxsEqualModel( gpu_idx )
+                            if len(self.gpu_idxs) <= 1:
+                                self.multi_gpu = False
+                        else:
+                            self.gpu_idxs = [gpu_idx]
+                            
+                self.cpu_only = (len(self.gpu_idxs) == 0)
+ 
+            if not self.cpu_only:
+                self.gpu_names = []
+                self.gpu_compute_caps = []
+                self.gpu_vram_gb = []
+                for gpu_idx in self.gpu_idxs:
+                    self.gpu_names += [device.getDeviceName(gpu_idx)]
+                    self.gpu_compute_caps += [ device.getDeviceComputeCapability(gpu_idx) ]
+                    self.gpu_vram_gb += [ device.getDeviceVRAMTotalGb(gpu_idx) ]
+                self.cpu_only = (len(self.gpu_idxs) == 0)
+                
+            if self.cpu_only:
+                self.backend = "tensorflow-cpu"
+                
+    @staticmethod
+    def getValidDeviceIdxsEnumerator():
+        if device.backend == "plaidML":
+            for i in range(plaidML_devices_count):
+                yield i
+        elif device.backend == "tensorflow":
+            for gpu_idx in range(nvmlDeviceGetCount()):
+                cap = device.getDeviceComputeCapability (gpu_idx)
+                if cap >= tf_min_req_cap:
+                    yield gpu_idx
+        elif device.backend == "tensorflow-generic":
+            yield 0
+        
+    
+    @staticmethod
+    def getValidDevicesWithAtLeastTotalMemoryGB(totalmemsize_gb):
+        result = []
+        if device.backend == "plaidML":
+            for i in device.getValidDeviceIdxsEnumerator():
+                if plaidML_devices[i]['globalMemSize'] >= totalmemsize_gb*1024*1024*1024:
+                     result.append (i)
+        elif device.backend == "tensorflow":
+            for i in device.getValidDeviceIdxsEnumerator():
+                handle = nvmlDeviceGetHandleByIndex(i)
+                memInfo = nvmlDeviceGetMemoryInfo( handle )
+                if (memInfo.total) >= totalmemsize_gb*1024*1024*1024:
+                    result.append (i)
+        elif device.backend == "tensorflow-generic":
+            return [0]
+                   
+        return result
+        
+    @staticmethod
+    def getAllDevicesIdxsList():
+        if device.backend == "plaidML":
+            return [ *range(plaidML_devices_count) ]
+        elif device.backend == "tensorflow":
+            return [ *range(nvmlDeviceGetCount() ) ]
+        elif device.backend == "tensorflow-generic":
+            return [0]  
+            
+    @staticmethod
+    def getValidDevicesIdxsWithNamesList():
+        if device.backend == "plaidML":
+            return [ (i, plaidML_devices[i]['description'] ) for i in device.getValidDeviceIdxsEnumerator() ]
+        elif device.backend == "tensorflow":
+            return [ (i, nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)).decode() ) for i in device.getValidDeviceIdxsEnumerator() ]
+        elif device.backend == "tensorflow-cpu":
+            return [ (0, 'CPU') ]
+        elif device.backend == "tensorflow-generic":
+            return [ (0, device.getDeviceName(0) ) ]
+
+    @staticmethod
+    def getDeviceVRAMTotalGb (idx):
+        if device.backend == "plaidML":
+            if idx < plaidML_devices_count: 
+                return plaidML_devices[idx]['globalMemSize'] / (1024*1024*1024)
+        elif device.backend == "tensorflow":
+            if idx < nvmlDeviceGetCount():    
+                memInfo = nvmlDeviceGetMemoryInfo(  nvmlDeviceGetHandleByIndex(idx) )
+                return round ( memInfo.total / (1024*1024*1024) )
+
+            return 0
+        elif device.backend == "tensorflow-generic":
+            return 2
+            
+    @staticmethod
+    def getBestValidDeviceIdx():
+        if device.backend == "plaidML":
+            idx = -1
+            idx_mem = 0
+            for i in device.getValidDeviceIdxsEnumerator():
+                total = plaidML_devices[i]['globalMemSize']
+                if total > idx_mem:
+                    idx = i
+                    idx_mem = total
+
+            return idx
+        elif device.backend == "tensorflow":
+            idx = -1
+            idx_mem = 0
+            for i in device.getValidDeviceIdxsEnumerator():
+                memInfo = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(i) )
+                if memInfo.total > idx_mem:
+                    idx = i
+                    idx_mem = memInfo.total
+
+            return idx
+        elif device.backend == "tensorflow-generic":
+            return 0
+            
+    @staticmethod
+    def getWorstValidDeviceIdx():
+        if device.backend == "plaidML":
+            idx = -1
+            idx_mem = sys.maxsize
+            for i in device.getValidDeviceIdxsEnumerator():
+                total = plaidML_devices[i]['globalMemSize']
+                if total < idx_mem:
+                    idx = i
+                    idx_mem = total
+
+            return idx
+        elif device.backend == "tensorflow":
+            idx = -1
+            idx_mem = sys.maxsize
+            for i in device.getValidDeviceIdxsEnumerator():
+                memInfo = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(i) )
+                if memInfo.total < idx_mem:
+                    idx = i
+                    idx_mem = memInfo.total
+
+            return idx
+        elif device.backend == "tensorflow-generic":
+            return 0
+        
+    @staticmethod
+    def isValidDeviceIdx(idx):
+        if device.backend == "plaidML":
+            return idx in [*device.getValidDeviceIdxsEnumerator()]
+        elif device.backend == "tensorflow":
+            return idx in [*device.getValidDeviceIdxsEnumerator()]
+        elif device.backend == "tensorflow-generic":
+            return (idx == 0)
+        
+    @staticmethod
+    def getDeviceIdxsEqualModel(idx):
+        if device.backend == "plaidML":
+            result = []  
+            idx_name = plaidML_devices[idx]['description']
+            for i in device.getValidDeviceIdxsEnumerator():
+                if plaidML_devices[i]['description'] == idx_name:
+                    result.append (i)
+
+            return result
+        elif device.backend == "tensorflow":
+            result = []  
+            idx_name = nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(idx)).decode()
+            for i in device.getValidDeviceIdxsEnumerator():
+                if nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)).decode() == idx_name:
+                    result.append (i)
+
+            return result
+        elif device.backend == "tensorflow-generic":
+            return [0] if idx == 0 else []            
+            
+    @staticmethod
+    def getDeviceName (idx):
+        if device.backend == "plaidML":
+            if idx < plaidML_devices_count:    
+                return plaidML_devices[idx]['description']
+        elif device.backend == "tensorflow":
+            if idx < nvmlDeviceGetCount():    
+                return nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(idx)).decode()
+        elif device.backend == "tensorflow-generic":
+            if idx == 0:
+                return "Generic GeForce GPU"
+    
+        return None
+        
+    @staticmethod
+    def getDeviceID (idx):
+        if device.backend == "plaidML":
+            if idx < plaidML_devices_count:    
+                return plaidML_devices[idx]['id'].decode()
+
+        return None   
+        
+    @staticmethod
+    def getDeviceComputeCapability(idx):
+        result = 0
+        if device.backend == "plaidML":
+            return 99
+        elif device.backend == "tensorflow":
+            if idx < nvmlDeviceGetCount():    
+                result = nvmlDeviceGetCudaComputeCapability(nvmlDeviceGetHandleByIndex(idx))
+        elif device.backend == "tensorflow-generic":
+            return 99 if idx == 0 else 0   
+            
+        return result[0] * 10 + result[1]
+
+        
+force_plaidML = os.environ.get("force_plaidML", "0") == "1"
+has_nvml = False
+has_nvml_cap = False
+has_nvidia_device = False
+plaidML_devices = []
+
+# Using plaidML OpenCL backend to determine system devices and has_nvidia_device
+try:    
+    os.environ['PLAIDML_EXPERIMENTAL'] = 'false' #this enables work plaidML without run 'plaidml-setup'
+    import plaidml        
+    ctx = plaidml.Context()
+    for d in plaidml.devices(ctx, return_all=True)[0]:
+        details = json.loads(d.details)
+        if 'nvidia' in details['vendor'].lower():
+            has_nvidia_device = True
+        plaidML_devices += [ {'id':d.id,
+                              'globalMemSize' : int(details['globalMemSize']), 
+                              'description' : d.description.decode()
+                           }]
+    ctx.shutdown()
+except:
+    pass
+    
+plaidML_devices_count = len(plaidML_devices)
+
+#choosing backend
+
+if device.backend is None:
+    #first trying to load NVSMI and detect CUDA devices for tensorflow backend,
+    #even force_plaidML is choosed, because if plaidML will fail, we can choose tensorflow
+    try:
+        nvmlInit()
+        has_nvml = True
+        device.backend = "tensorflow"   #set tensorflow backend in order to use device.*device() functions
+        
+        gpu_idxs = device.getAllDevicesIdxsList()
+        gpu_caps = np.array ( [ device.getDeviceComputeCapability(gpu_idx) for gpu_idx in gpu_idxs ] )
+        
+        if len ( np.ndarray.flatten ( np.argwhere (gpu_caps >= tf_min_req_cap) ) ) == 0:        
+            if not force_plaidML:
+                print ("No CUDA devices found with minimum required compute capability: %d.%d. Falling back to OpenCL mode." % (tf_min_req_cap // 10, tf_min_req_cap % 10) )
+            device.backend = None
+            nvmlShutdown()
+        else:
+            has_nvml_cap = True
+    except:
+        #if no NVSMI installed exception will occur
+        device.backend = None
+        has_nvml = False     
+
+if device.backend is None or force_plaidML:
+    #tensorflow backend was failed or forcing plaidML, trying to use plaidML backend
+    if plaidML_devices_count == 0:
+        print ("plaidML: No capable OpenCL devices found. Falling back to tensorflow backend.")
+        device.backend = None
+    else:
+        device.backend = "plaidML"
+
+if device.backend is None:
+    if not has_nvml:        
+        if has_nvidia_device:
+            #some notebook systems have NVIDIA card without NVSMI in official drivers
+            #in that case considering we have system with one capable GPU and let tensorflow to choose best GPU
+            device.backend = "tensorflow-generic"
+        else:
+            #no NVSMI and no NVIDIA cards, also plaidML was failed, then CPU only
+            device.backend = "tensorflow-cpu"
+    else:
+        if has_nvml_cap:
+            #has NVSMI and capable CUDA-devices, but force_plaidML was failed, then we choosing tensorflow
+            device.backend = "tensorflow"
+        else:
+            #has NVSMI, no capable CUDA-devices, also plaidML was failed, then CPU only
+            device.backend = "tensorflow-cpu"
--- a/nnlib/devicelib.py
+++ b/nnlib/devicelib.py
@ -1,186 +0,0 @@
-from .pynvml import *
-
-try:
-    nvmlInit()
-    hasNVML = True
-except:
-    hasNVML = False
-
-class devicelib:
-    class Config():    
-        force_gpu_idx = -1
-        multi_gpu = False
-        force_gpu_idxs = None
-        choose_worst_gpu = False
-        gpu_idxs = []
-        gpu_names = []
-        gpu_compute_caps = []
-        gpu_vram_gb = []
-        allow_growth = True
-        use_fp16 = False
-        cpu_only = False
-        
-        def __init__ (self, force_gpu_idx = -1, 
-                            multi_gpu = False, 
-                            force_gpu_idxs = None, 
-                            choose_worst_gpu = False,
-                            allow_growth = True,
-                            use_fp16 = False,
-                            cpu_only = False,
-                            **in_options):
-
-            self.use_fp16 = use_fp16
-            if cpu_only:
-                self.cpu_only = True
-            else:
-                self.force_gpu_idx = force_gpu_idx
-                self.multi_gpu = multi_gpu
-                self.force_gpu_idxs = force_gpu_idxs
-                self.choose_worst_gpu = choose_worst_gpu        
-                self.allow_growth = allow_growth
-          
-                self.gpu_idxs = []
-
-                if force_gpu_idxs is not None:
-                    for idx in force_gpu_idxs.split(','):
-                        idx = int(idx)
-                        if devicelib.isValidDeviceIdx(idx):
-                            self.gpu_idxs.append(idx)     
-                else:
-                    gpu_idx = force_gpu_idx if (force_gpu_idx >= 0 and devicelib.isValidDeviceIdx(force_gpu_idx)) else devicelib.getBestDeviceIdx() if not choose_worst_gpu else devicelib.getWorstDeviceIdx()
-                    if gpu_idx != -1:
-                        if self.multi_gpu:
-                            self.gpu_idxs = devicelib.getDeviceIdxsEqualModel( gpu_idx )
-                            if len(self.gpu_idxs) <= 1:
-                                self.multi_gpu = False
-                        else:
-                            self.gpu_idxs = [gpu_idx]
-                            
-                self.cpu_only = (len(self.gpu_idxs) == 0)
- 
-                if not self.cpu_only:
-                    self.gpu_names = []
-                    self.gpu_compute_caps = []
-                    for gpu_idx in self.gpu_idxs:
-                        self.gpu_names += [devicelib.getDeviceName(gpu_idx)]
-                        self.gpu_compute_caps += [ devicelib.getDeviceComputeCapability ( gpu_idx ) ]
-                        self.gpu_vram_gb += [ devicelib.getDeviceVRAMTotalGb ( gpu_idx ) ]
-                        
-    @staticmethod
-    def getDevicesWithAtLeastTotalMemoryGB(totalmemsize_gb):
-        if not hasNVML:
-            return [0]
-            
-        result = []
-        for i in range(nvmlDeviceGetCount()):
-            handle = nvmlDeviceGetHandleByIndex(i)
-            memInfo = nvmlDeviceGetMemoryInfo( handle )
-            if (memInfo.total) >= totalmemsize_gb*1024*1024*1024:
-                result.append (i)
-        return result
-        
-    @staticmethod
-    def getAllDevicesIdxsList():
-        if not hasNVML:
-            return [0]
-            
-        return [ i for i in range(0, nvmlDeviceGetCount() ) ]
-        
-    @staticmethod
-    def getAllDevicesIdxsWithNamesList():
-        if not hasNVML:
-            return [ (0, devicelib.getDeviceName(0) ) ]
-  
-        return [ (i, nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)).decode() ) for i in range(nvmlDeviceGetCount() ) ]
-        
-    @staticmethod
-    def getDeviceVRAMFree (idx):
-        if not hasNVML:
-            return 2
-
-        if idx < nvmlDeviceGetCount():    
-            memInfo = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(idx) )
-            return memInfo.total - memInfo.used
-
-        return 0
-        
-    @staticmethod
-    def getDeviceVRAMTotalGb (idx):
-        if not hasNVML:
-            return 2
-            
-        if idx < nvmlDeviceGetCount():    
-            memInfo = nvmlDeviceGetMemoryInfo(  nvmlDeviceGetHandleByIndex(idx) )
-            return round ( memInfo.total / (1024*1024*1024) )
-
-        return 0
-        
-    @staticmethod
-    def getBestDeviceIdx():
-        if not hasNVML:
-            return 0
-
-        idx = -1
-        idx_mem = 0
-        for i in range( nvmlDeviceGetCount() ):
-            memInfo = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(i) )
-            if memInfo.total > idx_mem:
-                idx = i
-                idx_mem = memInfo.total
-
-        return idx
-        
-    @staticmethod
-    def getWorstDeviceIdx():
-        if not hasNVML:
-            return 0
-
-        idx = -1
-        idx_mem = sys.maxsize
-        for i in range( nvmlDeviceGetCount() ):
-            memInfo = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(i) )
-            if memInfo.total < idx_mem:
-                idx = i
-                idx_mem = memInfo.total
-
-        return idx
-        
-    @staticmethod
-    def isValidDeviceIdx(idx):
-        if not hasNVML:
-            return (idx == 0)
-   
-        return (idx < nvmlDeviceGetCount())
-        
-    @staticmethod
-    def getDeviceIdxsEqualModel(idx):
-        if not hasNVML:
-            return [0] if idx == 0 else []            
-        
-        result = []  
-        idx_name = nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(idx)).decode()
-        for i in range( nvmlDeviceGetCount() ):
-            if nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)).decode() == idx_name:
-                result.append (i)
-
-        return result
-        
-    @staticmethod
-    def getDeviceName (idx):
-        if not hasNVML:
-            return 'Generic GeForce GPU'
-            
-        if idx < nvmlDeviceGetCount():    
-            return nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(idx)).decode()
-
-        return None
-        
-    @staticmethod
-    def getDeviceComputeCapability(idx):
-        if not hasNVML:
-            return 99 if idx == 0 else 0
-            
-        result = 0  
-        if idx < nvmlDeviceGetCount():    
-            result = nvmlDeviceGetCudaComputeCapability(nvmlDeviceGetHandleByIndex(idx))
-        return result[0] * 10 + result[1]
--- a/nnlib/nnlib.py
+++ b/nnlib/nnlib.py
@ -4,66 +4,37 @@ import contextlib
 import numpy as np

 from utils import std_utils
-from .devicelib import devicelib
+from .device import device

 class nnlib(object):
-    device = devicelib #forwards nnlib.devicelib to device in order to use nnlib as standalone lib
-    DeviceConfig = devicelib.Config
+    device = device #forwards nnlib.devicelib to device in order to use nnlib as standalone lib
+    DeviceConfig = device.Config
    active_DeviceConfig = DeviceConfig() #default is one best GPU

    dlib = None
+    
    keras = None
    keras_contrib = None
+    
    tf = None
    tf_sess = None
    
-    code_import_tf = None
+    PML = None
+    PMLK = None
+    PMLTile= None
+    
    code_import_keras = None
    code_import_keras_contrib = None
    code_import_all = None
    
    code_import_dlib = None

-    tf_dssim = None
-    tf_ssim = None
-    tf_resize_like = None
-    tf_image_histogram = None
-    tf_rgb_to_lab = None
-    tf_lab_to_rgb = None
-    tf_adain = None
-    tf_gaussian_blur = None
-    tf_style_loss = None
-    
-    modelify = None
-    ReflectionPadding2D = None
-    DSSIMLoss = None
-    DSSIMMSEMaskLoss = None
-    PixelShuffler = None  
-    SubpixelUpscaler = None
-    AddUniformNoise = None
    
    ResNet = None
    UNet = None
    UNetTemporalPredictor = None
    NLayerDiscriminator = None

-    code_import_tf_string = \
-"""
-tf = nnlib.tf
-tf_sess = nnlib.tf_sess
-
-tf_reduce_mean = tf.reduce_mean # todo tf 12+ = tf.math.reduce_mean
-tf_total_variation = tf.image.total_variation
-tf_dssim = nnlib.tf_dssim
-tf_ssim = nnlib.tf_ssim
-tf_resize_like = nnlib.tf_resize_like
-tf_image_histogram = nnlib.tf_image_histogram
-tf_rgb_to_lab = nnlib.tf_rgb_to_lab
-tf_lab_to_rgb = nnlib.tf_lab_to_rgb
-tf_adain = nnlib.tf_adain
-tf_gaussian_blur = nnlib.tf_gaussian_blur
-tf_style_loss = nnlib.tf_style_loss
-"""
    code_import_keras_string = \
 """
 keras = nnlib.keras
@ -81,9 +52,11 @@ BatchNormalization = keras.layers.BatchNormalization

 LeakyReLU = keras.layers.LeakyReLU
 ReLU = keras.layers.ReLU
+PReLU = keras.layers.PReLU
 tanh = keras.layers.Activation('tanh')
 sigmoid = keras.layers.Activation('sigmoid')
 Dropout = keras.layers.Dropout
+Softmax = keras.layers.Softmax

 Lambda = keras.layers.Lambda
 Add = keras.layers.Add
@ -100,12 +73,14 @@ Model = keras.models.Model
 Adam = keras.optimizers.Adam

 modelify = nnlib.modelify
-ReflectionPadding2D = nnlib.ReflectionPadding2D
-DSSIMLoss = nnlib.DSSIMLoss
-DSSIMMSEMaskLoss = nnlib.DSSIMMSEMaskLoss
+gaussian_blur = nnlib.gaussian_blur
+style_loss = nnlib.style_loss
+dssim = nnlib.dssim
+
+#ReflectionPadding2D = nnlib.ReflectionPadding2D
 PixelShuffler = nnlib.PixelShuffler
 SubpixelUpscaler = nnlib.SubpixelUpscaler
-AddUniformNoise = nnlib.AddUniformNoise
+#AddUniformNoise = nnlib.AddUniformNoise
 """
    code_import_keras_contrib_string = \
 """
@ -113,7 +88,6 @@ keras_contrib = nnlib.keras_contrib
 GroupNormalization = keras_contrib.layers.GroupNormalization
 InstanceNormalization = keras_contrib.layers.InstanceNormalization
 Padam = keras_contrib.optimizers.Padam
-PELU = keras_contrib.layers.advanced_activations.PELU
 """
    code_import_dlib_string = \
 """
@ -122,6 +96,7 @@ dlib = nnlib.dlib

    code_import_all_string = \
 """
+DSSIMMSEMaskLoss = nnlib.DSSIMMSEMaskLoss
 ResNet = nnlib.ResNet
 UNet = nnlib.UNet
 UNetTemporalPredictor = nnlib.UNetTemporalPredictor
@ -130,7 +105,7 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
    
            
    @staticmethod
-    def import_tf(device_config = None):
+    def _import_tf(device_config):
        if nnlib.tf is not None:
            return nnlib.code_import_tf

@ -147,34 +122,18 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
        import tensorflow as tf
        nnlib.tf = tf
        
-        if device_config is None:
-            device_config = nnlib.active_DeviceConfig
-        
-        tf_ver = [int(x) for x in tf.VERSION.split('.')]
-        req_cap = 35
-        if tf_ver[0] > 1 or (tf_ver[0] == 1 and tf_ver[1] >= 11):
-            req_cap = 37
-            
-        if not device_config.cpu_only and device_config.gpu_compute_caps[0] < req_cap:
-            if suppressor is not None:  
-                suppressor.__exit__()
-            
-            print ("%s does not meet minimum required compute capability: %d.%d. Falling back to CPU mode." % ( device_config.gpu_names[0], req_cap // 10, req_cap % 10 ) )
-            device_config = nnlib.DeviceConfig(cpu_only=True)
-            
-            if suppressor is not None:  
-                suppressor.__enter__()
-
-        nnlib.active_DeviceConfig = device_config
-        
        if device_config.cpu_only:
-            config = tf.ConfigProto( device_count = {'GPU': 0} )
+            config = tf.ConfigProto(device_count={'GPU': 0})
        else:   
            config = tf.ConfigProto()
-            visible_device_list = ''
-            for idx in device_config.gpu_idxs:
-                visible_device_list += str(idx) + ','
-            config.gpu_options.visible_device_list=visible_device_list[:-1]
+            
+            if device_config.backend != "tensorflow-generic":
+                #tensorflow-generic is system with NVIDIA card, but w/o NVSMI
+                #so dont hide devices and let tensorflow to choose best card
+                visible_device_list = ''
+                for idx in device_config.gpu_idxs:
+                    visible_device_list += str(idx) + ','
+                config.gpu_options.visible_device_list=visible_device_list[:-1]
            
        config.gpu_options.force_gpu_compatible = True            
        config.gpu_options.allow_growth = device_config.allow_growth
@ -184,226 +143,42 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
        if suppressor is not None:  
            suppressor.__exit__()
        
-        nnlib.__initialize_tf_functions()
-        nnlib.code_import_tf = compile (nnlib.code_import_tf_string,'','exec')
-        return nnlib.code_import_tf
-        
-    @staticmethod
-    def __initialize_tf_functions():
-        tf = nnlib.tf
-
-        def tf_dssim_(max_value=1.0):
-            def func(t1,t2):
-                return (1.0 - tf.image.ssim (t1, t2, max_value)) / 2.0
-            return func
-        nnlib.tf_dssim = tf_dssim_
-         
-        def tf_ssim_(max_value=1.0):            
-            def func(t1,t2):
-                return tf.image.ssim (t1, t2, max_value)
-            return func
-        nnlib.tf_ssim = tf_ssim_
-        
-        def tf_resize_like_(ref_tensor):
-            def func(input_tensor):
-                H, W = ref_tensor.get_shape()[1], ref_tensor.get_shape()[2]
-                return tf.image.resize_bilinear(input_tensor, [H.value, W.value])
-            return func
-        nnlib.tf_resize_like = tf_resize_like_
-
-        def tf_rgb_to_lab():
-            def func(rgb_input):
-                with tf.name_scope("rgb_to_lab"):
-                    srgb_pixels = tf.reshape(rgb_input, [-1, 3])
-
-                    with tf.name_scope("srgb_to_xyz"):
-                        linear_mask = tf.cast(srgb_pixels <= 0.04045, dtype=tf.float32)
-                        exponential_mask = tf.cast(srgb_pixels > 0.04045, dtype=tf.float32)
-                        rgb_pixels = (srgb_pixels / 12.92 * linear_mask) + (((srgb_pixels + 0.055) / 1.055) ** 2.4) * exponential_mask
-                        rgb_to_xyz = tf.constant([
-                            #    X        Y          Z
-                            [0.412453, 0.212671, 0.019334], # R
-                            [0.357580, 0.715160, 0.119193], # G
-                            [0.180423, 0.072169, 0.950227], # B
-                        ])
-                        xyz_pixels = tf.matmul(rgb_pixels, rgb_to_xyz)
-
-                    # https://en.wikipedia.org/wiki/Lab_color_space#CIELAB-CIEXYZ_conversions
-                    with tf.name_scope("xyz_to_cielab"):
-                        # convert to fx = f(X/Xn), fy = f(Y/Yn), fz = f(Z/Zn)
-
-                        # normalize for D65 white point
-                        xyz_normalized_pixels = tf.multiply(xyz_pixels, [1/0.950456, 1.0, 1/1.088754])
-
-                        epsilon = 6/29
-                        linear_mask = tf.cast(xyz_normalized_pixels <= (epsilon**3), dtype=tf.float32)
-                        exponential_mask = tf.cast(xyz_normalized_pixels > (epsilon**3), dtype=tf.float32)
-                        fxfyfz_pixels = (xyz_normalized_pixels / (3 * epsilon**2) + 4/29) * linear_mask + (xyz_normalized_pixels ** (1/3)) * exponential_mask
-
-                        # convert to lab
-                        fxfyfz_to_lab = tf.constant([
-                            #  l       a       b
-                            [  0.0,  500.0,    0.0], # fx
-                            [116.0, -500.0,  200.0], # fy
-                            [  0.0,    0.0, -200.0], # fz
-                        ])
-                        lab_pixels = tf.matmul(fxfyfz_pixels, fxfyfz_to_lab) + tf.constant([-16.0, 0.0, 0.0])
-                    return tf.reshape(lab_pixels, tf.shape(rgb_input))
-            return func
-        nnlib.tf_rgb_to_lab = tf_rgb_to_lab
-        
-        def tf_lab_to_rgb():
-            def func(lab):
-                with tf.name_scope("lab_to_rgb"):
-                    lab_pixels = tf.reshape(lab, [-1, 3])
-
-                    # https://en.wikipedia.org/wiki/Lab_color_space#CIELAB-CIEXYZ_conversions
-                    with tf.name_scope("cielab_to_xyz"):
-                        # convert to fxfyfz
-                        lab_to_fxfyfz = tf.constant([
-                            #   fx      fy        fz
-                            [1/116.0, 1/116.0,  1/116.0], # l
-                            [1/500.0,     0.0,      0.0], # a
-                            [    0.0,     0.0, -1/200.0], # b
-                        ])
-                        fxfyfz_pixels = tf.matmul(lab_pixels + tf.constant([16.0, 0.0, 0.0]), lab_to_fxfyfz)
-
-                        # convert to xyz
-                        epsilon = 6/29
-                        linear_mask = tf.cast(fxfyfz_pixels <= epsilon, dtype=tf.float32)
-                        exponential_mask = tf.cast(fxfyfz_pixels > epsilon, dtype=tf.float32)
-                        xyz_pixels = (3 * epsilon**2 * (fxfyfz_pixels - 4/29)) * linear_mask + (fxfyfz_pixels ** 3) * exponential_mask
-
-                        # denormalize for D65 white point
-                        xyz_pixels = tf.multiply(xyz_pixels, [0.950456, 1.0, 1.088754])
-
-                    with tf.name_scope("xyz_to_srgb"):
-                        xyz_to_rgb = tf.constant([
-                            #     r           g          b
-                            [ 3.2404542, -0.9692660,  0.0556434], # x
-                            [-1.5371385,  1.8760108, -0.2040259], # y
-                            [-0.4985314,  0.0415560,  1.0572252], # z
-                        ])
-                        rgb_pixels = tf.matmul(xyz_pixels, xyz_to_rgb)
-                        # avoid a slightly negative number messing up the conversion
-                        rgb_pixels = tf.clip_by_value(rgb_pixels, 0.0, 1.0)
-                        linear_mask = tf.cast(rgb_pixels <= 0.0031308, dtype=tf.float32)
-                        exponential_mask = tf.cast(rgb_pixels > 0.0031308, dtype=tf.float32)
-                        srgb_pixels = (rgb_pixels * 12.92 * linear_mask) + ((rgb_pixels ** (1/2.4) * 1.055) - 0.055) * exponential_mask
-
-                    return tf.reshape(srgb_pixels, tf.shape(lab))
-            return func
-        nnlib.tf_lab_to_rgb = tf_lab_to_rgb
-
-        def tf_image_histogram():
-            def func(input):
-                x = input
-                x += 1 / 255.0
-                
-                output = []
-                for i in range(256, 0, -1):
-                    v = i / 255.0
-                    y = (x - v) * 1000
-                    
-                    y = tf.clip_by_value (y, -1.0, 0.0) + 1
-
-                    output.append ( tf.reduce_sum (y) )
-                    x -= y*v
-
-                return tf.stack ( output[::-1] )
-            return func
-        nnlib.tf_image_histogram = tf_image_histogram
-     
-        def tf_adain(epsilon=1e-5):
-            def func(content, style):
-                axes = [1,2]
-                c_mean, c_var = tf.nn.moments(content, axes=axes, keep_dims=True)
-                s_mean, s_var = tf.nn.moments(style, axes=axes, keep_dims=True)
-                c_std, s_std = tf.sqrt(c_var + epsilon), tf.sqrt(s_var + epsilon)
-                return s_std * (content - c_mean) / c_std + s_mean
-            return func
-        nnlib.tf_adain = tf_adain
-        
-        def tf_gaussian_blur(radius=2.0):
-            def gaussian_kernel(size,mean,std):
-                d = tf.distributions.Normal( float(mean), float(std) )
-
-                vals = d.prob(tf.range(start = -int(size), limit = int(size) + 1, dtype = tf.float32))
-
-                gauss_kernel = tf.einsum('i,j->ij',
-                                              vals,
-                                              vals)
-
-                return gauss_kernel / tf.reduce_sum(gauss_kernel)
-
-            gauss_kernel = gaussian_kernel(radius, 1.0, radius )
-            gauss_kernel = gauss_kernel[:, :, tf.newaxis, tf.newaxis]
-            
-            def func(input):
-                input_nc = input.get_shape().as_list()[-1]
-                inputs = tf.split(input, input_nc, -1)
-                
-                outputs = []
-                for i in range(len(inputs)):
-                    outputs += [ tf.nn.conv2d( inputs[i] , gauss_kernel, strides=[1, 1, 1, 1], padding="SAME") ]
-
-                return tf.concat (outputs, axis=-1)
-            return func
-        nnlib.tf_gaussian_blur = tf_gaussian_blur
-
-        #any channel count style diff
-        #outputs 0.0 .. 1.0 style difference*loss_weight , 0.0 - no diff
-        def tf_style_loss(gaussian_blur_radius=0.0, loss_weight=1.0, batch_normalize=False, epsilon=1e-5):
-            gblur = tf_gaussian_blur(gaussian_blur_radius)
-            
-            def sd(content, style):
-                content_nc = content.get_shape().as_list()[-1]
-                style_nc = style.get_shape().as_list()[-1]
-                if content_nc != style_nc:
-                    raise Exception("tf_style_loss() content_nc != style_nc")
-                    
-                axes = [1,2]
-                c_mean, c_var = tf.nn.moments(content, axes=axes, keep_dims=True)
-                s_mean, s_var = tf.nn.moments(style, axes=axes, keep_dims=True)
-                c_std, s_std = tf.sqrt(c_var + epsilon), tf.sqrt(s_var + epsilon)
-
-                mean_loss = tf.reduce_sum(tf.squared_difference(c_mean, s_mean))
-                std_loss = tf.reduce_sum(tf.squared_difference(c_std, s_std))
-
-                if batch_normalize:
-                    #normalize w.r.t batch size
-                    n = tf.cast(tf.shape(content)[0], dtype=tf.float32)
-                    mean_loss /= n
-                    std_loss /= n
-                
-                return (mean_loss + std_loss) * loss_weight
-                
-            def func(target, style):
-                if gaussian_blur_radius > 0.0:
-                    return sd( gblur(target), gblur(style))
-                else:
-                    return sd( target, style )
-            return func
-            
-        nnlib.tf_style_loss = tf_style_loss
-
    @staticmethod
    def import_keras(device_config = None):
        if nnlib.keras is not None:
            return nnlib.code_import_keras

-        nnlib.import_tf(device_config)
-        device_config = nnlib.active_DeviceConfig
+        if device_config is None:
+            device_config = nnlib.active_DeviceConfig
+            
+        nnlib.active_DeviceConfig = device_config
+
+        if "tensorflow" in device_config.backend:
+            nnlib._import_tf(device_config)
+            device_config = nnlib.active_DeviceConfig
+        elif device_config.backend == "plaidML":
+            os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
+            os.environ["PLAIDML_DEVICE_IDS"] = ",".join ( [ nnlib.device.getDeviceID(idx) for idx in device_config.gpu_idxs] )
+
        if 'TF_SUPPRESS_STD' in os.environ.keys() and os.environ['TF_SUPPRESS_STD'] == '1':
            suppressor = std_utils.suppress_stdout_stderr().__enter__()
 
        import keras as keras_
        nnlib.keras = keras_
        
+        if device_config.backend == "plaidML":
+            import plaidml
+            import plaidml.tile
+            nnlib.PML = plaidml
+            nnlib.PMLK = plaidml.keras.backend
+            nnlib.PMLTile = plaidml.tile
+            
        if device_config.use_fp16:
            nnlib.keras.backend.set_floatx('float16')
        
-        nnlib.keras.backend.set_session(nnlib.tf_sess)
+        if "tensorflow" in device_config.backend:
+            nnlib.keras.backend.set_session(nnlib.tf_sess)
+            
        nnlib.keras.backend.set_image_data_format('channels_last')
        
        if 'TF_SUPPRESS_STD' in os.environ.keys() and os.environ['TF_SUPPRESS_STD'] == '1':        
@ -411,14 +186,12 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator

        nnlib.__initialize_keras_functions()  
        nnlib.code_import_keras = compile (nnlib.code_import_keras_string,'','exec')
-
+        return nnlib.code_import_keras
        
    @staticmethod
    def __initialize_keras_functions():
-        tf = nnlib.tf
        keras = nnlib.keras
        K = keras.backend
-        exec (nnlib.code_import_tf, locals(), globals())
        
        def modelify(model_functor):
            def func(tensor):
@ -427,68 +200,172 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
        
        nnlib.modelify = modelify
        
-        class ReflectionPadding2D(keras.layers.Layer):
-            def __init__(self, padding=(1, 1), **kwargs):
-                self.padding = tuple(padding)
-                self.input_spec = [keras.layers.InputSpec(ndim=4)]
-                super(ReflectionPadding2D, self).__init__(**kwargs)
+        def gaussian_blur(radius=2.0):
+            def gaussian(x, mu, sigma):
+                return np.exp(-(float(x) - float(mu)) ** 2 / (2 * sigma ** 2))

-            def compute_output_shape(self, s):
-                """ If you are using "channels_last" configuration"""
-                return (s[0], s[1] + 2 * self.padding[0], s[2] + 2 * self.padding[1], s[3])
+            def make_kernel(sigma):
+                kernel_size = max(3, int(2 * 2 * sigma + 1))
+                mean = np.floor(0.5 * kernel_size)
+                kernel_1d = np.array([gaussian(x, mean, sigma) for x in range(kernel_size)])
+                np_kernel = np.outer(kernel_1d, kernel_1d).astype(dtype=K.floatx())
+                kernel = np_kernel / np.sum(np_kernel)
+                return kernel
          
-            def call(self, x, mask=None):
-                w_pad,h_pad = self.padding
-                return tf.pad(x, [[0,0], [h_pad,h_pad], [w_pad,w_pad], [0,0] ], 'REFLECT')
-        nnlib.ReflectionPadding2D = ReflectionPadding2D
+            gauss_kernel = make_kernel(radius)
+            gauss_kernel = gauss_kernel[:, :,np.newaxis, np.newaxis]

-        class DSSIMLoss(object):
-            def __init__(self, is_tanh=False):
-                self.is_tanh = is_tanh
+            def func(input):
+                inputs = [ input[:,:,:,i:i+1]  for i in range( K.int_shape( input )[-1] ) ]

-            def __call__(self,y_true, y_pred):
-                if not self.is_tanh:            
-                    return (1.0 - tf.image.ssim (y_true, y_pred, 1.0)) / 2.0
+                outputs = []
+                for i in range(len(inputs)):
+                    outputs += [ K.conv2d( inputs[i] , K.constant(gauss_kernel) , strides=(1,1), padding="same") ]
+
+                return K.concatenate (outputs, axis=-1)
+            return func
+        nnlib.gaussian_blur = gaussian_blur
+        
+        def style_loss(gaussian_blur_radius=0.0, loss_weight=1.0, wnd_size=0, step_size=1):
+            if gaussian_blur_radius > 0.0:
+                gblur = gaussian_blur(gaussian_blur_radius)
+            
+            def sd(content, style, loss_weight):
+                content_nc = K.int_shape(content)[-1]
+                style_nc = K.int_shape(style)[-1]
+                if content_nc != style_nc:
+                    raise Exception("style_loss() content_nc != style_nc")
+                    
+                axes = [1,2]
+                c_mean, c_var = K.mean(content, axis=axes, keepdims=True), K.var(content, axis=axes, keepdims=True)
+                s_mean, s_var = K.mean(style, axis=axes, keepdims=True), K.var(style, axis=axes, keepdims=True)
+                c_std, s_std = K.sqrt(c_var + 1e-5), K.sqrt(s_var + 1e-5)
+
+                mean_loss = K.sum(K.square(c_mean-s_mean))
+                std_loss = K.sum(K.square(c_std-s_std))
+                
+                return (mean_loss + std_loss) * ( loss_weight / float(content_nc) )
+                
+            def func(target, style):
+                if wnd_size == 0:
+                    if gaussian_blur_radius > 0.0:
+                        return sd( gblur(target), gblur(style), loss_weight=loss_weight)
+                    else:
+                        return sd( target, style, loss_weight=loss_weight )
                else:
-                    return (1.0 - tf.image.ssim ((y_true/2+0.5), (y_pred/2+0.5), 1.0)) / 2.0
-        nnlib.DSSIMLoss = DSSIMLoss
+                    #currently unused
+                    if nnlib.tf is not None:
+                        sh = K.int_shape(target)[1]
+                        k = (sh-wnd_size) // step_size + 1                        
+                        if gaussian_blur_radius > 0.0:
+                            target, style = gblur(target), gblur(style)                        
+                        target = nnlib.tf.image.extract_image_patches(target, [1,k,k,1], [1,1,1,1], [1,step_size,step_size,1], 'VALID')
+                        style  = nnlib.tf.image.extract_image_patches(style,  [1,k,k,1], [1,1,1,1], [1,step_size,step_size,1], 'VALID')
+                        return sd( target, style, loss_weight )
+                    if nnlib.PML is not None:
+                        print ("Sorry, plaidML backend does not support style_loss")
+                        return 0
+            return func
+        nnlib.style_loss = style_loss  
        
-        class DSSIMMSEMaskLoss(object):
-            def __init__(self, mask, is_mse=False):
-                self.mask = mask
-                self.is_mse = is_mse
        
-            def __call__(self,y_true, y_pred):
-                total_loss = None
+        def dssim(k1=0.01, k2=0.03, max_value=1.0):
+            # port of tf.image.ssim to pure keras in order to work on plaidML backend.

-                mask = self.mask
-                if self.is_mse:                
-                    blur_mask = tf_gaussian_blur(max(1, mask.get_shape().as_list()[1] // 32))(mask)
-                    return K.mean ( 100*K.square( y_true*blur_mask - y_pred*blur_mask ) )
-                else:
-                    return (1.0 - (tf.image.ssim (y_true*mask, y_pred*mask, 1.0))) / 2.0
-        nnlib.DSSIMMSEMaskLoss = DSSIMMSEMaskLoss
+            def func(y_true, y_pred):
+                ch = K.int_shape(y_pred)[-1]
+                
+                def softmax(x, axis=-1): #from K numpy backend
+                    y = np.exp(x - np.max(x, axis, keepdims=True))
+                    return y / np.sum(y, axis, keepdims=True)
+                    
+                def gauss_kernel(size, sigma):
+                    coords = np.arange(0,size, dtype=K.floatx() )                  
+                    coords -= (size - 1 ) / 2.0
+                    g = coords**2
+                    g *= ( -0.5 / (sigma**2) )
+                    g = np.reshape (g, (1,-1)) + np.reshape(g, (-1,1) )
+                    g = np.reshape (g, (1,-1))
+                    g = softmax(g)
+                    g = np.reshape (g, (size, size, 1, 1))  
+                    g = np.tile (g, (1,1,ch,1))                
+                    return K.constant(g, dtype=K.floatx() )
+    
+                kernel = gauss_kernel(11,1.5)                
+              
+                def reducer(x):
+                    shape = K.shape(x)
+                    x = K.reshape(x, (-1, shape[-3] , shape[-2], shape[-1]) )                  
+                    y = K.depthwise_conv2d(x, kernel, strides=(1, 1), padding='valid')
+                    y_shape = K.shape(y)
+                    return K.reshape(y, (shape[0], y_shape[1], y_shape[2], y_shape[3] ) )
+
+                def _ssim_helper(x, y, reducer, compensation=1.0):
+                    c1 = (k1 * max_value) ** 2
+                    c2 = (k2 * max_value) ** 2
+                    
+                    mean0 = reducer(x)
+                    mean1 = reducer(y)
+                    num0 = mean0 * mean1 * 2.0
+                    den0 = K.square(mean0) + K.square(mean1)
+                    luminance = (num0 + c1) / (den0 + c1)
+                    
+                    num1 = reducer(x * y) * 2.0
+                    den1 = reducer(K.square(x) + K.square(y))
+                    c2 *= compensation
+                    cs = (num1 - num0 + c2) / (den1 - den0 + c2)
+                    
+                    return luminance, cs
+
+                luminance, cs = _ssim_helper(y_true, y_pred, reducer)
+                ssim_val = K.mean(luminance * cs, axis=(-3, -2) )
+                return K.mean( (1.0 - ssim_val ) / 2.0 )
+
+            return func
+        nnlib.dssim = dssim
        
        class PixelShuffler(keras.layers.Layer):
            def __init__(self, size=(2, 2), data_format=None, **kwargs):
                super(PixelShuffler, self).__init__(**kwargs)
-                self.data_format = keras.backend.common.normalize_data_format(data_format)
+                self.data_format = K.normalize_data_format(data_format)
                self.size = keras.utils.conv_utils.normalize_tuple(size, 2, 'size')

            def call(self, inputs):
-                input_shape = keras.backend.int_shape(inputs)
+
+                input_shape = K.int_shape(inputs)
                if len(input_shape) != 4:
                    raise ValueError('Inputs should have rank ' +
                                     str(4) +
                                     '; Received input shape:', str(input_shape))

                if self.data_format == 'channels_first':
-                    return tf.depth_to_space(inputs, self.size[0], 'NCHW')
+                    batch_size, c, h, w = input_shape
+                    if batch_size is None:
+                        batch_size = -1
+                    rh, rw = self.size
+                    oh, ow = h * rh, w * rw
+                    oc = c // (rh * rw)
+
+                    out = K.reshape(inputs, (batch_size, rh, rw, oc, h, w))
+                    out = K.permute_dimensions(out, (0, 3, 4, 1, 5, 2))
+                    out = K.reshape(out, (batch_size, oc, oh, ow))
+                    return out

                elif self.data_format == 'channels_last':
-                    return tf.depth_to_space(inputs, self.size[0], 'NHWC')
+                    batch_size, h, w, c = input_shape
+                    if batch_size is None:
+                        batch_size = -1
+                    rh, rw = self.size
+                    oh, ow = h * rh, w * rw
+                    oc = c // (rh * rw)
+
+                    out = K.reshape(inputs, (batch_size, h, w, rh, rw, oc))
+                    out = K.permute_dimensions(out, (0, 1, 3, 2, 4, 5))
+                    out = K.reshape(out, (batch_size, oh, ow, oc))
+                    return out

            def compute_output_shape(self, input_shape):
+
                if len(input_shape) != 4:
                    raise ValueError('Inputs should have rank ' +
                                     str(4) +
@ -529,6 +406,23 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
        
        nnlib.PixelShuffler = PixelShuffler
        nnlib.SubpixelUpscaler = PixelShuffler    
+        '''
+        
+        class ReflectionPadding2D(keras.layers.Layer):
+            def __init__(self, padding=(1, 1), **kwargs):
+                self.padding = tuple(padding)
+                self.input_spec = [keras.layers.InputSpec(ndim=4)]
+                super(ReflectionPadding2D, self).__init__(**kwargs)
+
+            def compute_output_shape(self, s):
+                """ If you are using "channels_last" configuration"""
+                return (s[0], s[1] + 2 * self.padding[0], s[2] + 2 * self.padding[1], s[3])
+
+            def call(self, x, mask=None):
+                w_pad,h_pad = self.padding
+                return tf.pad(x, [[0,0], [h_pad,h_pad], [w_pad,w_pad], [0,0] ], 'REFLECT')
+        nnlib.ReflectionPadding2D = ReflectionPadding2D
+
       
        class AddUniformNoise(keras.layers.Layer):
            def __init__(self, power=1.0, minval=-1.0, maxval=1.0, **kwargs):
@ -548,7 +442,7 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
                base_config = super(AddUniformNoise, self).get_config()
                return dict(list(base_config.items()) + list(config.items()))
        nnlib.AddUniformNoise = AddUniformNoise       
-                
+        '''        
    @staticmethod
    def import_keras_contrib(device_config = None):
        if nnlib.keras_contrib is not None:
@ -570,20 +464,17 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator

        import dlib as dlib_
        nnlib.dlib = dlib_
-        if not device_config.cpu_only and len(device_config.gpu_idxs) > 0:
+        if not device_config.cpu_only and "tensorflow" in device_config.backend and len(device_config.gpu_idxs) > 0:
            nnlib.dlib.cuda.set_device(device_config.gpu_idxs[0])           
        
-        
        nnlib.code_import_dlib = compile (nnlib.code_import_dlib_string,'','exec')
    
    @staticmethod
    def import_all(device_config = None):
        if nnlib.code_import_all is None:  
-            nnlib.import_tf(device_config)        
            nnlib.import_keras(device_config)
            nnlib.import_keras_contrib(device_config)                                                
-            nnlib.code_import_all = compile (nnlib.code_import_tf_string + '\n' 
-                                            + nnlib.code_import_keras_string + '\n'
+            nnlib.code_import_all = compile (nnlib.code_import_keras_string + '\n'
                                            + nnlib.code_import_keras_contrib_string 
                                            + nnlib.code_import_all_string,'','exec')        
            nnlib.__initialize_all_functions()
@ -592,6 +483,24 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
    
    @staticmethod
    def __initialize_all_functions():
+        exec (nnlib.import_keras(), locals(), globals())
+        exec (nnlib.import_keras_contrib(), locals(), globals())
+        
+        class DSSIMMSEMaskLoss(object):
+            def __init__(self, mask, is_mse=False):
+                self.mask = mask
+                self.is_mse = is_mse                
+            def __call__(self,y_true, y_pred):
+                total_loss = None
+                mask = self.mask
+                if self.is_mse:                
+                    blur_mask = gaussian_blur(max(1, K.int_shape(mask)[1] // 64))(mask)
+                    return K.mean ( 50*K.square( y_true*blur_mask - y_pred*blur_mask ) )
+                else:
+                    return 10*dssim() (y_true*mask, y_pred*mask)                    
+        nnlib.DSSIMMSEMaskLoss = DSSIMMSEMaskLoss
+        
+        '''
        def ResNet(output_nc, use_batch_norm, ngf=64, n_blocks=6, use_dropout=False):
            exec (nnlib.import_all(), locals(), globals())

@ -775,7 +684,7 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
                return Conv2D( 1, 4, 1, 'valid')(x)
            return func
        nnlib.NLayerDiscriminator = NLayerDiscriminator
-        
+        '''
    @staticmethod
    def finalize_all():
        if nnlib.keras_contrib is not None:
@ -786,7 +695,6 @@ NLayerDiscriminator = nnlib.NLayerDiscriminator
            nnlib.keras = None

        if nnlib.tf is not None:
-            nnlib.tf_sess.close()
            nnlib.tf_sess = None
            nnlib.tf = None

--- a/requirements-gpu-opencl-cuda9-cudnn7.txt
+++ b/requirements-gpu-opencl-cuda9-cudnn7.txt
@ -5,6 +5,7 @@ h5py==2.7.1
 Keras==2.2.4
 opencv-python==4.0.0.21
 tensorflow-gpu==1.11.0
+plaidml-keras==0.5.0
 scikit-image
 dlib==19.10.0
 tqdm
--- a/samples/SampleProcessor.py
+++ b/samples/SampleProcessor.py
@ -8,29 +8,28 @@ from facelib import FaceType

 class SampleProcessor(object):
    class TypeFlags(IntEnum):
-        SOURCE               = 0x00000001,
-        WARPED               = 0x00000002,
-        WARPED_TRANSFORMED   = 0x00000004,
-        TRANSFORMED          = 0x00000008,
-        LANDMARKS_ARRAY      = 0x00000010, #currently unused
+        SOURCE                = 0x00000001,
+        WARPED                = 0x00000002,
+        WARPED_TRANSFORMED    = 0x00000004,
+        TRANSFORMED           = 0x00000008,
+        LANDMARKS_ARRAY       = 0x00000010, #currently unused
                              
-        RANDOM_CLOSE       = 0x00000020,
-        MORPH_TO_RANDOM_CLOSE \
-                             = 0x00000040,
+        RANDOM_CLOSE          = 0x00000020,
+        MORPH_TO_RANDOM_CLOSE = 0x00000040,
                              
-        FACE_ALIGN_HALF      = 0x00000100,
-        FACE_ALIGN_FULL      = 0x00000200,
-        FACE_ALIGN_HEAD      = 0x00000400,
-        FACE_ALIGN_AVATAR    = 0x00000800,    
+        FACE_ALIGN_HALF       = 0x00000100,
+        FACE_ALIGN_FULL       = 0x00000200,
+        FACE_ALIGN_HEAD       = 0x00000400,
+        FACE_ALIGN_AVATAR     = 0x00000800,    
                              
-        FACE_MASK_FULL       = 0x00001000,
-        FACE_MASK_EYES       = 0x00002000,
+        FACE_MASK_FULL        = 0x00001000,
+        FACE_MASK_EYES        = 0x00002000,
                              
-        MODE_BGR             = 0x01000000,  #BGR
-        MODE_G               = 0x02000000,  #Grayscale
-        MODE_GGG             = 0x04000000,  #3xGrayscale 
-        MODE_M               = 0x08000000,  #mask only
-        MODE_BGR_SHUFFLE     = 0x10000000,  #BGR shuffle
+        MODE_BGR              = 0x01000000,  #BGR
+        MODE_G                = 0x02000000,  #Grayscale
+        MODE_GGG              = 0x04000000,  #3xGrayscale 
+        MODE_M                = 0x08000000,  #mask only
+        MODE_BGR_SHUFFLE      = 0x10000000,  #BGR shuffle
   
    class Options(object):     
        def __init__(self, random_flip = True, normalize_tanh = False, rotation_range=[-10,10], scale_range=[-0.05, 0.05], tx_range=[-0.05, 0.05], ty_range=[-0.05, 0.05]):
--- a/utils/image_utils.py
+++ b/utils/image_utils.py
@ -5,7 +5,6 @@ import cv2
 import localization
 from scipy.spatial import Delaunay
 from PIL import Image, ImageDraw, ImageFont
-from nnlib import nnlib

 def reinhard_color_transfer(target, source, clip=False, preserve_paper=False, source_mask=None, target_mask=None):
 	"""
@ -424,23 +423,3 @@ def reduce_colors (img_bgr, n_colors):
    
    return img_bgr
  
-    
-class TFLabConverter():
-    def __init__(self):        
-        exec (nnlib.import_tf(), locals(), globals())
-        self.tf_sess = tf_sess
-        
-        self.bgr_input_tensor = tf.placeholder("float", [None, None, 3])
-        self.lab_input_tensor = tf.placeholder("float", [None, None, 3])
-        
-        self.lab_output_tensor = tf_rgb_to_lab()(self.bgr_input_tensor)        
-        self.bgr_output_tensor = tf_lab_to_rgb()(self.lab_input_tensor)
-        
-        
-    def bgr2lab(self, bgr):    
-        return self.tf_sess.run(self.lab_output_tensor, feed_dict={self.bgr_input_tensor: bgr})
-        
-    def lab2bgr(self, lab):    
-        return self.tf_sess.run(self.bgr_output_tensor, feed_dict={self.lab_input_tensor: lab})    
-        
-