diff --git a/core/leras/device.py b/core/leras/device.py
index 4d157f0..2f65382 100644
--- a/core/leras/device.py
+++ b/core/leras/device.py
@@ -1,12 +1,19 @@
 import sys
 import ctypes
 import os
+import multiprocessing
+import json
+import time
+from pathlib import Path
+from core.interact import interact as io
+
 
 class Device(object):
-    def __init__(self, index, name, total_mem, free_mem, cc=0):
+    def __init__(self, index, tf_dev_type, name, total_mem, free_mem):
         self.index = index
+        self.tf_dev_type = tf_dev_type
         self.name = name
-        self.cc = cc
+        
         self.total_mem = total_mem
         self.total_mem_gb = total_mem / 1024**3
         self.free_mem = free_mem
@@ -82,12 +89,134 @@ class Devices(object):
                 result.append (device)
         return Devices(result)
 
+    @staticmethod
+    def _get_tf_devices_proc(q : multiprocessing.Queue):
+        
+        compute_cache_path = Path(os.environ['APPDATA']) / 'NVIDIA' / ('ComputeCache_ALL')
+        os.environ['CUDA_CACHE_PATH'] = str(compute_cache_path)
+        if not compute_cache_path.exists():
+            io.log_info("Caching GPU kernels...")
+            compute_cache_path.mkdir(parents=True, exist_ok=True)
+            
+        import tensorflow
+        
+        tf_version = tensorflow.version.VERSION
+        #if tf_version is None:
+        #    tf_version = tensorflow.version.GIT_VERSION
+        if tf_version[0] == 'v':
+            tf_version = tf_version[1:]
+        if tf_version[0] == '2':
+            tf = tensorflow.compat.v1
+        else:
+            tf = tensorflow
+        
+        import logging
+        # Disable tensorflow warnings
+        tf_logger = logging.getLogger('tensorflow')
+        tf_logger.setLevel(logging.ERROR)
+
+        from tensorflow.python.client import device_lib
+
+        devices = []
+        
+        physical_devices = device_lib.list_local_devices()
+        physical_devices_f = {}
+        for dev in physical_devices:
+            dev_type = dev.device_type
+            dev_tf_name = dev.name
+            dev_tf_name = dev_tf_name[ dev_tf_name.index(dev_type) : ]
+            
+            dev_idx = int(dev_tf_name.split(':')[-1])
+            
+            if dev_type in ['GPU','DML']:
+                dev_name = dev_tf_name
+                
+                dev_desc = dev.physical_device_desc
+                if len(dev_desc) != 0:
+                    if dev_desc[0] == '{':
+                        dev_desc_json = json.loads(dev_desc)
+                        dev_desc_json_name = dev_desc_json.get('name',None)
+                        if dev_desc_json_name is not None:
+                            dev_name = dev_desc_json_name
+                    else:
+                        for param, value in ( v.split(':') for v in dev_desc.split(',') ):
+                            param = param.strip()
+                            value = value.strip()
+                            if param == 'name':
+                                dev_name = value
+                                break
+                
+                physical_devices_f[dev_idx] = (dev_type, dev_name, dev.memory_limit)
+                        
+        q.put(physical_devices_f)
+        time.sleep(0.1)
+        
+        
     @staticmethod
     def initialize_main_env():
-        os.environ['NN_DEVICES_INITIALIZED'] = '1'
-        os.environ['NN_DEVICES_COUNT'] = '0'
+        if int(os.environ.get("NN_DEVICES_INITIALIZED", 0)) != 0:
+            return
+        
+        if 'CUDA_VISIBLE_DEVICES' in os.environ.keys():
+            os.environ.pop('CUDA_VISIBLE_DEVICES')
         
         os.environ['CUDA_​CACHE_​MAXSIZE'] = '2147483647'
+        os.environ['TF_MIN_GPU_MULTIPROCESSOR_COUNT'] = '2'
+        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # tf log errors only
+        
+        q = multiprocessing.Queue()
+        p = multiprocessing.Process(target=Devices._get_tf_devices_proc, args=(q,), daemon=True)
+        p.start()
+        p.join()
+        
+        visible_devices = q.get()
+
+        os.environ['NN_DEVICES_INITIALIZED'] = '1'
+        os.environ['NN_DEVICES_COUNT'] = str(len(visible_devices))
+        
+        for i in visible_devices:
+            dev_type, name, total_mem = visible_devices[i]
+
+            os.environ[f'NN_DEVICE_{i}_TF_DEV_TYPE'] = dev_type
+            os.environ[f'NN_DEVICE_{i}_NAME'] = name
+            os.environ[f'NN_DEVICE_{i}_TOTAL_MEM'] = str(total_mem)
+            os.environ[f'NN_DEVICE_{i}_FREE_MEM'] = str(total_mem)
+            
+        
+
+    @staticmethod
+    def getDevices():
+        if Devices.all_devices is None:
+            if int(os.environ.get("NN_DEVICES_INITIALIZED", 0)) != 1:
+                raise Exception("nn devices are not initialized. Run initialize_main_env() in main process.")
+            devices = []
+            for i in range ( int(os.environ['NN_DEVICES_COUNT']) ):
+                devices.append ( Device(index=i,
+                                        tf_dev_type=os.environ[f'NN_DEVICE_{i}_TF_DEV_TYPE'],
+                                        name=os.environ[f'NN_DEVICE_{i}_NAME'],
+                                        total_mem=int(os.environ[f'NN_DEVICE_{i}_TOTAL_MEM']),
+                                        free_mem=int(os.environ[f'NN_DEVICE_{i}_FREE_MEM']), )
+                                )
+            Devices.all_devices = Devices(devices)
+
+        return Devices.all_devices
+
+"""
+
+        
+        # {'name'      : name.split(b'\0', 1)[0].decode(),
+        #     'total_mem' : totalMem.value
+        # }
+
+        
+        
+        
+        
+        return
+
+        
+        
+        
         min_cc = int(os.environ.get("TF_MIN_REQ_CAP", 35))
         libnames = ('libcuda.so', 'libcuda.dylib', 'nvcuda.dll')
         for libname in libnames:
@@ -139,70 +268,4 @@ class Devices(object):
             os.environ[f'NN_DEVICE_{i}_TOTAL_MEM'] = str(device['total_mem'])
             os.environ[f'NN_DEVICE_{i}_FREE_MEM'] = str(device['free_mem'])
             os.environ[f'NN_DEVICE_{i}_CC'] = str(device['cc'])
-
-    @staticmethod
-    def getDevices():
-        if Devices.all_devices is None:
-            if int(os.environ.get("NN_DEVICES_INITIALIZED", 0)) != 1:
-                raise Exception("nn devices are not initialized. Run initialize_main_env() in main process.")
-            devices = []
-            for i in range ( int(os.environ['NN_DEVICES_COUNT']) ):
-                devices.append ( Device(index=i,
-                                        name=os.environ[f'NN_DEVICE_{i}_NAME'],
-                                        total_mem=int(os.environ[f'NN_DEVICE_{i}_TOTAL_MEM']),
-                                        free_mem=int(os.environ[f'NN_DEVICE_{i}_FREE_MEM']),
-                                        cc=int(os.environ[f'NN_DEVICE_{i}_CC']) ))
-            Devices.all_devices = Devices(devices)
-
-        return Devices.all_devices
-
-"""
-if Devices.all_devices is None:
-            min_cc = int(os.environ.get("TF_MIN_REQ_CAP", 35))
-
-            libnames = ('libcuda.so', 'libcuda.dylib', 'nvcuda.dll')
-            for libname in libnames:
-                try:
-                    cuda = ctypes.CDLL(libname)
-                except:
-                    continue
-                else:
-                    break
-            else:
-                return Devices([])
-
-            nGpus = ctypes.c_int()
-            name = b' ' * 200
-            cc_major = ctypes.c_int()
-            cc_minor = ctypes.c_int()
-            freeMem = ctypes.c_size_t()
-            totalMem = ctypes.c_size_t()
-
-            result = ctypes.c_int()
-            device = ctypes.c_int()
-            context = ctypes.c_void_p()
-            error_str = ctypes.c_char_p()
-
-            devices = []
-
-            if cuda.cuInit(0) == 0 and \
-               cuda.cuDeviceGetCount(ctypes.byref(nGpus)) == 0:
-                for i in range(nGpus.value):
-                    if cuda.cuDeviceGet(ctypes.byref(device), i) != 0 or \
-                       cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) != 0 or \
-                       cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) != 0:
-                        continue
-
-                    if cuda.cuCtxCreate_v2(ctypes.byref(context), 0, device) == 0:
-                        if cuda.cuMemGetInfo_v2(ctypes.byref(freeMem), ctypes.byref(totalMem)) == 0:
-                            cc = cc_major.value * 10 + cc_minor.value
-                            if cc >= min_cc:
-                                devices.append ( Device(index=i,
-                                                        name=name.split(b'\0', 1)[0].decode(),
-                                                        total_mem=totalMem.value,
-                                                        free_mem=freeMem.value,
-                                                        cc=cc) )
-                        cuda.cuCtxDetach(context)
-            Devices.all_devices = Devices(devices)
-        return Devices.all_devices
 """
\ No newline at end of file
diff --git a/core/leras/nn.py b/core/leras/nn.py
index ef5c2c9..f392aaf 100644
--- a/core/leras/nn.py
+++ b/core/leras/nn.py
@@ -33,8 +33,8 @@ class nn():
     tf = None
     tf_sess = None
     tf_sess_config = None
-    tf_default_device = None
-
+    tf_default_device_name = None
+    
     data_format = None
     conv2d_ch_axis = None
     conv2d_spatial_axes = None
@@ -50,9 +50,6 @@ class nn():
             nn.setCurrentDeviceConfig(device_config)
 
             # Manipulate environment variables before import tensorflow
-            
-            if 'CUDA_VISIBLE_DEVICES' in os.environ.keys():
-                os.environ.pop('CUDA_VISIBLE_DEVICES')
 
             first_run = False
             if len(device_config.devices) != 0:
@@ -68,22 +65,19 @@ class nn():
                     compute_cache_path = Path(os.environ['APPDATA']) / 'NVIDIA' / ('ComputeCache' + devices_str)
                     if not compute_cache_path.exists():
                         first_run = True
+                        compute_cache_path.mkdir(parents=True, exist_ok=True)
                     os.environ['CUDA_CACHE_PATH'] = str(compute_cache_path)
-
-            os.environ['TF_MIN_GPU_MULTIPROCESSOR_COUNT'] = '2'
-            os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # tf log errors only
-
+            
             if first_run:
                 io.log_info("Caching GPU kernels...")
 
             import tensorflow
-            
-            tf_version = getattr(tensorflow,'VERSION', None)
-            if tf_version is None:
-                tf_version = tensorflow.version.GIT_VERSION
-                if tf_version[0] == 'v':
-                    tf_version = tf_version[1:]
-                
+
+            tf_version = tensorflow.version.VERSION
+            #if tf_version is None:
+            #    tf_version = tensorflow.version.GIT_VERSION
+            if tf_version[0] == 'v':
+                tf_version = tf_version[1:]
             if tf_version[0] == '2':
                 tf = tensorflow.compat.v1
             else:
@@ -108,13 +102,14 @@ class nn():
             
             # Configure tensorflow session-config
             if len(device_config.devices) == 0:
-                nn.tf_default_device = "/CPU:0"
                 config = tf.ConfigProto(device_count={'GPU': 0})
+                nn.tf_default_device_name = '/CPU:0'
             else:
-                nn.tf_default_device = "/GPU:0"
+                nn.tf_default_device_name = f'/{device_config.devices[0].tf_dev_type}:0'
+                
                 config = tf.ConfigProto()
                 config.gpu_options.visible_device_list = ','.join([str(device.index) for device in device_config.devices])
-
+                
             config.gpu_options.force_gpu_compatible = True
             config.gpu_options.allow_growth = True
             nn.tf_sess_config = config
@@ -202,14 +197,6 @@ class nn():
             nn.tf_sess.close()
             nn.tf_sess = None
 
-    @staticmethod
-    def get_current_device():
-        # Undocumented access to last tf.device(...)
-        objs = nn.tf.get_default_graph()._device_function_stack.peek_objs()
-        if len(objs) != 0:
-            return objs[0].display_name
-        return nn.tf_default_device
-
     @staticmethod
     def ask_choose_device_idxs(choose_only_one=False, allow_cpu=True, suggest_best_multi_gpu=False, suggest_all_gpu=False):
         devices = Devices.getDevices()
diff --git a/facelib/FaceEnhancer.py b/facelib/FaceEnhancer.py
index 0b5ced3..1dc0dd9 100644
--- a/facelib/FaceEnhancer.py
+++ b/facelib/FaceEnhancer.py
@@ -161,11 +161,11 @@ class FaceEnhancer(object):
         if not model_path.exists():
             raise Exception("Unable to load FaceEnhancer.npy")
 
-        with tf.device ('/CPU:0' if place_model_on_cpu else '/GPU:0'):
+        with tf.device ('/CPU:0' if place_model_on_cpu else nn.tf_default_device_name):
             self.model = FaceEnhancer()
             self.model.load_weights (model_path)
         
-        with tf.device ('/CPU:0' if run_on_cpu else '/GPU:0'):
+        with tf.device ('/CPU:0' if run_on_cpu else nn.tf_default_device_name):
             self.model.build_for_run ([ (tf.float32, nn.get4Dshape (192,192,3) ),
                                         (tf.float32, (None,1,) ),
                                         (tf.float32, (None,1,) ),
diff --git a/facelib/XSegNet.py b/facelib/XSegNet.py
index 761ab94..5621a65 100644
--- a/facelib/XSegNet.py
+++ b/facelib/XSegNet.py
@@ -39,7 +39,7 @@ class XSegNet(object):
             self.target_t = tf.placeholder (nn.floatx, nn.get4Dshape(resolution,resolution,1) )
 
         # Initializing model classes
-        with tf.device ('/CPU:0' if place_model_on_cpu else '/GPU:0'):
+        with tf.device ('/CPU:0' if place_model_on_cpu else nn.tf_default_device_name):
             self.model = nn.XSeg(3, 32, 1, name=name)
             self.model_weights = self.model.get_weights()
             if training:
@@ -53,7 +53,7 @@ class XSegNet(object):
         self.model_filename_list += [ [self.model, f'{model_name}.npy'] ]
 
         if not training:
-            with tf.device ('/CPU:0' if run_on_cpu else '/GPU:0'):
+            with tf.device ('/CPU:0' if run_on_cpu else nn.tf_default_device_name):
                 _, pred = self.model(self.input_t)
 
             def net_run(input_np):
diff --git a/models/Model_Quick96/Model.py b/models/Model_Quick96/Model.py
index 3c39e46..fa139e5 100644
--- a/models/Model_Quick96/Model.py
+++ b/models/Model_Quick96/Model.py
@@ -31,7 +31,7 @@ class QModel(ModelBase):
         masked_training = True
 
         models_opt_on_gpu = len(devices) >= 1 and all([dev.total_mem_gb >= 4 for dev in devices])
-        models_opt_device = '/GPU:0' if models_opt_on_gpu and self.is_training else '/CPU:0'
+        models_opt_device = nn.tf_default_device_name if models_opt_on_gpu and self.is_training else '/CPU:0'
         optimizer_vars_on_cpu = models_opt_device=='/CPU:0'
 
         input_ch = 3
@@ -96,7 +96,7 @@ class QModel(ModelBase):
             gpu_src_dst_loss_gvs = []
             
             for gpu_id in range(gpu_count):
-                with tf.device( f'/GPU:{gpu_id}' if len(devices) != 0 else f'/CPU:0' ):
+                with tf.device( f'/{devices[gpu_id].tf_dev_type}:{gpu_id}' if len(devices) != 0 else f'/CPU:0' ):
                     batch_slice = slice( gpu_id*bs_per_gpu, (gpu_id+1)*bs_per_gpu )
                     with tf.device(f'/CPU:0'):
                         # slice on CPU, otherwise all batch data will be transfered to GPU first
@@ -190,7 +190,7 @@ class QModel(ModelBase):
             self.AE_view = AE_view
         else:
             # Initializing merge function
-            with tf.device( f'/GPU:0' if len(devices) != 0 else f'/CPU:0'):
+            with tf.device( nn.tf_default_device_name if len(devices) != 0 else f'/CPU:0'):
                 gpu_dst_code     = self.inter(self.encoder(self.warped_dst))
                 gpu_pred_src_dst, gpu_pred_src_dstm = self.decoder_src(gpu_dst_code)
                 _, gpu_pred_dst_dstm = self.decoder_dst(gpu_dst_code)
diff --git a/models/Model_SAEHD/Model.py b/models/Model_SAEHD/Model.py
index 79329ff..eb89172 100644
--- a/models/Model_SAEHD/Model.py
+++ b/models/Model_SAEHD/Model.py
@@ -235,9 +235,10 @@ Examples: df, liae, df-d, df-ud, liae-ud, ...
         ct_mode = self.options['ct_mode']
         if ct_mode == 'none':
             ct_mode = None
-
+        
+        
         models_opt_on_gpu = False if len(devices) == 0 else self.options['models_opt_on_gpu']
-        models_opt_device = '/GPU:0' if models_opt_on_gpu and self.is_training else '/CPU:0'
+        models_opt_device = nn.tf_default_device_name if models_opt_on_gpu and self.is_training else '/CPU:0'
         optimizer_vars_on_cpu = models_opt_device=='/CPU:0'
 
         input_ch=3
@@ -336,7 +337,6 @@ Examples: df, liae, df-d, df-ud, liae-ud, ...
             bs_per_gpu = max(1, self.get_batch_size() // gpu_count)
             self.set_batch_size( gpu_count*bs_per_gpu)
 
-
             # Compute losses per GPU
             gpu_pred_src_src_list = []
             gpu_pred_dst_dst_list = []
@@ -350,9 +350,9 @@ Examples: df, liae, df-d, df-ud, liae-ud, ...
             gpu_G_loss_gvs = []
             gpu_D_code_loss_gvs = []
             gpu_D_src_dst_loss_gvs = []
+            
             for gpu_id in range(gpu_count):
-                with tf.device( f'/GPU:{gpu_id}' if len(devices) != 0 else f'/CPU:0' ):
-
+                with tf.device( f'/{devices[gpu_id].tf_dev_type}:{gpu_id}' if len(devices) != 0 else f'/CPU:0' ):
                     with tf.device(f'/CPU:0'):
                         # slice on CPU, otherwise all batch data will be transfered to GPU first
                         batch_slice = slice( gpu_id*bs_per_gpu, (gpu_id+1)*bs_per_gpu )
@@ -360,10 +360,10 @@ Examples: df, liae, df-d, df-ud, liae-ud, ...
                         gpu_warped_dst      = self.warped_dst [batch_slice,:,:,:]
                         gpu_target_src      = self.target_src [batch_slice,:,:,:]
                         gpu_target_dst      = self.target_dst [batch_slice,:,:,:]
-                        gpu_target_srcm       = self.target_srcm[batch_slice,:,:,:]
-                        gpu_target_srcm_em = self.target_srcm_em[batch_slice,:,:,:]
-                        gpu_target_dstm       = self.target_dstm[batch_slice,:,:,:]
-                        gpu_target_dstm_em = self.target_dstm_em[batch_slice,:,:,:]
+                        gpu_target_srcm     = self.target_srcm[batch_slice,:,:,:]
+                        gpu_target_srcm_em  = self.target_srcm_em[batch_slice,:,:,:]
+                        gpu_target_dstm     = self.target_dstm[batch_slice,:,:,:]
+                        gpu_target_dstm_em  = self.target_dstm_em[batch_slice,:,:,:]
 
                     # process model tensors
                     if 'df' in archi_type:
@@ -571,7 +571,7 @@ Examples: df, liae, df-d, df-ud, liae-ud, ...
             self.AE_view = AE_view
         else:
             # Initializing merge function
-            with tf.device( f'/GPU:0' if len(devices) != 0 else f'/CPU:0'):
+            with tf.device( nn.tf_default_device_name if len(devices) != 0 else f'/CPU:0'):
                 if 'df' in archi_type:
                     gpu_dst_code     = self.inter(self.encoder(self.warped_dst))
                     gpu_pred_src_dst, gpu_pred_src_dstm = self.decoder_src(gpu_dst_code)
diff --git a/models/Model_XSeg/Model.py b/models/Model_XSeg/Model.py
index cd6eea3..567a1f2 100644
--- a/models/Model_XSeg/Model.py
+++ b/models/Model_XSeg/Model.py
@@ -52,7 +52,7 @@ class XSegModel(ModelBase):
                           'head' : FaceType.HEAD}[ self.options['face_type'] ]
 
         place_model_on_cpu = len(devices) == 0
-        models_opt_device = '/CPU:0' if place_model_on_cpu else '/GPU:0'
+        models_opt_device = '/CPU:0' if place_model_on_cpu else nn.tf_default_device_name
 
         bgr_shape = nn.get4Dshape(resolution,resolution,3)
         mask_shape = nn.get4Dshape(resolution,resolution,1)
@@ -83,7 +83,7 @@ class XSegModel(ModelBase):
             for gpu_id in range(gpu_count):
 
 
-                with tf.device( f'/GPU:{gpu_id}' if len(devices) != 0 else f'/CPU:0' ):
+                with tf.device(f'/{devices[gpu_id].tf_dev_type}:{gpu_id}' if len(devices) != 0 else f'/CPU:0' ):
                     with tf.device(f'/CPU:0'):
                         # slice on CPU, otherwise all batch data will be transfered to GPU first
                         batch_slice = slice( gpu_id*bs_per_gpu, (gpu_id+1)*bs_per_gpu )