diff --git a/core/leras/device.py b/core/leras/device.py index 4d157f0..2f65382 100644 --- a/core/leras/device.py +++ b/core/leras/device.py @@ -1,12 +1,19 @@ import sys import ctypes import os +import multiprocessing +import json +import time +from pathlib import Path +from core.interact import interact as io + class Device(object): - def __init__(self, index, name, total_mem, free_mem, cc=0): + def __init__(self, index, tf_dev_type, name, total_mem, free_mem): self.index = index + self.tf_dev_type = tf_dev_type self.name = name - self.cc = cc + self.total_mem = total_mem self.total_mem_gb = total_mem / 1024**3 self.free_mem = free_mem @@ -82,12 +89,134 @@ class Devices(object): result.append (device) return Devices(result) + @staticmethod + def _get_tf_devices_proc(q : multiprocessing.Queue): + + compute_cache_path = Path(os.environ['APPDATA']) / 'NVIDIA' / ('ComputeCache_ALL') + os.environ['CUDA_CACHE_PATH'] = str(compute_cache_path) + if not compute_cache_path.exists(): + io.log_info("Caching GPU kernels...") + compute_cache_path.mkdir(parents=True, exist_ok=True) + + import tensorflow + + tf_version = tensorflow.version.VERSION + #if tf_version is None: + # tf_version = tensorflow.version.GIT_VERSION + if tf_version[0] == 'v': + tf_version = tf_version[1:] + if tf_version[0] == '2': + tf = tensorflow.compat.v1 + else: + tf = tensorflow + + import logging + # Disable tensorflow warnings + tf_logger = logging.getLogger('tensorflow') + tf_logger.setLevel(logging.ERROR) + + from tensorflow.python.client import device_lib + + devices = [] + + physical_devices = device_lib.list_local_devices() + physical_devices_f = {} + for dev in physical_devices: + dev_type = dev.device_type + dev_tf_name = dev.name + dev_tf_name = dev_tf_name[ dev_tf_name.index(dev_type) : ] + + dev_idx = int(dev_tf_name.split(':')[-1]) + + if dev_type in ['GPU','DML']: + dev_name = dev_tf_name + + dev_desc = dev.physical_device_desc + if len(dev_desc) != 0: + if dev_desc[0] == '{': + dev_desc_json = json.loads(dev_desc) + dev_desc_json_name = dev_desc_json.get('name',None) + if dev_desc_json_name is not None: + dev_name = dev_desc_json_name + else: + for param, value in ( v.split(':') for v in dev_desc.split(',') ): + param = param.strip() + value = value.strip() + if param == 'name': + dev_name = value + break + + physical_devices_f[dev_idx] = (dev_type, dev_name, dev.memory_limit) + + q.put(physical_devices_f) + time.sleep(0.1) + + @staticmethod def initialize_main_env(): - os.environ['NN_DEVICES_INITIALIZED'] = '1' - os.environ['NN_DEVICES_COUNT'] = '0' + if int(os.environ.get("NN_DEVICES_INITIALIZED", 0)) != 0: + return + + if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(): + os.environ.pop('CUDA_VISIBLE_DEVICES') os.environ['CUDA_​CACHE_​MAXSIZE'] = '2147483647' + os.environ['TF_MIN_GPU_MULTIPROCESSOR_COUNT'] = '2' + os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # tf log errors only + + q = multiprocessing.Queue() + p = multiprocessing.Process(target=Devices._get_tf_devices_proc, args=(q,), daemon=True) + p.start() + p.join() + + visible_devices = q.get() + + os.environ['NN_DEVICES_INITIALIZED'] = '1' + os.environ['NN_DEVICES_COUNT'] = str(len(visible_devices)) + + for i in visible_devices: + dev_type, name, total_mem = visible_devices[i] + + os.environ[f'NN_DEVICE_{i}_TF_DEV_TYPE'] = dev_type + os.environ[f'NN_DEVICE_{i}_NAME'] = name + os.environ[f'NN_DEVICE_{i}_TOTAL_MEM'] = str(total_mem) + os.environ[f'NN_DEVICE_{i}_FREE_MEM'] = str(total_mem) + + + + @staticmethod + def getDevices(): + if Devices.all_devices is None: + if int(os.environ.get("NN_DEVICES_INITIALIZED", 0)) != 1: + raise Exception("nn devices are not initialized. Run initialize_main_env() in main process.") + devices = [] + for i in range ( int(os.environ['NN_DEVICES_COUNT']) ): + devices.append ( Device(index=i, + tf_dev_type=os.environ[f'NN_DEVICE_{i}_TF_DEV_TYPE'], + name=os.environ[f'NN_DEVICE_{i}_NAME'], + total_mem=int(os.environ[f'NN_DEVICE_{i}_TOTAL_MEM']), + free_mem=int(os.environ[f'NN_DEVICE_{i}_FREE_MEM']), ) + ) + Devices.all_devices = Devices(devices) + + return Devices.all_devices + +""" + + + # {'name' : name.split(b'\0', 1)[0].decode(), + # 'total_mem' : totalMem.value + # } + + + + + + return + + + + min_cc = int(os.environ.get("TF_MIN_REQ_CAP", 35)) libnames = ('libcuda.so', 'libcuda.dylib', 'nvcuda.dll') for libname in libnames: @@ -139,70 +268,4 @@ class Devices(object): os.environ[f'NN_DEVICE_{i}_TOTAL_MEM'] = str(device['total_mem']) os.environ[f'NN_DEVICE_{i}_FREE_MEM'] = str(device['free_mem']) os.environ[f'NN_DEVICE_{i}_CC'] = str(device['cc']) - - @staticmethod - def getDevices(): - if Devices.all_devices is None: - if int(os.environ.get("NN_DEVICES_INITIALIZED", 0)) != 1: - raise Exception("nn devices are not initialized. Run initialize_main_env() in main process.") - devices = [] - for i in range ( int(os.environ['NN_DEVICES_COUNT']) ): - devices.append ( Device(index=i, - name=os.environ[f'NN_DEVICE_{i}_NAME'], - total_mem=int(os.environ[f'NN_DEVICE_{i}_TOTAL_MEM']), - free_mem=int(os.environ[f'NN_DEVICE_{i}_FREE_MEM']), - cc=int(os.environ[f'NN_DEVICE_{i}_CC']) )) - Devices.all_devices = Devices(devices) - - return Devices.all_devices - -""" -if Devices.all_devices is None: - min_cc = int(os.environ.get("TF_MIN_REQ_CAP", 35)) - - libnames = ('libcuda.so', 'libcuda.dylib', 'nvcuda.dll') - for libname in libnames: - try: - cuda = ctypes.CDLL(libname) - except: - continue - else: - break - else: - return Devices([]) - - nGpus = ctypes.c_int() - name = b' ' * 200 - cc_major = ctypes.c_int() - cc_minor = ctypes.c_int() - freeMem = ctypes.c_size_t() - totalMem = ctypes.c_size_t() - - result = ctypes.c_int() - device = ctypes.c_int() - context = ctypes.c_void_p() - error_str = ctypes.c_char_p() - - devices = [] - - if cuda.cuInit(0) == 0 and \ - cuda.cuDeviceGetCount(ctypes.byref(nGpus)) == 0: - for i in range(nGpus.value): - if cuda.cuDeviceGet(ctypes.byref(device), i) != 0 or \ - cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) != 0 or \ - cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) != 0: - continue - - if cuda.cuCtxCreate_v2(ctypes.byref(context), 0, device) == 0: - if cuda.cuMemGetInfo_v2(ctypes.byref(freeMem), ctypes.byref(totalMem)) == 0: - cc = cc_major.value * 10 + cc_minor.value - if cc >= min_cc: - devices.append ( Device(index=i, - name=name.split(b'\0', 1)[0].decode(), - total_mem=totalMem.value, - free_mem=freeMem.value, - cc=cc) ) - cuda.cuCtxDetach(context) - Devices.all_devices = Devices(devices) - return Devices.all_devices """ \ No newline at end of file diff --git a/core/leras/nn.py b/core/leras/nn.py index ef5c2c9..f392aaf 100644 --- a/core/leras/nn.py +++ b/core/leras/nn.py @@ -33,8 +33,8 @@ class nn(): tf = None tf_sess = None tf_sess_config = None - tf_default_device = None - + tf_default_device_name = None + data_format = None conv2d_ch_axis = None conv2d_spatial_axes = None @@ -50,9 +50,6 @@ class nn(): nn.setCurrentDeviceConfig(device_config) # Manipulate environment variables before import tensorflow - - if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(): - os.environ.pop('CUDA_VISIBLE_DEVICES') first_run = False if len(device_config.devices) != 0: @@ -68,22 +65,19 @@ class nn(): compute_cache_path = Path(os.environ['APPDATA']) / 'NVIDIA' / ('ComputeCache' + devices_str) if not compute_cache_path.exists(): first_run = True + compute_cache_path.mkdir(parents=True, exist_ok=True) os.environ['CUDA_CACHE_PATH'] = str(compute_cache_path) - - os.environ['TF_MIN_GPU_MULTIPROCESSOR_COUNT'] = '2' - os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # tf log errors only - + if first_run: io.log_info("Caching GPU kernels...") import tensorflow - - tf_version = getattr(tensorflow,'VERSION', None) - if tf_version is None: - tf_version = tensorflow.version.GIT_VERSION - if tf_version[0] == 'v': - tf_version = tf_version[1:] - + + tf_version = tensorflow.version.VERSION + #if tf_version is None: + # tf_version = tensorflow.version.GIT_VERSION + if tf_version[0] == 'v': + tf_version = tf_version[1:] if tf_version[0] == '2': tf = tensorflow.compat.v1 else: @@ -108,13 +102,14 @@ class nn(): # Configure tensorflow session-config if len(device_config.devices) == 0: - nn.tf_default_device = "/CPU:0" config = tf.ConfigProto(device_count={'GPU': 0}) + nn.tf_default_device_name = '/CPU:0' else: - nn.tf_default_device = "/GPU:0" + nn.tf_default_device_name = f'/{device_config.devices[0].tf_dev_type}:0' + config = tf.ConfigProto() config.gpu_options.visible_device_list = ','.join([str(device.index) for device in device_config.devices]) - + config.gpu_options.force_gpu_compatible = True config.gpu_options.allow_growth = True nn.tf_sess_config = config @@ -202,14 +197,6 @@ class nn(): nn.tf_sess.close() nn.tf_sess = None - @staticmethod - def get_current_device(): - # Undocumented access to last tf.device(...) - objs = nn.tf.get_default_graph()._device_function_stack.peek_objs() - if len(objs) != 0: - return objs[0].display_name - return nn.tf_default_device - @staticmethod def ask_choose_device_idxs(choose_only_one=False, allow_cpu=True, suggest_best_multi_gpu=False, suggest_all_gpu=False): devices = Devices.getDevices() diff --git a/facelib/FaceEnhancer.py b/facelib/FaceEnhancer.py index 0b5ced3..1dc0dd9 100644 --- a/facelib/FaceEnhancer.py +++ b/facelib/FaceEnhancer.py @@ -161,11 +161,11 @@ class FaceEnhancer(object): if not model_path.exists(): raise Exception("Unable to load FaceEnhancer.npy") - with tf.device ('/CPU:0' if place_model_on_cpu else '/GPU:0'): + with tf.device ('/CPU:0' if place_model_on_cpu else nn.tf_default_device_name): self.model = FaceEnhancer() self.model.load_weights (model_path) - with tf.device ('/CPU:0' if run_on_cpu else '/GPU:0'): + with tf.device ('/CPU:0' if run_on_cpu else nn.tf_default_device_name): self.model.build_for_run ([ (tf.float32, nn.get4Dshape (192,192,3) ), (tf.float32, (None,1,) ), (tf.float32, (None,1,) ), diff --git a/facelib/XSegNet.py b/facelib/XSegNet.py index 761ab94..5621a65 100644 --- a/facelib/XSegNet.py +++ b/facelib/XSegNet.py @@ -39,7 +39,7 @@ class XSegNet(object): self.target_t = tf.placeholder (nn.floatx, nn.get4Dshape(resolution,resolution,1) ) # Initializing model classes - with tf.device ('/CPU:0' if place_model_on_cpu else '/GPU:0'): + with tf.device ('/CPU:0' if place_model_on_cpu else nn.tf_default_device_name): self.model = nn.XSeg(3, 32, 1, name=name) self.model_weights = self.model.get_weights() if training: @@ -53,7 +53,7 @@ class XSegNet(object): self.model_filename_list += [ [self.model, f'{model_name}.npy'] ] if not training: - with tf.device ('/CPU:0' if run_on_cpu else '/GPU:0'): + with tf.device ('/CPU:0' if run_on_cpu else nn.tf_default_device_name): _, pred = self.model(self.input_t) def net_run(input_np): diff --git a/models/Model_Quick96/Model.py b/models/Model_Quick96/Model.py index 3c39e46..fa139e5 100644 --- a/models/Model_Quick96/Model.py +++ b/models/Model_Quick96/Model.py @@ -31,7 +31,7 @@ class QModel(ModelBase): masked_training = True models_opt_on_gpu = len(devices) >= 1 and all([dev.total_mem_gb >= 4 for dev in devices]) - models_opt_device = '/GPU:0' if models_opt_on_gpu and self.is_training else '/CPU:0' + models_opt_device = nn.tf_default_device_name if models_opt_on_gpu and self.is_training else '/CPU:0' optimizer_vars_on_cpu = models_opt_device=='/CPU:0' input_ch = 3 @@ -96,7 +96,7 @@ class QModel(ModelBase): gpu_src_dst_loss_gvs = [] for gpu_id in range(gpu_count): - with tf.device( f'/GPU:{gpu_id}' if len(devices) != 0 else f'/CPU:0' ): + with tf.device( f'/{devices[gpu_id].tf_dev_type}:{gpu_id}' if len(devices) != 0 else f'/CPU:0' ): batch_slice = slice( gpu_id*bs_per_gpu, (gpu_id+1)*bs_per_gpu ) with tf.device(f'/CPU:0'): # slice on CPU, otherwise all batch data will be transfered to GPU first @@ -190,7 +190,7 @@ class QModel(ModelBase): self.AE_view = AE_view else: # Initializing merge function - with tf.device( f'/GPU:0' if len(devices) != 0 else f'/CPU:0'): + with tf.device( nn.tf_default_device_name if len(devices) != 0 else f'/CPU:0'): gpu_dst_code = self.inter(self.encoder(self.warped_dst)) gpu_pred_src_dst, gpu_pred_src_dstm = self.decoder_src(gpu_dst_code) _, gpu_pred_dst_dstm = self.decoder_dst(gpu_dst_code) diff --git a/models/Model_SAEHD/Model.py b/models/Model_SAEHD/Model.py index 79329ff..eb89172 100644 --- a/models/Model_SAEHD/Model.py +++ b/models/Model_SAEHD/Model.py @@ -235,9 +235,10 @@ Examples: df, liae, df-d, df-ud, liae-ud, ... ct_mode = self.options['ct_mode'] if ct_mode == 'none': ct_mode = None - + + models_opt_on_gpu = False if len(devices) == 0 else self.options['models_opt_on_gpu'] - models_opt_device = '/GPU:0' if models_opt_on_gpu and self.is_training else '/CPU:0' + models_opt_device = nn.tf_default_device_name if models_opt_on_gpu and self.is_training else '/CPU:0' optimizer_vars_on_cpu = models_opt_device=='/CPU:0' input_ch=3 @@ -336,7 +337,6 @@ Examples: df, liae, df-d, df-ud, liae-ud, ... bs_per_gpu = max(1, self.get_batch_size() // gpu_count) self.set_batch_size( gpu_count*bs_per_gpu) - # Compute losses per GPU gpu_pred_src_src_list = [] gpu_pred_dst_dst_list = [] @@ -350,9 +350,9 @@ Examples: df, liae, df-d, df-ud, liae-ud, ... gpu_G_loss_gvs = [] gpu_D_code_loss_gvs = [] gpu_D_src_dst_loss_gvs = [] + for gpu_id in range(gpu_count): - with tf.device( f'/GPU:{gpu_id}' if len(devices) != 0 else f'/CPU:0' ): - + with tf.device( f'/{devices[gpu_id].tf_dev_type}:{gpu_id}' if len(devices) != 0 else f'/CPU:0' ): with tf.device(f'/CPU:0'): # slice on CPU, otherwise all batch data will be transfered to GPU first batch_slice = slice( gpu_id*bs_per_gpu, (gpu_id+1)*bs_per_gpu ) @@ -360,10 +360,10 @@ Examples: df, liae, df-d, df-ud, liae-ud, ... gpu_warped_dst = self.warped_dst [batch_slice,:,:,:] gpu_target_src = self.target_src [batch_slice,:,:,:] gpu_target_dst = self.target_dst [batch_slice,:,:,:] - gpu_target_srcm = self.target_srcm[batch_slice,:,:,:] - gpu_target_srcm_em = self.target_srcm_em[batch_slice,:,:,:] - gpu_target_dstm = self.target_dstm[batch_slice,:,:,:] - gpu_target_dstm_em = self.target_dstm_em[batch_slice,:,:,:] + gpu_target_srcm = self.target_srcm[batch_slice,:,:,:] + gpu_target_srcm_em = self.target_srcm_em[batch_slice,:,:,:] + gpu_target_dstm = self.target_dstm[batch_slice,:,:,:] + gpu_target_dstm_em = self.target_dstm_em[batch_slice,:,:,:] # process model tensors if 'df' in archi_type: @@ -571,7 +571,7 @@ Examples: df, liae, df-d, df-ud, liae-ud, ... self.AE_view = AE_view else: # Initializing merge function - with tf.device( f'/GPU:0' if len(devices) != 0 else f'/CPU:0'): + with tf.device( nn.tf_default_device_name if len(devices) != 0 else f'/CPU:0'): if 'df' in archi_type: gpu_dst_code = self.inter(self.encoder(self.warped_dst)) gpu_pred_src_dst, gpu_pred_src_dstm = self.decoder_src(gpu_dst_code) diff --git a/models/Model_XSeg/Model.py b/models/Model_XSeg/Model.py index cd6eea3..567a1f2 100644 --- a/models/Model_XSeg/Model.py +++ b/models/Model_XSeg/Model.py @@ -52,7 +52,7 @@ class XSegModel(ModelBase): 'head' : FaceType.HEAD}[ self.options['face_type'] ] place_model_on_cpu = len(devices) == 0 - models_opt_device = '/CPU:0' if place_model_on_cpu else '/GPU:0' + models_opt_device = '/CPU:0' if place_model_on_cpu else nn.tf_default_device_name bgr_shape = nn.get4Dshape(resolution,resolution,3) mask_shape = nn.get4Dshape(resolution,resolution,1) @@ -83,7 +83,7 @@ class XSegModel(ModelBase): for gpu_id in range(gpu_count): - with tf.device( f'/GPU:{gpu_id}' if len(devices) != 0 else f'/CPU:0' ): + with tf.device(f'/{devices[gpu_id].tf_dev_type}:{gpu_id}' if len(devices) != 0 else f'/CPU:0' ): with tf.device(f'/CPU:0'): # slice on CPU, otherwise all batch data will be transfered to GPU first batch_slice = slice( gpu_id*bs_per_gpu, (gpu_id+1)*bs_per_gpu )