DeepFaceLab/nnlib/device.py
Colombo d781af3d1f fixed GPU detection and indexes, got rid of using nvml, now using direct cuda lib to determine gpu info that match tensorflow indexes,
removed TrueFace model.

added SAEv2 model. Differences from SAE:
+ default e_ch_dims is now 21
+ new encoder produces more stable face and less scale jitter
  before: https://i.imgur.com/4jUcol8.gifv
  after:  https://i.imgur.com/lyiax49.gifv - scale of the face is less changed within frame size
+ decoder now has only 1 residual block instead of 2, result is same quality with less decoder size
+ added mid-full face, which covers 30% more area than half face.
+ added option " Enable 'true face' training "
  Enable it only after 50k iters, when the face is sharp enough.
  the result face will be more like src.
  The most src-like face with 'true-face-training' you can achieve with DF architecture.
2019-10-05 16:26:23 +04:00

310 lines
11 KiB
Python

import sys
import ctypes
import os
import json
import numpy as np
#you can set DFL_TF_MIN_REQ_CAP manually for your build
#the reason why we cannot check tensorflow.version is it requires import tensorflow
tf_min_req_cap = int(os.environ.get("DFL_TF_MIN_REQ_CAP", 35))
class device:
backend = None
class Config():
force_gpu_idx = -1
multi_gpu = False
force_gpu_idxs = None
choose_worst_gpu = False
gpu_idxs = []
gpu_names = []
gpu_compute_caps = []
gpu_vram_gb = []
allow_growth = True
use_fp16 = False
cpu_only = False
backend = None
def __init__ (self, force_gpu_idx = -1,
multi_gpu = False,
force_gpu_idxs = None,
choose_worst_gpu = False,
allow_growth = True,
use_fp16 = False,
cpu_only = False,
**in_options):
self.backend = device.backend
self.use_fp16 = use_fp16
self.cpu_only = cpu_only
if not self.cpu_only:
self.cpu_only = (self.backend == "tensorflow-cpu")
if not self.cpu_only:
self.force_gpu_idx = force_gpu_idx
self.multi_gpu = multi_gpu
self.force_gpu_idxs = force_gpu_idxs
self.choose_worst_gpu = choose_worst_gpu
self.allow_growth = allow_growth
self.gpu_idxs = []
if force_gpu_idxs is not None:
for idx in force_gpu_idxs.split(','):
idx = int(idx)
if device.isValidDeviceIdx(idx):
self.gpu_idxs.append(idx)
else:
gpu_idx = force_gpu_idx if (force_gpu_idx >= 0 and device.isValidDeviceIdx(force_gpu_idx)) else device.getBestValidDeviceIdx() if not choose_worst_gpu else device.getWorstValidDeviceIdx()
if gpu_idx != -1:
if self.multi_gpu:
self.gpu_idxs = device.getDeviceIdxsEqualModel( gpu_idx )
if len(self.gpu_idxs) <= 1:
self.multi_gpu = False
else:
self.gpu_idxs = [gpu_idx]
self.cpu_only = (len(self.gpu_idxs) == 0)
if not self.cpu_only:
self.gpu_names = []
self.gpu_compute_caps = []
self.gpu_vram_gb = []
for gpu_idx in self.gpu_idxs:
self.gpu_names += [device.getDeviceName(gpu_idx)]
self.gpu_compute_caps += [ device.getDeviceComputeCapability(gpu_idx) ]
self.gpu_vram_gb += [ device.getDeviceVRAMTotalGb(gpu_idx) ]
self.cpu_only = (len(self.gpu_idxs) == 0)
else:
self.gpu_names = ['CPU']
self.gpu_compute_caps = [99]
self.gpu_vram_gb = [0]
if self.cpu_only:
self.backend = "tensorflow-cpu"
@staticmethod
def getValidDeviceIdxsEnumerator():
if device.backend == "plaidML":
for i in range(plaidML_devices_count):
yield i
elif device.backend == "tensorflow":
for dev in cuda_devices:
yield dev['index']
@staticmethod
def getValidDevicesWithAtLeastTotalMemoryGB(totalmemsize_gb):
result = []
if device.backend == "plaidML":
for i in device.getValidDeviceIdxsEnumerator():
if plaidML_devices[i]['globalMemSize'] >= totalmemsize_gb*1024*1024*1024:
result.append (i)
elif device.backend == "tensorflow":
for dev in cuda_devices:
if dev['total_mem'] >= totalmemsize_gb*1024*1024*1024:
result.append (i)
return result
@staticmethod
def getValidDevicesIdxsWithNamesList():
if device.backend == "plaidML":
return [ (i, plaidML_devices[i]['description'] ) for i in device.getValidDeviceIdxsEnumerator() ]
elif device.backend == "tensorflow":
return [ ( dev['index'], dev['name'] ) for dev in cuda_devices ]
elif device.backend == "tensorflow-cpu":
return [ (0, 'CPU') ]
@staticmethod
def getDeviceVRAMTotalGb (idx):
if device.backend == "plaidML":
if idx < plaidML_devices_count:
return plaidML_devices[idx]['globalMemSize'] / (1024*1024*1024)
elif device.backend == "tensorflow":
for dev in cuda_devices:
if idx == dev['index']:
return round ( dev['total_mem'] / (1024*1024*1024) )
return 0
@staticmethod
def getBestValidDeviceIdx():
if device.backend == "plaidML":
idx = -1
idx_mem = 0
for i in device.getValidDeviceIdxsEnumerator():
total = plaidML_devices[i]['globalMemSize']
if total > idx_mem:
idx = i
idx_mem = total
return idx
elif device.backend == "tensorflow":
idx = -1
idx_mem = 0
for dev in cuda_devices:
if dev['total_mem'] > idx_mem:
idx = dev['index']
idx_mem = dev['total_mem']
return idx
@staticmethod
def getWorstValidDeviceIdx():
if device.backend == "plaidML":
idx = -1
idx_mem = sys.maxsize
for i in device.getValidDeviceIdxsEnumerator():
total = plaidML_devices[i]['globalMemSize']
if total < idx_mem:
idx = i
idx_mem = total
return idx
elif device.backend == "tensorflow":
idx = -1
idx_mem = sys.maxsize
for dev in cuda_devices:
if dev['total_mem'] < idx_mem:
idx = dev['index']
idx_mem = dev['total_mem']
return idx
@staticmethod
def isValidDeviceIdx(idx):
if device.backend == "plaidML":
return idx in [*device.getValidDeviceIdxsEnumerator()]
elif device.backend == "tensorflow":
for dev in cuda_devices:
if idx == dev['index']:
return True
return False
@staticmethod
def getDeviceIdxsEqualModel(idx):
if device.backend == "plaidML":
result = []
idx_name = plaidML_devices[idx]['description']
for i in device.getValidDeviceIdxsEnumerator():
if plaidML_devices[i]['description'] == idx_name:
result.append (i)
return result
elif device.backend == "tensorflow":
result = []
idx_name = device.getDeviceName(idx)
for dev in cuda_devices:
if dev['name'] == idx_name:
result.append ( dev['index'] )
return result
@staticmethod
def getDeviceName (idx):
if device.backend == "plaidML":
if idx < plaidML_devices_count:
return plaidML_devices[idx]['description']
elif device.backend == "tensorflow":
for dev in cuda_devices:
if dev['index'] == idx:
return dev['name']
return None
@staticmethod
def getDeviceID (idx):
if device.backend == "plaidML":
if idx < plaidML_devices_count:
return plaidML_devices[idx]['id'].decode()
return None
@staticmethod
def getDeviceComputeCapability(idx):
if device.backend == "plaidML":
return 99
elif device.backend == "tensorflow":
for dev in cuda_devices:
if dev['index'] == idx:
return dev['cc']
return 0
plaidML_build = os.environ.get("DFL_PLAIDML_BUILD", "0") == "1"
plaidML_devices = None
cuda_devices = None
if plaidML_build:
if plaidML_devices is None:
plaidML_devices = []
# Using plaidML OpenCL backend to determine system devices
try:
os.environ['PLAIDML_EXPERIMENTAL'] = 'false' #this enables work plaidML without run 'plaidml-setup'
import plaidml
ctx = plaidml.Context()
for d in plaidml.devices(ctx, return_all=True)[0]:
details = json.loads(d.details)
if details['type'] == 'CPU': #skipping opencl-CPU
continue
plaidML_devices += [ {'id':d.id,
'globalMemSize' : int(details['globalMemSize']),
'description' : d.description.decode()
}]
ctx.shutdown()
except:
pass
if len(plaidML_devices) != 0:
device.backend = "plaidML"
else:
if cuda_devices is None:
cuda_devices = []
libnames = ('libcuda.so', 'libcuda.dylib', 'nvcuda.dll')
cuda = None
for libname in libnames:
try:
cuda = ctypes.CDLL(libname)
except:
continue
else:
break
if cuda is not None:
nGpus = ctypes.c_int()
name = b' ' * 200
cc_major = ctypes.c_int()
cc_minor = ctypes.c_int()
freeMem = ctypes.c_size_t()
totalMem = ctypes.c_size_t()
result = ctypes.c_int()
device_t = ctypes.c_int()
context = ctypes.c_void_p()
error_str = ctypes.c_char_p()
if cuda.cuInit(0) == 0 and \
cuda.cuDeviceGetCount(ctypes.byref(nGpus)) == 0:
for i in range(nGpus.value):
if cuda.cuDeviceGet(ctypes.byref(device_t), i) != 0 or \
cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device_t) != 0 or \
cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device_t) != 0:
continue
if cuda.cuCtxCreate_v2(ctypes.byref(context), 0, device_t) == 0:
if cuda.cuMemGetInfo_v2(ctypes.byref(freeMem), ctypes.byref(totalMem)) == 0:
cc = cc_major.value * 10 + cc_minor.value
if cc >= tf_min_req_cap:
cuda_devices.append ( {'index':i,
'name':name.split(b'\0', 1)[0].decode(),
'total_mem':totalMem.value,
'free_mem':freeMem.value,
'cc':cc
}
)
cuda.cuCtxDetach(context)
if len(cuda_devices) != 0:
device.backend = "tensorflow"
if device.backend is None:
device.backend = "tensorflow-cpu"