diff --git a/build/windows/WindowsBuilder.py b/build/windows/WindowsBuilder.py index 52ac684..f34c456 100644 --- a/build/windows/WindowsBuilder.py +++ b/build/windows/WindowsBuilder.py @@ -12,7 +12,7 @@ from typing import List class WindowsFolderBuilder: """ - Builds standalone python folder for Windows with the project from scratch. + Builds stand-alone portable all-in-one python folder for Windows with the project from scratch. """ # Constants @@ -462,36 +462,51 @@ pause """) -def build_deepfacelive_windows(release_dir, cache_dir, python_ver='3.7.9'): +def build_deepfacelive_windows(release_dir, cache_dir, python_ver='3.7.9', backend='cuda'): + builder = WindowsFolderBuilder(release_path=Path(release_dir), cache_path=Path(cache_dir), python_ver=python_ver, clear_release_path=True) - builder.install_pip_package('numpy==1.21.1') + + # PIP INSTALLATIONS + + builder.install_pip_package('numpy==1.21.2') builder.install_pip_package('scipy==1.5.4') builder.install_pip_package('numexpr') builder.install_pip_package('opencv-python==4.5.3.56') builder.install_pip_package('opencv-contrib-python==4.5.3.56') builder.install_pip_package('pyqt6==6.1.1') - builder.install_pip_package('torch==1.8.1+cu111 torchvision==0.9.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html') - builder.install_pip_package('onnxruntime-gpu==1.8.1') - builder.install_pip_package('cupy-cuda111===9.0.0') + builder.install_pip_package('onnx==1.10.1') + + if backend == 'cuda': + builder.install_pip_package('torch==1.8.1+cu111 torchvision==0.9.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html') + builder.install_pip_package('onnxruntime-gpu==1.8.1') + builder.install_pip_package('cupy-cuda111===9.0.0') + elif backend == 'directml': + if python_ver[:3] == '3.7': + builder.install_pip_package('https://github.com/iperov/DeepFaceLive/releases/download/ort-dml/onnxruntime_directml-1.8.2-cp37-cp37m-win_amd64.whl') + else: + raise Exception(f'no onnxruntime_directml wheel for python {python_ver}') builder.install_ffmpeg_binaries() - print('Moving CUDA dlls from Torch to shared directory') - cuda_bin_path = builder.cuda_bin_path - torch_lib_path = builder.python_site_packages_path / 'torch' / 'lib' + # - for cu_file in torch_lib_path.glob("**/cu*64*.dll"): - target = cuda_bin_path / cu_file.name - print (f'Moving {target}') - shutil.move (str(cu_file), str(target) ) + if backend == 'cuda': + print('Moving CUDA dlls from Torch to shared directory') + cuda_bin_path = builder.cuda_bin_path + torch_lib_path = builder.python_site_packages_path / 'torch' / 'lib' - for file in torch_lib_path.glob("**/nvrtc*.dll"): - target = cuda_bin_path / file.name - print (f'Moving {target}') - shutil.move (str(file), str(target) ) + for cu_file in torch_lib_path.glob("**/cu*64*.dll"): + target = cuda_bin_path / cu_file.name + print (f'Moving {target}') + shutil.move (str(cu_file), str(target) ) + + for file in torch_lib_path.glob("**/nvrtc*.dll"): + target = cuda_bin_path / file.name + print (f'Moving {target}') + shutil.move (str(file), str(target) ) deepfacelive_path = builder.get_internal_path() / 'DeepFaceLive' @@ -511,8 +526,13 @@ def build_deepfacelive_windows(release_dir, cache_dir, python_ver='3.7.9'): print('Copying samples.') shutil.copytree( str(Path(__file__).parent.parent / 'samples'), str(userdata_path / 'samples') ) - builder.create_run_python_script('DeepFaceLive.bat', 'DeepFaceLive\\main.py', 'run DeepFaceLive --userdata-dir=%~dp0userdata') - builder.create_internal_run_python_script('build DeepFaceLive.bat','DeepFaceLive\\build\\windows\\WindowsBuilder.py', '--build-type dfl-windows --release-dir Builds\DeepFaceLive --cache-dir _cache' ) + if backend == 'cuda': + builder.create_run_python_script('DeepFaceLive.bat', 'DeepFaceLive\\main.py', 'run DeepFaceLive --userdata-dir=%~dp0userdata') + elif backend == 'directml': + builder.create_run_python_script('DeepFaceLive.bat', 'DeepFaceLive\\main.py', 'run DeepFaceLive --userdata-dir=%~dp0userdata --no-cuda') + + builder.create_internal_run_python_script('build DeepFaceLive CUDA.bat', 'DeepFaceLive\\build\\windows\\WindowsBuilder.py', '--build-type dfl-windows --release-dir Builds\DeepFaceLive --cache-dir _cache --backend cuda') + builder.create_internal_run_python_script('build DeepFaceLive DirectML.bat', 'DeepFaceLive\\build\\windows\\WindowsBuilder.py', '--build-type dfl-windows --release-dir Builds\DeepFaceLive --cache-dir _cache --backend directml') builder.run_python('main.py dev merge_large_files --delete-parts', cwd=deepfacelive_path) @@ -531,12 +551,15 @@ if __name__ == '__main__': p.add_argument('--release-dir', action=fixPathAction, default=None) p.add_argument('--cache-dir', action=fixPathAction, default=None) p.add_argument('--python-ver', default="3.7.9") + p.add_argument('--backend', choices=['cuda', 'directml'], default='cuda') + args = p.parse_args() if args.build_type == 'dfl-windows': build_deepfacelive_windows(release_dir=args.release_dir, cache_dir=args.cache_dir, - python_ver=args.python_ver) + python_ver=args.python_ver, + backend=args.backend) diff --git a/main.py b/main.py index fd206b2..9c4f407 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,9 @@ +import argparse import os import platform +from pathlib import Path + +from xlib import appargs as lib_appargs # onnxruntime==1.8.0 requires CUDA_PATH_V11_2, but 1.8.1 don't # keep the code if they return that behaviour @@ -10,9 +14,6 @@ import platform # # set environ for onnxruntime # # os.environ['CUDA_PATH_V11_2'] = os.environ['CUDA_PATH'] -import argparse -from pathlib import Path - def main(): parser = argparse.ArgumentParser() subparsers = parser.add_subparsers() @@ -22,29 +23,31 @@ def main(): def run_DeepFaceLive(args): userdata_path = Path(args.userdata_dir) + lib_appargs.set_arg_bool('NO_CUDA', args.no_cuda) + print('Running DeepFaceLive.') from apps.DeepFaceLive.DeepFaceLiveApp import DeepFaceLiveApp DeepFaceLiveApp(userdata_path=userdata_path).run() p = run_subparsers.add_parser('DeepFaceLive') p.add_argument('--userdata-dir', default=None, action=fixPathAction, help="Workspace directory.") + p.add_argument('--no-cuda', action="store_true", default=False, help="Disable CUDA.") p.set_defaults(func=run_DeepFaceLive) - dev_parser = subparsers.add_parser("dev") dev_subparsers = dev_parser.add_subparsers() - + def run_split_large_files(args): from scripts import dev dev.split_large_files() - + p = dev_subparsers.add_parser('split_large_files') p.set_defaults(func=run_split_large_files) - + def run_merge_large_files(args): from scripts import dev dev.merge_large_files(delete_parts=args.delete_parts) - + p = dev_subparsers.add_parser('merge_large_files') p.add_argument('--delete-parts', action="store_true", default=False) p.set_defaults(func=run_merge_large_files) @@ -63,6 +66,6 @@ class fixPathAction(argparse.Action): if __name__ == '__main__': main() - + # import code # code.interact(local=dict(globals(), **locals())) diff --git a/xlib/cupy/device.py b/xlib/cupy/device.py index dccc9ca..b765c2e 100644 --- a/xlib/cupy/device.py +++ b/xlib/cupy/device.py @@ -1,6 +1,6 @@ from typing import List - +from .. import appargs as lib_appargs class CuPyDeviceInfo: @@ -53,7 +53,11 @@ def get_available_devices() -> List[CuPyDeviceInfo]: """ returns a list of available CuPyDeviceInfo """ + if lib_appargs.get_arg_bool('NO_CUDA'): + return [] + global _cupy_devices + if _cupy_devices is None: import cupy as cp # BUG eats 1.8Gb paging file per process, so import on demand devices = [] diff --git a/xlib/onnxruntime/InferenceSession.py b/xlib/onnxruntime/InferenceSession.py index e3eed9b..6df1da6 100644 --- a/xlib/onnxruntime/InferenceSession.py +++ b/xlib/onnxruntime/InferenceSession.py @@ -12,27 +12,24 @@ def InferenceSession_with_device(onnx_model_or_path, device_info : ORTDeviceInfo can raise Exception """ - + if isinstance(onnx_model_or_path, onnx.ModelProto): b = BytesIO() onnx.save(onnx_model_or_path, b) onnx_model_or_path = b.getvalue() - prs = rt.get_available_providers() + device_ep = device_info.get_execution_provider() + if device_ep not in rt.get_available_providers(): + raise Exception(f'{device_ep} is not avaiable in onnxruntime') - if device_info.is_cpu(): - if 'CPUExecutionProvider' not in prs: - raise Exception('CPUExecutionProvider is not avaiable in onnxruntime') - providers = ['CPUExecutionProvider'] - else: - if 'CUDAExecutionProvider' not in prs: - raise Exception('CUDAExecutionProvider is not avaiable in onnxruntime') - providers = [ ('CUDAExecutionProvider', {'device_id': device_info.get_index() }) ] - #providers = [ ('DmlExecutionProvider', {'device_id': 1 }) ] + ep_flags = {} + if device_ep in ['CUDAExecutionProvider','DmlExecutionProvider']: + ep_flags['device_id'] = device_info.get_index() sess_options = rt.SessionOptions() - #sess_options.enable_mem_pattern = False #for DmlExecutionProvider sess_options.log_severity_level = 4 sess_options.log_verbosity_level = -1 - sess = rt.InferenceSession(onnx_model_or_path, providers=providers, sess_options=sess_options) + if device_ep == 'DmlExecutionProvider': + sess_options.enable_mem_pattern = False + sess = rt.InferenceSession(onnx_model_or_path, providers=[ (device_ep, ep_flags) ], sess_options=sess_options) return sess diff --git a/xlib/onnxruntime/device.py b/xlib/onnxruntime/device.py index ea9d0cb..ad8ddc4 100644 --- a/xlib/onnxruntime/device.py +++ b/xlib/onnxruntime/device.py @@ -1,19 +1,24 @@ import ctypes +import itertools import os from typing import List +import onnxruntime as rt + +from .. import appargs as lib_appargs + class ORTDeviceInfo: """ Represents picklable ONNXRuntime device info """ - def __init__(self, index=None, name=None, total_memory=None, free_memory=None, compute_capability=None): + def __init__(self, index=None, execution_provider=None, name=None, total_memory=None, free_memory=None): self._index : int = index + self._execution_provider : str = execution_provider self._name : str = name self._total_memory : int = total_memory self._free_memory : int = free_memory - self._compute_capability : int = compute_capability def __getstate__(self): return self.__dict__.copy() @@ -27,8 +32,8 @@ class ORTDeviceInfo: def get_index(self) -> int: return self._index - def get_compute_capability(self) -> int: - return self._compute_capability + def get_execution_provider(self) -> str: + return self._execution_provider def get_name(self) -> str: return self._name @@ -51,90 +56,19 @@ class ORTDeviceInfo: if self.is_cpu(): return f"CPU" else: - return f"[{self._index}] {self._name} [{(self._total_memory / 1024**3) :.3}Gb]" + ep = self.get_execution_provider() + if ep == 'CUDAExecutionProvider': + return f"[{self._index}] {self._name} [{(self._total_memory / 1024**3) :.3}Gb] [CUDA]" + elif ep == 'DmlExecutionProvider': + return f"[{self._index}] {self._name} [{(self._total_memory / 1024**3) :.3}Gb] [DirectX12]" def __repr__(self): return f'{self.__class__.__name__} object: ' + self.__str__() - -# class ORTDevicesInfo: -# """ -# a list of ORTDeviceInfo -# """ - -# def __init__(self, devices : List[ORTDeviceInfo] = None): -# if devices is None: -# devices = [] -# self._devices = devices - -# def __getstate__(self): -# return self.__dict__.copy() - -# def __setstate__(self, d): -# self.__init__() -# self.__dict__.update(d) - -# def add(self, device_or_devices : ORTDeviceInfo): -# if isinstance(device_or_devices, ORTDeviceInfo): -# if device_or_devices not in self._devices: -# self._devices.append(device_or_devices) -# elif isinstance(device_or_devices, ORTDevicesInfo): -# for device in device_or_devices: -# self.add(device) - -# def copy(self): -# return copy.deepcopy(self) - -# def get_count(self): return len(self._devices) - -# def get_highest_total_memory_device(self) -> ORTDeviceInfo: -# """ -# returns ORTDeviceInfo with highest available memory, if devices support total_memory parameter -# """ -# result = None -# idx_mem = 0 -# for device in self._devices: -# mem = device.get_total_memory() -# if result is None or (mem is not None and mem > idx_mem): -# result = device -# idx_mem = mem -# return result - -# def get_lowest_total_memory_device(self) -> ORTDeviceInfo: -# """ -# returns ORTDeviceInfo with lowest available memory, if devices support total_memory parameter -# """ -# result = None -# idx_mem = sys.maxsize -# for device in self._devices: -# mem = device.get_total_memory() -# if result is None or (mem is not None and mem < idx_mem): -# result = device -# idx_mem = mem -# return result - -# def __len__(self): -# return len(self._devices) - -# def __getitem__(self, key): -# result = self._devices[key] -# if isinstance(key, slice): -# return self.__class__(result) -# return result - -# def __iter__(self): -# for device in self._devices: -# yield device - -# def __str__(self): return f'{self.__class__.__name__}:[' + ', '.join([ device.__str__() for device in self._devices ]) + ']' -# def __repr__(self): return f'{self.__class__.__name__}:[' + ', '.join([ device.__repr__() for device in self._devices ]) + ']' - - - _ort_devices_info = None def get_cpu_device() -> ORTDeviceInfo: - return ORTDeviceInfo(index=-1, name='CPU', total_memory=0, free_memory=0, compute_capability=0) + return ORTDeviceInfo(index=-1, execution_provider='CPUExecutionProvider', name='CPU', total_memory=0, free_memory=0) def get_available_devices_info(include_cpu=True, cpu_only=False) -> List[ORTDeviceInfo]: """ @@ -145,12 +79,13 @@ def get_available_devices_info(include_cpu=True, cpu_only=False) -> List[ORTDevi _initialize_ort_devices() devices = [] if not cpu_only: - for i in range ( int(os.environ['ORT_DEVICES_COUNT']) ): - devices.append ( ORTDeviceInfo(index=i, - name=os.environ[f'ORT_DEVICE_{i}_NAME'], - total_memory=int(os.environ[f'ORT_DEVICE_{i}_TOTAL_MEM']), - free_memory=int(os.environ[f'ORT_DEVICE_{i}_FREE_MEM']), - compute_capability=int(os.environ[f'ORT_DEVICE_{i}_CC']) )) + for i in range ( int(os.environ.get('ORT_DEVICES_COUNT',0)) ): + devices.append ( ORTDeviceInfo(index=int(os.environ[f'ORT_DEVICE_{i}_INDEX']), + execution_provider=os.environ[f'ORT_DEVICE_{i}_EP'], + name=os.environ[f'ORT_DEVICE_{i}_NAME'], + total_memory=int(os.environ[f'ORT_DEVICE_{i}_TOTAL_MEM']), + free_memory=int(os.environ[f'ORT_DEVICE_{i}_FREE_MEM']), + ) ) if include_cpu or cpu_only: devices.append(get_cpu_device()) _ort_devices_info = devices @@ -168,55 +103,85 @@ def _initialize_ort_devices(): if int(os.environ.get('ORT_DEVICES_INITIALIZED', 0)) == 0: os.environ['ORT_DEVICES_INITIALIZED'] = '1' os.environ['ORT_DEVICES_COUNT'] = '0' - os.environ['CUDA_​CACHE_​MAXSIZE'] = '2147483647' - try: - libnames = ('libcuda.so', 'libcuda.dylib', 'nvcuda.dll') - for libname in libnames: - try: - cuda = ctypes.CDLL(libname) - except: - continue - else: - break - else: - return - nGpus = ctypes.c_int() - name = b' ' * 200 - cc_major = ctypes.c_int() - cc_minor = ctypes.c_int() - freeMem = ctypes.c_size_t() - totalMem = ctypes.c_size_t() - device = ctypes.c_int() - context = ctypes.c_void_p() - devices = [] - - if cuda.cuInit(0) == 0 and \ - cuda.cuDeviceGetCount(ctypes.byref(nGpus)) == 0: - for i in range(nGpus.value): - if cuda.cuDeviceGet(ctypes.byref(device), i) != 0 or \ - cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) != 0 or \ - cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) != 0: + devices = [] + prs = rt.get_available_providers() + if not lib_appargs.get_arg_bool('NO_CUDA') and 'CUDAExecutionProvider' in prs: + os.environ['CUDA_​CACHE_​MAXSIZE'] = '2147483647' + try: + libnames = ('libcuda.so', 'libcuda.dylib', 'nvcuda.dll') + for libname in libnames: + try: + cuda = ctypes.CDLL(libname) + except: continue + else: + break + else: + return - if cuda.cuCtxCreate_v2(ctypes.byref(context), 0, device) == 0: - if cuda.cuMemGetInfo_v2(ctypes.byref(freeMem), ctypes.byref(totalMem)) == 0: - cc = cc_major.value * 10 + cc_minor.value - devices.append ({'name' : name.split(b'\0', 1)[0].decode(), - 'total_mem' : totalMem.value, - 'free_mem' : freeMem.value, - 'cc' : cc + nGpus = ctypes.c_int() + name = b' ' * 200 + cc_major = ctypes.c_int() + cc_minor = ctypes.c_int() + freeMem = ctypes.c_size_t() + totalMem = ctypes.c_size_t() + device = ctypes.c_int() + context = ctypes.c_void_p() + + + if cuda.cuInit(0) == 0 and \ + cuda.cuDeviceGetCount(ctypes.byref(nGpus)) == 0: + for i in range(nGpus.value): + if cuda.cuDeviceGet(ctypes.byref(device), i) != 0 or \ + cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) != 0 or \ + cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) != 0: + continue + + if cuda.cuCtxCreate_v2(ctypes.byref(context), 0, device) == 0: + if cuda.cuMemGetInfo_v2(ctypes.byref(freeMem), ctypes.byref(totalMem)) == 0: + cc = cc_major.value * 10 + cc_minor.value + devices.append ({'index' : i, + 'execution_provider' : 'CUDAExecutionProvider', + 'name' : name.split(b'\0', 1)[0].decode(), + 'total_mem' : totalMem.value, + 'free_mem' : freeMem.value, + }) + cuda.cuCtxDetach(context) + except Exception as e: + print(f'CUDA devices initialization error: {e}') + + if 'DmlExecutionProvider' in prs: + # onnxruntime-directml has no device enumeration API for users. Thus the code must follow the same logic + # as here https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/providers/dml/dml_provider_factory.cc + + from xlib.api.win32 import dxgi as lib_dxgi + + dxgi_factory = lib_dxgi.create_DXGIFactory4() + if dxgi_factory is not None: + for i in itertools.count(): + adapter = dxgi_factory.enum_adapters1(i) + if adapter is not None: + desc = adapter.get_desc1() + if desc.Flags != lib_dxgi.DXGI_ADAPTER_FLAG.DXGI_ADAPTER_FLAG_SOFTWARE and \ + not (desc.VendorId == 0x1414 and desc.DeviceId == 0x8c): + devices.append ({'index' : i, + 'execution_provider' : 'DmlExecutionProvider', + 'name' : desc.Description, + 'total_mem' : desc.DedicatedVideoMemory, + 'free_mem' : desc.DedicatedVideoMemory, }) - cuda.cuCtxDetach(context) - except Exception as e: - print(f'CUDA devices initialization error: {e}') - devices = [] + adapter.Release() + else: + break + dxgi_factory.Release() os.environ['ORT_DEVICES_COUNT'] = str(len(devices)) for i, device in enumerate(devices): + os.environ[f'ORT_DEVICE_{i}_INDEX'] = str(device['index']) + os.environ[f'ORT_DEVICE_{i}_EP'] = device['execution_provider'] os.environ[f'ORT_DEVICE_{i}_NAME'] = device['name'] os.environ[f'ORT_DEVICE_{i}_TOTAL_MEM'] = str(device['total_mem']) os.environ[f'ORT_DEVICE_{i}_FREE_MEM'] = str(device['free_mem']) - os.environ[f'ORT_DEVICE_{i}_CC'] = str(device['cc']) - + _initialize_ort_devices()