added DirectX12-compatible cards support through onnxruntime-directml

2025-07-07 05:22:08 -07:00 · 2021-09-09 17:15:30 +04:00 · 2021-09-09 17:15:30 +04:00 · 6d504d5969
commit 6d504d5969
parent 071bf80681
5 changed files with 163 additions and 171 deletions
--- a/build/windows/WindowsBuilder.py
+++ b/build/windows/WindowsBuilder.py
@ -12,7 +12,7 @@ from typing import List
 class WindowsFolderBuilder:
    """
-    Builds standalone python folder for Windows with the project from scratch.
+    Builds stand-alone portable all-in-one python folder for Windows with the project from scratch.
    """
    # Constants
@ -462,36 +462,51 @@ pause
 """)
-def build_deepfacelive_windows(release_dir, cache_dir, python_ver='3.7.9'):
+def build_deepfacelive_windows(release_dir, cache_dir, python_ver='3.7.9', backend='cuda'):
    builder = WindowsFolderBuilder(release_path=Path(release_dir),
                                   cache_path=Path(cache_dir),
                                   python_ver=python_ver,
                                   clear_release_path=True)
-    builder.install_pip_package('numpy==1.21.1')
+
    # PIP INSTALLATIONS
    builder.install_pip_package('numpy==1.21.2')
    builder.install_pip_package('scipy==1.5.4')
    builder.install_pip_package('numexpr')
    builder.install_pip_package('opencv-python==4.5.3.56')
    builder.install_pip_package('opencv-contrib-python==4.5.3.56')
    builder.install_pip_package('pyqt6==6.1.1')
-    builder.install_pip_package('torch==1.8.1+cu111 torchvision==0.9.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html')
+    builder.install_pip_package('onnx==1.10.1')
-    builder.install_pip_package('onnxruntime-gpu==1.8.1')
+
-    builder.install_pip_package('cupy-cuda111===9.0.0')
+    if backend == 'cuda':
        builder.install_pip_package('torch==1.8.1+cu111 torchvision==0.9.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html')
        builder.install_pip_package('onnxruntime-gpu==1.8.1')
        builder.install_pip_package('cupy-cuda111===9.0.0')
    elif backend == 'directml':
        if python_ver[:3] == '3.7':
            builder.install_pip_package('https://github.com/iperov/DeepFaceLive/releases/download/ort-dml/onnxruntime_directml-1.8.2-cp37-cp37m-win_amd64.whl')
        else:
            raise Exception(f'no onnxruntime_directml wheel for python {python_ver}')
    builder.install_ffmpeg_binaries()
-    print('Moving CUDA dlls from Torch to shared directory')
+    #
    cuda_bin_path = builder.cuda_bin_path
    torch_lib_path = builder.python_site_packages_path / 'torch' / 'lib'
-    for cu_file in torch_lib_path.glob("**/cu*64*.dll"):
+    if backend == 'cuda':
-        target = cuda_bin_path / cu_file.name
+        print('Moving CUDA dlls from Torch to shared directory')
-        print (f'Moving {target}')
+        cuda_bin_path = builder.cuda_bin_path
-        shutil.move (str(cu_file), str(target) )
+        torch_lib_path = builder.python_site_packages_path / 'torch' / 'lib'
-    for file in torch_lib_path.glob("**/nvrtc*.dll"):
+        for cu_file in torch_lib_path.glob("**/cu*64*.dll"):
-        target = cuda_bin_path / file.name
+            target = cuda_bin_path / cu_file.name
-        print (f'Moving {target}')
+            print (f'Moving {target}')
-        shutil.move (str(file), str(target) )
+            shutil.move (str(cu_file), str(target) )
        for file in torch_lib_path.glob("**/nvrtc*.dll"):
            target = cuda_bin_path / file.name
            print (f'Moving {target}')
            shutil.move (str(file), str(target) )
    deepfacelive_path = builder.get_internal_path() / 'DeepFaceLive'
@ -511,8 +526,13 @@ def build_deepfacelive_windows(release_dir, cache_dir, python_ver='3.7.9'):
    print('Copying samples.')
    shutil.copytree( str(Path(__file__).parent.parent / 'samples'), str(userdata_path / 'samples') )
-    builder.create_run_python_script('DeepFaceLive.bat', 'DeepFaceLive\\main.py', 'run DeepFaceLive --userdata-dir=%~dp0userdata')
+    if backend == 'cuda':
-    builder.create_internal_run_python_script('build DeepFaceLive.bat','DeepFaceLive\\build\\windows\\WindowsBuilder.py', '--build-type dfl-windows --release-dir Builds\DeepFaceLive --cache-dir _cache' )
+        builder.create_run_python_script('DeepFaceLive.bat', 'DeepFaceLive\\main.py', 'run DeepFaceLive --userdata-dir=%~dp0userdata')
    elif backend == 'directml':
        builder.create_run_python_script('DeepFaceLive.bat', 'DeepFaceLive\\main.py', 'run DeepFaceLive --userdata-dir=%~dp0userdata --no-cuda')
    builder.create_internal_run_python_script('build DeepFaceLive CUDA.bat',     'DeepFaceLive\\build\\windows\\WindowsBuilder.py', '--build-type dfl-windows --release-dir Builds\DeepFaceLive --cache-dir _cache --backend cuda')
    builder.create_internal_run_python_script('build DeepFaceLive DirectML.bat', 'DeepFaceLive\\build\\windows\\WindowsBuilder.py', '--build-type dfl-windows --release-dir Builds\DeepFaceLive --cache-dir _cache --backend directml')
    builder.run_python('main.py dev merge_large_files --delete-parts', cwd=deepfacelive_path)
@ -531,12 +551,15 @@ if __name__ == '__main__':
    p.add_argument('--release-dir', action=fixPathAction, default=None)
    p.add_argument('--cache-dir', action=fixPathAction, default=None)
    p.add_argument('--python-ver', default="3.7.9")
    p.add_argument('--backend', choices=['cuda', 'directml'], default='cuda')
    args = p.parse_args()
    if args.build_type == 'dfl-windows':
        build_deepfacelive_windows(release_dir=args.release_dir,
                                   cache_dir=args.cache_dir,
-                                   python_ver=args.python_ver)
+                                   python_ver=args.python_ver,
                                   backend=args.backend)
--- a/main.py
+++ b/main.py
@ -1,5 +1,9 @@
 import argparse
 import os
 import platform
 from pathlib import Path
 from xlib import appargs as lib_appargs
 # onnxruntime==1.8.0 requires CUDA_PATH_V11_2, but 1.8.1 don't
 # keep the code if they return that behaviour
@ -10,9 +14,6 @@ import platform
 #         # set environ for onnxruntime
 #         # os.environ['CUDA_PATH_V11_2'] = os.environ['CUDA_PATH']
 import argparse
 from pathlib import Path
 def main():
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers()
@ -22,15 +23,17 @@ def main():
    def run_DeepFaceLive(args):
        userdata_path = Path(args.userdata_dir)
        lib_appargs.set_arg_bool('NO_CUDA', args.no_cuda)
        print('Running DeepFaceLive.')
        from apps.DeepFaceLive.DeepFaceLiveApp import DeepFaceLiveApp
        DeepFaceLiveApp(userdata_path=userdata_path).run()
    p = run_subparsers.add_parser('DeepFaceLive')
    p.add_argument('--userdata-dir', default=None, action=fixPathAction, help="Workspace directory.")
    p.add_argument('--no-cuda', action="store_true", default=False, help="Disable CUDA.")
    p.set_defaults(func=run_DeepFaceLive)
    dev_parser = subparsers.add_parser("dev")
    dev_subparsers = dev_parser.add_subparsers()
--- a/xlib/cupy/device.py
+++ b/xlib/cupy/device.py
@ -1,6 +1,6 @@
 from typing import List
-
+from .. import appargs as lib_appargs
 class CuPyDeviceInfo:
@ -53,7 +53,11 @@ def get_available_devices() -> List[CuPyDeviceInfo]:
    """
    returns a list of available CuPyDeviceInfo
    """
    if lib_appargs.get_arg_bool('NO_CUDA'):
        return []
    global _cupy_devices
    if _cupy_devices is None:
        import cupy as cp # BUG eats 1.8Gb paging file per process, so import on demand
        devices = []
--- a/xlib/onnxruntime/InferenceSession.py
+++ b/xlib/onnxruntime/InferenceSession.py
@ -18,21 +18,18 @@ def InferenceSession_with_device(onnx_model_or_path, device_info : ORTDeviceInfo
        onnx.save(onnx_model_or_path, b)
        onnx_model_or_path = b.getvalue()
-    prs = rt.get_available_providers()
+    device_ep = device_info.get_execution_provider()
    if device_ep not in rt.get_available_providers():
        raise Exception(f'{device_ep} is not avaiable in onnxruntime')
-    if device_info.is_cpu():
+    ep_flags = {}
-        if 'CPUExecutionProvider' not in prs:
+    if device_ep in ['CUDAExecutionProvider','DmlExecutionProvider']:
-            raise Exception('CPUExecutionProvider is not avaiable in onnxruntime')
+        ep_flags['device_id'] = device_info.get_index()
        providers = ['CPUExecutionProvider']
    else:
        if 'CUDAExecutionProvider' not in prs:
            raise Exception('CUDAExecutionProvider is not avaiable in onnxruntime')
        providers = [ ('CUDAExecutionProvider', {'device_id': device_info.get_index() }) ]
        #providers = [ ('DmlExecutionProvider', {'device_id': 1 }) ]
    sess_options = rt.SessionOptions()
    #sess_options.enable_mem_pattern = False #for DmlExecutionProvider
    sess_options.log_severity_level = 4
    sess_options.log_verbosity_level = -1
-    sess = rt.InferenceSession(onnx_model_or_path, providers=providers, sess_options=sess_options)
+    if device_ep == 'DmlExecutionProvider':
        sess_options.enable_mem_pattern = False
    sess = rt.InferenceSession(onnx_model_or_path, providers=[ (device_ep, ep_flags) ], sess_options=sess_options)
    return sess
--- a/xlib/onnxruntime/device.py
+++ b/xlib/onnxruntime/device.py
@ -1,19 +1,24 @@
 import ctypes
 import itertools
 import os
 from typing import List
 import onnxruntime as rt
 from .. import appargs as lib_appargs
 class ORTDeviceInfo:
    """
    Represents picklable ONNXRuntime device info
    """
-    def __init__(self, index=None, name=None, total_memory=None, free_memory=None, compute_capability=None):
+    def __init__(self, index=None, execution_provider=None, name=None, total_memory=None, free_memory=None):
        self._index : int = index
        self._execution_provider : str = execution_provider
        self._name : str = name
        self._total_memory : int = total_memory
        self._free_memory : int = free_memory
        self._compute_capability : int = compute_capability
    def __getstate__(self):
        return self.__dict__.copy()
@ -27,8 +32,8 @@ class ORTDeviceInfo:
    def get_index(self) -> int:
        return self._index
-    def get_compute_capability(self) -> int:
+    def get_execution_provider(self) -> str:
-        return self._compute_capability
+        return self._execution_provider
    def get_name(self) -> str:
        return self._name
@ -51,90 +56,19 @@ class ORTDeviceInfo:
        if self.is_cpu():
            return f"CPU"
        else:
-            return f"[{self._index}] {self._name} [{(self._total_memory / 1024**3) :.3}Gb]"
+            ep = self.get_execution_provider()
            if ep == 'CUDAExecutionProvider':
                return f"[{self._index}] {self._name} [{(self._total_memory / 1024**3) :.3}Gb] [CUDA]"
            elif ep == 'DmlExecutionProvider':
                return f"[{self._index}] {self._name} [{(self._total_memory / 1024**3) :.3}Gb] [DirectX12]"
    def __repr__(self):
        return f'{self.__class__.__name__} object: ' + self.__str__()
 # class ORTDevicesInfo:
 #     """
 #     a list of ORTDeviceInfo
 #     """
 #     def __init__(self, devices : List[ORTDeviceInfo] = None):
 #         if devices is None:
 #             devices = []
 #         self._devices = devices
 #     def __getstate__(self):
 #         return self.__dict__.copy()
 #     def __setstate__(self, d):
 #         self.__init__()
 #         self.__dict__.update(d)
 #     def add(self, device_or_devices : ORTDeviceInfo):
 #         if isinstance(device_or_devices, ORTDeviceInfo):
 #             if device_or_devices not in self._devices:
 #                 self._devices.append(device_or_devices)
 #         elif isinstance(device_or_devices, ORTDevicesInfo):
 #             for device in device_or_devices:
 #                 self.add(device)
 #     def copy(self):
 #         return copy.deepcopy(self)
 #     def get_count(self): return len(self._devices)
 #     def get_highest_total_memory_device(self) -> ORTDeviceInfo:
 #         """
 #         returns ORTDeviceInfo with highest available memory, if devices support total_memory parameter
 #         """
 #         result = None
 #         idx_mem = 0
 #         for device in self._devices:
 #             mem = device.get_total_memory()
 #             if result is None or (mem is not None and mem > idx_mem):
 #                 result = device
 #                 idx_mem = mem
 #         return result
 #     def get_lowest_total_memory_device(self) -> ORTDeviceInfo:
 #         """
 #         returns ORTDeviceInfo with lowest available memory, if devices support total_memory parameter
 #         """
 #         result = None
 #         idx_mem = sys.maxsize
 #         for device in self._devices:
 #             mem = device.get_total_memory()
 #             if result is None or (mem is not None and mem < idx_mem):
 #                 result = device
 #                 idx_mem = mem
 #         return result
 #     def __len__(self):
 #         return len(self._devices)
 #     def __getitem__(self, key):
 #         result = self._devices[key]
 #         if isinstance(key, slice):
 #             return self.__class__(result)
 #         return result
 #     def __iter__(self):
 #         for device in self._devices:
 #             yield device
 #     def __str__(self):  return f'{self.__class__.__name__}:[' + ', '.join([ device.__str__() for device in self._devices ]) + ']'
 #     def __repr__(self): return f'{self.__class__.__name__}:[' + ', '.join([ device.__repr__() for device in self._devices ]) + ']'
 _ort_devices_info = None
 def get_cpu_device() -> ORTDeviceInfo:
-    return ORTDeviceInfo(index=-1, name='CPU', total_memory=0, free_memory=0, compute_capability=0)
+    return ORTDeviceInfo(index=-1, execution_provider='CPUExecutionProvider', name='CPU', total_memory=0, free_memory=0)
 def get_available_devices_info(include_cpu=True, cpu_only=False) -> List[ORTDeviceInfo]:
    """
@ -145,12 +79,13 @@ def get_available_devices_info(include_cpu=True, cpu_only=False) -> List[ORTDevi
        _initialize_ort_devices()
        devices = []
        if not cpu_only:
-            for i in range ( int(os.environ['ORT_DEVICES_COUNT']) ):
+            for i in range ( int(os.environ.get('ORT_DEVICES_COUNT',0)) ):
-                devices.append ( ORTDeviceInfo(index=i,
+                devices.append ( ORTDeviceInfo(index=int(os.environ[f'ORT_DEVICE_{i}_INDEX']),
-                                                name=os.environ[f'ORT_DEVICE_{i}_NAME'],
+                                               execution_provider=os.environ[f'ORT_DEVICE_{i}_EP'],
-                                                total_memory=int(os.environ[f'ORT_DEVICE_{i}_TOTAL_MEM']),
+                                               name=os.environ[f'ORT_DEVICE_{i}_NAME'],
-                                                free_memory=int(os.environ[f'ORT_DEVICE_{i}_FREE_MEM']),
+                                               total_memory=int(os.environ[f'ORT_DEVICE_{i}_TOTAL_MEM']),
-                                                compute_capability=int(os.environ[f'ORT_DEVICE_{i}_CC']) ))
+                                               free_memory=int(os.environ[f'ORT_DEVICE_{i}_FREE_MEM']),
                                              ) )
        if include_cpu or cpu_only:
            devices.append(get_cpu_device())
        _ort_devices_info = devices
@ -168,55 +103,85 @@ def _initialize_ort_devices():
    if int(os.environ.get('ORT_DEVICES_INITIALIZED', 0)) == 0:
        os.environ['ORT_DEVICES_INITIALIZED'] = '1'
        os.environ['ORT_DEVICES_COUNT'] = '0'
        os.environ['CUDA_CACHE_MAXSIZE'] = '2147483647'
        try:
            libnames = ('libcuda.so', 'libcuda.dylib', 'nvcuda.dll')
            for libname in libnames:
                try:
                    cuda = ctypes.CDLL(libname)
                except:
                    continue
                else:
                    break
            else:
                return
-            nGpus = ctypes.c_int()
+        devices = []
-            name = b' ' * 200
+        prs = rt.get_available_providers()
-            cc_major = ctypes.c_int()
+        if not lib_appargs.get_arg_bool('NO_CUDA') and 'CUDAExecutionProvider' in prs:
-            cc_minor = ctypes.c_int()
+            os.environ['CUDA_CACHE_MAXSIZE'] = '2147483647'
-            freeMem = ctypes.c_size_t()
+            try:
-            totalMem = ctypes.c_size_t()
+                libnames = ('libcuda.so', 'libcuda.dylib', 'nvcuda.dll')
-            device = ctypes.c_int()
+                for libname in libnames:
-            context = ctypes.c_void_p()
+                    try:
-            devices = []
+                        cuda = ctypes.CDLL(libname)
-
+                    except:
            if cuda.cuInit(0) == 0 and \
                cuda.cuDeviceGetCount(ctypes.byref(nGpus)) == 0:
                for i in range(nGpus.value):
                    if cuda.cuDeviceGet(ctypes.byref(device), i) != 0 or \
                        cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) != 0 or \
                        cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) != 0:
                        continue
                    else:
                        break
                else:
                    return
-                    if cuda.cuCtxCreate_v2(ctypes.byref(context), 0, device) == 0:
+                nGpus = ctypes.c_int()
-                        if cuda.cuMemGetInfo_v2(ctypes.byref(freeMem), ctypes.byref(totalMem)) == 0:
+                name = b' ' * 200
-                            cc = cc_major.value * 10 + cc_minor.value
+                cc_major = ctypes.c_int()
-                            devices.append ({'name'     : name.split(b'\0', 1)[0].decode(),
+                cc_minor = ctypes.c_int()
-                                            'total_mem' : totalMem.value,
+                freeMem = ctypes.c_size_t()
-                                            'free_mem'  : freeMem.value,
+                totalMem = ctypes.c_size_t()
-                                            'cc'        : cc
+                device = ctypes.c_int()
                context = ctypes.c_void_p()
                if cuda.cuInit(0) == 0 and \
                    cuda.cuDeviceGetCount(ctypes.byref(nGpus)) == 0:
                    for i in range(nGpus.value):
                        if cuda.cuDeviceGet(ctypes.byref(device), i) != 0 or \
                            cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) != 0 or \
                            cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) != 0:
                            continue
                        if cuda.cuCtxCreate_v2(ctypes.byref(context), 0, device) == 0:
                            if cuda.cuMemGetInfo_v2(ctypes.byref(freeMem), ctypes.byref(totalMem)) == 0:
                                cc = cc_major.value * 10 + cc_minor.value
                                devices.append ({'index'     : i,
                                                 'execution_provider' : 'CUDAExecutionProvider',
                                                 'name'      : name.split(b'\0', 1)[0].decode(),
                                                 'total_mem' : totalMem.value,
                                                 'free_mem'  : freeMem.value,
                                                })
                            cuda.cuCtxDetach(context)
            except Exception as e:
                print(f'CUDA devices initialization error: {e}')
        if 'DmlExecutionProvider' in prs:
            # onnxruntime-directml has no device enumeration API for users. Thus the code must follow the same logic
            # as here https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/providers/dml/dml_provider_factory.cc
            from xlib.api.win32 import dxgi as lib_dxgi
            dxgi_factory = lib_dxgi.create_DXGIFactory4()
            if dxgi_factory is not None:
                for i in itertools.count():
                    adapter = dxgi_factory.enum_adapters1(i)
                    if adapter is not None:
                        desc = adapter.get_desc1()
                        if desc.Flags != lib_dxgi.DXGI_ADAPTER_FLAG.DXGI_ADAPTER_FLAG_SOFTWARE and \
                           not (desc.VendorId == 0x1414 and desc.DeviceId == 0x8c):
                            devices.append ({'index'     : i,
                                             'execution_provider' : 'DmlExecutionProvider',
                                             'name'      : desc.Description,
                                             'total_mem' : desc.DedicatedVideoMemory,
                                             'free_mem'  : desc.DedicatedVideoMemory,
                                            })
-                        cuda.cuCtxDetach(context)
+                        adapter.Release()        
-        except Exception as e:
+                    else:
-            print(f'CUDA devices initialization error: {e}')
+                        break
-            devices = []
+                dxgi_factory.Release()
        os.environ['ORT_DEVICES_COUNT'] = str(len(devices))
        for i, device in enumerate(devices):
            os.environ[f'ORT_DEVICE_{i}_INDEX'] = str(device['index'])
            os.environ[f'ORT_DEVICE_{i}_EP'] = device['execution_provider']
            os.environ[f'ORT_DEVICE_{i}_NAME'] = device['name']
            os.environ[f'ORT_DEVICE_{i}_TOTAL_MEM'] = str(device['total_mem'])
            os.environ[f'ORT_DEVICE_{i}_FREE_MEM'] = str(device['free_mem'])
            os.environ[f'ORT_DEVICE_{i}_CC'] = str(device['cc'])
 _initialize_ort_devices()