added DirectX12-compatible cards support through onnxruntime-directml

This commit is contained in:
iperov 2021-09-09 17:15:30 +04:00
parent 071bf80681
commit 6d504d5969
5 changed files with 163 additions and 171 deletions

View file

@ -1,19 +1,24 @@
import ctypes
import itertools
import os
from typing import List
import onnxruntime as rt
from .. import appargs as lib_appargs
class ORTDeviceInfo:
"""
Represents picklable ONNXRuntime device info
"""
def __init__(self, index=None, name=None, total_memory=None, free_memory=None, compute_capability=None):
def __init__(self, index=None, execution_provider=None, name=None, total_memory=None, free_memory=None):
self._index : int = index
self._execution_provider : str = execution_provider
self._name : str = name
self._total_memory : int = total_memory
self._free_memory : int = free_memory
self._compute_capability : int = compute_capability
def __getstate__(self):
return self.__dict__.copy()
@ -27,8 +32,8 @@ class ORTDeviceInfo:
def get_index(self) -> int:
return self._index
def get_compute_capability(self) -> int:
return self._compute_capability
def get_execution_provider(self) -> str:
return self._execution_provider
def get_name(self) -> str:
return self._name
@ -51,90 +56,19 @@ class ORTDeviceInfo:
if self.is_cpu():
return f"CPU"
else:
return f"[{self._index}] {self._name} [{(self._total_memory / 1024**3) :.3}Gb]"
ep = self.get_execution_provider()
if ep == 'CUDAExecutionProvider':
return f"[{self._index}] {self._name} [{(self._total_memory / 1024**3) :.3}Gb] [CUDA]"
elif ep == 'DmlExecutionProvider':
return f"[{self._index}] {self._name} [{(self._total_memory / 1024**3) :.3}Gb] [DirectX12]"
def __repr__(self):
return f'{self.__class__.__name__} object: ' + self.__str__()
# class ORTDevicesInfo:
# """
# a list of ORTDeviceInfo
# """
# def __init__(self, devices : List[ORTDeviceInfo] = None):
# if devices is None:
# devices = []
# self._devices = devices
# def __getstate__(self):
# return self.__dict__.copy()
# def __setstate__(self, d):
# self.__init__()
# self.__dict__.update(d)
# def add(self, device_or_devices : ORTDeviceInfo):
# if isinstance(device_or_devices, ORTDeviceInfo):
# if device_or_devices not in self._devices:
# self._devices.append(device_or_devices)
# elif isinstance(device_or_devices, ORTDevicesInfo):
# for device in device_or_devices:
# self.add(device)
# def copy(self):
# return copy.deepcopy(self)
# def get_count(self): return len(self._devices)
# def get_highest_total_memory_device(self) -> ORTDeviceInfo:
# """
# returns ORTDeviceInfo with highest available memory, if devices support total_memory parameter
# """
# result = None
# idx_mem = 0
# for device in self._devices:
# mem = device.get_total_memory()
# if result is None or (mem is not None and mem > idx_mem):
# result = device
# idx_mem = mem
# return result
# def get_lowest_total_memory_device(self) -> ORTDeviceInfo:
# """
# returns ORTDeviceInfo with lowest available memory, if devices support total_memory parameter
# """
# result = None
# idx_mem = sys.maxsize
# for device in self._devices:
# mem = device.get_total_memory()
# if result is None or (mem is not None and mem < idx_mem):
# result = device
# idx_mem = mem
# return result
# def __len__(self):
# return len(self._devices)
# def __getitem__(self, key):
# result = self._devices[key]
# if isinstance(key, slice):
# return self.__class__(result)
# return result
# def __iter__(self):
# for device in self._devices:
# yield device
# def __str__(self): return f'{self.__class__.__name__}:[' + ', '.join([ device.__str__() for device in self._devices ]) + ']'
# def __repr__(self): return f'{self.__class__.__name__}:[' + ', '.join([ device.__repr__() for device in self._devices ]) + ']'
_ort_devices_info = None
def get_cpu_device() -> ORTDeviceInfo:
return ORTDeviceInfo(index=-1, name='CPU', total_memory=0, free_memory=0, compute_capability=0)
return ORTDeviceInfo(index=-1, execution_provider='CPUExecutionProvider', name='CPU', total_memory=0, free_memory=0)
def get_available_devices_info(include_cpu=True, cpu_only=False) -> List[ORTDeviceInfo]:
"""
@ -145,12 +79,13 @@ def get_available_devices_info(include_cpu=True, cpu_only=False) -> List[ORTDevi
_initialize_ort_devices()
devices = []
if not cpu_only:
for i in range ( int(os.environ['ORT_DEVICES_COUNT']) ):
devices.append ( ORTDeviceInfo(index=i,
name=os.environ[f'ORT_DEVICE_{i}_NAME'],
total_memory=int(os.environ[f'ORT_DEVICE_{i}_TOTAL_MEM']),
free_memory=int(os.environ[f'ORT_DEVICE_{i}_FREE_MEM']),
compute_capability=int(os.environ[f'ORT_DEVICE_{i}_CC']) ))
for i in range ( int(os.environ.get('ORT_DEVICES_COUNT',0)) ):
devices.append ( ORTDeviceInfo(index=int(os.environ[f'ORT_DEVICE_{i}_INDEX']),
execution_provider=os.environ[f'ORT_DEVICE_{i}_EP'],
name=os.environ[f'ORT_DEVICE_{i}_NAME'],
total_memory=int(os.environ[f'ORT_DEVICE_{i}_TOTAL_MEM']),
free_memory=int(os.environ[f'ORT_DEVICE_{i}_FREE_MEM']),
) )
if include_cpu or cpu_only:
devices.append(get_cpu_device())
_ort_devices_info = devices
@ -168,55 +103,85 @@ def _initialize_ort_devices():
if int(os.environ.get('ORT_DEVICES_INITIALIZED', 0)) == 0:
os.environ['ORT_DEVICES_INITIALIZED'] = '1'
os.environ['ORT_DEVICES_COUNT'] = '0'
os.environ['CUDA_CACHE_MAXSIZE'] = '2147483647'
try:
libnames = ('libcuda.so', 'libcuda.dylib', 'nvcuda.dll')
for libname in libnames:
try:
cuda = ctypes.CDLL(libname)
except:
continue
else:
break
else:
return
nGpus = ctypes.c_int()
name = b' ' * 200
cc_major = ctypes.c_int()
cc_minor = ctypes.c_int()
freeMem = ctypes.c_size_t()
totalMem = ctypes.c_size_t()
device = ctypes.c_int()
context = ctypes.c_void_p()
devices = []
if cuda.cuInit(0) == 0 and \
cuda.cuDeviceGetCount(ctypes.byref(nGpus)) == 0:
for i in range(nGpus.value):
if cuda.cuDeviceGet(ctypes.byref(device), i) != 0 or \
cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) != 0 or \
cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) != 0:
devices = []
prs = rt.get_available_providers()
if not lib_appargs.get_arg_bool('NO_CUDA') and 'CUDAExecutionProvider' in prs:
os.environ['CUDA_CACHE_MAXSIZE'] = '2147483647'
try:
libnames = ('libcuda.so', 'libcuda.dylib', 'nvcuda.dll')
for libname in libnames:
try:
cuda = ctypes.CDLL(libname)
except:
continue
else:
break
else:
return
if cuda.cuCtxCreate_v2(ctypes.byref(context), 0, device) == 0:
if cuda.cuMemGetInfo_v2(ctypes.byref(freeMem), ctypes.byref(totalMem)) == 0:
cc = cc_major.value * 10 + cc_minor.value
devices.append ({'name' : name.split(b'\0', 1)[0].decode(),
'total_mem' : totalMem.value,
'free_mem' : freeMem.value,
'cc' : cc
nGpus = ctypes.c_int()
name = b' ' * 200
cc_major = ctypes.c_int()
cc_minor = ctypes.c_int()
freeMem = ctypes.c_size_t()
totalMem = ctypes.c_size_t()
device = ctypes.c_int()
context = ctypes.c_void_p()
if cuda.cuInit(0) == 0 and \
cuda.cuDeviceGetCount(ctypes.byref(nGpus)) == 0:
for i in range(nGpus.value):
if cuda.cuDeviceGet(ctypes.byref(device), i) != 0 or \
cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) != 0 or \
cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) != 0:
continue
if cuda.cuCtxCreate_v2(ctypes.byref(context), 0, device) == 0:
if cuda.cuMemGetInfo_v2(ctypes.byref(freeMem), ctypes.byref(totalMem)) == 0:
cc = cc_major.value * 10 + cc_minor.value
devices.append ({'index' : i,
'execution_provider' : 'CUDAExecutionProvider',
'name' : name.split(b'\0', 1)[0].decode(),
'total_mem' : totalMem.value,
'free_mem' : freeMem.value,
})
cuda.cuCtxDetach(context)
except Exception as e:
print(f'CUDA devices initialization error: {e}')
if 'DmlExecutionProvider' in prs:
# onnxruntime-directml has no device enumeration API for users. Thus the code must follow the same logic
# as here https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/providers/dml/dml_provider_factory.cc
from xlib.api.win32 import dxgi as lib_dxgi
dxgi_factory = lib_dxgi.create_DXGIFactory4()
if dxgi_factory is not None:
for i in itertools.count():
adapter = dxgi_factory.enum_adapters1(i)
if adapter is not None:
desc = adapter.get_desc1()
if desc.Flags != lib_dxgi.DXGI_ADAPTER_FLAG.DXGI_ADAPTER_FLAG_SOFTWARE and \
not (desc.VendorId == 0x1414 and desc.DeviceId == 0x8c):
devices.append ({'index' : i,
'execution_provider' : 'DmlExecutionProvider',
'name' : desc.Description,
'total_mem' : desc.DedicatedVideoMemory,
'free_mem' : desc.DedicatedVideoMemory,
})
cuda.cuCtxDetach(context)
except Exception as e:
print(f'CUDA devices initialization error: {e}')
devices = []
adapter.Release()
else:
break
dxgi_factory.Release()
os.environ['ORT_DEVICES_COUNT'] = str(len(devices))
for i, device in enumerate(devices):
os.environ[f'ORT_DEVICE_{i}_INDEX'] = str(device['index'])
os.environ[f'ORT_DEVICE_{i}_EP'] = device['execution_provider']
os.environ[f'ORT_DEVICE_{i}_NAME'] = device['name']
os.environ[f'ORT_DEVICE_{i}_TOTAL_MEM'] = str(device['total_mem'])
os.environ[f'ORT_DEVICE_{i}_FREE_MEM'] = str(device['free_mem'])
os.environ[f'ORT_DEVICE_{i}_CC'] = str(device['cc'])
_initialize_ort_devices()