MultiGPU training:

speed is significantly increased. fixed CUDNN_STREAM errors. Trainer: added key 'b' : creates a backup even if the autobackup is disabled.
2025-07-06 04:52:13 -07:00 · 2020-01-29 10:55:51 +04:00 · 2020-01-29 10:55:51 +04:00 · 0251eb3490
commit 0251eb3490
parent a6d72b620d
4 changed files with 50 additions and 33 deletions
--- a/models/Model_Quick96/Model.py
+++ b/models/Model_Quick96/Model.py
@ -163,7 +163,7 @@ class QModel(ModelBase):

        masked_training = True

-        models_opt_on_gpu = len(devices) == 1 and devices[0].total_mem_gb >= 4
+        models_opt_on_gpu = len(devices) >= 1 and all([dev.total_mem_gb >= 2 for dev in devices])
        models_opt_device = '/GPU:0' if models_opt_on_gpu and self.is_training else '/CPU:0'
        optimizer_vars_on_cpu = models_opt_device=='/CPU:0'