MultiGPU training:

speed is significantly increased.
fixed CUDNN_STREAM errors.

Trainer: added key 'b' : creates a backup even if the autobackup is disabled.
This commit is contained in:
Colombo 2020-01-29 10:55:51 +04:00
parent a6d72b620d
commit 0251eb3490
4 changed files with 50 additions and 33 deletions

View file

@ -68,6 +68,10 @@ def trainerThread (s2c, c2s, e,
model.save()
shared_state['after_save'] = True
def model_backup():
if not debug and not is_reached_goal:
model.create_backup()
def send_preview():
if not debug:
previews = model.get_previews()
@ -172,6 +176,8 @@ def trainerThread (s2c, c2s, e,
op = input['op']
if op == 'save':
model_save()
elif op == 'backup':
model_backup()
elif op == 'preview':
if is_reached_goal:
model.pass_one_iter()
@ -277,7 +283,7 @@ def main(**kwargs):
# HEAD
head_lines = [
'[s]:save [enter]:exit',
'[s]:save [b]:backup [enter]:exit',
'[p]:update [space]:next preview [l]:change history range',
'Preview: "%s" [%d/%d]' % (selected_preview_name,selected_preview+1, len(previews) )
]
@ -314,6 +320,8 @@ def main(**kwargs):
s2c.put ( {'op': 'close'} )
elif key == ord('s'):
s2c.put ( {'op': 'save'} )
elif key == ord('b'):
s2c.put ( {'op': 'backup'} )
elif key == ord('p'):
if not is_waiting_preview:
is_waiting_preview = True

View file

@ -345,8 +345,7 @@ class ModelBase(object):
return self.onGetPreview (self.sample_for_preview)[0][1] #first preview, and bgr
def save(self):
summary_path = self.get_strpath_storage_for_file('summary.txt')
Path( summary_path ).write_text( self.get_summary_text() )
Path( self.get_summary_path() ).write_text( self.get_summary_text() )
self.onSave()
@ -360,42 +359,49 @@ class ModelBase(object):
pathex.write_bytes_safe (self.model_data_path, pickle.dumps(model_data) )
if self.autobackup:
bckp_filename_list = [ self.get_strpath_storage_for_file(filename) for _, filename in self.get_model_filename_list() ]
bckp_filename_list += [ str(summary_path), str(self.model_data_path) ]
current_hour = time.localtime().tm_hour
if self.autobackup_current_hour != current_hour:
self.autobackup_current_hour = current_hour
self.create_backup()
for i in range(15,0,-1):
idx_str = '%.2d' % i
next_idx_str = '%.2d' % (i+1)
def create_backup(self):
io.log_info ("Creating backup...", end='\r')
idx_backup_path = self.autobackups_path / idx_str
next_idx_packup_path = self.autobackups_path / next_idx_str
if not self.autobackups_path.exists():
self.autobackups_path.mkdir(exist_ok=True)
if idx_backup_path.exists():
if i == 15:
pathex.delete_all_files(idx_backup_path)
else:
next_idx_packup_path.mkdir(exist_ok=True)
pathex.move_all_files (idx_backup_path, next_idx_packup_path)
bckp_filename_list = [ self.get_strpath_storage_for_file(filename) for _, filename in self.get_model_filename_list() ]
bckp_filename_list += [ str(self.get_summary_path()), str(self.model_data_path) ]
if i == 1:
idx_backup_path.mkdir(exist_ok=True)
for filename in bckp_filename_list:
shutil.copy ( str(filename), str(idx_backup_path / Path(filename).name) )
for i in range(15,0,-1):
idx_str = '%.2d' % i
next_idx_str = '%.2d' % (i+1)
previews = self.get_previews()
plist = []
for i in range(len(previews)):
name, bgr = previews[i]
plist += [ (bgr, idx_backup_path / ( ('preview_%s.jpg') % (name)) ) ]
idx_backup_path = self.autobackups_path / idx_str
next_idx_packup_path = self.autobackups_path / next_idx_str
for preview, filepath in plist:
preview_lh = ModelBase.get_loss_history_preview(self.loss_history, self.iter, preview.shape[1], preview.shape[2])
img = (np.concatenate ( [preview_lh, preview], axis=0 ) * 255).astype(np.uint8)
cv2_imwrite (filepath, img )
if idx_backup_path.exists():
if i == 15:
pathex.delete_all_files(idx_backup_path)
else:
next_idx_packup_path.mkdir(exist_ok=True)
pathex.move_all_files (idx_backup_path, next_idx_packup_path)
if i == 1:
idx_backup_path.mkdir(exist_ok=True)
for filename in bckp_filename_list:
shutil.copy ( str(filename), str(idx_backup_path / Path(filename).name) )
previews = self.get_previews()
plist = []
for i in range(len(previews)):
name, bgr = previews[i]
plist += [ (bgr, idx_backup_path / ( ('preview_%s.jpg') % (name)) ) ]
for preview, filepath in plist:
preview_lh = ModelBase.get_loss_history_preview(self.loss_history, self.iter, preview.shape[1], preview.shape[2])
img = (np.concatenate ( [preview_lh, preview], axis=0 ) * 255).astype(np.uint8)
cv2_imwrite (filepath, img )
def debug_one_iter(self):
images = []
@ -479,6 +485,9 @@ class ModelBase(object):
def get_strpath_storage_for_file(self, filename):
return str( self.saved_models_path / ( self.get_model_name() + '_' + filename) )
def get_summary_path(self):
return self.get_strpath_storage_for_file('summary.txt')
def get_summary_text(self):
###Generate text summary of model hyperparameters
#Find the longest key name and value string. Used as column widths.

View file

@ -163,7 +163,7 @@ class QModel(ModelBase):
masked_training = True
models_opt_on_gpu = len(devices) == 1 and devices[0].total_mem_gb >= 4
models_opt_on_gpu = len(devices) >= 1 and all([dev.total_mem_gb >= 2 for dev in devices])
models_opt_device = '/GPU:0' if models_opt_on_gpu and self.is_training else '/CPU:0'
optimizer_vars_on_cpu = models_opt_device=='/CPU:0'

View file

@ -349,7 +349,7 @@ class SAEHDModel(ModelBase):
masked_training = True
models_opt_on_gpu = False if len(devices) != 1 else self.options['models_opt_on_gpu']
models_opt_on_gpu = False if len(devices) == 0 else self.options['models_opt_on_gpu']
models_opt_device = '/GPU:0' if models_opt_on_gpu and self.is_training else '/CPU:0'
optimizer_vars_on_cpu = models_opt_device=='/CPU:0'