diff --git a/core/leras/layers/Saveable.py b/core/leras/layers/Saveable.py index 13eff3b..0cc0e94 100644 --- a/core/leras/layers/Saveable.py +++ b/core/leras/layers/Saveable.py @@ -76,25 +76,28 @@ class Saveable(): if self.name is None: raise Exception("name must be defined.") - tuples = [] - for w in weights: - w_name_split = w.name.split('/') - if self.name != w_name_split[0]: - raise Exception("weight first name != Saveable.name") + try: + tuples = [] + for w in weights: + w_name_split = w.name.split('/') + if self.name != w_name_split[0]: + raise Exception("weight first name != Saveable.name") - sub_w_name = "/".join(w_name_split[1:]) + sub_w_name = "/".join(w_name_split[1:]) - w_val = d.get(sub_w_name, None) + w_val = d.get(sub_w_name, None) - if w_val is None: - #io.log_err(f"Weight {w.name} was not loaded from file {filename}") - tuples.append ( (w, w.initializer) ) - else: - w_val = np.reshape( w_val, w.shape.as_list() ) - tuples.append ( (w, w_val) ) - - nn.batch_set_value(tuples) + if w_val is None: + #io.log_err(f"Weight {w.name} was not loaded from file {filename}") + tuples.append ( (w, w.initializer) ) + else: + w_val = np.reshape( w_val, w.shape.as_list() ) + tuples.append ( (w, w_val) ) + nn.batch_set_value(tuples) + except: + return False + return True def init_weights(self): diff --git a/core/leras/optimizers/AdaBelief.py b/core/leras/optimizers/AdaBelief.py new file mode 100644 index 0000000..aed4308 --- /dev/null +++ b/core/leras/optimizers/AdaBelief.py @@ -0,0 +1,80 @@ +from tensorflow.python.ops import control_flow_ops, state_ops +from core.leras import nn +tf = nn.tf + +class AdaBelief(nn.OptimizerBase): + def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, lr_dropout=1.0, lr_cos=0, epsilon=1e-7, clipnorm=0.0, name=None, **kwargs): + super().__init__(name=name) + + if name is None: + raise ValueError('name must be defined.') + + self.lr = lr + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.lr_dropout = lr_dropout + self.lr_cos = lr_cos + self.clipnorm = clipnorm + self.epsilon = epsilon + + with tf.device('/CPU:0') : + with tf.variable_scope(self.name): + self.iterations = tf.Variable(0, dtype=tf.int64, name='iters') + + self.ms_dict = {} + self.vs_dict = {} + self.lr_rnds_dict = {} + + def get_weights(self): + return [self.iterations] + list(self.ms_dict.values()) + list(self.vs_dict.values()) + + def initialize_variables(self, trainable_weights, vars_on_cpu=True, lr_dropout_on_cpu=False): + # Initialize here all trainable variables used in training + e = tf.device('/CPU:0') if vars_on_cpu else None + if e: e.__enter__() + with tf.variable_scope(self.name): + ms = { v.name : tf.get_variable ( f'ms_{v.name}'.replace(':','_'), v.shape, dtype=v.dtype, initializer=tf.initializers.constant(0.0), trainable=False) for v in trainable_weights } + vs = { v.name : tf.get_variable ( f'vs_{v.name}'.replace(':','_'), v.shape, dtype=v.dtype, initializer=tf.initializers.constant(0.0), trainable=False) for v in trainable_weights } + self.ms_dict.update (ms) + self.vs_dict.update (vs) + + if self.lr_dropout != 1.0: + e = tf.device('/CPU:0') if lr_dropout_on_cpu else None + if e: e.__enter__() + lr_rnds = [ nn.random_binomial( v.shape, p=self.lr_dropout, dtype=v.dtype) for v in trainable_weights ] + if e: e.__exit__(None, None, None) + self.lr_rnds_dict.update ( { v.name : rnd for v,rnd in zip(trainable_weights,lr_rnds) } ) + if e: e.__exit__(None, None, None) + + def get_update_op(self, grads_vars): + updates = [] + + if self.clipnorm > 0.0: + norm = tf.sqrt( sum([tf.reduce_sum(tf.square(g)) for g,v in grads_vars])) + updates += [ state_ops.assign_add( self.iterations, 1) ] + for i, (g,v) in enumerate(grads_vars): + if self.clipnorm > 0.0: + g = self.tf_clip_norm(g, self.clipnorm, norm) + + ms = self.ms_dict[ v.name ] + vs = self.vs_dict[ v.name ] + + m_t = self.beta_1*ms + (1.0-self.beta_1) * g + v_t = self.beta_2*vs + (1.0-self.beta_2) * tf.square(g-m_t) + + lr = tf.constant(self.lr, g.dtype) + if self.lr_cos != 0: + lr *= (tf.cos( tf.cast(self.iterations, g.dtype) * (2*3.1415926535/ float(self.lr_cos) ) ) + 1.0) / 2.0 + + v_diff = - lr * m_t / (tf.sqrt(v_t) + self.epsilon) + if self.lr_dropout != 1.0: + lr_rnd = self.lr_rnds_dict[v.name] + v_diff *= lr_rnd + new_v = v + v_diff + + updates.append (state_ops.assign(ms, m_t)) + updates.append (state_ops.assign(vs, v_t)) + updates.append (state_ops.assign(v, new_v)) + + return control_flow_ops.group ( *updates, name=self.name+'_updates') +nn.AdaBelief = AdaBelief \ No newline at end of file diff --git a/core/leras/optimizers/RMSprop.py b/core/leras/optimizers/RMSprop.py index 668fea3..345b2a7 100644 --- a/core/leras/optimizers/RMSprop.py +++ b/core/leras/optimizers/RMSprop.py @@ -3,7 +3,7 @@ from core.leras import nn tf = nn.tf class RMSprop(nn.OptimizerBase): - def __init__(self, lr=0.001, rho=0.9, lr_dropout=1.0, epsilon=1e-7, clipnorm=0.0, name=None): + def __init__(self, lr=0.001, rho=0.9, lr_dropout=1.0, epsilon=1e-7, clipnorm=0.0, name=None, **kwargs): super().__init__(name=name) if name is None: diff --git a/core/leras/optimizers/__init__.py b/core/leras/optimizers/__init__.py index aec36af..4f8a7e4 100644 --- a/core/leras/optimizers/__init__.py +++ b/core/leras/optimizers/__init__.py @@ -1,2 +1,3 @@ from .OptimizerBase import * -from .RMSprop import * \ No newline at end of file +from .RMSprop import * +from .AdaBelief import * \ No newline at end of file