Speeding up model with feedback to the encoder

I got the following model, an autoencoder with feedback to the input layers of both, encoder and decoder, which is very slow at the moment due to the for loop. But it seems TOO slow, even for that case. Is it possible to speed up the inference/training?
The model is:

class FRAE(tf.keras.Model):     def __init__(self, latent_dim, shape, ht, n1, n2, n3, n4, bypass=False, trainable=True,**kwargs):         super(FRAE, self).__init__(**kwargs)         self.latent_dim = latent_dim         self.shape = shape         self.ht = ht         self.buffer = tf.Variable(initial_value=tf.zeros(shape=(1,shape[0] * self.ht), dtype=tf.float32))         self.bypass = bypass         self.quantizer = None         self.trainable = trainable                  self.l1 = tf.keras.layers.Dense(n1, activation='swish', input_shape=shape)         self.l2 = tf.keras.layers.Dense(n1, activation='swish')         self.ls = tf.keras.layers.Dense(latent_dim, activation='swish')          self.l3 = tf.keras.layers.Dense(n3, activation='swish')         self.l4 = tf.keras.layers.Dense(n4, activation='swish')         self.l5 = tf.keras.layers.Dense(shape[-1], activation='linear')       def get_config(self):         config = super(FRAE,self).get_config().copy()         config.update({'latent_dim':self.latent_dim, 'bypass':self.bypass, 'quantizer':self.quantizer,                         "encoder":self.encoder, "buffer":self.buffer,                        'decoder':self.decoder,"ht":self.ht, "shape":self.shape, "name":self.name})                          return config                def update_buffer(self, new_element):         n = self.shape[0]         new_element_expanded = tf.expand_dims(new_element, axis=0)         self.buffer.assign(tf.keras.backend.concatenate([new_element_expanded, self.buffer[:, :-n]], axis=1))                  def resetBuffer(self):         self.buffer[:,:].assign(tf.zeros(shape=(1,self.shape[0] * self.ht), dtype=tf.float32))                  @tf.function     def call(self, x):                 if self.bypass is True:             print("Bypassing FRAE", flush=True)             return x         else:             x = tf.squeeze(x,axis=0)                          decoded = tf.TensorArray(tf.float32, size=tf.shape(x)[0])             for i in tf.range(tf.shape(x)[0]):                 xexpand = tf.expand_dims(x[i],axis=0)                 xin = tf.concat((xexpand, self.buffer), axis=1)                    encoded = self.ls(self.l2(self.l1(xin)))                 decin = tf.concat([encoded, self.buffer], axis=1)                 y = self.l5(self.l4(self.l3(decin)))                 decoded = decoded.write(i,y)                 i += 1                 self.update_buffer(tf.squeeze(y))                                              tmp = tf.transpose(decoded.stack(),[1,0,2])             return tmp 

hi @Cola_Lightyear.
Did you note an improvement adding a @tf.function to the call method of your subclassing model?
Do you want to decorate as such your training function and maybe also your loss function, instead?

I did not see a great improvement using @tf.function, maybe slightly. I need a speedup by at least 2 orders of magnitude. Do you have an idea? Why would @tf.function on my loss and training function help? Do you mean like this btw:

@tf.function def MyTrainingFunction(): model =        setupmodel() model.fit(data,epochs=whatever) 

?

I updated my current code in the OP btw

I really do not understand why this is this slow. It is similar to RNN code I saw, here it is just an autoencoder and the feedback occurs at two layers, that is all.