My tensorflow model not update loss, please can someone help me?

i was writer one tensorflow yolo, But my loss has not been updated. Can someone check it for me?

my all code github link :

my optimizer update code:
if ni <= nw:

            xi = [0,nw]                           # ?奇怪,为啥要获取这个?              # 这个东西大概意思是求nbs/tbs 的第ni个线性变化值,最小必须是1,然后会做取整操作              # 累计?累计了什么?              accumulate = max(1,np.interp(ni, xi, [1, nbs / total_batch_size]).round())              # 这是在干嘛?              # for j, x in enumerate(opt):              #通过_set_hyper调整lr和mom                           # optimizer._set_hyper("learning_rate", np.interp(ni, xi, 0.0, hyp['lr0'] * lf(epoch)))              optimizer._set_hyper("momentum", np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']]))          # 是否要做缩放          if opt.multi_scale:              sz = random.randrange(imgsz * 0.5, imgsz * 1.5 * gs) // gs * gs              sf = sz / max(imgs.shape[2:])  # scale factor              if sf != 1:                  ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]# new shape (stretched to gs-multiple)                  imgs = tf.image.resize(imgs, ns ,  tf.image.ResizeMethod.BILINEAR,False)                       #Forward          with tf.GradientTape() as gt:                         pred = model(imgs)                                           loss, loss_items = compute_loss(pred,targets,model)                           print(loss_items,end='')              print(loss)              if rank != -1 :              # gradient averaged between devices in DDP mode              # ddp模式下需要配置设备间的梯度平均值                  loss *= opt.world_size                           grads = gt.gradient(loss,model.trainable_variables)              optimizer.apply_gradients((grad, var) for (grad, var) in zip(grads,model.trainable_variables) if grad is not None) # 优化函数应用梯度进行优化              # optimizer.apply_gradients(zip(grads,model.trainable_variables)) # 不要使用trainable_variables**strong text** 

my trian code

for epoch in range(start_epoch,epochs):

    # 更新图片权重      if opt.image_weights:          # 生成索引          if rank in [-1, 0]:              cw = model.class_widths * (1 - maps) ** 2 /nc # 类权重(class weights)              iw = model.class_widths * (1 - maps) ** 2 /nc # 图片权重(image weights)              dataset.indices = random.choice(range(dataset.n),weights=id, k = dataset.n)          #如果是ddp模式(ddp大概是分布式训练?),就需要配置广播          # if rank != -1 :              # indices = np.array[dataset.indices] if rank == 0 else np.zeros(dataset.n,dtype=np.int8)           mloss = np.zeros(5)          logger.info(('\n' + '%10s' * 9) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'landmark', 'total', 'targets', 'img_size'))      # dataset len         pbar = range(nb)      if(rank in [-1,0]):          pbar = tqdm(pbar,total=nb)                   for i  in pbar:          # number integrated batches (since train start)          # 这个参大概意思是本次训练累计使用了多少图片了          ni = i + nb * epoch                   # 通过批次数读取一次epochs所需要使用到的数据          (imgs, targets, paths)=[],[],[]                   batch_index = 0                   for imgi in range(i * batch_size , i * batch_size + batch_size):                                           # 下表从0开始,删一个              if(imgi < dl ):                  img,target,path = dataset.__getitem__(imgi)                  # 在getitem里改图片的shape有点困难,还是在外面改吧                  if(opt.format=='NHWC'):                      img = tf.transpose(img,perm=[1,2,0]).numpy()                                   imgs.append(img)                  targets.append(target)                  paths.append(path)                           (imgs, targets, paths) =  dataset.collate_fn(imgs,targets,paths)                                           imgs = np.array(imgs,dtype=np.float32) / 255.0 #将图片从uint8的0-255转化为float32的0到1                   #预处理了图像一下          if ni <= nw:              xi = [0,nw]                           # ?奇怪,为啥要获取这个?              # 这个东西大概意思是求nbs/tbs 的第ni个线性变化值,最小必须是1,然后会做取整操作              # 累计?累计了什么?              accumulate = max(1,np.interp(ni, xi, [1, nbs / total_batch_size]).round())              # 这是在干嘛?              # for j, x in enumerate(opt):              #通过_set_hyper调整lr和mom                           # optimizer._set_hyper("learning_rate", np.interp(ni, xi, 0.0, hyp['lr0'] * lf(epoch)))              optimizer._set_hyper("momentum", np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']]))          # 是否要做缩放          if opt.multi_scale:              sz = random.randrange(imgsz * 0.5, imgsz * 1.5 * gs) // gs * gs              sf = sz / max(imgs.shape[2:])  # scale factor              if sf != 1:                  ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]# new shape (stretched to gs-multiple)                  imgs = tf.image.resize(imgs, ns ,  tf.image.ResizeMethod.BILINEAR,False)                       #Forward          with tf.GradientTape() as gt:                         pred = model(imgs)                                           loss, loss_items = compute_loss(pred,targets,model)                           print(loss_items,end='')              print(loss)              if rank != -1 :              # gradient averaged between devices in DDP mode              # ddp模式下需要配置设备间的梯度平均值                  loss *= opt.world_size                           grads = gt.gradient(loss,model.trainable_variables)              optimizer.apply_gradients((grad, var) for (grad, var) in zip(grads,model.trainable_variables) if grad is not None) # 优化函数应用梯度进行优化              # optimizer.apply_gradients(zip(grads,model.trainable_variables)) # 不要使用trainable_variables                   #?什么玩意          # tensorflow的混合精度学习。。后面再琢磨,现在不想                   # scaler.scale(loss).backward()                               # Optimize          # if ni % accumulate == 0 :              # scaler.step(optimizer)              # scaler.update()                           # optimizer.zero_grad()              # if ema :              #     ema.update(model)                               # Print          if rank in [-1,0]:              mloss = (mloss * i + loss_items) / (i+1) # 更新当前平均loss                           mem = '%.3G' % ( pynvml.nvmlDeviceGetMemoryInfo(handle).used  / 1E9 if useGpu else 0)              s = ('%10s' * 2 + '%10.4g' * 7) % (                  '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1] if opt.format == 'NCHW' else imgs.shape[1])              pbar.set_description(s)          # Plot              if plots and ni < 3:                  f = save_dir / f'train_batch{ni}.jpg'  # filename                  f = ''                  Thread(target=plot_images, args=(imgs, targets, paths, f, opt.format), daemon=True).start()                  # if tb_writer:                  #     tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)                  #     tb_writer.add_graph(model, imgs)  # add model to tensorboard              elif plots and ni == 3 and wandb:                  wandb.log({"Mosaics": [wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg')]})                           # 单次训练完成,清理掉img,paths和 targets          del imgs, targets, paths          # # 跑完一次epoch记得gc叫出来一下,做个深度清理          gc.collect()       model.save("mask_detector")