i was writer one tensorflow yolo, But my loss has not been updated. Can someone check it for me?
my all code github link :
my optimizer update code:
if ni <= nw:
xi = [0,nw] # ?奇怪,为啥要获取这个? # 这个东西大概意思是求nbs/tbs 的第ni个线性变化值,最小必须是1,然后会做取整操作 # 累计?累计了什么? accumulate = max(1,np.interp(ni, xi, [1, nbs / total_batch_size]).round()) # 这是在干嘛? # for j, x in enumerate(opt): #通过_set_hyper调整lr和mom # optimizer._set_hyper("learning_rate", np.interp(ni, xi, 0.0, hyp['lr0'] * lf(epoch))) optimizer._set_hyper("momentum", np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])) # 是否要做缩放 if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 * gs) // gs * gs sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]# new shape (stretched to gs-multiple) imgs = tf.image.resize(imgs, ns , tf.image.ResizeMethod.BILINEAR,False) #Forward with tf.GradientTape() as gt: pred = model(imgs) loss, loss_items = compute_loss(pred,targets,model) print(loss_items,end='') print(loss) if rank != -1 : # gradient averaged between devices in DDP mode # ddp模式下需要配置设备间的梯度平均值 loss *= opt.world_size grads = gt.gradient(loss,model.trainable_variables) optimizer.apply_gradients((grad, var) for (grad, var) in zip(grads,model.trainable_variables) if grad is not None) # 优化函数应用梯度进行优化 # optimizer.apply_gradients(zip(grads,model.trainable_variables)) # 不要使用trainable_variables**strong text**
my trian code
for epoch in range(start_epoch,epochs):
# 更新图片权重 if opt.image_weights: # 生成索引 if rank in [-1, 0]: cw = model.class_widths * (1 - maps) ** 2 /nc # 类权重(class weights) iw = model.class_widths * (1 - maps) ** 2 /nc # 图片权重(image weights) dataset.indices = random.choice(range(dataset.n),weights=id, k = dataset.n) #如果是ddp模式(ddp大概是分布式训练?),就需要配置广播 # if rank != -1 : # indices = np.array[dataset.indices] if rank == 0 else np.zeros(dataset.n,dtype=np.int8) mloss = np.zeros(5) logger.info(('\n' + '%10s' * 9) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'landmark', 'total', 'targets', 'img_size')) # dataset len pbar = range(nb) if(rank in [-1,0]): pbar = tqdm(pbar,total=nb) for i in pbar: # number integrated batches (since train start) # 这个参大概意思是本次训练累计使用了多少图片了 ni = i + nb * epoch # 通过批次数读取一次epochs所需要使用到的数据 (imgs, targets, paths)=[],[],[] batch_index = 0 for imgi in range(i * batch_size , i * batch_size + batch_size): # 下表从0开始,删一个 if(imgi < dl ): img,target,path = dataset.__getitem__(imgi) # 在getitem里改图片的shape有点困难,还是在外面改吧 if(opt.format=='NHWC'): img = tf.transpose(img,perm=[1,2,0]).numpy() imgs.append(img) targets.append(target) paths.append(path) (imgs, targets, paths) = dataset.collate_fn(imgs,targets,paths) imgs = np.array(imgs,dtype=np.float32) / 255.0 #将图片从uint8的0-255转化为float32的0到1 #预处理了图像一下 if ni <= nw: xi = [0,nw] # ?奇怪,为啥要获取这个? # 这个东西大概意思是求nbs/tbs 的第ni个线性变化值,最小必须是1,然后会做取整操作 # 累计?累计了什么? accumulate = max(1,np.interp(ni, xi, [1, nbs / total_batch_size]).round()) # 这是在干嘛? # for j, x in enumerate(opt): #通过_set_hyper调整lr和mom # optimizer._set_hyper("learning_rate", np.interp(ni, xi, 0.0, hyp['lr0'] * lf(epoch))) optimizer._set_hyper("momentum", np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])) # 是否要做缩放 if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 * gs) // gs * gs sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]# new shape (stretched to gs-multiple) imgs = tf.image.resize(imgs, ns , tf.image.ResizeMethod.BILINEAR,False) #Forward with tf.GradientTape() as gt: pred = model(imgs) loss, loss_items = compute_loss(pred,targets,model) print(loss_items,end='') print(loss) if rank != -1 : # gradient averaged between devices in DDP mode # ddp模式下需要配置设备间的梯度平均值 loss *= opt.world_size grads = gt.gradient(loss,model.trainable_variables) optimizer.apply_gradients((grad, var) for (grad, var) in zip(grads,model.trainable_variables) if grad is not None) # 优化函数应用梯度进行优化 # optimizer.apply_gradients(zip(grads,model.trainable_variables)) # 不要使用trainable_variables #?什么玩意 # tensorflow的混合精度学习。。后面再琢磨,现在不想 # scaler.scale(loss).backward() # Optimize # if ni % accumulate == 0 : # scaler.step(optimizer) # scaler.update() # optimizer.zero_grad() # if ema : # ema.update(model) # Print if rank in [-1,0]: mloss = (mloss * i + loss_items) / (i+1) # 更新当前平均loss mem = '%.3G' % ( pynvml.nvmlDeviceGetMemoryInfo(handle).used / 1E9 if useGpu else 0) s = ('%10s' * 2 + '%10.4g' * 7) % ( '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1] if opt.format == 'NCHW' else imgs.shape[1]) pbar.set_description(s) # Plot if plots and ni < 3: f = save_dir / f'train_batch{ni}.jpg' # filename f = '' Thread(target=plot_images, args=(imgs, targets, paths, f, opt.format), daemon=True).start() # if tb_writer: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard elif plots and ni == 3 and wandb: wandb.log({"Mosaics": [wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg')]}) # 单次训练完成,清理掉img,paths和 targets del imgs, targets, paths # # 跑完一次epoch记得gc叫出来一下,做个深度清理 gc.collect() model.save("mask_detector")