你好，我使用单卡可以训练，当使用多卡时，会出错，请问你尝试过多卡训练吗？ #9

lxyyang · 2020-11-09T06:29:01Z

Traceback (most recent call last):
File "tools/train.py", line 144, in
main()
File "tools/train.py", line 140, in main
train_detector(model,datasets,cfg,validate=args.validate,timestamp=timestamp,meta=meta)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/train.py", line 161, in train_detector
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 331, in run
epoch_runner(data_loaders[i], **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 220, in train
self.model, data_batch, train_mode=True, **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/train.py", line 116, in batch_processor
losses = model(**data)
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in call
result = self.forward(*input, **kwargs)
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 155, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 165, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/_utils.py", line 395, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in call
result = self.forward(*input, **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/detectors/base.py", line 55, in forward
return self.forward_train(img, img_metas, **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/detectors/YOLOv5Detector.py", line 100, in forward_train
head_loss = self.head.loss(*head_loss_inputs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/heads/yolo.py", line 99, in loss
bbox_loss, confidence_loss, class_loss = multi_apply(self.loss_single, pred, indices,tbox,tcls,ancher,self.conf_balances,ignore_mask)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/utils/util.py", line 32, in multi_apply
return tuple(map(list, zip(*map_results)))
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/heads/yolo.py", line 91, in loss_single
return self.yolov5_loss_single(pred, indices,tbox,tcls,anchors, conf_balances,ignore_mask)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/heads/yolo.py", line 210, in yolov5_loss_single
pwh = (ps[:, 2:4].sigmoid() * 2) ** 2 * torch.from_numpy(anchors).to(device) # 0-4倍缩放 model.hyp['anchor_t']=4
RuntimeError: CUDA error: device-side assert triggered

wuzhihao7788 · 2020-11-09T06:33:58Z

Traceback (most recent call last):
File "tools/train.py", line 144, in
main()
File "tools/train.py", line 140, in main
train_detector(model,datasets,cfg,validate=args.validate,timestamp=timestamp,meta=meta)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/train.py", line 161, in train_detector
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 331, in run
epoch_runner(data_loaders[i], **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 220, in train
self.model, data_batch, train_mode=True, **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/train.py", line 116, in batch_processor
losses = model(**data)
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in call
result = self.forward(*input, **kwargs)
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 155, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 165, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/_utils.py", line 395, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in call
result = self.forward(*input, **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/detectors/base.py", line 55, in forward
return self.forward_train(img, img_metas, **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/detectors/YOLOv5Detector.py", line 100, in forward_train
head_loss = self.head.loss(*head_loss_inputs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/heads/yolo.py", line 99, in loss
bbox_loss, confidence_loss, class_loss = multi_apply(self.loss_single, pred, indices,tbox,tcls,ancher,self.conf_balances,ignore_mask)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/utils/util.py", line 32, in multi_apply
return tuple(map(list, zip(*map_results)))
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/heads/yolo.py", line 91, in loss_single
return self.yolov5_loss_single(pred, indices,tbox,tcls,anchors, conf_balances,ignore_mask)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/heads/yolo.py", line 210, in yolov5_loss_single
pwh = (ps[:, 2:4].sigmoid() * 2) ** 2 * torch.from_numpy(anchors).to(device) # 0-4倍缩放 model.hyp['anchor_t']=4
RuntimeError: CUDA error: device-side assert triggered

单机多卡，我这边是测试过没有问题的，两张2070的卡，可以训练。根据你的错误反馈，猜测是不是显卡容量不够了？还有更详细的日志没有？

lxyyang · 2020-11-09T06:43:21Z

/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [41/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [41,0,0,0], thread: [0,0,0], thread: [0,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds",0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds"failed. failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator():60: operator(): block: [41,0,0], thread: [1,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
: block: [41,0,0], thread: [1,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [41,0,0], thread: [2,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [41,0,0], thread: [2,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [0,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [1,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [2,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [3,0/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [35,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [4,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu,0], thread: [56,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds":60: operator(): block: [30 failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu,0,0], thread: [5:60: operator(): block: [35,0,0], thread: [57,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(),0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu: block: [35,0,0:60: operator(): block: [30,0,0], thread: [6,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
], thread: [58,0,0/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [7,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
], thread: [8,0,0/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator()] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
: block: [35,0,0/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [9,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds"], thread: [59,0,0 failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [10,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator():60: operator(): block: [30,0,0: block: [35,0,0], thread: [11,0,0], thread: [60,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [35,0,0], thread: [61,0,0/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [12] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
,0,0/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator()] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [13,0: block: [35,0,0], thread: [62,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds",0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed. /pytorch/aten/src/ATen/native/cuda/IndexKernel.cu failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [14,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator():60: operator(): block: [30,0,0], thread: [15,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [16: block: [35,0,0], thread: [63,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator():60: operator(): block: [30: block: [21,0,0,0,0], thread: [17,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator()], thread: [32,0,0: block: [30,0,0], thread: [18,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [19,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds"/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [33,0,0 failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
: operator(): block: [30,0,0], thread: [20,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(),0,0], thread: [21,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds": block: [21,0,0 failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [34,0,0], thread: [22,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [23,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds": operator(): block: [21,0 failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [24,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
,0], thread: [35,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator()/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [35,0,0], thread: [56,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds": block: [21,0,0 failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [35,0,0], thread: [36,0,0], thread: [57,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [35,0,0], thread: [58,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator()/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [35,0,0], thread: [59,0: block: [21,0,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
], thread: [37,0,0/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [35,0,0], thread: [60,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
:60: operator()/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [38: block: [35,0,0], thread: [61,0,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0:60: operator(): block: [35,0,0], thread: [62,0,0], thread: [39,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [35,0,0/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator()], thread: [63,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
: block: [21,0,0], thread: [40,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [41,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [42,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [43,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [44,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [45,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [46,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [47,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [48,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [49,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [50,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [51,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [52,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [53,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [54,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [55,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [56,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [57,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [58,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [59,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [60,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [61,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [62,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [63,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [0,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [1,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [2,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [3,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [4,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [5,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [6,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [7,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [8,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [9,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [10,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [11,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [12,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [13,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [14,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [15,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [16,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [17,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [18,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [19,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [20,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [21,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [22,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [23,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [24,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [25,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [26,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [27,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [28,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [29,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [30,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [31,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
Traceback (most recent call last):
File "tools/train.py", line 144, in
main()
File "tools/train.py", line 140, in main
train_detector(model,datasets,cfg,validate=args.validate,timestamp=timestamp,meta=meta)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/train.py", line 161, in train_detector
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 331, in run
epoch_runner(data_loaders[i], **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 220, in train
self.model, data_batch, train_mode=True, **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/train.py", line 116, in batch_processor
losses = model(**data)
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in call
result = self.forward(*input, **kwargs)
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 155, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 165, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/_utils.py", line 395, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in call
result = self.forward(*input, **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/detectors/base.py", line 55, in forward
return self.forward_train(img, img_metas, **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/detectors/YOLOv5Detector.py", line 100, in forward_train
head_loss = self.head.loss(*head_loss_inputs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/heads/yolo.py", line 99, in loss
bbox_loss, confidence_loss, class_loss = multi_apply(self.loss_single, pred, indices,tbox,tcls,ancher,self.conf_balances,ignore_mask)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/utils/util.py", line 32, in multi_apply
return tuple(map(list, zip(*map_results)))
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/heads/yolo.py", line 91, in loss_single
return self.yolov5_loss_single(pred, indices,tbox,tcls,anchors, conf_balances,ignore_mask)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/heads/yolo.py", line 210, in yolov5_loss_single
pwh = (ps[:, 2:4].sigmoid() * 2) ** 2 * torch.from_numpy(anchors).to(device) # 0-4倍缩放 model.hyp['anchor_t']=4
RuntimeError: CUDA error: device-side assert triggered

@wuzhihao7788
这是我用多卡训练的全部log了
我的命令是/python tools/train.py cfg/yolov5s_coco_100e.py --device 2,3 --validate
查看了显存，是足够的

wuzhihao7788 · 2020-11-09T06:47:55Z

/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [41/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [41,0,0,0], thread: [0,0,0], thread: [0,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds",0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds"failed. failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator():60: operator(): block: [41,0,0], thread: [1,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
: block: [41,0,0], thread: [1,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [41,0,0], thread: [2,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [41,0,0], thread: [2,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [0,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [1,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [2,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [3,0/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [35,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [4,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu,0], thread: [56,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds":60: operator(): block: [30 failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu,0,0], thread: [5:60: operator(): block: [35,0,0], thread: [57,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(),0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu: block: [35,0,0:60: operator(): block: [30,0,0], thread: [6,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
], thread: [58,0,0/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [7,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
], thread: [8,0,0/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator()] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
: block: [35,0,0/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [9,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds"], thread: [59,0,0 failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [10,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator():60: operator(): block: [30,0,0: block: [35,0,0], thread: [11,0,0], thread: [60,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [35,0,0], thread: [61,0,0/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [12] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
,0,0/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator()] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [13,0: block: [35,0,0], thread: [62,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds",0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed. /pytorch/aten/src/ATen/native/cuda/IndexKernel.cu failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [14,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator():60: operator(): block: [30,0,0], thread: [15,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [16: block: [35,0,0], thread: [63,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator():60: operator(): block: [30: block: [21,0,0,0,0], thread: [17,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator()], thread: [32,0,0: block: [30,0,0], thread: [18,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [19,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds"/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [33,0,0 failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
: operator(): block: [30,0,0], thread: [20,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(),0,0], thread: [21,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds": block: [21,0,0 failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [34,0,0], thread: [22,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [23,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds": operator(): block: [21,0 failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [30,0,0], thread: [24,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
,0], thread: [35,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator()/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [35,0,0], thread: [56,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds": block: [21,0,0 failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [35,0,0], thread: [36,0,0], thread: [57,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [35,0,0], thread: [58,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator()/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [35,0,0], thread: [59,0: block: [21,0,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
], thread: [37,0,0/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [35,0,0], thread: [60,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
:60: operator()/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [38: block: [35,0,0], thread: [61,0,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0:60: operator(): block: [35,0,0], thread: [62,0,0], thread: [39,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [35,0,0/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator()], thread: [63,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
: block: [21,0,0], thread: [40,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [41,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [42,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [43,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [44,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [45,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [46,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [47,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [48,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [49,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [50,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [51,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [52,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [53,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [54,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [55,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [56,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [57,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [58,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [59,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [60,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [61,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [62,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [63,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [0,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [1,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [2,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [3,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [4,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [5,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [6,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [7,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [8,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [9,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [10,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [11,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [12,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [13,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [14,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [15,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [16,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [17,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [18,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [19,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [20,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [21,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [22,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [23,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [24,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [25,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [26,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [27,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [28,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [29,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [30,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:60: operator(): block: [21,0,0], thread: [31,0,0] Assertion index >= -sizes[i] && index < sizes[i] && "index out of bounds" failed.
Traceback (most recent call last):
File "tools/train.py", line 144, in
main()
File "tools/train.py", line 140, in main
train_detector(model,datasets,cfg,validate=args.validate,timestamp=timestamp,meta=meta)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/train.py", line 161, in train_detector
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 331, in run
epoch_runner(data_loaders[i], **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 220, in train
self.model, data_batch, train_mode=True, **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/train.py", line 116, in batch_processor
losses = model(**data)
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in call
result = self.forward(*input, **kwargs)
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 155, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py", line 165, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in parallel_apply
output.reraise()
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/_utils.py", line 395, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in call
result = self.forward(*input, **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/detectors/base.py", line 55, in forward
return self.forward_train(img, img_metas, **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/detectors/YOLOv5Detector.py", line 100, in forward_train
head_loss = self.head.loss(*head_loss_inputs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/heads/yolo.py", line 99, in loss
bbox_loss, confidence_loss, class_loss = multi_apply(self.loss_single, pred, indices,tbox,tcls,ancher,self.conf_balances,ignore_mask)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/utils/util.py", line 32, in multi_apply
return tuple(map(list, zip(*map_results)))
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/heads/yolo.py", line 91, in loss_single
return self.yolov5_loss_single(pred, indices,tbox,tcls,anchors, conf_balances,ignore_mask)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/heads/yolo.py", line 210, in yolov5_loss_single
pwh = (ps[:, 2:4].sigmoid() * 2) ** 2 * torch.from_numpy(anchors).to(device) # 0-4倍缩放 model.hyp['anchor_t']=4
RuntimeError: CUDA error: device-side assert triggered

@wuzhihao7788
这是我用多卡训练的全部log了
我的命令是/python tools/train.py cfg/yolov5s_coco_100e.py --device 2,3 --validate
查看了显存，是足够的

看日志好像是越界问题了，查看你的数据集制作是不是有问题，你可以现在单卡上测试一下，是否还在报错

lxyyang · 2020-11-09T06:51:22Z

@wuzhihao7788
你好，单卡没有问题；
coco数据转换过来，标签是从0开始吧？

wuzhihao7788 · 2020-11-09T06:53:33Z

@wuzhihao7788
你好，单卡没有问题；
coco数据转换过来，标签是从0开始吧？

是的

wuzhihao7788 · 2020-11-09T06:57:53Z

@wuzhihao7788
你好，单卡没有问题；
coco数据转换过来，标签是从0开始吧？

是的

因为这个错误不是很明显，我抽时间在跑一下代码，看一下，是不是没有和github上的代码保持一致。我实验没有问题了，争取明天给你一个答复。

lxyyang · 2020-11-09T07:07:51Z

@wuzhihao7788 你能发一份COCO的标签文件给我吗？train.txt val.txt和label.names 我先验证一下我的数据制作有没有问题。我的邮箱[email protected]

lxyyang · 2020-11-09T11:42:50Z

@wuzhihao7788
我发现在yolodet/models/heads/yolo.py文件中
“”“
pi = pred
b, a, gj, gi = indices # image, anchor, gridy, gridx
print("pi: ", pi.shape)
print("b: ", b)
print("a: ", a)
print("gj: ", gj)
print("gi: ", gi)
pdb.set_trace()
tobj = torch.zeros_like(pi[..., 0]).to(device) # target obj
nb = b.shape[0] # number of targets
if nb:
nt += nb # cumulative targets
ps = pi[b, a, gj, gi] # prediction subset co
”“”
这块代码
pi: torch.Size([2, 3, 80, 80, 85])
b: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3,
3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 3, 3,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 3,
3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 3, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 2, 3, 3, 3, 3, 3], device='cuda:1')
这个b的值已经超出的pi的第一维度的下标值？

lxyyang · 2020-11-10T02:33:35Z

@wuzhihao7788 问题应该是多卡训练时候图片被分到了多张卡，但是计算loss时，对应的标签用的是整个batchsize的。

wuzhihao7788 · 2020-11-10T02:35:54Z

@wuzhihao7788 问题应该是多卡训练时候图片被分到了多张卡，但是计算loss时，对应的标签用的是整个batchsize的。

谢谢你的反馈，我现在在debug这段代码，看一下具体是什么原因，等代码修改完，上传完成，我会告知你的。

wuzhihao7788 · 2020-11-10T07:12:19Z

@wuzhihao7788 问题应该是多卡训练时候图片被分到了多张卡，但是计算loss时，对应的标签用的是整个batchsize的。

亲，最新的代码已经更新完成，请你重新拉取一下，再用多卡训练，看是否还有错误。

lxyyang · 2020-11-10T09:32:39Z

@wuzhihao7788 你好，可以了

lxyyang · 2020-11-10T14:11:36Z

@wuzhihao7788 测试的时候遇到错误
Traceback (most recent call last):
File "tools/train.py", line 144, in
main()
File "tools/train.py", line 140, in main
train_detector(model,datasets,cfg,validate=args.validate,timestamp=timestamp,meta=meta)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/train.py", line 161, in train_detector
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 331, in run
epoch_runner(data_loaders[i], **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 230, in train
self.call_hook('after_train_epoch')
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 205, in call_hook
getattr(hook, fn_name)(self)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/hooks/eval_hook.py", line 52, in after_train_epoch
results = single_gpu_test(model, self.dataloader)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/test.py", line 57, in single_gpu_test
names = model.CLASSES if hasattr(model, 'CLASSES') else model.module.CLASSES
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 594, in getattr
type(self).name, name))
AttributeError: 'YOLOv5Detector' object has no attribute 'module'

wuzhihao7788 · 2020-11-10T14:50:19Z

@wuzhihao7788 测试的时候遇到错误
Traceback (most recent call last):
File "tools/train.py", line 144, in
main()
File "tools/train.py", line 140, in main
train_detector(model,datasets,cfg,validate=args.validate,timestamp=timestamp,meta=meta)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/train.py", line 161, in train_detector
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 331, in run
epoch_runner(data_loaders[i], **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 230, in train
self.call_hook('after_train_epoch')
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 205, in call_hook
getattr(hook, fn_name)(self)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/hooks/eval_hook.py", line 52, in after_train_epoch
results = single_gpu_test(model, self.dataloader)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/test.py", line 57, in single_gpu_test
names = model.CLASSES if hasattr(model, 'CLASSES') else model.module.CLASSES
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 594, in getattr
type(self).name, name))
AttributeError: 'YOLOv5Detector' object has no attribute 'module'

好的明天我在看一下验证部分的代码

wuzhihao7788 · 2020-11-11T01:55:16Z

@wuzhihao7788 测试的时候遇到错误
Traceback (most recent call last):
File "tools/train.py", line 144, in
main()
File "tools/train.py", line 140, in main
train_detector(model,datasets,cfg,validate=args.validate,timestamp=timestamp,meta=meta)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/train.py", line 161, in train_detector
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 331, in run
epoch_runner(data_loaders[i], **kwargs)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 230, in train
self.call_hook('after_train_epoch')
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/runner.py", line 205, in call_hook
getattr(hook, fn_name)(self)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/models/hooks/eval_hook.py", line 52, in after_train_epoch
results = single_gpu_test(model, self.dataloader)
File "/data1/xieyangyang/yolodet-pytorch/yolodet/apis/test.py", line 57, in single_gpu_test
names = model.CLASSES if hasattr(model, 'CLASSES') else model.module.CLASSES
File "/data4/xieyangyang/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 594, in getattr
type(self).name, name))
AttributeError: 'YOLOv5Detector' object has no attribute 'module'

亲，最新的代码已经更新完成，多卡训练，验证集已经没有问题

lxyyang · 2020-11-11T08:43:18Z

@wuzhihao7788
你好，训练和测试都正常了；
我在coco上训练yolov5
4个epoch后的测试结果
Class Images Targets P R [email protected] [email protected]:.95:
all 4.94e+03 3.68e+04 0.109 0.013 0.0268 0.0155
这个正常嘛？
能发一份你的训练log给我嘛 [email protected]

wuzhihao7788 · 2020-11-11T08:51:40Z

@wuzhihao7788
你好，训练和测试都正常了；
我在coco上训练yolov5
4个epoch后的测试结果
Class Images Targets P R [email protected] [email protected]:.95:
all 4.94e+03 3.68e+04 0.109 0.013 0.0268 0.0155
这个正常嘛？
能发一份你的训练log给我嘛 [email protected]

也算可以，我的训练日志都属于断断续续的。根据我这单卡训练的经验，无论是coco数据集还是小型数据集，能看到一定效果差不多执行需要在24个epoch左右

wuzhihao7788 · 2020-11-11T08:53:35Z

@wuzhihao7788
你好，训练和测试都正常了；
我在coco上训练yolov5
4个epoch后的测试结果
Class Images Targets P R [email protected] [email protected]:.95:
all 4.94e+03 3.68e+04 0.109 0.013 0.0268 0.0155
这个正常嘛？
能发一份你的训练log给我嘛 [email protected]

也算可以，我的训练日志都属于断断续续的。根据我这单卡训练的经验，无论是coco数据集还是小型数据集，能看到一定效果差不多执行需要在24个epoch左右

亲，后面你能把你训练的模型贡献出来吗？我这条件比较有限，还没有训练到一个比较充分的模型。

lxyyang · 2020-11-11T08:56:56Z

@wuzhihao7788 我这边继续训练，有好的模型会贡献的

wuzhihao7788 · 2020-11-11T08:58:46Z

@wuzhihao7788 我这边继续训练，有好的模型会贡献的

感谢

lxyyang · 2020-11-18T10:31:12Z

@wuzhihao7788 yolov5l，总epoch设置的100，训练了92个epoch
{"mode": "val", "epoch": 92, "iter": 7329, "lr": 0.00022, "P": 0.33703, "R": 0.62344, "[email protected]": 0.51406, "[email protected]:0.95": 0.35654}

wuzhihao7788 · 2020-11-18T10:51:18Z

感谢你的训练，这个结果跟官方的还是有差距的。因为官方的训练epoch数为300个。卡是V100。你还能继续训练一下吗？模型应该还会继续进行收敛。官方结果。[email protected] : 47.7mAP0.5:66.5

…

------------------ 原始邮件 ------------------ 发件人: "wuzhihao7788/yolodet-pytorch" <[email protected]>; 发送时间: 2020年11月18日(星期三) 晚上6:31 收件人: "wuzhihao7788/yolodet-pytorch"<[email protected]>; 抄送: "wuzhihao"<[email protected]>;"Mention"<[email protected]>; 主题: Re: [wuzhihao7788/yolodet-pytorch] 你好，我使用单卡可以训练，当使用多卡时，会出错，请问你尝试过多卡训练吗？ (#9) @wuzhihao7788 yolov5l，总epoch设置的100，训练了92个epoch {"mode": "val", "epoch": 92, "iter": 7329, "lr": 0.00022, "P": 0.33703, "R": 0.62344, "[email protected]": 0.51406, "[email protected]:0.95": 0.35654} — You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub, or unsubscribe.

lxyyang · 2020-11-19T02:46:39Z

@wuzhihao7788 训练yolov5l太慢了，我准备尝试yolov5s，训练这个比较快。

wuzhihao7788 · 2020-11-19T02:48:38Z

嗯嗯，好的，你可以直接把最大epoch设置为300训练，其他参数可以不用调整

…

------------------ 原始邮件 ------------------ 发件人: "wuzhihao7788/yolodet-pytorch" <[email protected]>; 发送时间: 2020年11月19日(星期四) 上午10:46 收件人: "wuzhihao7788/yolodet-pytorch"<[email protected]>; 抄送: "wuzhihao"<[email protected]>;"Mention"<[email protected]>; 主题: Re: [wuzhihao7788/yolodet-pytorch] 你好，我使用单卡可以训练，当使用多卡时，会出错，请问你尝试过多卡训练吗？ (#9) @wuzhihao7788 训练yolov5l太慢了，我准备尝试yolov5s，训练这个比较快。 — You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub, or unsubscribe.

lxyyang · 2020-12-01T02:40:55Z

@wuzhihao7788 yolov5s训练了300个epoch，AP只能到30，离37.7还有差距的

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

你好，我使用单卡可以训练，当使用多卡时，会出错，请问你尝试过多卡训练吗？ #9

你好，我使用单卡可以训练，当使用多卡时，会出错，请问你尝试过多卡训练吗？ #9

lxyyang commented Nov 9, 2020

wuzhihao7788 commented Nov 9, 2020

lxyyang commented Nov 9, 2020

wuzhihao7788 commented Nov 9, 2020

lxyyang commented Nov 9, 2020

wuzhihao7788 commented Nov 9, 2020

wuzhihao7788 commented Nov 9, 2020

lxyyang commented Nov 9, 2020

lxyyang commented Nov 9, 2020

lxyyang commented Nov 10, 2020

wuzhihao7788 commented Nov 10, 2020

wuzhihao7788 commented Nov 10, 2020

lxyyang commented Nov 10, 2020

lxyyang commented Nov 10, 2020

wuzhihao7788 commented Nov 10, 2020

wuzhihao7788 commented Nov 11, 2020

lxyyang commented Nov 11, 2020

wuzhihao7788 commented Nov 11, 2020

wuzhihao7788 commented Nov 11, 2020

lxyyang commented Nov 11, 2020

wuzhihao7788 commented Nov 11, 2020

lxyyang commented Nov 18, 2020

wuzhihao7788 commented Nov 18, 2020 via email

lxyyang commented Nov 19, 2020

wuzhihao7788 commented Nov 19, 2020 via email

lxyyang commented Dec 1, 2020

你好，我使用单卡可以训练，当使用多卡时，会出错，请问你尝试过多卡训练吗？ #9

你好，我使用单卡可以训练，当使用多卡时，会出错，请问你尝试过多卡训练吗？ #9

Comments

lxyyang commented Nov 9, 2020

wuzhihao7788 commented Nov 9, 2020

lxyyang commented Nov 9, 2020

wuzhihao7788 commented Nov 9, 2020

lxyyang commented Nov 9, 2020

wuzhihao7788 commented Nov 9, 2020

wuzhihao7788 commented Nov 9, 2020

lxyyang commented Nov 9, 2020

lxyyang commented Nov 9, 2020

lxyyang commented Nov 10, 2020

wuzhihao7788 commented Nov 10, 2020

wuzhihao7788 commented Nov 10, 2020

lxyyang commented Nov 10, 2020

lxyyang commented Nov 10, 2020

wuzhihao7788 commented Nov 10, 2020

wuzhihao7788 commented Nov 11, 2020

lxyyang commented Nov 11, 2020

wuzhihao7788 commented Nov 11, 2020

wuzhihao7788 commented Nov 11, 2020

lxyyang commented Nov 11, 2020

wuzhihao7788 commented Nov 11, 2020

lxyyang commented Nov 18, 2020

wuzhihao7788 commented Nov 18, 2020 via email

lxyyang commented Nov 19, 2020

wuzhihao7788 commented Nov 19, 2020 via email

lxyyang commented Dec 1, 2020