From 89022a260c6c7aeb10113986bacdaabe368dc605 Mon Sep 17 00:00:00 2001 From: Jiaqi Wang <1155098160@link.cuhk.edu.hk> Date: Thu, 23 May 2019 20:57:11 +0800 Subject: [PATCH] Code of CVPR 2019 Paper: Region Proposal by Guided Anchoring (#594) * add two stage w/o neck and w/ upperneck * add rpn r50 c4 * update c4 configs * fix * config update * update config * minor update * mask rcnn support c4 train and test * lr fix * cascade support upper_neck * add cascade c4 config * update config * update * update res_layer to new interface * refactoring * c4 configs update * refactoring * update rpn_c4 config * rename upper_neck as shared_head * update * update configs * update * update c4 configs * update according to commits * update * add ga rpn * test bug fix * test bug fix with loc_filter_thr is large * update configs * update configs * add ga_retinanet * ga test bug fix * update configs * update * init masked conv * update * update masked conv * update * support no ga_sampler * update * update * test with masked_conv * update comment * fix flake errors * fix flake 8 errors * refactor bounded iou loss * refactor ga_retina_head * update configs * refactor masked conv * fix flake8 error * refactor guided_anchor_head and ga_rpn_head * update configs * use_sigmoid_cls -> cls_sigmoid_loss; use_focal_loss -> cls_focal_loss * refactoring * cls_sigmoid_loss -> use_sigmoid_cls * fix flake8 error * add some docs * rename normalize to norm_cfg * update configs * add readme * update ga_faster config * update readme * update readme * rename configs as r50_caffe * merge master * refactor guided anchor target * update readme * update approx mas iou assigner * refactor guided anchor target * update docstring * refactor ga heads * fix flake8 error * update readme * update model url * update comments * refactor get anchors * update docstring * not use_loc_filter during training * add R-101 results * update to support build loss api * fix flake8 error * update readme with x-101 performances * update readme * add a link in project readme * refactor code about ga shape inside flags * update * update * add x101 config files * add ga_rpn r101 config * update some comments * add comments * add comments * update comments * fix flake8 error --- MODEL_ZOO.md | 4 + compile.sh | 7 + configs/guided_anchoring/README.md | 42 ++ .../ga_fast_r50_caffe_fpn_1x.py | 130 ++++ .../ga_faster_r50_caffe_fpn_1x.py | 193 ++++++ .../ga_faster_x101_32x4d_fpn_1x.py | 193 ++++++ .../ga_retinanet_r50_caffe_fpn_1x.py | 152 +++++ .../ga_retinanet_x101_32x4d_fpn_1x.py | 152 +++++ .../ga_rpn_r101_caffe_rpn_1x.py | 151 +++++ .../ga_rpn_r50_caffe_fpn_1x.py | 151 +++++ .../ga_rpn_x101_32x4d_fpn_1x.py | 151 +++++ mmdet/core/anchor/__init__.py | 8 +- mmdet/core/anchor/guided_anchor_target.py | 285 +++++++++ mmdet/core/bbox/assigners/__init__.py | 5 +- .../bbox/assigners/approx_max_iou_assigner.py | 116 ++++ mmdet/core/loss/__init__.py | 13 +- mmdet/core/loss/losses.py | 97 ++- mmdet/models/anchor_heads/__init__.py | 10 +- mmdet/models/anchor_heads/ga_retina_head.py | 107 ++++ mmdet/models/anchor_heads/ga_rpn_head.py | 127 ++++ .../models/anchor_heads/guided_anchor_head.py | 589 ++++++++++++++++++ mmdet/models/losses/__init__.py | 3 +- mmdet/models/losses/iou_loss.py | 26 + mmdet/ops/__init__.py | 4 +- mmdet/ops/masked_conv/__init__.py | 4 + mmdet/ops/masked_conv/functions/__init__.py | 0 .../ops/masked_conv/functions/masked_conv.py | 55 ++ mmdet/ops/masked_conv/modules/__Init__.py | 0 mmdet/ops/masked_conv/modules/masked_conv.py | 30 + mmdet/ops/masked_conv/setup.py | 12 + .../masked_conv/src/masked_conv2d_cuda.cpp | 74 +++ .../masked_conv/src/masked_conv2d_kernel.cu | 113 ++++ 32 files changed, 2983 insertions(+), 21 deletions(-) create mode 100644 configs/guided_anchoring/README.md create mode 100644 configs/guided_anchoring/ga_fast_r50_caffe_fpn_1x.py create mode 100644 configs/guided_anchoring/ga_faster_r50_caffe_fpn_1x.py create mode 100644 configs/guided_anchoring/ga_faster_x101_32x4d_fpn_1x.py create mode 100644 configs/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x.py create mode 100644 configs/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x.py create mode 100644 configs/guided_anchoring/ga_rpn_r101_caffe_rpn_1x.py create mode 100644 configs/guided_anchoring/ga_rpn_r50_caffe_fpn_1x.py create mode 100644 configs/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x.py create mode 100644 mmdet/core/anchor/guided_anchor_target.py create mode 100644 mmdet/core/bbox/assigners/approx_max_iou_assigner.py create mode 100644 mmdet/models/anchor_heads/ga_retina_head.py create mode 100644 mmdet/models/anchor_heads/ga_rpn_head.py create mode 100644 mmdet/models/anchor_heads/guided_anchor_head.py create mode 100644 mmdet/models/losses/iou_loss.py create mode 100644 mmdet/ops/masked_conv/__init__.py create mode 100644 mmdet/ops/masked_conv/functions/__init__.py create mode 100644 mmdet/ops/masked_conv/functions/masked_conv.py create mode 100644 mmdet/ops/masked_conv/modules/__Init__.py create mode 100644 mmdet/ops/masked_conv/modules/masked_conv.py create mode 100644 mmdet/ops/masked_conv/setup.py create mode 100644 mmdet/ops/masked_conv/src/masked_conv2d_cuda.cpp create mode 100644 mmdet/ops/masked_conv/src/masked_conv2d_kernel.cu diff --git a/MODEL_ZOO.md b/MODEL_ZOO.md index 9ba86ab..41480bd 100644 --- a/MODEL_ZOO.md +++ b/MODEL_ZOO.md @@ -214,6 +214,10 @@ Please refer to [Weight Standardization](configs/gn+ws/README.md) for details. Please refer to [Deformable Convolutional Networks](configs/dcn/README.md) for details. +### Guided Anchoring + +Please refer to [Guided Anchoring](configs/guided_anchoring/README.md) for details. + ## Comparison with Detectron and maskrcnn-benchmark diff --git a/compile.sh b/compile.sh index 335cf51..c3853f1 100755 --- a/compile.sh +++ b/compile.sh @@ -36,3 +36,10 @@ if [ -d "build" ]; then rm -r build fi $PYTHON setup.py build_ext --inplace + +echo "Building masked conv op..." +cd ../masked_conv +if [ -d "build" ]; then + rm -r build +fi +$PYTHON setup.py build_ext --inplace diff --git a/configs/guided_anchoring/README.md b/configs/guided_anchoring/README.md new file mode 100644 index 0000000..1d8bb00 --- /dev/null +++ b/configs/guided_anchoring/README.md @@ -0,0 +1,42 @@ +# Region Proposal by Guided Anchoring + +## Introduction + +We provide config files to reproduce the results in the CVPR 2019 paper for [Region Proposal by Guided Anchoring](https://arxiv.org/abs/1901.03278). + +``` +@inproceedings{wang2019region, + title={Region Proposal by Guided Anchoring}, + author={Jiaqi Wang and Kai Chen and Shuo Yang and Chen Change Loy and Dahua Lin}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, + year={2019} +} +``` + +## Results and Models + +The results on COCO 2017 val is shown in the below table. (results on test-dev are usually slightly higher than val). + +| Method | Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | AR 1000 | Download | +| :----: | :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :-----: | :-------------------------------------------------------------------------------------------------------------------------------------------: | +| GA-RPN | R-50-FPN | caffe | 1x | 5.0 | 0.55 | 13.3 | 68.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_20190513-95e91886.pth) | +| GA-RPN | R-101-FPN | caffe | 1x | - | - | - | 69.6 | - | +| GA-RPN | X-101-32x4d-FPN | pytorch | 1x | - | - | - | 70.0 | - | +| GA-RPN | X-101-64x4d-FPN | pytorch | 1x | - | - | - | 70.5 | - | + + +| Method | Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download | +| :------------: | :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :-------------------------------------------------------------------------------------------------------------------------------------------------: | +| GA-Fast RCNN | R-50-FPN | caffe | 1x | 3.3 | 0.23 | 14.9 | 39.5 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/guided_anchoring/ga_fast_r50_caffe_fpn_1x_20190513-c5af9f8b.pth) | +| GA-Faster RCNN | R-50-FPN | caffe | 1x | 5.1 | 0.64 | 9.6 | 39.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/guided_anchoring/ga_faster_r50_caffe_fpn_1x_20190513-a52b31fa.pth) | +| GA-Faster RCNN | R-101-FPN | caffe | 1x | - | - | - | 41.5 | - | +| GA-Faster RCNN | X-101-32x4d-FPN | pytorch | 1x | - | - | - | 42.9 | - | +| GA-Faster RCNN | X-101-64x4d-FPN | pytorch | 1x | - | - | - | 43.9 | - | +| GA-RetinaNet | R-50-FPN | caffe | 1x | 3.2 | 0.50 | 10.7 | 37.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_20190513-29905101.pth) | +| GA-RetinaNet | R-101-FPN | caffe | 1x | - | - | - | 38.9 | - | +| GA-RetinaNet | X-101-32x4d-FPN | pytorch | 1x | - | - | - | 40.3 | - | +| GA-RetinaNet | X-101-64x4d-FPN | pytorch | 1x | - | - | - | 40.8 | - | + + + +- In the Guided Anchoring paper, `score_thr` is set to 0.001 in Fast/Faster RCNN and 0.05 in RetinaNet for both baselines and Guided Anchoring. \ No newline at end of file diff --git a/configs/guided_anchoring/ga_fast_r50_caffe_fpn_1x.py b/configs/guided_anchoring/ga_fast_r50_caffe_fpn_1x.py new file mode 100644 index 0000000..269967d --- /dev/null +++ b/configs/guided_anchoring/ga_fast_r50_caffe_fpn_1x.py @@ -0,0 +1,130 @@ +# model settings +model = dict( + type='FastRCNN', + pretrained='open-mmlab://resnet50_caffe', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1], + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))) +# model training and testing settings +train_cfg = dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)) +test_cfg = dict( + rcnn=dict( + score_thr=1e-3, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + num_max_proposals=300, + proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_train2017.pkl', + flip_ratio=0.5, + with_mask=False, + with_crowd=True, + with_label=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + num_max_proposals=300, + proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_val2017.pkl', + flip_ratio=0, + with_mask=False, + with_crowd=True, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + num_max_proposals=300, + proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_val2017.pkl', + flip_ratio=0, + with_mask=False, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/ga_fast_rcnn_r50_caffe_fpn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/guided_anchoring/ga_faster_r50_caffe_fpn_1x.py b/configs/guided_anchoring/ga_faster_r50_caffe_fpn_1x.py new file mode 100644 index 0000000..0b9f725 --- /dev/null +++ b/configs/guided_anchoring/ga_faster_r50_caffe_fpn_1x.py @@ -0,0 +1,193 @@ +# model settings +model = dict( + type='FasterRCNN', + pretrained='open-mmlab://resnet50_caffe', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='GARPNHead', + in_channels=256, + feat_channels=256, + octave_base_scale=8, + scales_per_octave=3, + octave_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + anchor_base_sizes=None, + anchoring_means=[.0, .0, .0, .0], + anchoring_stds=[0.07, 0.07, 0.14, 0.14], + target_means=(.0, .0, .0, .0), + target_stds=[0.07, 0.07, 0.11, 0.11], + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict( + type='IoULoss', style='bounded', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1], + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))) +# model training and testing settings +train_cfg = dict( + rpn=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + center_ratio=0.2, + ignore_ratio=0.5, + debug=False), + rpn_proposal=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=300, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=1000, + nms_post=1000, + max_num=300, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=1e-3, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + with_mask=False, + with_crowd=True, + with_label=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_crowd=True, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/ga_faster_rcnn_r50_caffe_fpn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/guided_anchoring/ga_faster_x101_32x4d_fpn_1x.py b/configs/guided_anchoring/ga_faster_x101_32x4d_fpn_1x.py new file mode 100644 index 0000000..dabdf6c --- /dev/null +++ b/configs/guided_anchoring/ga_faster_x101_32x4d_fpn_1x.py @@ -0,0 +1,193 @@ +# model settings +model = dict( + type='FasterRCNN', + pretrained='open-mmlab://resnext101_32x4d', + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='GARPNHead', + in_channels=256, + feat_channels=256, + octave_base_scale=8, + scales_per_octave=3, + octave_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + anchor_base_sizes=None, + anchoring_means=[.0, .0, .0, .0], + anchoring_stds=[0.07, 0.07, 0.14, 0.14], + target_means=(.0, .0, .0, .0), + target_stds=[0.07, 0.07, 0.11, 0.11], + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict( + type='IoULoss', style='bounded', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1], + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))) +# model training and testing settings +train_cfg = dict( + rpn=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + center_ratio=0.2, + ignore_ratio=0.5, + debug=False), + rpn_proposal=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=300, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=1000, + nms_post=1000, + max_num=300, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=1e-3, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + with_mask=False, + with_crowd=True, + with_label=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_crowd=True, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/ga_faster_rcnn_x101_32x4d_fpn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x.py b/configs/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x.py new file mode 100644 index 0000000..63ba9e7 --- /dev/null +++ b/configs/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x.py @@ -0,0 +1,152 @@ +# model settings +model = dict( + type='RetinaNet', + pretrained='open-mmlab://resnet50_caffe', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5), + bbox_head=dict( + type='GARetinaHead', + num_classes=81, + in_channels=256, + stacked_convs=4, + feat_channels=256, + octave_base_scale=4, + scales_per_octave=3, + octave_ratios=[0.5, 1.0, 2.0], + anchor_strides=[8, 16, 32, 64, 128], + anchor_base_sizes=None, + anchoring_means=[.0, .0, .0, .0], + anchoring_stds=[1.0, 1.0, 1.0, 1.0], + target_means=(.0, .0, .0, .0), + target_stds=[1.0, 1.0, 1.0, 1.0], + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict( + type='IoULoss', style='bounded', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=0.04, loss_weight=1.0))) +# training and testing settings +train_cfg = dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.4, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + center_ratio=0.2, + ignore_ratio=0.5, + debug=False) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + with_mask=False, + with_crowd=False, + with_label=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_crowd=False, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_crowd=False, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +device_ids = range(8) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/ga_retinanet_r50_caffe_fpn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x.py b/configs/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x.py new file mode 100644 index 0000000..bd39bf1 --- /dev/null +++ b/configs/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x.py @@ -0,0 +1,152 @@ +# model settings +model = dict( + type='RetinaNet', + pretrained='open-mmlab://resnext101_32x4d', + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5), + bbox_head=dict( + type='GARetinaHead', + num_classes=81, + in_channels=256, + stacked_convs=4, + feat_channels=256, + octave_base_scale=4, + scales_per_octave=3, + octave_ratios=[0.5, 1.0, 2.0], + anchor_strides=[8, 16, 32, 64, 128], + anchor_base_sizes=None, + anchoring_means=[.0, .0, .0, .0], + anchoring_stds=[1.0, 1.0, 1.0, 1.0], + target_means=(.0, .0, .0, .0), + target_stds=[1.0, 1.0, 1.0, 1.0], + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict( + type='IoULoss', style='bounded', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=0.04, loss_weight=1.0))) +# training and testing settings +train_cfg = dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.4, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + center_ratio=0.2, + ignore_ratio=0.5, + debug=False) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + with_mask=False, + with_crowd=False, + with_label=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_crowd=False, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_crowd=False, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +device_ids = range(8) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/ga_retinanet_x101_32x4d_fpn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/guided_anchoring/ga_rpn_r101_caffe_rpn_1x.py b/configs/guided_anchoring/ga_rpn_r101_caffe_rpn_1x.py new file mode 100644 index 0000000..d3acf87 --- /dev/null +++ b/configs/guided_anchoring/ga_rpn_r101_caffe_rpn_1x.py @@ -0,0 +1,151 @@ +# model settings +model = dict( + type='RPN', + pretrained='open-mmlab://resnet101_caffe', + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='GARPNHead', + in_channels=256, + feat_channels=256, + octave_base_scale=8, + scales_per_octave=3, + octave_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + anchor_base_sizes=None, + anchoring_means=[.0, .0, .0, .0], + anchoring_stds=[0.07, 0.07, 0.14, 0.14], + target_means=(.0, .0, .0, .0), + target_stds=[0.07, 0.07, 0.11, 0.11], + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict( + type='IoULoss', style='bounded', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))) +# model training and testing settings +train_cfg = dict( + rpn=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + center_ratio=0.2, + ignore_ratio=0.5, + debug=False)) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + with_mask=False, + with_crowd=False, + with_label=False), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_crowd=False, + with_label=False), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +# runner configs +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/ga_rpn_r101_caffe_fpn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/guided_anchoring/ga_rpn_r50_caffe_fpn_1x.py b/configs/guided_anchoring/ga_rpn_r50_caffe_fpn_1x.py new file mode 100644 index 0000000..cea9b76 --- /dev/null +++ b/configs/guided_anchoring/ga_rpn_r50_caffe_fpn_1x.py @@ -0,0 +1,151 @@ +# model settings +model = dict( + type='RPN', + pretrained='open-mmlab://resnet50_caffe', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='GARPNHead', + in_channels=256, + feat_channels=256, + octave_base_scale=8, + scales_per_octave=3, + octave_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + anchor_base_sizes=None, + anchoring_means=[.0, .0, .0, .0], + anchoring_stds=[0.07, 0.07, 0.14, 0.14], + target_means=(.0, .0, .0, .0), + target_stds=[0.07, 0.07, 0.11, 0.11], + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict( + type='IoULoss', style='bounded', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))) +# model training and testing settings +train_cfg = dict( + rpn=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + center_ratio=0.2, + ignore_ratio=0.5, + debug=False)) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + with_mask=False, + with_crowd=False, + with_label=False), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_crowd=False, + with_label=False), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +# runner configs +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/ga_rpn_r50_caffe_fpn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x.py b/configs/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x.py new file mode 100644 index 0000000..c037254 --- /dev/null +++ b/configs/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x.py @@ -0,0 +1,151 @@ +# model settings +model = dict( + type='RPN', + pretrained='open-mmlab://resnext101_32x4d', + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='GARPNHead', + in_channels=256, + feat_channels=256, + octave_base_scale=8, + scales_per_octave=3, + octave_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + anchor_base_sizes=None, + anchoring_means=[.0, .0, .0, .0], + anchoring_stds=[0.07, 0.07, 0.14, 0.14], + target_means=(.0, .0, .0, .0), + target_stds=[0.07, 0.07, 0.11, 0.11], + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict( + type='IoULoss', style='bounded', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))) +# model training and testing settings +train_cfg = dict( + rpn=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + center_ratio=0.2, + ignore_ratio=0.5, + debug=False)) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + with_mask=False, + with_crowd=False, + with_label=False), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_crowd=False, + with_label=False), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +# runner configs +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/ga_rpn_x101_32x4d_fpn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/mmdet/core/anchor/__init__.py b/mmdet/core/anchor/__init__.py index 0ff430a..304d493 100644 --- a/mmdet/core/anchor/__init__.py +++ b/mmdet/core/anchor/__init__.py @@ -1,4 +1,8 @@ from .anchor_generator import AnchorGenerator -from .anchor_target import anchor_target +from .anchor_target import anchor_target, anchor_inside_flags +from .guided_anchor_target import ga_loc_target, ga_shape_target -__all__ = ['AnchorGenerator', 'anchor_target'] +__all__ = [ + 'AnchorGenerator', 'anchor_target', 'anchor_inside_flags', 'ga_loc_target', + 'ga_shape_target' +] diff --git a/mmdet/core/anchor/guided_anchor_target.py b/mmdet/core/anchor/guided_anchor_target.py new file mode 100644 index 0000000..2e95406 --- /dev/null +++ b/mmdet/core/anchor/guided_anchor_target.py @@ -0,0 +1,285 @@ +import torch + +from ..bbox import build_assigner, build_sampler, PseudoSampler +from ..utils import unmap, multi_apply + + +def calc_region(bbox, ratio, featmap_size=None): + """Calculate a proportional bbox region. + + The bbox center are fixed and the new h' and w' is h * ratio and w * ratio. + + Args: + bbox (Tensor): Bboxes to calculate regions, shape (n, 4) + ratio (float): Ratio of the output region. + featmap_size (tuple): Feature map size used for clipping the boundary. + + Returns: + tuple: x1, y1, x2, y2 + """ + x1 = torch.round((1 - ratio) * bbox[0] + ratio * bbox[2]).long() + y1 = torch.round((1 - ratio) * bbox[1] + ratio * bbox[3]).long() + x2 = torch.round(ratio * bbox[0] + (1 - ratio) * bbox[2]).long() + y2 = torch.round(ratio * bbox[1] + (1 - ratio) * bbox[3]).long() + if featmap_size is not None: + x1 = x1.clamp(min=0, max=featmap_size[1] - 1) + y1 = y1.clamp(min=0, max=featmap_size[0] - 1) + x2 = x2.clamp(min=0, max=featmap_size[1] - 1) + y2 = y2.clamp(min=0, max=featmap_size[0] - 1) + return (x1, y1, x2, y2) + + +def ga_loc_target(gt_bboxes_list, + featmap_sizes, + anchor_scale, + anchor_strides, + center_ratio=0.2, + ignore_ratio=0.5): + """Compute location targets for guided anchoring. + + Each feature map is divided into positive, negative and ignore regions. + - positive regions: target 1, weight 1 + - ignore regions: target 0, weight 0 + - negative regions: target 0, weight 0.1 + + Args: + gt_bboxes_list (list[Tensor]): Gt bboxes of each image. + featmap_sizes (list[tuple]): Multi level sizes of each feature maps. + anchor_scale (int): Anchor scale. + anchor_strides ([list[int]]): Multi level anchor strides. + center_ratio (float): Ratio of center region. + ignore_ratio (float): Ratio of ignore region. + + Returns: + tuple + """ + img_per_gpu = len(gt_bboxes_list) + num_lvls = len(featmap_sizes) + r1 = (1 - center_ratio) / 2 + r2 = (1 - ignore_ratio) / 2 + all_loc_targets = [] + all_loc_weights = [] + all_ignore_map = [] + for lvl_id in range(num_lvls): + h, w = featmap_sizes[lvl_id] + loc_targets = torch.zeros(img_per_gpu, + 1, + h, + w, + device=gt_bboxes_list[0].device, + dtype=torch.float32) + loc_weights = torch.full_like(loc_targets, -1) + ignore_map = torch.zeros_like(loc_targets) + all_loc_targets.append(loc_targets) + all_loc_weights.append(loc_weights) + all_ignore_map.append(ignore_map) + for img_id in range(img_per_gpu): + gt_bboxes = gt_bboxes_list[img_id] + scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1) * + (gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1)) + min_anchor_size = scale.new_full( + (1, ), float(anchor_scale * anchor_strides[0])) + # assign gt bboxes to different feature levels w.r.t. their scales + target_lvls = torch.floor( + torch.log2(scale) - torch.log2(min_anchor_size) + 0.5) + target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long() + for gt_id in range(gt_bboxes.size(0)): + lvl = target_lvls[gt_id].item() + # rescaled to corresponding feature map + gt_ = gt_bboxes[gt_id, :4] / anchor_strides[lvl] + # calculate ignore regions + ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region( + gt_, r2, featmap_sizes[lvl]) + # calculate positive (center) regions + ctr_x1, ctr_y1, ctr_x2, ctr_y2 = calc_region( + gt_, r1, featmap_sizes[lvl]) + all_loc_targets[lvl][img_id, 0, ctr_y1:ctr_y2 + 1, ctr_x1:ctr_x2 + + 1] = 1 + all_loc_weights[lvl][img_id, 0, ignore_y1:ignore_y2 + + 1, ignore_x1:ignore_x2 + 1] = 0 + all_loc_weights[lvl][img_id, 0, ctr_y1:ctr_y2 + 1, ctr_x1:ctr_x2 + + 1] = 1 + # calculate ignore map on nearby low level feature + if lvl > 0: + d_lvl = lvl - 1 + # rescaled to corresponding feature map + gt_ = gt_bboxes[gt_id, :4] / anchor_strides[d_lvl] + ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region( + gt_, r2, featmap_sizes[d_lvl]) + all_ignore_map[d_lvl][img_id, 0, ignore_y1:ignore_y2 + + 1, ignore_x1:ignore_x2 + 1] = 1 + # calculate ignore map on nearby high level feature + if lvl < num_lvls - 1: + u_lvl = lvl + 1 + # rescaled to corresponding feature map + gt_ = gt_bboxes[gt_id, :4] / anchor_strides[u_lvl] + ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region( + gt_, r2, featmap_sizes[u_lvl]) + all_ignore_map[u_lvl][img_id, 0, ignore_y1:ignore_y2 + + 1, ignore_x1:ignore_x2 + 1] = 1 + for lvl_id in range(num_lvls): + # ignore negative regions w.r.t. ignore map + all_loc_weights[lvl_id][(all_loc_weights[lvl_id] < 0) + & (all_ignore_map[lvl_id] > 0)] = 0 + # set negative regions with weight 0.1 + all_loc_weights[lvl_id][all_loc_weights[lvl_id] < 0] = 0.1 + # loc average factor to balance loss + loc_avg_factor = sum( + [t.size(0) * t.size(-1) * t.size(-2) for t in all_loc_targets]) / 200 + return all_loc_targets, all_loc_weights, loc_avg_factor + + +def ga_shape_target(approx_list, + inside_flag_list, + square_list, + gt_bboxes_list, + img_metas, + approxs_per_octave, + cfg, + gt_bboxes_ignore_list=None, + sampling=True, + unmap_outputs=True): + """Compute guided anchoring targets. + + Args: + approx_list (list[list]): Multi level approxs of each image. + inside_flag_list (list[list]): Multi level inside flags of each image. + square_list (list[list]): Multi level squares of each image. + gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image. + img_metas (list[dict]): Meta info of each image. + approxs_per_octave (int): number of approxs per octave + cfg (dict): RPN train configs. + gt_bboxes_ignore_list (list[Tensor]): ignore list of gt bboxes. + sampling (bool): sampling or not. + unmap_outputs (bool): unmap outputs or not. + + Returns: + tuple + """ + num_imgs = len(img_metas) + assert len(approx_list) == len(inside_flag_list) == len( + square_list) == num_imgs + # anchor number of multi levels + num_level_squares = [squares.size(0) for squares in square_list[0]] + # concat all level anchors and flags to a single tensor + inside_flag_flat_list = [] + approx_flat_list = [] + square_flat_list = [] + for i in range(num_imgs): + assert len(square_list[i]) == len(inside_flag_list[i]) + inside_flag_flat_list.append(torch.cat(inside_flag_list[i])) + approx_flat_list.append(torch.cat(approx_list[i])) + square_flat_list.append(torch.cat(square_list[i])) + + # compute targets for each image + if gt_bboxes_ignore_list is None: + gt_bboxes_ignore_list = [None for _ in range(num_imgs)] + (all_bbox_anchors, all_bbox_gts, all_bbox_weights, pos_inds_list, + neg_inds_list) = multi_apply(ga_shape_target_single, + approx_flat_list, + inside_flag_flat_list, + square_flat_list, + gt_bboxes_list, + gt_bboxes_ignore_list, + img_metas, + approxs_per_octave=approxs_per_octave, + cfg=cfg, + sampling=sampling, + unmap_outputs=unmap_outputs) + # no valid anchors + if any([bbox_anchors is None for bbox_anchors in all_bbox_anchors]): + return None + # sampled anchors of all images + num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list]) + num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list]) + # split targets to a list w.r.t. multiple levels + bbox_anchors_list = images_to_levels(all_bbox_anchors, num_level_squares) + bbox_gts_list = images_to_levels(all_bbox_gts, num_level_squares) + bbox_weights_list = images_to_levels(all_bbox_weights, num_level_squares) + return (bbox_anchors_list, bbox_gts_list, bbox_weights_list, num_total_pos, + num_total_neg) + + +def images_to_levels(target, num_level_anchors): + """Convert targets by image to targets by feature level. + + [target_img0, target_img1] -> [target_level0, target_level1, ...] + """ + target = torch.stack(target, 0) + level_targets = [] + start = 0 + for n in num_level_anchors: + end = start + n + level_targets.append(target[:, start:end].squeeze(0)) + start = end + return level_targets + + +def ga_shape_target_single(flat_approxs, + inside_flags, + flat_squares, + gt_bboxes, + gt_bboxes_ignore, + img_meta, + approxs_per_octave, + cfg, + sampling=True, + unmap_outputs=True): + """Compute guided anchoring targets. + + This function returns sampled anchors and gt bboxes directly + rather than calculates regression targets. + + Args: + flat_approxs (Tensor): flat approxs of a single image, + shape (n, 4) + inside_flags (Tensor): inside flags of a single image, + shape (n, ). + flat_squares (Tensor): flat squares of a single image, + shape (approxs_per_octave * n, 4) + gt_bboxes (Tensor): Ground truth bboxes of a single image. + img_meta (dict): Meta info of a single image. + approxs_per_octave (int): number of approxs per octave + cfg (dict): RPN train configs. + sampling (bool): sampling or not. + unmap_outputs (bool): unmap outputs or not. + + Returns: + tuple + """ + if not inside_flags.any(): + return (None, ) * 6 + # assign gt and sample anchors + expand_inside_flags = inside_flags[:, None].expand( + -1, approxs_per_octave).reshape(-1) + approxs = flat_approxs[expand_inside_flags, :] + squares = flat_squares[inside_flags, :] + + bbox_assigner = build_assigner(cfg.ga_assigner) + assign_result = bbox_assigner.assign(approxs, squares, approxs_per_octave, + gt_bboxes, gt_bboxes_ignore) + if sampling: + bbox_sampler = build_sampler(cfg.ga_sampler) + else: + bbox_sampler = PseudoSampler() + sampling_result = bbox_sampler.sample(assign_result, squares, gt_bboxes) + + bbox_anchors = torch.zeros_like(squares) + bbox_gts = torch.zeros_like(squares) + bbox_weights = torch.zeros_like(squares) + + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + if len(pos_inds) > 0: + bbox_anchors[pos_inds, :] = sampling_result.pos_bboxes + bbox_gts[pos_inds, :] = sampling_result.pos_gt_bboxes + bbox_weights[pos_inds, :] = 1.0 + + # map up to original set of anchors + if unmap_outputs: + num_total_anchors = flat_squares.size(0) + bbox_anchors = unmap(bbox_anchors, num_total_anchors, inside_flags) + bbox_gts = unmap(bbox_gts, num_total_anchors, inside_flags) + bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags) + + return (bbox_anchors, bbox_gts, bbox_weights, pos_inds, neg_inds) diff --git a/mmdet/core/bbox/assigners/__init__.py b/mmdet/core/bbox/assigners/__init__.py index 40a89e9..fafa3fa 100644 --- a/mmdet/core/bbox/assigners/__init__.py +++ b/mmdet/core/bbox/assigners/__init__.py @@ -1,5 +1,8 @@ from .base_assigner import BaseAssigner from .max_iou_assigner import MaxIoUAssigner +from .approx_max_iou_assigner import ApproxMaxIoUAssigner from .assign_result import AssignResult -__all__ = ['BaseAssigner', 'MaxIoUAssigner', 'AssignResult'] +__all__ = [ + 'BaseAssigner', 'MaxIoUAssigner', 'ApproxMaxIoUAssigner', 'AssignResult' +] diff --git a/mmdet/core/bbox/assigners/approx_max_iou_assigner.py b/mmdet/core/bbox/assigners/approx_max_iou_assigner.py new file mode 100644 index 0000000..1283f7f --- /dev/null +++ b/mmdet/core/bbox/assigners/approx_max_iou_assigner.py @@ -0,0 +1,116 @@ +import torch + +from .max_iou_assigner import MaxIoUAssigner +from ..geometry import bbox_overlaps + + +class ApproxMaxIoUAssigner(MaxIoUAssigner): + """Assign a corresponding gt bbox or background to each bbox. + + Each proposals will be assigned with `-1`, `0`, or a positive integer + indicating the ground truth index. + + - -1: don't care + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + + Args: + pos_iou_thr (float): IoU threshold for positive bboxes. + neg_iou_thr (float or tuple): IoU threshold for negative bboxes. + min_pos_iou (float): Minimum iou for a bbox to be considered as a + positive bbox. Positive samples can have smaller IoU than + pos_iou_thr due to the 4th step (assign max IoU sample to each gt). + gt_max_assign_all (bool): Whether to assign all bboxes with the same + highest overlap with some gt to that gt. + ignore_iof_thr (float): IoF threshold for ignoring bboxes (if + `gt_bboxes_ignore` is specified). Negative values mean not + ignoring any bboxes. + ignore_wrt_candidates (bool): Whether to compute the iof between + `bboxes` and `gt_bboxes_ignore`, or the contrary. + """ + + def __init__(self, + pos_iou_thr, + neg_iou_thr, + min_pos_iou=.0, + gt_max_assign_all=True, + ignore_iof_thr=-1, + ignore_wrt_candidates=True): + self.pos_iou_thr = pos_iou_thr + self.neg_iou_thr = neg_iou_thr + self.min_pos_iou = min_pos_iou + self.gt_max_assign_all = gt_max_assign_all + self.ignore_iof_thr = ignore_iof_thr + self.ignore_wrt_candidates = ignore_wrt_candidates + + def assign(self, + approxs, + squares, + approxs_per_octave, + gt_bboxes, + gt_bboxes_ignore=None, + gt_labels=None): + """Assign gt to approxs. + + This method assign a gt bbox to each group of approxs (bboxes), + each group of approxs is represent by a base approx (bbox) and + will be assigned with -1, 0, or a positive number. + -1 means don't care, 0 means negative sample, + positive number is the index (1-based) of assigned gt. + The assignment is done in following steps, the order matters. + + 1. assign every bbox to -1 + 2. use the max IoU of each group of approxs to assign + 2. assign proposals whose iou with all gts < neg_iou_thr to 0 + 3. for each bbox, if the iou with its nearest gt >= pos_iou_thr, + assign it to that bbox + 4. for each gt bbox, assign its nearest proposals (may be more than + one) to itself + + Args: + approxs (Tensor): Bounding boxes to be assigned, + shape(approxs_per_octave*n, 4). + squares (Tensor): Base Bounding boxes to be assigned, + shape(n, 4). + approxs_per_octave (int): number of approxs per octave + gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4). + gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are + labelled as `ignored`, e.g., crowd boxes in COCO. + gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ). + + Returns: + :obj:`AssignResult`: The assign result. + """ + + if squares.shape[0] == 0 or gt_bboxes.shape[0] == 0: + raise ValueError('No gt or approxs') + num_squares = squares.size(0) + num_gts = gt_bboxes.size(0) + # re-organize anchors by approxs_per_octave x num_squares + approxs = torch.transpose( + approxs.view(num_squares, approxs_per_octave, 4), 0, + 1).contiguous().view(-1, 4) + all_overlaps = bbox_overlaps(approxs, gt_bboxes) + + overlaps, _ = all_overlaps.view(approxs_per_octave, num_squares, + num_gts).max(dim=0) + overlaps = torch.transpose(overlaps, 0, 1) + + bboxes = squares[:, :4] + + if (self.ignore_iof_thr > 0) and (gt_bboxes_ignore is not None) and ( + gt_bboxes_ignore.numel() > 0): + if self.ignore_wrt_candidates: + ignore_overlaps = bbox_overlaps(bboxes, + gt_bboxes_ignore, + mode='iof') + ignore_max_overlaps, _ = ignore_overlaps.max(dim=1) + else: + ignore_overlaps = bbox_overlaps(gt_bboxes_ignore, + bboxes, + mode='iof') + ignore_max_overlaps, _ = ignore_overlaps.max(dim=0) + overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1 + + assign_result = self.assign_wrt_overlaps(overlaps, gt_labels) + return assign_result diff --git a/mmdet/core/loss/__init__.py b/mmdet/core/loss/__init__.py index 8880518..c73b221 100644 --- a/mmdet/core/loss/__init__.py +++ b/mmdet/core/loss/__init__.py @@ -1,12 +1,13 @@ -from .losses import ( - weighted_nll_loss, weighted_cross_entropy, weighted_binary_cross_entropy, - sigmoid_focal_loss, py_sigmoid_focal_loss, weighted_sigmoid_focal_loss, - mask_cross_entropy, smooth_l1_loss, weighted_smoothl1, accuracy, iou_loss) +from .losses import (weighted_nll_loss, weighted_cross_entropy, + weighted_binary_cross_entropy, sigmoid_focal_loss, + py_sigmoid_focal_loss, weighted_sigmoid_focal_loss, + mask_cross_entropy, smooth_l1_loss, weighted_smoothl1, + bounded_iou_loss, weighted_iou_loss, iou_loss, accuracy) __all__ = [ 'weighted_nll_loss', 'weighted_cross_entropy', 'weighted_binary_cross_entropy', 'sigmoid_focal_loss', 'py_sigmoid_focal_loss', 'weighted_sigmoid_focal_loss', - 'mask_cross_entropy', 'smooth_l1_loss', 'weighted_smoothl1', 'accuracy', - 'iou_loss' + 'mask_cross_entropy', 'smooth_l1_loss', 'weighted_smoothl1', + 'bounded_iou_loss', 'weighted_iou_loss', 'iou_loss', 'accuracy' ] diff --git a/mmdet/core/loss/losses.py b/mmdet/core/loss/losses.py index e541ec4..6bb0954 100644 --- a/mmdet/core/loss/losses.py +++ b/mmdet/core/loss/losses.py @@ -44,8 +44,8 @@ def py_sigmoid_focal_loss(pred, pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target) weight = (alpha * target + (1 - alpha) * (1 - target)) * weight weight = weight * pt.pow(gamma) - loss = F.binary_cross_entropy_with_logits( - pred, target, reduction='none') * weight + loss = F.binary_cross_entropy_with_logits(pred, target, + reduction='none') * weight reduction_enum = F._Reduction.get_enum(reduction) # none: 0, mean:1, sum: 2 if reduction_enum == 0: @@ -66,16 +66,17 @@ def weighted_sigmoid_focal_loss(pred, if avg_factor is None: avg_factor = torch.sum(weight > 0).float().item() / num_classes + 1e-6 return torch.sum( - sigmoid_focal_loss(pred, target, gamma, alpha, 'none') * weight.view( - -1, 1))[None] / avg_factor + sigmoid_focal_loss(pred, target, gamma, alpha, 'none') * + weight.view(-1, 1))[None] / avg_factor def mask_cross_entropy(pred, target, label): num_rois = pred.size()[0] inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device) pred_slice = pred[inds, label].squeeze(1) - return F.binary_cross_entropy_with_logits( - pred_slice, target, reduction='mean')[None] + return F.binary_cross_entropy_with_logits(pred_slice, + target, + reduction='mean')[None] def smooth_l1_loss(pred, target, beta=1.0, reduction='mean'): @@ -101,6 +102,85 @@ def weighted_smoothl1(pred, target, weight, beta=1.0, avg_factor=None): return torch.sum(loss * weight)[None] / avg_factor +def bounded_iou_loss(pred, target, beta=0.2, eps=1e-3, reduction='mean'): + """Improving Object Localization with Fitness NMS and Bounded IoU Loss, + https://arxiv.org/abs/1711.00164. + + Args: + pred (tensor): Predicted bboxes. + target (tensor): Target bboxes. + beta (float): beta parameter in smoothl1. + eps (float): eps to avoid NaN. + reduction (str): Reduction type. + """ + pred_ctrx = (pred[:, 0] + pred[:, 2]) * 0.5 + pred_ctry = (pred[:, 1] + pred[:, 3]) * 0.5 + pred_w = pred[:, 2] - pred[:, 0] + 1 + pred_h = pred[:, 3] - pred[:, 1] + 1 + with torch.no_grad(): + target_ctrx = (target[:, 0] + target[:, 2]) * 0.5 + target_ctry = (target[:, 1] + target[:, 3]) * 0.5 + target_w = target[:, 2] - target[:, 0] + 1 + target_h = target[:, 3] - target[:, 1] + 1 + + dx = target_ctrx - pred_ctrx + dy = target_ctry - pred_ctry + + loss_dx = 1 - torch.max( + (target_w - 2 * dx.abs()) / + (target_w + 2 * dx.abs() + eps), torch.zeros_like(dx)) + loss_dy = 1 - torch.max( + (target_h - 2 * dy.abs()) / + (target_h + 2 * dy.abs() + eps), torch.zeros_like(dy)) + loss_dw = 1 - torch.min(target_w / (pred_w + eps), pred_w / + (target_w + eps)) + loss_dh = 1 - torch.min(target_h / (pred_h + eps), pred_h / + (target_h + eps)) + loss_comb = torch.stack([loss_dx, loss_dy, loss_dw, loss_dh], + dim=-1).view(loss_dx.size(0), -1) + + loss = torch.where(loss_comb < beta, 0.5 * loss_comb * loss_comb / beta, + loss_comb - 0.5 * beta) + reduction_enum = F._Reduction.get_enum(reduction) + # none: 0, mean:1, sum: 2 + if reduction_enum == 0: + return loss + elif reduction_enum == 1: + return loss.sum() / pred.numel() + elif reduction_enum == 2: + return loss.sum() + + +def weighted_iou_loss(pred, + target, + weight, + style='naive', + beta=0.2, + eps=1e-3, + avg_factor=None): + if style not in ['bounded', 'naive']: + raise ValueError('Only support bounded iou loss and naive iou loss.') + inds = torch.nonzero(weight[:, 0] > 0) + if avg_factor is None: + avg_factor = inds.numel() + 1e-6 + + if inds.numel() > 0: + inds = inds.squeeze(1) + else: + return (pred * weight).sum()[None] / avg_factor + + if style == 'bounded': + loss = bounded_iou_loss(pred[inds], + target[inds], + beta=beta, + eps=eps, + reduction='sum') + else: + loss = iou_loss(pred[inds], target[inds], reduction='sum') + loss = loss[None] / avg_factor + return loss + + def accuracy(pred, target, topk=1): if isinstance(topk, int): topk = (topk, ) @@ -125,8 +205,9 @@ def _expand_binary_labels(labels, label_weights, label_channels): inds = torch.nonzero(labels >= 1).squeeze() if inds.numel() > 0: bin_labels[inds, labels[inds] - 1] = 1 - bin_label_weights = label_weights.view(-1, 1).expand( - label_weights.size(0), label_channels) + bin_label_weights = label_weights.view(-1, + 1).expand(label_weights.size(0), + label_channels) return bin_labels, bin_label_weights diff --git a/mmdet/models/anchor_heads/__init__.py b/mmdet/models/anchor_heads/__init__.py index 86877a2..798b1bc 100644 --- a/mmdet/models/anchor_heads/__init__.py +++ b/mmdet/models/anchor_heads/__init__.py @@ -1,7 +1,13 @@ from .anchor_head import AnchorHead +from .guided_anchor_head import GuidedAnchorHead, FeatureAdaption from .fcos_head import FCOSHead -from .retina_head import RetinaHead from .rpn_head import RPNHead +from .ga_rpn_head import GARPNHead +from .retina_head import RetinaHead +from .ga_retina_head import GARetinaHead from .ssd_head import SSDHead -__all__ = ['AnchorHead', 'RPNHead', 'RetinaHead', 'SSDHead', 'FCOSHead'] +__all__ = [ + 'AnchorHead', 'GuidedAnchorHead', 'FeatureAdaption', 'RPNHead', + 'GARPNHead', 'RetinaHead', 'GARetinaHead', 'SSDHead', 'FCOSHead' +] diff --git a/mmdet/models/anchor_heads/ga_retina_head.py b/mmdet/models/anchor_heads/ga_retina_head.py new file mode 100644 index 0000000..c39ab8d --- /dev/null +++ b/mmdet/models/anchor_heads/ga_retina_head.py @@ -0,0 +1,107 @@ +import torch.nn as nn +from mmcv.cnn import normal_init + +from .guided_anchor_head import GuidedAnchorHead, FeatureAdaption +from ..registry import HEADS +from ..utils import bias_init_with_prob, ConvModule +from mmdet.ops import MaskedConv2d + + +@HEADS.register_module +class GARetinaHead(GuidedAnchorHead): + """Guided-Anchor-based RetinaNet head.""" + + def __init__(self, + num_classes, + in_channels, + stacked_convs=4, + conv_cfg=None, + norm_cfg=None, + **kwargs): + self.stacked_convs = stacked_convs + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + super(GARetinaHead, self).__init__(num_classes, in_channels, **kwargs) + + def _init_layers(self): + self.relu = nn.ReLU(inplace=True) + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + self.cls_convs.append( + ConvModule(chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + self.reg_convs.append( + ConvModule(chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + + self.conv_loc = nn.Conv2d(self.feat_channels, 1, 1) + self.conv_shape = nn.Conv2d(self.feat_channels, self.num_anchors * 2, + 1) + self.feature_adaption_cls = FeatureAdaption( + self.feat_channels, + self.feat_channels, + kernel_size=3, + deformable_groups=self.deformable_groups) + self.feature_adaption_reg = FeatureAdaption( + self.feat_channels, + self.feat_channels, + kernel_size=3, + deformable_groups=self.deformable_groups) + self.retina_cls = MaskedConv2d(self.feat_channels, + self.num_anchors * + self.cls_out_channels, + 3, + padding=1) + self.retina_reg = MaskedConv2d(self.feat_channels, + self.num_anchors * 4, + 3, + padding=1) + + def init_weights(self): + for m in self.cls_convs: + normal_init(m.conv, std=0.01) + for m in self.reg_convs: + normal_init(m.conv, std=0.01) + + self.feature_adaption_cls.init_weights() + self.feature_adaption_reg.init_weights() + + bias_cls = bias_init_with_prob(0.01) + normal_init(self.conv_loc, std=0.01, bias=bias_cls) + normal_init(self.conv_shape, std=0.01) + normal_init(self.retina_cls, std=0.01, bias=bias_cls) + normal_init(self.retina_reg, std=0.01) + + def forward_single(self, x): + cls_feat = x + reg_feat = x + for cls_conv in self.cls_convs: + cls_feat = cls_conv(cls_feat) + for reg_conv in self.reg_convs: + reg_feat = reg_conv(reg_feat) + + loc_pred = self.conv_loc(cls_feat) + shape_pred = self.conv_shape(reg_feat) + + cls_feat = self.feature_adaption_cls(cls_feat, shape_pred) + reg_feat = self.feature_adaption_reg(reg_feat, shape_pred) + + if not self.training: + mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr + else: + mask = None + cls_score = self.retina_cls(cls_feat, mask) + bbox_pred = self.retina_reg(reg_feat, mask) + return cls_score, bbox_pred, shape_pred, loc_pred diff --git a/mmdet/models/anchor_heads/ga_rpn_head.py b/mmdet/models/anchor_heads/ga_rpn_head.py new file mode 100644 index 0000000..b7788b6 --- /dev/null +++ b/mmdet/models/anchor_heads/ga_rpn_head.py @@ -0,0 +1,127 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import normal_init + +from mmdet.core import delta2bbox +from mmdet.ops import nms +from .guided_anchor_head import GuidedAnchorHead +from ..registry import HEADS + + +@HEADS.register_module +class GARPNHead(GuidedAnchorHead): + """Guided-Anchor-based RPN head.""" + + def __init__(self, in_channels, **kwargs): + super(GARPNHead, self).__init__(2, in_channels, **kwargs) + + def _init_layers(self): + self.rpn_conv = nn.Conv2d(self.in_channels, + self.feat_channels, + 3, + padding=1) + super(GARPNHead, self)._init_layers() + + def init_weights(self): + normal_init(self.rpn_conv, std=0.01) + super(GARPNHead, self).init_weights() + + def forward_single(self, x): + x = self.rpn_conv(x) + x = F.relu(x, inplace=True) + (cls_score, bbox_pred, shape_pred, + loc_pred) = super(GARPNHead, self).forward_single(x) + return cls_score, bbox_pred, shape_pred, loc_pred + + def loss(self, + cls_scores, + bbox_preds, + shape_preds, + loc_preds, + gt_bboxes, + img_metas, + cfg, + gt_bboxes_ignore=None): + losses = super(GARPNHead, self).loss(cls_scores, + bbox_preds, + shape_preds, + loc_preds, + gt_bboxes, + None, + img_metas, + cfg, + gt_bboxes_ignore=gt_bboxes_ignore) + return dict(loss_rpn_cls=losses['loss_cls'], + loss_rpn_bbox=losses['loss_bbox'], + loss_anchor_shape=losses['loss_shape'], + loss_anchor_loc=losses['loss_loc']) + + def get_bboxes_single(self, + cls_scores, + bbox_preds, + mlvl_anchors, + mlvl_masks, + img_shape, + scale_factor, + cfg, + rescale=False): + mlvl_proposals = [] + for idx in range(len(cls_scores)): + rpn_cls_score = cls_scores[idx] + rpn_bbox_pred = bbox_preds[idx] + anchors = mlvl_anchors[idx] + mask = mlvl_masks[idx] + assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:] + # if no location is kept, end. + if mask.sum() == 0: + continue + rpn_cls_score = rpn_cls_score.permute(1, 2, 0) + if self.use_sigmoid_cls: + rpn_cls_score = rpn_cls_score.reshape(-1) + scores = rpn_cls_score.sigmoid() + else: + rpn_cls_score = rpn_cls_score.reshape(-1, 2) + scores = rpn_cls_score.softmax(dim=1)[:, 1] + # filter scores, bbox_pred w.r.t. mask. + # anchors are filtered in get_anchors() beforehand. + scores = scores[mask] + rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, + 4)[mask, :] + if scores.dim() == 0: + rpn_bbox_pred = rpn_bbox_pred.unsqueeze(0) + anchors = anchors.unsqueeze(0) + scores = scores.unsqueeze(0) + # filter anchors, bbox_pred, scores w.r.t. scores + if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre: + _, topk_inds = scores.topk(cfg.nms_pre) + rpn_bbox_pred = rpn_bbox_pred[topk_inds, :] + anchors = anchors[topk_inds, :] + scores = scores[topk_inds] + # get proposals w.r.t. anchors and rpn_bbox_pred + proposals = delta2bbox(anchors, rpn_bbox_pred, self.target_means, + self.target_stds, img_shape) + # filter out too small bboxes + if cfg.min_bbox_size > 0: + w = proposals[:, 2] - proposals[:, 0] + 1 + h = proposals[:, 3] - proposals[:, 1] + 1 + valid_inds = torch.nonzero((w >= cfg.min_bbox_size) & + (h >= cfg.min_bbox_size)).squeeze() + proposals = proposals[valid_inds, :] + scores = scores[valid_inds] + proposals = torch.cat([proposals, scores.unsqueeze(-1)], dim=-1) + # NMS in current level + proposals, _ = nms(proposals, cfg.nms_thr) + proposals = proposals[:cfg.nms_post, :] + mlvl_proposals.append(proposals) + proposals = torch.cat(mlvl_proposals, 0) + if cfg.nms_across_levels: + # NMS across multi levels + proposals, _ = nms(proposals, cfg.nms_thr) + proposals = proposals[:cfg.max_num, :] + else: + scores = proposals[:, 4] + num = min(cfg.max_num, proposals.shape[0]) + _, topk_inds = scores.topk(num) + proposals = proposals[topk_inds, :] + return proposals diff --git a/mmdet/models/anchor_heads/guided_anchor_head.py b/mmdet/models/anchor_heads/guided_anchor_head.py new file mode 100644 index 0000000..da43aa8 --- /dev/null +++ b/mmdet/models/anchor_heads/guided_anchor_head.py @@ -0,0 +1,589 @@ +from __future__ import division + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import normal_init + +from mmdet.core import (AnchorGenerator, anchor_target, anchor_inside_flags, + ga_loc_target, ga_shape_target, delta2bbox, + multi_apply, multiclass_nms) +from mmdet.ops import DeformConv, MaskedConv2d +from ..builder import build_loss +from .anchor_head import AnchorHead +from ..registry import HEADS +from ..utils import bias_init_with_prob + + +class FeatureAdaption(nn.Module): + """Feature Adaption Module. + + Feature Adaption Module is implemented based on DCN v1. + It uses anchor shape prediction rather than feature map to + predict offsets of deformable conv layer. + + Args: + in_channels (int): Number of channels in the input feature map. + out_channels (int): Number of channels in the output feature map. + kernel_size (int): Deformable conv kernel size. + deformable_groups (int): Deformable conv group size. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + deformable_groups=4): + super(FeatureAdaption, self).__init__() + offset_channels = kernel_size * kernel_size * 2 + self.conv_offset = nn.Conv2d(2, + deformable_groups * offset_channels, + 1, + bias=False) + self.conv_adaption = DeformConv(in_channels, + out_channels, + kernel_size=kernel_size, + padding=(kernel_size - 1) // 2, + deformable_groups=deformable_groups) + self.relu = nn.ReLU(inplace=True) + + def init_weights(self): + normal_init(self.conv_offset, std=0.1) + normal_init(self.conv_adaption, std=0.01) + + def forward(self, x, shape): + offset = self.conv_offset(shape.detach()) + x = self.relu(self.conv_adaption(x, offset)) + return x + + +@HEADS.register_module +class GuidedAnchorHead(AnchorHead): + """Guided-Anchor-based head (GA-RPN, GA-RetinaNet, etc.). + + This GuidedAnchorHead will predict high-quality feature guided + anchors and locations where anchors will be kept in inference. + There are mainly 3 categories of bounding-boxes. + - Sampled (9) pairs for target assignment. (approxes) + - The square boxes where the predicted anchors are based on. + (squares) + - Guided anchors. + Please refer to https://arxiv.org/abs/1901.03278 for more details. + + Args: + num_classes (int): Number of classes. + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of channels of the feature map. + octave_base_scale (int): Base octave scale of each level of + feature map. + scales_per_octave (int): Number of octave scales in each level of + feature map + octave_ratios (Iterable): octave aspect ratios. + anchor_strides (Iterable): Anchor strides. + anchor_base_sizes (Iterable): Anchor base sizes. + anchoring_means (Iterable): Mean values of anchoring targets. + anchoring_stds (Iterable): Std values of anchoring targets. + target_means (Iterable): Mean values of regression targets. + target_stds (Iterable): Std values of regression targets. + deformable_groups: (int): Group number of DCN in + FeatureAdaption module. + loc_filter_thr (float): Threshold to filter out unconcerned regions. + loss_loc (dict): Config of location loss. + loss_shape (dict): Config of anchor shape loss. + loss_cls (dict): Config of classification loss. + loss_bbox (dict): Config of bbox regression loss. + """ + + def __init__(self, + num_classes, + in_channels, + feat_channels=256, + octave_base_scale=8, + scales_per_octave=3, + octave_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + anchor_base_sizes=None, + anchoring_means=(.0, .0, .0, .0), + anchoring_stds=(1.0, 1.0, 1.0, 1.0), + target_means=(.0, .0, .0, .0), + target_stds=(1.0, 1.0, 1.0, 1.0), + deformable_groups=4, + loc_filter_thr=0.01, + loss_loc=dict(type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict(type='IoULoss', beta=0.2, loss_weight=1.0), + loss_cls=dict(type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)): + super(AnchorHead, self).__init__() + self.in_channels = in_channels + self.num_classes = num_classes + self.feat_channels = feat_channels + self.octave_base_scale = octave_base_scale + self.scales_per_octave = scales_per_octave + self.octave_scales = octave_base_scale * np.array( + [2**(i / scales_per_octave) for i in range(scales_per_octave)]) + self.approxs_per_octave = len(self.octave_scales) * len(octave_ratios) + self.octave_ratios = octave_ratios + self.anchor_strides = anchor_strides + self.anchor_base_sizes = list( + anchor_strides) if anchor_base_sizes is None else anchor_base_sizes + self.anchoring_means = anchoring_means + self.anchoring_stds = anchoring_stds + self.target_means = target_means + self.target_stds = target_stds + self.deformable_groups = deformable_groups + self.loc_filter_thr = loc_filter_thr + self.approx_generators = [] + self.square_generators = [] + for anchor_base in self.anchor_base_sizes: + # Generators for approxs + self.approx_generators.append( + AnchorGenerator(anchor_base, self.octave_scales, + self.octave_ratios)) + # Generators for squares + self.square_generators.append( + AnchorGenerator(anchor_base, [self.octave_base_scale], [1.0])) + # one anchor per location + self.num_anchors = 1 + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + self.cls_focal_loss = loss_cls['type'] in ['FocalLoss'] + self.loc_focal_loss = loss_loc['type'] in ['FocalLoss'] + if self.use_sigmoid_cls: + self.cls_out_channels = self.num_classes - 1 + else: + self.cls_out_channels = self.num_classes + + # build losses + self.loss_loc = build_loss(loss_loc) + self.loss_shape = build_loss(loss_shape) + self.loss_cls = build_loss(loss_cls) + self.loss_bbox = build_loss(loss_bbox) + + self._init_layers() + + def _init_layers(self): + self.relu = nn.ReLU(inplace=True) + self.conv_loc = nn.Conv2d(self.feat_channels, 1, 1) + self.conv_shape = nn.Conv2d(self.feat_channels, self.num_anchors * 2, + 1) + self.feature_adaption = FeatureAdaption( + self.feat_channels, + self.feat_channels, + kernel_size=3, + deformable_groups=self.deformable_groups) + self.conv_cls = MaskedConv2d(self.feat_channels, + self.num_anchors * self.cls_out_channels, + 1) + self.conv_reg = MaskedConv2d(self.feat_channels, self.num_anchors * 4, + 1) + + def init_weights(self): + normal_init(self.conv_cls, std=0.01) + normal_init(self.conv_reg, std=0.01) + + bias_cls = bias_init_with_prob(0.01) + normal_init(self.conv_loc, std=0.01, bias=bias_cls) + normal_init(self.conv_shape, std=0.01) + + self.feature_adaption.init_weights() + + def forward_single(self, x): + loc_pred = self.conv_loc(x) + shape_pred = self.conv_shape(x) + x = self.feature_adaption(x, shape_pred) + # masked conv is only used during inference for speed-up + if not self.training: + mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr + else: + mask = None + cls_score = self.conv_cls(x, mask) + bbox_pred = self.conv_reg(x, mask) + return cls_score, bbox_pred, shape_pred, loc_pred + + def forward(self, feats): + return multi_apply(self.forward_single, feats) + + def get_sampled_approxs(self, featmap_sizes, img_metas, cfg): + """Get sampled approxs and inside flags according to feature map sizes. + + Args: + featmap_sizes (list[tuple]): Multi-level feature map sizes. + img_metas (list[dict]): Image meta info. + + Returns: + tuple: approxes of each image, inside flags of each image + """ + num_imgs = len(img_metas) + num_levels = len(featmap_sizes) + + # since feature map sizes of all images are the same, we only compute + # approxes for one time + multi_level_approxs = [] + for i in range(num_levels): + approxs = self.approx_generators[i].grid_anchors( + featmap_sizes[i], self.anchor_strides[i]) + multi_level_approxs.append(approxs) + approxs_list = [multi_level_approxs for _ in range(num_imgs)] + + # for each image, we compute inside flags of multi level approxes + inside_flag_list = [] + for img_id, img_meta in enumerate(img_metas): + multi_level_flags = [] + multi_level_approxs = approxs_list[img_id] + for i in range(num_levels): + approxs = multi_level_approxs[i] + anchor_stride = self.anchor_strides[i] + feat_h, feat_w = featmap_sizes[i] + h, w, _ = img_meta['pad_shape'] + valid_feat_h = min(int(np.ceil(h / anchor_stride)), feat_h) + valid_feat_w = min(int(np.ceil(w / anchor_stride)), feat_w) + flags = self.approx_generators[i].valid_flags( + (feat_h, feat_w), (valid_feat_h, valid_feat_w)) + inside_flags_list = [] + for i in range(self.approxs_per_octave): + split_valid_flags = flags[i::self.approxs_per_octave] + split_approxs = approxs[i::self.approxs_per_octave, :] + inside_flags = anchor_inside_flags( + split_approxs, split_valid_flags, + img_meta['img_shape'][:2], cfg.allowed_border) + inside_flags_list.append(inside_flags) + # inside_flag for a position is true if any anchor in this + # position is true + inside_flags = (torch.stack(inside_flags_list, 0).sum(dim=0) > + 0) + multi_level_flags.append(inside_flags) + inside_flag_list.append(multi_level_flags) + return approxs_list, inside_flag_list + + def get_anchors(self, + featmap_sizes, + shape_preds, + loc_preds, + img_metas, + use_loc_filter=False): + """Get squares according to feature map sizes and guided + anchors. + + Args: + featmap_sizes (list[tuple]): Multi-level feature map sizes. + shape_preds (list[tensor]): Multi-level shape predictions. + loc_preds (list[tensor]): Multi-level location predictions. + img_metas (list[dict]): Image meta info. + use_loc_filter (bool): Use loc filter or not. + + Returns: + tuple: square approxs of each image, guided anchors of each image, + loc masks of each image + """ + num_imgs = len(img_metas) + num_levels = len(featmap_sizes) + + # since feature map sizes of all images are the same, we only compute + # squares for one time + multi_level_squares = [] + for i in range(num_levels): + squares = self.square_generators[i].grid_anchors( + featmap_sizes[i], self.anchor_strides[i]) + multi_level_squares.append(squares) + squares_list = [multi_level_squares for _ in range(num_imgs)] + + # for each image, we compute multi level guided anchors + guided_anchors_list = [] + loc_mask_list = [] + for img_id, img_meta in enumerate(img_metas): + multi_level_guided_anchors = [] + multi_level_loc_mask = [] + for i in range(num_levels): + squares = squares_list[img_id][i] + shape_pred = shape_preds[i][img_id] + loc_pred = loc_preds[i][img_id] + guided_anchors, loc_mask = self.get_guided_anchors_single( + squares, + shape_pred, + loc_pred, + use_loc_filter=use_loc_filter) + multi_level_guided_anchors.append(guided_anchors) + multi_level_loc_mask.append(loc_mask) + guided_anchors_list.append(multi_level_guided_anchors) + loc_mask_list.append(multi_level_loc_mask) + return squares_list, guided_anchors_list, loc_mask_list + + def get_guided_anchors_single(self, + squares, + shape_pred, + loc_pred, + use_loc_filter=False): + """Get guided anchors and loc masks for a single level. + + Args: + square (tensor): Squares of a single level. + shape_pred (tensor): Shape predections of a single level. + loc_pred (tensor): Loc predections of a single level. + use_loc_filter (list[tensor]): Use loc filter or not. + + Returns: + tuple: guided anchors, location masks + """ + # calculate location filtering mask + loc_pred = loc_pred.sigmoid().detach() + if use_loc_filter: + loc_mask = loc_pred >= self.loc_filter_thr + else: + loc_mask = loc_pred >= 0.0 + mask = loc_mask.permute(1, 2, 0).expand(-1, -1, self.num_anchors) + mask = mask.contiguous().view(-1) + # calculate guided anchors + squares = squares[mask] + anchor_deltas = shape_pred.permute(1, 2, 0).contiguous().view( + -1, 2).detach()[mask] + bbox_deltas = anchor_deltas.new_full(squares.size(), 0) + bbox_deltas[:, 2:] = anchor_deltas + guided_anchors = delta2bbox(squares, + bbox_deltas, + self.anchoring_means, + self.anchoring_stds, + wh_ratio_clip=1e-6) + return guided_anchors, mask + + def loss_shape_single(self, shape_pred, bbox_anchors, bbox_gts, + anchor_weights, anchor_total_num): + shape_pred = shape_pred.permute(0, 2, 3, 1).contiguous().view(-1, 2) + bbox_anchors = bbox_anchors.contiguous().view(-1, 4) + bbox_gts = bbox_gts.contiguous().view(-1, 4) + anchor_weights = anchor_weights.contiguous().view(-1, 4) + bbox_deltas = bbox_anchors.new_full(bbox_anchors.size(), 0) + bbox_deltas[:, 2:] += shape_pred + # filter out negative samples to speed-up weighted_bounded_iou_loss + inds = torch.nonzero(anchor_weights[:, 0] > 0).squeeze(1) + bbox_deltas_ = bbox_deltas[inds] + bbox_anchors_ = bbox_anchors[inds] + bbox_gts_ = bbox_gts[inds] + anchor_weights_ = anchor_weights[inds] + pred_anchors_ = delta2bbox(bbox_anchors_, + bbox_deltas_, + self.anchoring_means, + self.anchoring_stds, + wh_ratio_clip=1e-6) + loss_shape = self.loss_shape(pred_anchors_, + bbox_gts_, + anchor_weights_, + avg_factor=anchor_total_num) + return loss_shape + + def loss_loc_single(self, loc_pred, loc_target, loc_weight, loc_avg_factor, + cfg): + loss_loc = self.loss_loc(loc_pred.reshape(-1, 1), + loc_target.reshape(-1, 1).long(), + loc_weight.reshape(-1, 1), + avg_factor=loc_avg_factor) + return loss_loc + + def loss(self, + cls_scores, + bbox_preds, + shape_preds, + loc_preds, + gt_bboxes, + gt_labels, + img_metas, + cfg, + gt_bboxes_ignore=None): + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == len(self.approx_generators) + + # get loc targets + loc_targets, loc_weights, loc_avg_factor = ga_loc_target( + gt_bboxes, + featmap_sizes, + self.octave_base_scale, + self.anchor_strides, + center_ratio=cfg.center_ratio, + ignore_ratio=cfg.ignore_ratio) + + # get sampled approxes + approxs_list, inside_flag_list = self.get_sampled_approxs( + featmap_sizes, img_metas, cfg) + # get squares and guided anchors + squares_list, guided_anchors_list, _ = self.get_anchors( + featmap_sizes, shape_preds, loc_preds, img_metas) + + # get shape targets + sampling = False if not hasattr(cfg, 'ga_sampler') else True + shape_targets = ga_shape_target(approxs_list, + inside_flag_list, + squares_list, + gt_bboxes, + img_metas, + self.approxs_per_octave, + cfg, + sampling=sampling) + if shape_targets is None: + return None + (bbox_anchors_list, bbox_gts_list, anchor_weights_list, anchor_fg_num, + anchor_bg_num) = shape_targets + anchor_total_num = (anchor_fg_num if not sampling else anchor_fg_num + + anchor_bg_num) + + # get anchor targets + sampling = False if self.cls_focal_loss else True + label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1 + cls_reg_targets = anchor_target(guided_anchors_list, + inside_flag_list, + gt_bboxes, + img_metas, + self.target_means, + self.target_stds, + cfg, + gt_bboxes_ignore_list=gt_bboxes_ignore, + gt_labels_list=gt_labels, + label_channels=label_channels, + sampling=sampling) + if cls_reg_targets is None: + return None + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + num_total_pos, num_total_neg) = cls_reg_targets + num_total_samples = (num_total_pos if self.cls_focal_loss else + num_total_pos + num_total_neg) + + # get classification and bbox regression losses + losses_cls, losses_bbox = multi_apply( + self.loss_single, + cls_scores, + bbox_preds, + labels_list, + label_weights_list, + bbox_targets_list, + bbox_weights_list, + num_total_samples=num_total_samples, + cfg=cfg) + + # get anchor location loss + losses_loc, = multi_apply(self.loss_loc_single, + loc_preds, + loc_targets, + loc_weights, + loc_avg_factor=loc_avg_factor, + cfg=cfg) + + # get anchor shape loss + losses_shape, = multi_apply(self.loss_shape_single, + shape_preds, + bbox_anchors_list, + bbox_gts_list, + anchor_weights_list, + anchor_total_num=anchor_total_num) + return dict(loss_cls=losses_cls, + loss_bbox=losses_bbox, + loss_shape=losses_shape, + loss_loc=losses_loc) + + def get_bboxes(self, + cls_scores, + bbox_preds, + shape_preds, + loc_preds, + img_metas, + cfg, + rescale=False): + assert len(cls_scores) == len(bbox_preds) == len(shape_preds) == len( + loc_preds) + num_levels = len(cls_scores) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + # get guided anchors + _, guided_anchors, loc_masks = self.get_anchors( + featmap_sizes, + shape_preds, + loc_preds, + img_metas, + use_loc_filter=not self.training) + result_list = [] + for img_id in range(len(img_metas)): + cls_score_list = [ + cls_scores[i][img_id].detach() for i in range(num_levels) + ] + bbox_pred_list = [ + bbox_preds[i][img_id].detach() for i in range(num_levels) + ] + guided_anchor_list = [ + guided_anchors[img_id][i].detach() for i in range(num_levels) + ] + loc_mask_list = [ + loc_masks[img_id][i].detach() for i in range(num_levels) + ] + img_shape = img_metas[img_id]['img_shape'] + scale_factor = img_metas[img_id]['scale_factor'] + proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list, + guided_anchor_list, + loc_mask_list, img_shape, + scale_factor, cfg, rescale) + result_list.append(proposals) + return result_list + + def get_bboxes_single(self, + cls_scores, + bbox_preds, + mlvl_anchors, + mlvl_masks, + img_shape, + scale_factor, + cfg, + rescale=False): + assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors) + mlvl_bboxes = [] + mlvl_scores = [] + for cls_score, bbox_pred, anchors, mask in zip(cls_scores, bbox_preds, + mlvl_anchors, + mlvl_masks): + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + # if no location is kept, end. + if mask.sum() == 0: + continue + # reshape scores and bbox_pred + cls_score = cls_score.permute(1, 2, + 0).reshape(-1, self.cls_out_channels) + if self.use_sigmoid_cls: + scores = cls_score.sigmoid() + else: + scores = cls_score.softmax(-1) + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) + # filter scores, bbox_pred w.r.t. mask. + # anchors are filtered in get_anchors() beforehand. + scores = scores[mask, :] + bbox_pred = bbox_pred[mask, :] + if scores.dim() == 0: + anchors = anchors.unsqueeze(0) + scores = scores.unsqueeze(0) + bbox_pred = bbox_pred.unsqueeze(0) + # filter anchors, bbox_pred, scores w.r.t. scores + nms_pre = cfg.get('nms_pre', -1) + if nms_pre > 0 and scores.shape[0] > nms_pre: + if self.use_sigmoid_cls: + max_scores, _ = scores.max(dim=1) + else: + max_scores, _ = scores[:, 1:].max(dim=1) + _, topk_inds = max_scores.topk(nms_pre) + anchors = anchors[topk_inds, :] + bbox_pred = bbox_pred[topk_inds, :] + scores = scores[topk_inds, :] + bboxes = delta2bbox(anchors, bbox_pred, self.target_means, + self.target_stds, img_shape) + mlvl_bboxes.append(bboxes) + mlvl_scores.append(scores) + mlvl_bboxes = torch.cat(mlvl_bboxes) + if rescale: + mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) + mlvl_scores = torch.cat(mlvl_scores) + if self.use_sigmoid_cls: + padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) + mlvl_scores = torch.cat([padding, mlvl_scores], dim=1) + # multi class NMS + det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores, + cfg.score_thr, cfg.nms, + cfg.max_per_img) + return det_bboxes, det_labels diff --git a/mmdet/models/losses/__init__.py b/mmdet/models/losses/__init__.py index efe40ea..3b00245 100644 --- a/mmdet/models/losses/__init__.py +++ b/mmdet/models/losses/__init__.py @@ -1,5 +1,6 @@ from .cross_entropy_loss import CrossEntropyLoss from .focal_loss import FocalLoss from .smooth_l1_loss import SmoothL1Loss +from .iou_loss import IoULoss -__all__ = ['CrossEntropyLoss', 'FocalLoss', 'SmoothL1Loss'] +__all__ = ['CrossEntropyLoss', 'FocalLoss', 'SmoothL1Loss', 'IoULoss'] diff --git a/mmdet/models/losses/iou_loss.py b/mmdet/models/losses/iou_loss.py new file mode 100644 index 0000000..8c9d602 --- /dev/null +++ b/mmdet/models/losses/iou_loss.py @@ -0,0 +1,26 @@ +import torch.nn as nn +from mmdet.core import weighted_iou_loss + +from ..registry import LOSSES + + +@LOSSES.register_module +class IoULoss(nn.Module): + + def __init__(self, style='naive', beta=0.2, eps=1e-3, loss_weight=1.0): + super(IoULoss, self).__init__() + self.style = style + self.beta = beta + self.eps = eps + self.loss_weight = loss_weight + + def forward(self, pred, target, weight, *args, **kwargs): + loss = self.loss_weight * weighted_iou_loss( + pred, + target, + weight, + beta=self.beta, + eps=self.eps, + *args, + **kwargs) + return loss diff --git a/mmdet/ops/__init__.py b/mmdet/ops/__init__.py index b3cbc26..34467bf 100644 --- a/mmdet/ops/__init__.py +++ b/mmdet/ops/__init__.py @@ -6,11 +6,13 @@ from .roi_align import RoIAlign, roi_align from .roi_pool import RoIPool, roi_pool from .sigmoid_focal_loss import SigmoidFocalLoss, sigmoid_focal_loss +from .masked_conv import MaskedConv2d __all__ = [ 'nms', 'soft_nms', 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'DeformConv', 'DeformConvPack', 'DeformRoIPooling', 'DeformRoIPoolingPack', 'ModulatedDeformRoIPoolingPack', 'ModulatedDeformConv', 'ModulatedDeformConvPack', 'deform_conv', 'modulated_deform_conv', - 'deform_roi_pooling', 'SigmoidFocalLoss', 'sigmoid_focal_loss' + 'deform_roi_pooling', 'SigmoidFocalLoss', 'sigmoid_focal_loss', + 'MaskedConv2d' ] diff --git a/mmdet/ops/masked_conv/__init__.py b/mmdet/ops/masked_conv/__init__.py new file mode 100644 index 0000000..feab953 --- /dev/null +++ b/mmdet/ops/masked_conv/__init__.py @@ -0,0 +1,4 @@ +from .functions.masked_conv import masked_conv2d +from .modules.masked_conv import MaskedConv2d + +__all__ = ['masked_conv2d', 'MaskedConv2d'] diff --git a/mmdet/ops/masked_conv/functions/__init__.py b/mmdet/ops/masked_conv/functions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mmdet/ops/masked_conv/functions/masked_conv.py b/mmdet/ops/masked_conv/functions/masked_conv.py new file mode 100644 index 0000000..41ba5a7 --- /dev/null +++ b/mmdet/ops/masked_conv/functions/masked_conv.py @@ -0,0 +1,55 @@ +import math +import torch +from torch.autograd import Function +from torch.nn.modules.utils import _pair +from .. import masked_conv2d_cuda + + +class MaskedConv2dFunction(Function): + + @staticmethod + def forward(ctx, features, mask, weight, bias, padding=0, stride=1): + assert mask.dim() == 3 and mask.size(0) == 1 + assert features.dim() == 4 and features.size(0) == 1 + assert features.size()[2:] == mask.size()[1:] + pad_h, pad_w = _pair(padding) + stride_h, stride_w = _pair(stride) + if stride_h != 1 or stride_w != 1: + raise ValueError( + 'Stride could not only be 1 in masked_conv2d currently.') + if not features.is_cuda: + raise NotImplementedError + + out_channel, in_channel, kernel_h, kernel_w = weight.size() + + batch_size = features.size(0) + out_h = int( + math.floor((features.size(2) + 2 * pad_h - + (kernel_h - 1) - 1) / stride_h + 1)) + out_w = int( + math.floor((features.size(3) + 2 * pad_w - + (kernel_h - 1) - 1) / stride_w + 1)) + mask_inds = torch.nonzero(mask[0] > 0) + mask_h_idx = mask_inds[:, 0].contiguous() + mask_w_idx = mask_inds[:, 1].contiguous() + data_col = features.new_zeros(in_channel * kernel_h * kernel_w, + mask_inds.size(0)) + masked_conv2d_cuda.masked_im2col_forward(features, mask_h_idx, + mask_w_idx, kernel_h, + kernel_w, pad_h, pad_w, + data_col) + + masked_output = torch.addmm(1, bias[:, None], 1, + weight.view(out_channel, -1), data_col) + output = features.new_zeros(batch_size, out_channel, out_h, out_w) + masked_conv2d_cuda.masked_col2im_forward(masked_output, mask_h_idx, + mask_w_idx, out_h, out_w, + out_channel, output) + return output + + @staticmethod + def backward(ctx, grad_output): + return (None, ) * 5 + + +masked_conv2d = MaskedConv2dFunction.apply diff --git a/mmdet/ops/masked_conv/modules/__Init__.py b/mmdet/ops/masked_conv/modules/__Init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mmdet/ops/masked_conv/modules/masked_conv.py b/mmdet/ops/masked_conv/modules/masked_conv.py new file mode 100644 index 0000000..1b8c434 --- /dev/null +++ b/mmdet/ops/masked_conv/modules/masked_conv.py @@ -0,0 +1,30 @@ +import torch.nn as nn +from ..functions.masked_conv import masked_conv2d + + +class MaskedConv2d(nn.Conv2d): + """A MaskedConv2d which inherits the official Conv2d. + + The masked forward doesn't implement the backward function and only + supports the stride parameter to be 1 currently. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True): + super(MaskedConv2d, + self).__init__(in_channels, out_channels, kernel_size, stride, + padding, dilation, groups, bias) + + def forward(self, input, mask=None): + if mask is None: # fallback to the normal Conv2d + return super(MaskedConv2d, self).forward(input) + else: + return masked_conv2d(input, mask, self.weight, self.bias, + self.padding) diff --git a/mmdet/ops/masked_conv/setup.py b/mmdet/ops/masked_conv/setup.py new file mode 100644 index 0000000..fdff5f2 --- /dev/null +++ b/mmdet/ops/masked_conv/setup.py @@ -0,0 +1,12 @@ +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +setup( + name='masked_conv2d_cuda', + ext_modules=[ + CUDAExtension('masked_conv2d_cuda', [ + 'src/masked_conv2d_cuda.cpp', + 'src/masked_conv2d_kernel.cu', + ]), + ], + cmdclass={'build_ext': BuildExtension}) diff --git a/mmdet/ops/masked_conv/src/masked_conv2d_cuda.cpp b/mmdet/ops/masked_conv/src/masked_conv2d_cuda.cpp new file mode 100644 index 0000000..f9d5373 --- /dev/null +++ b/mmdet/ops/masked_conv/src/masked_conv2d_cuda.cpp @@ -0,0 +1,74 @@ +#include + +#include +#include + +int MaskedIm2colForwardLaucher(const at::Tensor im, const int height, + const int width, const int channels, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const at::Tensor mask_h_idx, + const at::Tensor mask_w_idx, const int mask_cnt, + at::Tensor col); + +int MaskedCol2imForwardLaucher(const at::Tensor col, const int height, + const int width, const int channels, + const at::Tensor mask_h_idx, + const at::Tensor mask_w_idx, const int mask_cnt, + at::Tensor im); + +#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") +#define CHECK_CONTIGUOUS(x) \ + AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") +#define CHECK_INPUT(x) \ + CHECK_CUDA(x); \ + CHECK_CONTIGUOUS(x) + +int masked_im2col_forward_cuda(const at::Tensor im, const at::Tensor mask_h_idx, + const at::Tensor mask_w_idx, const int kernel_h, + const int kernel_w, const int pad_h, + const int pad_w, at::Tensor col) { + CHECK_INPUT(im); + CHECK_INPUT(mask_h_idx); + CHECK_INPUT(mask_w_idx); + CHECK_INPUT(col); + // im: (n, ic, h, w), kernel size (kh, kw) + // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh) + + int channels = im.size(1); + int height = im.size(2); + int width = im.size(3); + int mask_cnt = mask_h_idx.size(0); + + MaskedIm2colForwardLaucher(im, height, width, channels, kernel_h, kernel_w, + pad_h, pad_w, mask_h_idx, mask_w_idx, mask_cnt, + col); + + return 1; +} + +int masked_col2im_forward_cuda(const at::Tensor col, + const at::Tensor mask_h_idx, + const at::Tensor mask_w_idx, int height, + int width, int channels, at::Tensor im) { + CHECK_INPUT(col); + CHECK_INPUT(mask_h_idx); + CHECK_INPUT(mask_w_idx); + CHECK_INPUT(im); + // im: (n, ic, h, w), kernel size (kh, kw) + // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh) + + int mask_cnt = mask_h_idx.size(0); + + MaskedCol2imForwardLaucher(col, height, width, channels, mask_h_idx, + mask_w_idx, mask_cnt, im); + + return 1; +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("masked_im2col_forward", &masked_im2col_forward_cuda, + "masked_im2col forward (CUDA)"); + m.def("masked_col2im_forward", &masked_col2im_forward_cuda, + "masked_col2im forward (CUDA)"); +} \ No newline at end of file diff --git a/mmdet/ops/masked_conv/src/masked_conv2d_kernel.cu b/mmdet/ops/masked_conv/src/masked_conv2d_kernel.cu new file mode 100644 index 0000000..394af13 --- /dev/null +++ b/mmdet/ops/masked_conv/src/masked_conv2d_kernel.cu @@ -0,0 +1,113 @@ +#include +#include + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ + i += blockDim.x * gridDim.x) + +#define THREADS_PER_BLOCK 1024 + +inline int GET_BLOCKS(const int N) { + int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + int max_block_num = 65000; + return min(optimal_block_num, max_block_num); +} + +template +__global__ void MaskedIm2colForward(const int n, const scalar_t *data_im, + const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const long *mask_h_idx, + const long *mask_w_idx, const int mask_cnt, + scalar_t *data_col) { + // mask_cnt * channels + CUDA_1D_KERNEL_LOOP(index, n) { + const int m_index = index % mask_cnt; + const int h_col = mask_h_idx[m_index]; + const int w_col = mask_w_idx[m_index]; + const int c_im = index / mask_cnt; + const int c_col = c_im * kernel_h * kernel_w; + const int h_offset = h_col - pad_h; + const int w_offset = w_col - pad_w; + scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index; + for (int i = 0; i < kernel_h; ++i) { + int h_im = h_offset + i; + for (int j = 0; j < kernel_w; ++j) { + int w_im = w_offset + j; + if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + *data_col_ptr = + (scalar_t)data_im[(c_im * height + h_im) * width + w_im]; + } else { + *data_col_ptr = 0.0; + } + data_col_ptr += mask_cnt; + } + } + } +} + +int MaskedIm2colForwardLaucher(const at::Tensor bottom_data, const int height, + const int width, const int channels, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const at::Tensor mask_h_idx, + const at::Tensor mask_w_idx, const int mask_cnt, + at::Tensor top_data) { + const int output_size = mask_cnt * channels; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + bottom_data.type(), "MaskedIm2colLaucherForward", ([&] { + const scalar_t *bottom_data_ = bottom_data.data(); + const long *mask_h_idx_ = mask_h_idx.data(); + const long *mask_w_idx_ = mask_w_idx.data(); + scalar_t *top_data_ = top_data.data(); + MaskedIm2colForward + <<>>( + output_size, bottom_data_, height, width, kernel_h, kernel_w, + pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_); + })); + THCudaCheck(cudaGetLastError()); + return 1; +} + +template +__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col, + const int height, const int width, + const int channels, const long *mask_h_idx, + const long *mask_w_idx, const int mask_cnt, + scalar_t *data_im) { + CUDA_1D_KERNEL_LOOP(index, n) { + const int m_index = index % mask_cnt; + const int h_im = mask_h_idx[m_index]; + const int w_im = mask_w_idx[m_index]; + const int c_im = index / mask_cnt; + // int kernel_extent_w = (kernel_w - 1) + 1; + // int kernel_extent_h = (kernel_h - 1) + 1; + // compute the start and end of the output + data_im[(c_im * height + h_im) * width + w_im] = data_col[index]; + } +} + +int MaskedCol2imForwardLaucher(const at::Tensor bottom_data, const int height, + const int width, const int channels, + const at::Tensor mask_h_idx, + const at::Tensor mask_w_idx, const int mask_cnt, + at::Tensor top_data) { + const int output_size = mask_cnt * channels; + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + bottom_data.type(), "MaskedCol2imLaucherForward", ([&] { + const scalar_t *bottom_data_ = bottom_data.data(); + const long *mask_h_idx_ = mask_h_idx.data(); + const long *mask_w_idx_ = mask_w_idx.data(); + scalar_t *top_data_ = top_data.data(); + + MaskedCol2imForward + <<>>( + output_size, bottom_data_, height, width, channels, mask_h_idx_, + mask_w_idx_, mask_cnt, top_data_); + })); + THCudaCheck(cudaGetLastError()); + return 1; +}