Faster R-CNN代码详解

技术2022-07-10 164

代码框架

Faster RCNN --> GeneralizedRCNN --> nn.Module

Generalized RCNN

class GeneralizedRCNN(nn.Module): def __init__(self, backbone, rpn, roi_heads, transform): super(GeneralizedRCNN, self).__init__() self.transform = transform self.backbone = backbone self.rpn = rpn self.roi_heads = roi_heads # used only on torchscript mode self._has_warned = False @torch.jit.unused def eager_outputs(self, losses, detections): # type: (Dict[str, Tensor], List[Dict[str, Tensor]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]] if self.training: return losses return detections def forward(self, images, targets=None): if self.training and targets is None: raise ValueError("In training mode, targets should be passed") original_image_sizes = torch.jit.annotate(List[Tuple[int, int]], []) for img in images: val = img.shape[-2:] assert len(val) == 2 original_image_sizes.append((val[0], val[1])) images, targets = self.transform(images, targets) features = self.backbone(images.tensors) if isinstance(features, torch.Tensor): features = OrderedDict([('0', features)]) proposals, proposal_losses = self.rpn(images, features, targets) detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets) detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes) losses = {} losses.update(detector_losses) losses.update(proposal_losses) if torch.jit.is_scripting(): if not self._has_warned: warnings.warn("RCNN always returns a (Losses, Detections) tuple in scripting") self._has_warned = True return (losses, detections) else: return self.eager_outputs(losses, detections)

对于GeneralizedRCNN由四个部分组成

transformbackbonerpnroi_heads

transform

transform主要做两件事：

image: [0,255] uint8–> [-1.0, +1.0] float32image, targets : resize # GeneralizedRCNN.forward(...) for img in images: val = img.shape[-2:] assert len(val) == 2 original_image_sizes.append((val[0], val[1])) images, targets = self.transform(images, targets)

backbone + rpn + roi_heads

完成图像缩放之后其实才算是正式进入网络流程。接下来有4个步骤：

将transform后的图像输入到backbone模块提取特征图

# GeneralizedRCNN.forward(...) features = self.backbone(images.tensors) #

然后经过rpn 模块生成proposals 和proposal_losses

# GeneralizedRCNN.forward(...) proposals, proposal_losses = self.rpn(images, features, targets)

接着进入roi_heads模块（即 roi_pooling+ 分类）

# GeneralizedRCNN.forward(...) detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)

最后经transform.postprocess模块（进行 NMS，同时将box通过 original_images_size映射回原图）

# GeneralizedRCNN.forward(...) detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)

FasterRCNN

FasterRCNN 继承基类 GeneralizedRCNN，实现了 GeneralizedRCNN 中的 transform、backbone、rpn、roi_heads 接口：

class FasterRCNN(GeneralizedRCNN): def __init__(self, backbone, num_classes=None, # transform parameters min_size=800, max_size=1333, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None): out_channels = backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((32,), (64,), (128,), (256,), (512,)) aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator( anchor_sizes, aspect_ratios ) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0] ) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork( rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) roi_heads = RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform)

transform

对于 transform 接口，使用 GeneralizedRCNNTransform 实现。从代码变量名可以明显看到包含：

与缩放相关参数：min_size + max_size与归一化相关参数：image_mean + image_std # FasterRCNN.__init__(...) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std)

backbone

对于 backbone, Faster RCNN 使用 ResNet50 + FPN 结构：

# detection/faster_rcnn.py (291...) def fasterrcnn_resnet50_fpn(pretrained=False, progress=True, num_classes=91, pretrained_backbone=True, **kwargs): if pretrained: # no need to download the backbone if pretrained is set pretrained_backbone = False backbone = resnet_fpn_backbone('resnet50', pretrained_backbone) model = FasterRCNN(backbone, num_classes, **kwargs) if pretrained: state_dict = load_state_dict_from_url(model_urls['fasterrcnn_resnet50_fpn_coco'], progress=progress) model.load_state_dict(state_dict) return model

特征提取网络，一般为 VGG、ResNet、MobileNet 等网络。

modelbackbonenickfaster RCNNResnet50FPNyolov3Darknet53ssdvgg

rpn

接下来重点介绍 rpn 接口的实现

# FasterRCNN.__init__(194) rpn = RegionProposalNetwork( rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh)

首先是 rpn_anchor_generator :

目的: 用于生成Base-anchorinput: 每个位置有三种长宽比(0.5, 1.0, 2.0)，以及五种sanchor_sizeoutput: 每个位置有15个base_anchor # FasterRCNN.__init__(...) if rpn_anchor_generator is None: anchor_sizes = ((32,), (64,), (128,), (256,), (512,)) aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator( anchor_sizes, aspect_ratios )

在之前提到，由于有 FPN 网络，所以输入 rpn 的是多个特征。为了方便介绍，以下都是以某一个特征进行描述，其他特征类似。

假设有 $h * w$ 的特征，首先会计算这个特征相对于输入图像的下采样倍数 stride：

$\_size \over featuresize }$

然后生成一个 $h * w$ 大小的网格，每个格子长度为 stride，如下图：

# AnchorGenerator.grid_anchors(...) shifts_x = torch.arange(0, grid_width, dtype=torch.float32, device=device) * stride_width shifts_y = torch.arange(0, grid_height, dtype=torch.float32, device=device) * stride_height shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)

然后将 base_anchors 的中心从移动到网格的点，且在网格的每个点都放置一组 base_anchors。这样就在当前 feature_map 上有了很多的 anchors。

需要特别说明，stride 代表网络的感受野，网络不可能检测到比 feature_map 更密集的框了！所以才只会在网格中每个点设置 anchors（反过来说，如果在网格的两个点之间设置 anchors，那么就对应 feature_map 中半个点，显然不合理）。

放置好 anchors 后，接下来就要调整网络，使网络输出能够判断每个 anchor 是否有目标，同时还要有 bounding box regression 需要的4个值 $(d x, d y, d w, d h)$ 。

class RPNHead(nn.Module): def __init__(self, in_channels, num_anchors): super(RPNHead, self).__init__() self.conv = nn.Conv2d( in_channels, in_channels, kernel_size=3, stride=1, padding=1 ) self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1) self.bbox_pred = nn.Conv2d( in_channels, num_anchors * 4, kernel_size=1, stride=1 ) def forward(self, x): logits = [] bbox_reg = [] for feature in x: t = F.relu(self.conv(feature)) logits.append(self.cls_logits(t)) bbox_reg.append(self.bbox_pred(t)) return logits, bbox_reg

Processed: 0.010, SQL: 9