paper-implementations/src/rcnn/main.py at main · 0xMukesh/paper-implementations · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import cv2
import torch
from typing import Sequence
from cv2.typing import Rect
from collections import Counter

"""
In R-CNN,

i) Selective search is used to generate ~2k region proposals
ii) IoU (intersection over union) is applied on each of these region proposals to find the appropriate GT values
iii) The region proposals are resized and passed into the convolutional layers
iv) The output of the convolutional layers is then passed into the classification head and regression head
v) Cross entropy loss is used in the classification head and L2 loss is used in the regression head and their weighted sum acts like the total loss for the network
vi) Within the regression head, instead of determing the coordinates of the bounding box from scratch, it tries to predict the delta (transform)

g_x = p_x + p_w * t_x => t_x = (g_x - p_x)/p_w
g_y = p_y + p_h * t_y => t_y = (g_y - p_y)/p_h
g_w = p_w * exp(t_w) => t_w = log(g_w/p_w)
g_h = p_h * exp(t_h) => t_h = log(g_h/p_h)

The regression head would tries to learn t_x, t_y, t_w and t_h values.

vii) When there are multiple bounding boxes pointing to the same object, NMS (non-maximum suppression) is applied (during post-processing) to remove the redutant bounding boxes.
viii) mAP (mean average precision) is used evalute the object detection model.
"""

Bbox = tuple[float, float, float, float]


def rect_to_bbox(r: Rect) -> Bbox:
    return (r[0], r[1], r[2], r[3])


def selective_search(img_path: str, num_region_proposals: int):
    img = cv2.imread(img_path, cv2.IMREAD_COLOR)
    ss = cv2.ximgproc.segmentation.createSelectiveSearchSegmentation()
    ss.setBaseImage(img)
    ss.switchToSelectiveSearchFast()
    rects = ss.process()
    return rects[:num_region_proposals]


def compute_iou(box1: Bbox, box2: Bbox):
    (x1, y1, w1, h1) = box1
    (x2, y2, w2, h2) = box2

    xA = max(x1, x2)
    yA = max(y1, y2)
    xB = min(x1 + w1, x2 + w2)
    yB = min(y1 + h1, y2 + h2)

    inter_width = max(0, xB - xA)
    inter_height = max(0, yB - yA)

    intersection_area = inter_width * inter_height
    union_area = (w1 * h1) + (w2 * h2) - intersection_area

    if union_area == 0:
        return 0

    return intersection_area / union_area


def apply_iou_to_ss_output(rects: Sequence[Rect], gt: dict[Bbox, str]):
    output: dict[Bbox, tuple[Bbox, str]] = {}

    for r in rects:
        r = rect_to_bbox(r)

        best_iou = 0.0
        best_label = None
        best_bbox = None

        for bbox, label in gt.items():
            iou = compute_iou(r, bbox)

            if iou > best_iou:
                best_iou = iou
                best_label = label
                best_bbox = bbox

        if best_label is not None and best_bbox is not None:
            if best_iou >= 0.5:
                output[r] = (best_bbox, best_label)
            elif 0.3 <= best_iou < 0.5:
                output[r] = (best_bbox, "__background__")

    return output


def apply_nms(
    predictions: list[tuple[int, float, float, float, float, float]],
    prob_threshold=0.2,
    iou_threshold=0.5,
):
    # predictions - [[class, probability, x1, y1, x2, y2]]

    boxes = [box for box in predictions if box[0] > prob_threshold]
    boxes = sorted(boxes, key=lambda x: x[0], reverse=True)
    output = []

    while boxes:
        chosen_box = boxes.pop(0)
        boxes = [
            box
            for box in boxes
            if box[0] != chosen_box[0]
            or compute_iou(box[2:], chosen_box[2:]) < iou_threshold
        ]
        output.append(chosen_box)

    return output


def compute_map(
    pred_bboxes, gt_bboxes, iou_threshold=0.5, prob_threshold=0.2, num_classes=20
):
    # pred_bboxes: [[test_img_idx, class_pred, pred_score, x1, x2, y1, y2], ...]
    # gt_bboxes: [[test_img_idx, class_idx, x1, x2, y1, y2]]

    average_precisions: list[torch.Tensor] = []

    for c in range(num_classes):
        detections = []
        ground_truths = []

        for pred in pred_bboxes:
            if pred[1] == c and pred[2] >= prob_threshold:
                detections.append(pred)

        for gt in gt_bboxes:
            if gt[1] == c:
                ground_truths.append(gt)

        counts = Counter([gt[0] for gt in ground_truths])
        num_bboxes = {k: torch.zeros(v) for k, v in counts.items()}

        detections.sort(
            key=lambda x: x[2], reverse=True
        )  # sort with decreasing order of predicted score

        true_positives = torch.zeros(len(detections))
        false_positives = torch.zeros(len(detections))
        total_true_bboxes = len(ground_truths)

        for detection_idx, detection in enumerate(detections):
            gts = [gt for gt in ground_truths if gt[0] == detection[0]]
            best_iou = 0
            best_gt_idx = -1

            for gt_idx, gt in enumerate(gts):
                gt_bbox = gt[2:]
                pred_bbox = detection[3:]

                iou = compute_iou(gt_bbox, pred_bbox)

                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = gt_idx

            if best_iou > iou_threshold:
                if num_bboxes[detection[0]][best_gt_idx] == 0:
                    true_positives[detection_idx] = 1
                    num_bboxes[detection[0]][best_gt_idx] = 1
                else:
                    false_positives[detection_idx] = 1
            else:
                false_positives[detection_idx] = 1

        true_positives_cumsum = torch.cumsum(true_positives, dim=0)
        false_positive_cumsum = torch.cumsum(false_positives, dim=0)

        precision = torch.divide(
            true_positives_cumsum, true_positives_cumsum + false_positive_cumsum
        )
        recall = torch.divide(true_positives_cumsum, total_true_bboxes)
        # adds the "initial" trapezoid while calculating AUC
        precision = torch.cat((torch.tensor([1]), precision), dim=0)
        recall = torch.cat((torch.tensor([0]), recall), dim=0)

        ap = torch.trapezoid(precision, recall)
        average_precisions.append(ap)

    return sum(average_precisions) / len(average_precisions)


def compute_map_range(
    pred_bboxes, gt_bboxes, prob_threshold, num_classes, start_iou, stop_iou, step_size
):
    total_map = 0
    num_ious = 0

    for iou in range(start_iou, stop_iou, step_size):
        total_map += compute_map(
            pred_bboxes, gt_bboxes, iou, prob_threshold, num_classes
        )
        num_ious += 1

    return total_map / num_ious