yjh0410 2 жил өмнө
parent
commit
a0ccea1a60

+ 5 - 5
README.md

@@ -59,13 +59,13 @@ For example:
 python train.py --cuda -d voc --root path/to/VOCdevkit -v yolov1 -bs 16 --max_epoch 150 --wp_epoch 1 --eval_epoch 10 --fp16 --ema --multi_scale
 ```
 
-| Model  | Scale |  IP  | Epoch | mAP  | FPS<sup>3090<br>FP32-bs1 | FLOPs<br><sup>(G) | Params<br><sup>(M) | Weight |
+| Model  | Scale |  IP  | Epoch | AP50 | FPS<sup>3090<br>FP32-bs1 | FLOPs<br><sup>(G) | Params<br><sup>(M) | Weight |
 |--------|-------|------|-------|------|--------------------------|-------------------|--------------------|--------|
 | YOLOv1 |  640  |  √   |  150  | 76.7 |                          |   37.8            |   21.3             | [ckpt](https://github.com/yjh0410/PyTorch_YOLO_Tutorial/releases/download/yolo_tutorial_ckpy/yolov1_voc.pth) |
-| YOLOv2 |  640  |  √   |  150  |      |                          |   53.9            |   30.9             |  |
-| YOLOv3 |  640  |  √   |  150  |      |                          |   167.4           |   54.9             |  |
+| YOLOv2 |  640  |  √   |  150  | 79.8 |                          |   53.9            |   30.9             | [ckpt](https://github.com/yjh0410/PyTorch_YOLO_Tutorial/releases/download/yolo_tutorial_ckpy/yolov2_voc.pth) |
+| YOLOv3 |  640  |  √   |  150  | 82.0 |                          |   167.4           |   54.9             | [ckpt](https://github.com/yjh0410/PyTorch_YOLO_Tutorial/releases/download/yolo_tutorial_ckpy/yolov3_voc.pth) |
 | YOLOv4 |  640  |  √   |  150  |      |                          |                   |                    |  |
-| YOLOX  |  640  |  ×   |  300  |      |                          |                   |                    |  |
+| YOLOX  |  640  |  √   |  150  |      |                          |                   |                    |  |
 
 *All models are trained with ImageNet pretrained weight (IP). All FLOPs are measured with a 640x640 image size on VOC2007 test. The FPS is measured with batch size 1 on 3090 GPU from the model inference to the NMS operation.*
 
@@ -97,7 +97,7 @@ python train.py --cuda -d coco --root path/to/COCO -v yolov1 -bs 16 --max_epoch
 | YOLOv2 |  640  |  √   |  150  |                        |                         |  |
 | YOLOv3 |  640  |  √   |  300  |                        |                         |  |
 | YOLOv4 |  640  |  √   |  300  |                        |                         |  |
-| YOLOX  |  640  |  ×   |  300  |                        |                         |  |
+| YOLOX  |  640  |     |  300  |                        |                         |  |
 
 *All models are trained with ImageNet pretrained weight (IP). All FLOPs are measured with a 640x640 image size on COCO val2017. The FPS is measured with batch size 1 on 3090 GPU from the model inference to the NMS operation.*
 

+ 3 - 9
models/yolov2/yolov2.py

@@ -107,9 +107,9 @@ class YOLOv2(nn.Module):
     def postprocess(self, obj_pred, cls_pred, reg_pred, anchors):
         """
         Input:
-            conf_pred: (Tensor) [H*W*A, 1]
-            cls_pred:  (Tensor) [H*W*A, C]
-            reg_pred:  (Tensor) [H*W*A, 4]
+            obj_pred: (Tensor) [H*W*A, 1]
+            cls_pred: (Tensor) [H*W*A, C]
+            reg_pred: (Tensor) [H*W*A, 4]
         """
         # (H x W x A x C,)
         scores = torch.sqrt(obj_pred.sigmoid() * cls_pred.sigmoid()).flatten()
@@ -136,12 +136,6 @@ class YOLOv2(nn.Module):
         # 解算边界框, 并归一化边界框: [H*W*A, 4]
         bboxes = self.decode_boxes(anchors, reg_pred)
 
-        # threshold
-        keep_idxs = scores.gt(self.conf_thresh)
-        scores = scores[keep_idxs]
-        labels = labels[keep_idxs]
-        bboxes = bboxes[keep_idxs]
-
         # to cpu & numpy
         scores = scores.cpu().numpy()
         labels = labels.cpu().numpy()

+ 0 - 6
models/yolov3/yolov3.py

@@ -184,12 +184,6 @@ class YOLOv3(nn.Module):
         labels = torch.cat(all_labels)
         bboxes = torch.cat(all_bboxes)
 
-        # threshold
-        keep_idxs = scores.gt(self.conf_thresh)
-        scores = scores[keep_idxs]
-        labels = labels[keep_idxs]
-        bboxes = bboxes[keep_idxs]
-
         # to cpu & numpy
         scores = scores.cpu().numpy()
         labels = labels.cpu().numpy()

+ 0 - 6
models/yolov4/yolov4.py

@@ -184,12 +184,6 @@ class YOLOv4(nn.Module):
         labels = torch.cat(all_labels)
         bboxes = torch.cat(all_bboxes)
 
-        # threshold
-        keep_idxs = scores.gt(self.conf_thresh)
-        scores = scores[keep_idxs]
-        labels = labels[keep_idxs]
-        bboxes = bboxes[keep_idxs]
-
         # to cpu & numpy
         scores = scores.cpu().numpy()
         labels = labels.cpu().numpy()

+ 230 - 0
utils/kmeans_anchor.py

@@ -0,0 +1,230 @@
+import numpy as np
+import random
+import argparse
+import os
+import sys
+sys.path.append('..')
+
+from dataset.voc import VOCDetection
+from dataset.coco import COCODataset
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='kmeans for anchor box')
+    parser.add_argument('--root', default='/mnt/share/ssd2/dataset',
+                        help='data root')
+    parser.add_argument('-d', '--dataset', default='coco',
+                        help='coco, widerface, crowdhuman')
+    parser.add_argument('-na', '--num_anchorbox', default=5, type=int,
+                        help='number of anchor box.')
+    parser.add_argument('-size', '--img_size', default=416, type=int,
+                        help='input size.')
+    parser.add_argument('-ab', '--absolute', action='store_true', default=False,
+                        help='absolute coords.')
+    return parser.parse_args()
+                    
+args = parse_args()
+                    
+
+class Box():
+    def __init__(self, x, y, w, h):
+        self.x = x
+        self.y = y
+        self.w = w
+        self.h = h
+
+
+def iou(box1, box2):
+    x1, y1, w1, h1 = box1.x, box1.y, box1.w, box1.h
+    x2, y2, w2, h2 = box2.x, box2.y, box2.w, box2.h
+
+    S_1 = w1 * h1
+    S_2 = w2 * h2
+
+    xmin_1, ymin_1 = x1 - w1 / 2, y1 - h1 / 2
+    xmax_1, ymax_1 = x1 + w1 / 2, y1 + h1 / 2
+    xmin_2, ymin_2 = x2 - w2 / 2, y2 - h2 / 2
+    xmax_2, ymax_2 = x2 + w2 / 2, y2 + h2 / 2
+
+    I_w = min(xmax_1, xmax_2) - max(xmin_1, xmin_2)
+    I_h = min(ymax_1, ymax_2) - max(ymin_1, ymin_2)
+    if I_w < 0 or I_h < 0:
+        return 0
+    I = I_w * I_h
+
+    IoU = I / (S_1 + S_2 - I)
+
+    return IoU
+
+
+def init_centroids(boxes, n_anchors):
+    centroids = []
+    boxes_num = len(boxes)
+
+    centroid_index = int(np.random.choice(boxes_num, 1)[0])
+    centroids.append(boxes[centroid_index])
+    print(centroids[0].w,centroids[0].h)
+
+    for centroid_index in range(0, n_anchors-1):
+        sum_distance = 0
+        distance_thresh = 0
+        distance_list = []
+        cur_sum = 0
+
+        for box in boxes:
+            min_distance = 1
+            for centroid_i, centroid in enumerate(centroids):
+                distance = (1 - iou(box, centroid))
+                if distance < min_distance:
+                    min_distance = distance
+            sum_distance += min_distance
+            distance_list.append(min_distance)
+
+        distance_thresh = sum_distance * np.random.random()
+
+        for i in range(0, boxes_num):
+            cur_sum += distance_list[i]
+            if cur_sum > distance_thresh:
+                centroids.append(boxes[i])
+                print(boxes[i].w, boxes[i].h)
+                break
+    return centroids
+
+
+def do_kmeans(n_anchors, boxes, centroids):
+    loss = 0
+    groups = []
+    new_centroids = []
+
+    for i in range(n_anchors):
+        groups.append([])
+        new_centroids.append(Box(0, 0, 0, 0))
+    
+    for box in boxes:
+        min_distance = 1
+        group_index = 0
+        for centroid_index, centroid in enumerate(centroids):
+            distance = (1 - iou(box, centroid))
+            if distance < min_distance:
+                min_distance = distance
+                group_index = centroid_index
+        groups[group_index].append(box)
+        loss += min_distance
+        new_centroids[group_index].w += box.w
+        new_centroids[group_index].h += box.h
+
+    for i in range(n_anchors):
+        new_centroids[i].w /= max(len(groups[i]), 1)
+        new_centroids[i].h /= max(len(groups[i]), 1)
+
+    return new_centroids, groups, loss# / len(boxes)
+
+
+def anchor_box_kmeans(total_gt_boxes, n_anchors, loss_convergence, iters, plus=True):
+    """
+        This function will use k-means to get appropriate anchor boxes for train dataset.
+        Input:
+            total_gt_boxes: 
+            n_anchor : int -> the number of anchor boxes.
+            loss_convergence : float -> threshold of iterating convergence.
+            iters: int -> the number of iterations for training kmeans.
+        Output: anchor_boxes : list -> [[w1, h1], [w2, h2], ..., [wn, hn]].
+    """
+    boxes = total_gt_boxes
+    centroids = []
+    if plus:
+        centroids = init_centroids(boxes, n_anchors)
+    else:
+        total_indexs = range(len(boxes))
+        sample_indexs = random.sample(total_indexs, n_anchors)
+        for i in sample_indexs:
+            centroids.append(boxes[i])
+
+    # iterate k-means
+    centroids, groups, old_loss = do_kmeans(n_anchors, boxes, centroids)
+    iterations = 1
+    while(True):
+        centroids, groups, loss = do_kmeans(n_anchors, boxes, centroids)
+        iterations += 1
+        print("Loss = %f" % loss)
+        if abs(old_loss - loss) < loss_convergence or iterations > iters:
+            break
+        old_loss = loss
+
+        for centroid in centroids:
+            print(centroid.w, centroid.h)
+    
+    print("k-means result : ") 
+    for centroid in centroids:
+        if args.absolute:
+            print("w, h: ", round(centroid.w, 2), round(centroid.h, 2), 
+                "area: ", round(centroid.w, 2) * round(centroid.h, 2))
+        else:
+            print("w, h: ", round(centroid.w / 32, 2), round(centroid.h / 32, 2), 
+                "area: ", round(centroid.w / 32, 2) * round(centroid.h / 32, 2))
+    
+    return centroids
+
+
+if __name__ == "__main__":
+
+    n_anchors = args.num_anchorbox
+    img_size = args.img_size
+    
+    loss_convergence = 1e-6
+    iters_n = 1000
+    
+    boxes = []
+    if args.dataset == 'voc':
+        data_root = os.path.join(args.root, 'VOCdevkit')
+        dataset = VOCDetection(data_dir=data_root)
+
+        # VOC
+        for i in range(len(dataset)):
+            if i % 5000 == 0:
+                print('Loading voc data [%d / %d]' % (i+1, len(dataset)))
+
+            # For VOC
+            img, _ = dataset.pull_image(i)
+            img_h, img_w = img.shape[:2]
+            _, annotation = dataset.pull_anno(i)
+
+            # prepare bbox datas
+            for box_and_label in annotation:
+                box = box_and_label[:-1]
+                xmin, ymin, xmax, ymax = box
+                bw = (xmax - xmin) / max(img_w, img_h) * img_size
+                bh = (ymax - ymin) / max(img_w, img_h) * img_size
+                # check bbox
+                if bw < 1.0 or bh < 1.0:
+                    continue
+                boxes.append(Box(0, 0, bw, bh))
+            break
+
+    elif args.dataset == 'coco':
+        data_root = os.path.join(args.root, 'COCO')
+        dataset = COCODataset(data_dir=data_root, img_size=img_size)
+
+        for i in range(len(dataset)):
+            if i % 5000 == 0:
+                print('Loading coco datat [%d / %d]' % (i+1, len(dataset)))
+
+            # For COCO
+            img, _ = dataset.pull_image(i)
+            img_h, img_w = img.shape[:2]
+            annotation = dataset.pull_anno(i)
+
+            # prepare bbox datas
+            for box_and_label in annotation:
+                box = box_and_label[:-1]
+                xmin, ymin, xmax, ymax = box
+                bw = (xmax - xmin) / max(img_w, img_h) * img_size
+                bh = (ymax - ymin) / max(img_w, img_h) * img_size
+                # check bbox
+                if bw < 1.0 or bh < 1.0:
+                    continue
+                boxes.append(Box(0, 0, bw, bh))
+
+    print("Number of all bboxes: ", len(boxes))
+    print("Start k-means !")
+    centroids = anchor_box_kmeans(boxes, n_anchors, loss_convergence, iters_n, plus=True)