| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230 |
- import numpy as np
- import random
- import argparse
- import os
- import sys
- sys.path.append('..')
- from dataset.voc import VOCDetection
- from dataset.coco import COCODataset
- def parse_args():
- parser = argparse.ArgumentParser(description='kmeans for anchor box')
- parser.add_argument('--root', default='/mnt/share/ssd2/dataset',
- help='data root')
- parser.add_argument('-d', '--dataset', default='coco',
- help='coco, widerface, crowdhuman')
- parser.add_argument('-na', '--num_anchorbox', default=5, type=int,
- help='number of anchor box.')
- parser.add_argument('-size', '--img_size', default=416, type=int,
- help='input size.')
- parser.add_argument('-ab', '--absolute', action='store_true', default=False,
- help='absolute coords.')
- return parser.parse_args()
-
- args = parse_args()
-
- class Box():
- def __init__(self, x, y, w, h):
- self.x = x
- self.y = y
- self.w = w
- self.h = h
- def iou(box1, box2):
- x1, y1, w1, h1 = box1.x, box1.y, box1.w, box1.h
- x2, y2, w2, h2 = box2.x, box2.y, box2.w, box2.h
- S_1 = w1 * h1
- S_2 = w2 * h2
- xmin_1, ymin_1 = x1 - w1 / 2, y1 - h1 / 2
- xmax_1, ymax_1 = x1 + w1 / 2, y1 + h1 / 2
- xmin_2, ymin_2 = x2 - w2 / 2, y2 - h2 / 2
- xmax_2, ymax_2 = x2 + w2 / 2, y2 + h2 / 2
- I_w = min(xmax_1, xmax_2) - max(xmin_1, xmin_2)
- I_h = min(ymax_1, ymax_2) - max(ymin_1, ymin_2)
- if I_w < 0 or I_h < 0:
- return 0
- I = I_w * I_h
- IoU = I / (S_1 + S_2 - I)
- return IoU
- def init_centroids(boxes, n_anchors):
- centroids = []
- boxes_num = len(boxes)
- centroid_index = int(np.random.choice(boxes_num, 1)[0])
- centroids.append(boxes[centroid_index])
- print(centroids[0].w,centroids[0].h)
- for centroid_index in range(0, n_anchors-1):
- sum_distance = 0
- distance_thresh = 0
- distance_list = []
- cur_sum = 0
- for box in boxes:
- min_distance = 1
- for centroid_i, centroid in enumerate(centroids):
- distance = (1 - iou(box, centroid))
- if distance < min_distance:
- min_distance = distance
- sum_distance += min_distance
- distance_list.append(min_distance)
- distance_thresh = sum_distance * np.random.random()
- for i in range(0, boxes_num):
- cur_sum += distance_list[i]
- if cur_sum > distance_thresh:
- centroids.append(boxes[i])
- print(boxes[i].w, boxes[i].h)
- break
- return centroids
- def do_kmeans(n_anchors, boxes, centroids):
- loss = 0
- groups = []
- new_centroids = []
- for i in range(n_anchors):
- groups.append([])
- new_centroids.append(Box(0, 0, 0, 0))
-
- for box in boxes:
- min_distance = 1
- group_index = 0
- for centroid_index, centroid in enumerate(centroids):
- distance = (1 - iou(box, centroid))
- if distance < min_distance:
- min_distance = distance
- group_index = centroid_index
- groups[group_index].append(box)
- loss += min_distance
- new_centroids[group_index].w += box.w
- new_centroids[group_index].h += box.h
- for i in range(n_anchors):
- new_centroids[i].w /= max(len(groups[i]), 1)
- new_centroids[i].h /= max(len(groups[i]), 1)
- return new_centroids, groups, loss# / len(boxes)
- def anchor_box_kmeans(total_gt_boxes, n_anchors, loss_convergence, iters, plus=True):
- """
- This function will use k-means to get appropriate anchor boxes for train dataset.
- Input:
- total_gt_boxes:
- n_anchor : int -> the number of anchor boxes.
- loss_convergence : float -> threshold of iterating convergence.
- iters: int -> the number of iterations for training kmeans.
- Output: anchor_boxes : list -> [[w1, h1], [w2, h2], ..., [wn, hn]].
- """
- boxes = total_gt_boxes
- centroids = []
- if plus:
- centroids = init_centroids(boxes, n_anchors)
- else:
- total_indexs = range(len(boxes))
- sample_indexs = random.sample(total_indexs, n_anchors)
- for i in sample_indexs:
- centroids.append(boxes[i])
- # iterate k-means
- centroids, groups, old_loss = do_kmeans(n_anchors, boxes, centroids)
- iterations = 1
- while(True):
- centroids, groups, loss = do_kmeans(n_anchors, boxes, centroids)
- iterations += 1
- print("Loss = %f" % loss)
- if abs(old_loss - loss) < loss_convergence or iterations > iters:
- break
- old_loss = loss
- for centroid in centroids:
- print(centroid.w, centroid.h)
-
- print("k-means result : ")
- for centroid in centroids:
- if args.absolute:
- print("w, h: ", round(centroid.w, 2), round(centroid.h, 2),
- "area: ", round(centroid.w, 2) * round(centroid.h, 2))
- else:
- print("w, h: ", round(centroid.w / 32, 2), round(centroid.h / 32, 2),
- "area: ", round(centroid.w / 32, 2) * round(centroid.h / 32, 2))
-
- return centroids
- if __name__ == "__main__":
- n_anchors = args.num_anchorbox
- img_size = args.img_size
-
- loss_convergence = 1e-6
- iters_n = 1000
-
- boxes = []
- if args.dataset == 'voc':
- data_root = os.path.join(args.root, 'VOCdevkit')
- dataset = VOCDetection(data_dir=data_root)
- # VOC
- for i in range(len(dataset)):
- if i % 5000 == 0:
- print('Loading voc data [%d / %d]' % (i+1, len(dataset)))
- # For VOC
- img, _ = dataset.pull_image(i)
- img_h, img_w = img.shape[:2]
- _, annotation = dataset.pull_anno(i)
- # prepare bbox datas
- for box_and_label in annotation:
- box = box_and_label[:-1]
- xmin, ymin, xmax, ymax = box
- bw = (xmax - xmin) / max(img_w, img_h) * img_size
- bh = (ymax - ymin) / max(img_w, img_h) * img_size
- # check bbox
- if bw < 1.0 or bh < 1.0:
- continue
- boxes.append(Box(0, 0, bw, bh))
- break
- elif args.dataset == 'coco':
- data_root = os.path.join(args.root, 'COCO')
- dataset = COCODataset(data_dir=data_root, img_size=img_size)
- for i in range(len(dataset)):
- if i % 5000 == 0:
- print('Loading coco datat [%d / %d]' % (i+1, len(dataset)))
- # For COCO
- img, _ = dataset.pull_image(i)
- img_h, img_w = img.shape[:2]
- annotation = dataset.pull_anno(i)
- # prepare bbox datas
- for box_and_label in annotation:
- box = box_and_label[:-1]
- xmin, ymin, xmax, ymax = box
- bw = (xmax - xmin) / max(img_w, img_h) * img_size
- bh = (ymax - ymin) / max(img_w, img_h) * img_size
- # check bbox
- if bw < 1.0 or bh < 1.0:
- continue
- boxes.append(Box(0, 0, bw, bh))
- print("Number of all bboxes: ", len(boxes))
- print("Start k-means !")
- centroids = anchor_box_kmeans(boxes, n_anchors, loss_convergence, iters_n, plus=True)
|