voc.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. """VOC Dataset Classes
  2. Original author: Francisco Massa
  3. https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
  4. Updated by: Ellis Brown, Max deGroot
  5. """
  6. import os.path as osp
  7. import random
  8. import torch.utils.data as data
  9. import cv2
  10. import numpy as np
  11. import xml.etree.ElementTree as ET
  12. try:
  13. from .data_augment import build_transform
  14. from .data_augment.yolov5_augment import yolov5_mosaic_augment, yolov5_mixup_augment, yolox_mixup_augment
  15. except:
  16. from data_augment import build_transform
  17. from data_augment.yolov5_augment import yolov5_mosaic_augment, yolov5_mixup_augment, yolox_mixup_augment
  18. VOC_CLASSES = ( # always index 0
  19. 'aeroplane', 'bicycle', 'bird', 'boat',
  20. 'bottle', 'bus', 'car', 'cat', 'chair',
  21. 'cow', 'diningtable', 'dog', 'horse',
  22. 'motorbike', 'person', 'pottedplant',
  23. 'sheep', 'sofa', 'train', 'tvmonitor')
  24. class VOCAnnotationTransform(object):
  25. """Transforms a VOC annotation into a Tensor of bbox coords and label index
  26. Initilized with a dictionary lookup of classnames to indexes
  27. Arguments:
  28. class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
  29. (default: alphabetic indexing of VOC's 20 classes)
  30. keep_difficult (bool, optional): keep difficult instances or not
  31. (default: False)
  32. height (int): height
  33. width (int): width
  34. """
  35. def __init__(self, class_to_ind=None, keep_difficult=False):
  36. self.class_to_ind = class_to_ind or dict(
  37. zip(VOC_CLASSES, range(len(VOC_CLASSES))))
  38. self.keep_difficult = keep_difficult
  39. def __call__(self, target):
  40. """
  41. Arguments:
  42. target (annotation) : the target annotation to be made usable
  43. will be an ET.Element
  44. Returns:
  45. a list containing lists of bounding boxes [bbox coords, class name]
  46. """
  47. res = []
  48. for obj in target.iter('object'):
  49. difficult = int(obj.find('difficult').text) == 1
  50. if not self.keep_difficult and difficult:
  51. continue
  52. name = obj.find('name').text.lower().strip()
  53. bbox = obj.find('bndbox')
  54. pts = ['xmin', 'ymin', 'xmax', 'ymax']
  55. bndbox = []
  56. for i, pt in enumerate(pts):
  57. cur_pt = int(bbox.find(pt).text) - 1
  58. # scale height or width
  59. cur_pt = cur_pt if i % 2 == 0 else cur_pt
  60. bndbox.append(cur_pt)
  61. label_idx = self.class_to_ind[name]
  62. bndbox.append(label_idx)
  63. res += [bndbox] # [x1, y1, x2, y2, label_ind]
  64. return res # [[x1, y1, x2, y2, label_ind], ... ]
  65. class VOCDetection(data.Dataset):
  66. """VOC Detection Dataset Object
  67. input is image, target is annotation
  68. Arguments:
  69. root (string): filepath to VOCdevkit folder.
  70. image_set (string): imageset to use (eg. 'train', 'val', 'test')
  71. transform (callable, optional): transformation to perform on the
  72. input image
  73. target_transform (callable, optional): transformation to perform on the
  74. target `annotation`
  75. (eg: take in caption string, return tensor of word indices)
  76. dataset_name (string, optional): which dataset to load
  77. (default: 'VOC2007')
  78. """
  79. def __init__(self,
  80. img_size=640,
  81. data_dir=None,
  82. image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
  83. trans_config=None,
  84. transform=None,
  85. is_train=False):
  86. self.root = data_dir
  87. self.img_size = img_size
  88. self.image_set = image_sets
  89. self.target_transform = VOCAnnotationTransform()
  90. self._annopath = osp.join('%s', 'Annotations', '%s.xml')
  91. self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg')
  92. self.ids = list()
  93. self.is_train = is_train
  94. for (year, name) in image_sets:
  95. rootpath = osp.join(self.root, 'VOC' + year)
  96. for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
  97. self.ids.append((rootpath, line.strip()))
  98. # augmentation
  99. self.transform = transform
  100. self.mosaic_prob = trans_config['mosaic_prob'] if trans_config else 0.0
  101. self.mixup_prob = trans_config['mixup_prob'] if trans_config else 0.0
  102. self.trans_config = trans_config
  103. print('==============================')
  104. print('use Mosaic Augmentation: {}'.format(self.mosaic_prob))
  105. print('use Mixup Augmentation: {}'.format(self.mixup_prob))
  106. print('==============================')
  107. def __getitem__(self, index):
  108. image, target, deltas = self.pull_item(index)
  109. return image, target, deltas
  110. def __len__(self):
  111. return len(self.ids)
  112. def load_image_target(self, index):
  113. # load an image
  114. img_id = self.ids[index]
  115. image = cv2.imread(self._imgpath % img_id)
  116. height, width, channels = image.shape
  117. # laod an annotation
  118. anno = ET.parse(self._annopath % img_id).getroot()
  119. if self.target_transform is not None:
  120. anno = self.target_transform(anno)
  121. # guard against no boxes via resizing
  122. anno = np.array(anno).reshape(-1, 5)
  123. target = {
  124. "boxes": anno[:, :4],
  125. "labels": anno[:, 4],
  126. "orig_size": [height, width]
  127. }
  128. return image, target
  129. def load_mosaic(self, index):
  130. # load 4x mosaic image
  131. index_list = np.arange(index).tolist() + np.arange(index+1, len(self.ids)).tolist()
  132. id1 = index
  133. id2, id3, id4 = random.sample(index_list, 3)
  134. indexs = [id1, id2, id3, id4]
  135. # load images and targets
  136. image_list = []
  137. target_list = []
  138. for index in indexs:
  139. img_i, target_i = self.load_image_target(index)
  140. image_list.append(img_i)
  141. target_list.append(target_i)
  142. # Mosaic
  143. if self.trans_config['mosaic_type'] == 'yolov5_mosaic':
  144. image, target = yolov5_mosaic_augment(
  145. image_list, target_list, self.img_size, self.trans_config, self.is_train)
  146. return image, target
  147. def load_mixup(self, origin_image, origin_target):
  148. # YOLOv5 type Mixup
  149. if self.trans_config['mixup_type'] == 'yolov5_mixup':
  150. new_index = np.random.randint(0, len(self.ids))
  151. new_image, new_target = self.load_mosaic(new_index)
  152. image, target = yolov5_mixup_augment(
  153. origin_image, origin_target, new_image, new_target)
  154. # YOLOX type Mixup
  155. elif self.trans_config['mixup_type'] == 'yolox_mixup':
  156. new_index = np.random.randint(0, len(self.ids))
  157. new_image, new_target = self.load_image_target(new_index)
  158. image, target = yolox_mixup_augment(
  159. origin_image, origin_target, new_image, new_target, self.img_size, self.trans_config['mixup_scale'])
  160. return image, target
  161. def pull_item(self, index):
  162. if random.random() < self.mosaic_prob:
  163. # load a mosaic image
  164. mosaic = True
  165. image, target = self.load_mosaic(index)
  166. else:
  167. mosaic = False
  168. # load an image and target
  169. image, target = self.load_image_target(index)
  170. # MixUp
  171. if random.random() < self.mixup_prob:
  172. image, target = self.load_mixup(image, target)
  173. # augment
  174. image, target, deltas = self.transform(image, target, mosaic)
  175. return image, target, deltas
  176. def pull_image(self, index):
  177. '''Returns the original image object at index in PIL form
  178. Note: not using self.__getitem__(), as any transformations passed in
  179. could mess up this functionality.
  180. Argument:
  181. index (int): index of img to show
  182. Return:
  183. PIL img
  184. '''
  185. img_id = self.ids[index]
  186. return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR), img_id
  187. def pull_anno(self, index):
  188. '''Returns the original annotation of image at index
  189. Note: not using self.__getitem__(), as any transformations passed in
  190. could mess up this functionality.
  191. Argument:
  192. index (int): index of img to get annotation of
  193. Return:
  194. list: [img_id, [(label, bbox coords),...]]
  195. eg: ('001718', [('dog', (96, 13, 438, 332))])
  196. '''
  197. img_id = self.ids[index]
  198. anno = ET.parse(self._annopath % img_id).getroot()
  199. gt = self.target_transform(anno, 1, 1)
  200. return img_id[1], gt
  201. if __name__ == "__main__":
  202. import argparse
  203. from data_augment import build_transform
  204. parser = argparse.ArgumentParser(description='VOC-Dataset')
  205. # opt
  206. parser.add_argument('--root', default='D:\\python_work\\object-detection\\dataset\\VOCdevkit',
  207. help='data root')
  208. args = parser.parse_args()
  209. is_train = True
  210. img_size = 640
  211. yolov5_trans_config = {
  212. 'aug_type': 'yolov5',
  213. # Basic Augment
  214. 'degrees': 0.0,
  215. 'translate': 0.2,
  216. 'scale': 0.9,
  217. 'shear': 0.0,
  218. 'perspective': 0.0,
  219. 'hsv_h': 0.015,
  220. 'hsv_s': 0.7,
  221. 'hsv_v': 0.4,
  222. # Mosaic & Mixup
  223. 'mosaic_prob': 1.0,
  224. 'mixup_prob': 0.15,
  225. 'mosaic_type': 'yolov5_mosaic',
  226. 'mixup_type': 'yolov5_mixup',
  227. 'mixup_scale': [0.5, 1.5]
  228. }
  229. yolox_trans_config = {
  230. 'aug_type': 'yolov5',
  231. # Basic Augment
  232. 'degrees': 0.0,
  233. 'translate': 0.2,
  234. 'scale': 0.9,
  235. 'shear': 0.0,
  236. 'perspective': 0.0,
  237. 'hsv_h': 0.015,
  238. 'hsv_s': 0.7,
  239. 'hsv_v': 0.4,
  240. # Mosaic & Mixup
  241. 'mosaic_prob': 1.0,
  242. 'mixup_prob': 1.0,
  243. 'mosaic_type': 'yolov5_mosaic',
  244. 'mixup_type': 'yolox_mixup',
  245. 'mixup_scale': [0.5, 1.5]
  246. }
  247. ssd_trans_config = {
  248. 'aug_type': 'ssd',
  249. 'mosaic_prob': 0.0,
  250. 'mixup_prob': 0.0
  251. }
  252. transform = build_transform(img_size, yolox_trans_config, is_train)
  253. dataset = VOCDetection(
  254. img_size=img_size,
  255. data_dir=args.root,
  256. trans_config=yolox_trans_config,
  257. transform=transform,
  258. is_train=is_train
  259. )
  260. np.random.seed(0)
  261. class_colors = [(np.random.randint(255),
  262. np.random.randint(255),
  263. np.random.randint(255)) for _ in range(20)]
  264. print('Data length: ', len(dataset))
  265. for i in range(1000):
  266. image, target, deltas = dataset.pull_item(i)
  267. # to numpy
  268. image = image.permute(1, 2, 0).numpy()
  269. # to uint8
  270. image = image.astype(np.uint8)
  271. image = image.copy()
  272. img_h, img_w = image.shape[:2]
  273. boxes = target["boxes"]
  274. labels = target["labels"]
  275. for box, label in zip(boxes, labels):
  276. x1, y1, x2, y2 = box
  277. cls_id = int(label)
  278. color = class_colors[cls_id]
  279. # class name
  280. label = VOC_CLASSES[cls_id]
  281. image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,255), 2)
  282. # put the test on the bbox
  283. cv2.putText(image, label, (int(x1), int(y1 - 5)), 0, 0.5, color, 1, lineType=cv2.LINE_AA)
  284. cv2.imshow('gt', image)
  285. # cv2.imwrite(str(i)+'.jpg', img)
  286. cv2.waitKey(0)