voc.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. """VOC Dataset Classes
  2. Original author: Francisco Massa
  3. https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
  4. Updated by: Ellis Brown, Max deGroot
  5. """
  6. import os.path as osp
  7. import random
  8. import torch.utils.data as data
  9. import cv2
  10. import numpy as np
  11. import xml.etree.ElementTree as ET
  12. try:
  13. from .data_augment.yolov5_augment import yolov5_mosaic_augment, yolov5_mixup_augment, yolox_mixup_augment
  14. except:
  15. from data_augment.yolov5_augment import yolov5_mosaic_augment, yolov5_mixup_augment, yolox_mixup_augment
  16. VOC_CLASSES = ( # always index 0
  17. 'aeroplane', 'bicycle', 'bird', 'boat',
  18. 'bottle', 'bus', 'car', 'cat', 'chair',
  19. 'cow', 'diningtable', 'dog', 'horse',
  20. 'motorbike', 'person', 'pottedplant',
  21. 'sheep', 'sofa', 'train', 'tvmonitor')
  22. class VOCAnnotationTransform(object):
  23. """Transforms a VOC annotation into a Tensor of bbox coords and label index
  24. Initilized with a dictionary lookup of classnames to indexes
  25. Arguments:
  26. class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
  27. (default: alphabetic indexing of VOC's 20 classes)
  28. keep_difficult (bool, optional): keep difficult instances or not
  29. (default: False)
  30. height (int): height
  31. width (int): width
  32. """
  33. def __init__(self, class_to_ind=None, keep_difficult=False):
  34. self.class_to_ind = class_to_ind or dict(
  35. zip(VOC_CLASSES, range(len(VOC_CLASSES))))
  36. self.keep_difficult = keep_difficult
  37. def __call__(self, target):
  38. """
  39. Arguments:
  40. target (annotation) : the target annotation to be made usable
  41. will be an ET.Element
  42. Returns:
  43. a list containing lists of bounding boxes [bbox coords, class name]
  44. """
  45. res = []
  46. for obj in target.iter('object'):
  47. difficult = int(obj.find('difficult').text) == 1
  48. if not self.keep_difficult and difficult:
  49. continue
  50. name = obj.find('name').text.lower().strip()
  51. bbox = obj.find('bndbox')
  52. pts = ['xmin', 'ymin', 'xmax', 'ymax']
  53. bndbox = []
  54. for i, pt in enumerate(pts):
  55. cur_pt = int(bbox.find(pt).text) - 1
  56. # scale height or width
  57. cur_pt = cur_pt if i % 2 == 0 else cur_pt
  58. bndbox.append(cur_pt)
  59. label_idx = self.class_to_ind[name]
  60. bndbox.append(label_idx)
  61. res += [bndbox] # [x1, y1, x2, y2, label_ind]
  62. return res # [[x1, y1, x2, y2, label_ind], ... ]
  63. class VOCDetection(data.Dataset):
  64. """VOC Detection Dataset Object
  65. input is image, target is annotation
  66. Arguments:
  67. root (string): filepath to VOCdevkit folder.
  68. image_set (string): imageset to use (eg. 'train', 'val', 'test')
  69. transform (callable, optional): transformation to perform on the
  70. input image
  71. target_transform (callable, optional): transformation to perform on the
  72. target `annotation`
  73. (eg: take in caption string, return tensor of word indices)
  74. dataset_name (string, optional): which dataset to load
  75. (default: 'VOC2007')
  76. """
  77. def __init__(self,
  78. img_size=640,
  79. data_dir=None,
  80. image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
  81. trans_config=None,
  82. transform=None,
  83. is_train=False):
  84. self.root = data_dir
  85. self.img_size = img_size
  86. self.image_set = image_sets
  87. self.target_transform = VOCAnnotationTransform()
  88. self._annopath = osp.join('%s', 'Annotations', '%s.xml')
  89. self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg')
  90. self.ids = list()
  91. self.is_train = is_train
  92. for (year, name) in image_sets:
  93. rootpath = osp.join(self.root, 'VOC' + year)
  94. for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
  95. self.ids.append((rootpath, line.strip()))
  96. # augmentation
  97. self.transform = transform
  98. self.mosaic_prob = trans_config['mosaic_prob'] if trans_config else 0.0
  99. self.mixup_prob = trans_config['mixup_prob'] if trans_config else 0.0
  100. self.trans_config = trans_config
  101. print('==============================')
  102. print('use Mosaic Augmentation: {}'.format(self.mosaic_prob))
  103. print('use Mixup Augmentation: {}'.format(self.mixup_prob))
  104. print('==============================')
  105. def __getitem__(self, index):
  106. image, target, deltas = self.pull_item(index)
  107. return image, target, deltas
  108. def __len__(self):
  109. return len(self.ids)
  110. def load_image_target(self, index):
  111. # load an image
  112. img_id = self.ids[index]
  113. image = cv2.imread(self._imgpath % img_id)
  114. height, width, channels = image.shape
  115. # laod an annotation
  116. anno = ET.parse(self._annopath % img_id).getroot()
  117. if self.target_transform is not None:
  118. anno = self.target_transform(anno)
  119. # guard against no boxes via resizing
  120. anno = np.array(anno).reshape(-1, 5)
  121. target = {
  122. "boxes": anno[:, :4],
  123. "labels": anno[:, 4],
  124. "orig_size": [height, width]
  125. }
  126. return image, target
  127. def load_mosaic(self, index):
  128. # load 4x mosaic image
  129. index_list = np.arange(index).tolist() + np.arange(index+1, len(self.ids)).tolist()
  130. id1 = index
  131. id2, id3, id4 = random.sample(index_list, 3)
  132. indexs = [id1, id2, id3, id4]
  133. # load images and targets
  134. image_list = []
  135. target_list = []
  136. for index in indexs:
  137. img_i, target_i = self.load_image_target(index)
  138. image_list.append(img_i)
  139. target_list.append(target_i)
  140. # Mosaic
  141. if self.trans_config['mosaic_type'] == 'yolov5_mosaic':
  142. image, target = yolov5_mosaic_augment(
  143. image_list, target_list, self.img_size, self.trans_config, self.is_train)
  144. return image, target
  145. def load_mixup(self, origin_image, origin_target):
  146. # YOLOv5 type Mixup
  147. if self.trans_config['mixup_type'] == 'yolov5_mixup':
  148. new_index = np.random.randint(0, len(self.ids))
  149. new_image, new_target = self.load_mosaic(new_index)
  150. image, target = yolov5_mixup_augment(
  151. origin_image, origin_target, new_image, new_target)
  152. # YOLOX type Mixup
  153. elif self.trans_config['mixup_type'] == 'yolox_mixup':
  154. new_index = np.random.randint(0, len(self.ids))
  155. new_image, new_target = self.load_image_target(new_index)
  156. image, target = yolox_mixup_augment(
  157. origin_image, origin_target, new_image, new_target, self.img_size, self.trans_config['mixup_scale'])
  158. return image, target
  159. def pull_item(self, index):
  160. if random.random() < self.mosaic_prob:
  161. # load a mosaic image
  162. mosaic = True
  163. image, target = self.load_mosaic(index)
  164. else:
  165. mosaic = False
  166. # load an image and target
  167. image, target = self.load_image_target(index)
  168. # MixUp
  169. if random.random() < self.mixup_prob:
  170. image, target = self.load_mixup(image, target)
  171. # augment
  172. image, target, deltas = self.transform(image, target, mosaic)
  173. return image, target, deltas
  174. def pull_image(self, index):
  175. '''Returns the original image object at index in PIL form
  176. Note: not using self.__getitem__(), as any transformations passed in
  177. could mess up this functionality.
  178. Argument:
  179. index (int): index of img to show
  180. Return:
  181. PIL img
  182. '''
  183. img_id = self.ids[index]
  184. return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR), img_id
  185. def pull_anno(self, index):
  186. '''Returns the original annotation of image at index
  187. Note: not using self.__getitem__(), as any transformations passed in
  188. could mess up this functionality.
  189. Argument:
  190. index (int): index of img to get annotation of
  191. Return:
  192. list: [img_id, [(label, bbox coords),...]]
  193. eg: ('001718', [('dog', (96, 13, 438, 332))])
  194. '''
  195. img_id = self.ids[index]
  196. anno = ET.parse(self._annopath % img_id).getroot()
  197. gt = self.target_transform(anno, 1, 1)
  198. return img_id[1], gt
  199. if __name__ == "__main__":
  200. import argparse
  201. from build import build_transform
  202. parser = argparse.ArgumentParser(description='VOC-Dataset')
  203. # opt
  204. parser.add_argument('--root', default='D:\\python_work\\object-detection\\dataset\\VOCdevkit',
  205. help='data root')
  206. args = parser.parse_args()
  207. is_train = False
  208. img_size = 640
  209. yolov5_trans_config = {
  210. 'aug_type': 'yolov5',
  211. # Basic Augment
  212. 'degrees': 0.0,
  213. 'translate': 0.2,
  214. 'scale': 0.9,
  215. 'shear': 0.0,
  216. 'perspective': 0.0,
  217. 'hsv_h': 0.015,
  218. 'hsv_s': 0.7,
  219. 'hsv_v': 0.4,
  220. # Mosaic & Mixup
  221. 'mosaic_prob': 1.0,
  222. 'mixup_prob': 0.15,
  223. 'mosaic_type': 'yolov5_mosaic',
  224. 'mixup_type': 'yolov5_mixup',
  225. 'mixup_scale': [0.5, 1.5]
  226. }
  227. yolox_trans_config = {
  228. 'aug_type': 'yolov5',
  229. # Basic Augment
  230. 'degrees': 0.0,
  231. 'translate': 0.2,
  232. 'scale': 0.9,
  233. 'shear': 0.0,
  234. 'perspective': 0.0,
  235. 'hsv_h': 0.015,
  236. 'hsv_s': 0.7,
  237. 'hsv_v': 0.4,
  238. # Mosaic & Mixup
  239. 'mosaic_prob': 1.0,
  240. 'mixup_prob': 1.0,
  241. 'mosaic_type': 'yolov5_mosaic',
  242. 'mixup_type': 'yolox_mixup',
  243. 'mixup_scale': [0.5, 1.5]
  244. }
  245. ssd_trans_config = {
  246. 'aug_type': 'ssd',
  247. 'mosaic_prob': 0.0,
  248. 'mixup_prob': 0.0
  249. }
  250. transform = build_transform(img_size, yolov5_trans_config, is_train)
  251. dataset = VOCDetection(
  252. img_size=img_size,
  253. data_dir=args.root,
  254. trans_config=yolov5_trans_config,
  255. transform=transform,
  256. is_train=is_train
  257. )
  258. np.random.seed(0)
  259. class_colors = [(np.random.randint(255),
  260. np.random.randint(255),
  261. np.random.randint(255)) for _ in range(20)]
  262. print('Data length: ', len(dataset))
  263. for i in range(1000):
  264. image, target, deltas = dataset.pull_item(i)
  265. # to numpy
  266. image = image.permute(1, 2, 0).numpy()
  267. # to uint8
  268. image = image.astype(np.uint8)
  269. image = image.copy()
  270. img_h, img_w = image.shape[:2]
  271. boxes = target["boxes"]
  272. labels = target["labels"]
  273. for box, label in zip(boxes, labels):
  274. x1, y1, x2, y2 = box
  275. cls_id = int(label)
  276. color = class_colors[cls_id]
  277. # class name
  278. label = VOC_CLASSES[cls_id]
  279. image = cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0,0,255), 2)
  280. # put the test on the bbox
  281. cv2.putText(image, label, (int(x1), int(y1 - 5)), 0, 0.5, color, 1, lineType=cv2.LINE_AA)
  282. cv2.imshow('gt', image)
  283. # cv2.imwrite(str(i)+'.jpg', img)
  284. cv2.waitKey(0)