123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299 |
- #!/usr/bin/env python3
- # -*- coding:utf-8 -*-
- # Copyright (c) Megvii, Inc. and its affiliates.
- """
- Data augmentation functionality. Passed as callable transformations to
- Dataset classes.
-
- The data augmentation procedures were interpreted from @weiliu89's SSD paper
- http://arxiv.org/abs/1512.02325
- """
-
- import cv2
- import numpy as np
-
- import torch
-
- from yolox.utils import xyxy2cxcywh
-
- import math
- import random
-
-
- def augment_hsv(img, hgain=0.015, sgain=0.7, vgain=0.4):
- r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1 # random gains
- hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))
- dtype = img.dtype # uint8
-
- x = np.arange(0, 256, dtype=np.int16)
- lut_hue = ((x * r[0]) % 180).astype(dtype)
- lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
- lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
-
- img_hsv = cv2.merge(
- (cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))
- ).astype(dtype)
- cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) # no return needed
-
-
- def box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.2):
- # box1(4,n), box2(4,n)
- # Compute candidate boxes which include follwing 5 things:
- # box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio
- w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
- w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
- ar = np.maximum(w2 / (h2 + 1e-16), h2 / (w2 + 1e-16)) # aspect ratio
- return (
- (w2 > wh_thr)
- & (h2 > wh_thr)
- & (w2 * h2 / (w1 * h1 + 1e-16) > area_thr)
- & (ar < ar_thr)
- ) # candidates
-
-
- def random_perspective(
- img,
- targets=(),
- degrees=10,
- translate=0.1,
- scale=0.1,
- shear=10,
- perspective=0.0,
- border=(0, 0),
- ):
- # targets = [cls, xyxy]
- height = img.shape[0] + border[0] * 2 # shape(h,w,c)
- width = img.shape[1] + border[1] * 2
-
- # Center
- C = np.eye(3)
- C[0, 2] = -img.shape[1] / 2 # x translation (pixels)
- C[1, 2] = -img.shape[0] / 2 # y translation (pixels)
-
- # Rotation and Scale
- R = np.eye(3)
- a = random.uniform(-degrees, degrees)
- # a += random.choice([-180, -90, 0, 90]) # add 90deg rotations to small rotations
- s = random.uniform(scale[0], scale[1])
- # s = 2 ** random.uniform(-scale, scale)
- R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
-
- # Shear
- S = np.eye(3)
- S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # x shear (deg)
- S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # y shear (deg)
-
- # Translation
- T = np.eye(3)
- T[0, 2] = (
- random.uniform(0.5 - translate, 0.5 + translate) * width
- ) # x translation (pixels)
- T[1, 2] = (
- random.uniform(0.5 - translate, 0.5 + translate) * height
- ) # y translation (pixels)
-
- # Combined rotation matrix
- M = T @ S @ R @ C # order of operations (right to left) is IMPORTANT
-
- ###########################
- # For Aug out of Mosaic
- # s = 1.
- # M = np.eye(3)
- ###########################
-
- if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any(): # image changed
- if perspective:
- img = cv2.warpPerspective(
- img, M, dsize=(width, height), borderValue=(114, 114, 114)
- )
- else: # affine
- img = cv2.warpAffine(
- img, M[:2], dsize=(width, height), borderValue=(114, 114, 114)
- )
-
- # Transform label coordinates
- n = len(targets)
- if n:
- # warp points
- xy = np.ones((n * 4, 3))
- xy[:, :2] = targets[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
- n * 4, 2
- ) # x1y1, x2y2, x1y2, x2y1
- xy = xy @ M.T # transform
- if perspective:
- xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8) # rescale
- else: # affine
- xy = xy[:, :2].reshape(n, 8)
-
- # create new boxes
- x = xy[:, [0, 2, 4, 6]]
- y = xy[:, [1, 3, 5, 7]]
- xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
-
- # clip boxes
- #xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
- #xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
-
- # filter candidates
- i = box_candidates(box1=targets[:, :4].T * s, box2=xy.T)
- targets = targets[i]
- targets[:, :4] = xy[i]
-
- targets = targets[targets[:, 0] < width]
- targets = targets[targets[:, 2] > 0]
- targets = targets[targets[:, 1] < height]
- targets = targets[targets[:, 3] > 0]
-
- return img, targets
-
-
- def _distort(image):
- def _convert(image, alpha=1, beta=0):
- tmp = image.astype(float) * alpha + beta
- tmp[tmp < 0] = 0
- tmp[tmp > 255] = 255
- image[:] = tmp
-
- image = image.copy()
-
- if random.randrange(2):
- _convert(image, beta=random.uniform(-32, 32))
-
- if random.randrange(2):
- _convert(image, alpha=random.uniform(0.5, 1.5))
-
- image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
-
- if random.randrange(2):
- tmp = image[:, :, 0].astype(int) + random.randint(-18, 18)
- tmp %= 180
- image[:, :, 0] = tmp
-
- if random.randrange(2):
- _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5))
-
- image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
-
- return image
-
-
- def _mirror(image, boxes):
- _, width, _ = image.shape
- if random.randrange(2):
- image = image[:, ::-1]
- boxes = boxes.copy()
- boxes[:, 0::2] = width - boxes[:, 2::-2]
- return image, boxes
-
-
- def preproc(image, input_size, mean, std, swap=(2, 0, 1)):
- if len(image.shape) == 3:
- padded_img = np.ones((input_size[0], input_size[1], 3)) * 114.0
- else:
- padded_img = np.ones(input_size) * 114.0
- img = np.array(image)
- r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
- resized_img = cv2.resize(
- img,
- (int(img.shape[1] * r), int(img.shape[0] * r)),
- interpolation=cv2.INTER_LINEAR,
- ).astype(np.float32)
- padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
-
- padded_img = padded_img[:, :, ::-1]
- padded_img /= 255.0
- if mean is not None:
- padded_img -= mean
- if std is not None:
- padded_img /= std
- padded_img = padded_img.transpose(swap)
- padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
- return padded_img, r
-
-
- class TrainTransform:
- def __init__(self, p=0.5, rgb_means=None, std=None, max_labels=100):
- self.means = rgb_means
- self.std = std
- self.p = p
- self.max_labels = max_labels
-
- def __call__(self, image, targets, input_dim):
- boxes = targets[:, :4].copy()
- labels = targets[:, 4].copy()
- ids = targets[:, 5].copy()
- if len(boxes) == 0:
- targets = np.zeros((self.max_labels, 6), dtype=np.float32)
- image, r_o = preproc(image, input_dim, self.means, self.std)
- image = np.ascontiguousarray(image, dtype=np.float32)
- return image, targets
-
- image_o = image.copy()
- targets_o = targets.copy()
- height_o, width_o, _ = image_o.shape
- boxes_o = targets_o[:, :4]
- labels_o = targets_o[:, 4]
- ids_o = targets_o[:, 5]
- # bbox_o: [xyxy] to [c_x,c_y,w,h]
- boxes_o = xyxy2cxcywh(boxes_o)
-
- image_t = _distort(image)
- image_t, boxes = _mirror(image_t, boxes)
- height, width, _ = image_t.shape
- image_t, r_ = preproc(image_t, input_dim, self.means, self.std)
- # boxes [xyxy] 2 [cx,cy,w,h]
- boxes = xyxy2cxcywh(boxes)
- boxes *= r_
-
- mask_b = np.minimum(boxes[:, 2], boxes[:, 3]) > 1
- boxes_t = boxes[mask_b]
- labels_t = labels[mask_b]
- ids_t = ids[mask_b]
-
- if len(boxes_t) == 0:
- image_t, r_o = preproc(image_o, input_dim, self.means, self.std)
- boxes_o *= r_o
- boxes_t = boxes_o
- labels_t = labels_o
- ids_t = ids_o
-
- labels_t = np.expand_dims(labels_t, 1)
- ids_t = np.expand_dims(ids_t, 1)
-
- targets_t = np.hstack((labels_t, boxes_t, ids_t))
- padded_labels = np.zeros((self.max_labels, 6))
- padded_labels[range(len(targets_t))[: self.max_labels]] = targets_t[
- : self.max_labels
- ]
- padded_labels = np.ascontiguousarray(padded_labels, dtype=np.float32)
- image_t = np.ascontiguousarray(image_t, dtype=np.float32)
- return image_t, padded_labels
-
-
- class ValTransform:
- """
- Defines the transformations that should be applied to test PIL image
- for input into the network
-
- dimension -> tensorize -> color adj
-
- Arguments:
- resize (int): input dimension to SSD
- rgb_means ((int,int,int)): average RGB of the dataset
- (104,117,123)
- swap ((int,int,int)): final order of channels
-
- Returns:
- transform (transform) : callable transform to be applied to test/val
- data
- """
-
- def __init__(self, rgb_means=None, std=None, swap=(2, 0, 1)):
- self.means = rgb_means
- self.swap = swap
- self.std = std
-
- # assume input is cv2 img for now
- def __call__(self, img, res, input_size):
- img, _ = preproc(img, input_size, self.means, self.std, self.swap)
- return img, np.zeros((1, 5))
|