diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..37ed2f4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,13 @@ +# Folders +__pycache__/ +build/ +*.egg-info + + +# Files +*.weights +*.t7 +*.mp4 +*.avi +*.so +*.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..c2fccb5 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# deep_count diff --git a/configs/deep_sort.yaml b/configs/deep_sort.yaml new file mode 100644 index 0000000..8aa24ad --- /dev/null +++ b/configs/deep_sort.yaml @@ -0,0 +1,11 @@ +DEEPSORT: + REID_CKPT: "./deep_sort/deep/checkpoint/ckpt.t7" + REID_CKPT_Car: "./deep_sort/deep/checkpoint/ckpt_car.t7" + MAX_DIST: 0.2 + MIN_CONFIDENCE: 0.3 + NMS_MAX_OVERLAP: 1.0 + MAX_IOU_DISTANCE: 0.7 + MAX_AGE: 70 + N_INIT: 3 + NN_BUDGET: 100 + \ No newline at end of file diff --git a/configs/yolov3.yaml b/configs/yolov3.yaml new file mode 100644 index 0000000..a46e474 --- /dev/null +++ b/configs/yolov3.yaml @@ -0,0 +1,7 @@ +YOLOV3: + CFG: "./detector/YOLOv3/cfg/yolov4.cfg" + WEIGHT: "./detector/YOLOv3/weight/yolov4.weights" + CLASS_NAMES: "./detector/YOLOv3/cfg/coco.names" + + SCORE_THRESH: 0.1 + NMS_THRESH: 0.4 diff --git a/configs/yolov3_tiny.yaml b/configs/yolov3_tiny.yaml new file mode 100644 index 0000000..1261e68 --- /dev/null +++ b/configs/yolov3_tiny.yaml @@ -0,0 +1,7 @@ +YOLOV3: + CFG: "./detector/YOLOv3/cfg/yolov3-tiny.cfg" + WEIGHT: "./detector/YOLOv3/weight/yolov3-tiny.weights" + CLASS_NAMES: "./detector/YOLOv3/cfg/coco.names" + + SCORE_THRESH: 0.5 + NMS_THRESH: 0.4 \ No newline at end of file diff --git a/configs/yolov4_onnx.yaml b/configs/yolov4_onnx.yaml new file mode 100644 index 0000000..d63eb08 --- /dev/null +++ b/configs/yolov4_onnx.yaml @@ -0,0 +1,7 @@ +YOLOV4: + CFG: "./detector/YOLOv3/cfg/yolov4.cfg" + WEIGHT: "./detector/YOLOv3/weight/yolov4_1_3_416_416_static.onnx" + CLASS_NAMES: "./detector/YOLOv3/cfg/coco.names" + + SCORE_THRESH: 0.1 + NMS_THRESH: 0.4 diff --git a/configs/yolov4_trt.yaml b/configs/yolov4_trt.yaml new file mode 100644 index 0000000..367509d --- /dev/null +++ b/configs/yolov4_trt.yaml @@ -0,0 +1,7 @@ +YOLOV4: + CFG: "./detector/YOLOv3/cfg/yolov4.cfg" + WEIGHT: "./detector/YOLOv3/weight/yolov4.engine" + CLASS_NAMES: "./detector/YOLOv3/cfg/coco.names" + + SCORE_THRESH: 0.4 + NMS_THRESH: 0.4 diff --git a/dataset.py b/dataset.py new file mode 100644 index 0000000..5cfdf29 --- /dev/null +++ b/dataset.py @@ -0,0 +1,111 @@ +import cv2 +import numpy as np +from threading import Thread +import time +import os +class LoadStreams: # multiple IP or RTSP cameras + def __init__(self, sources='streams.txt', img_size=640): + self.mode = 'images' + self.img_size = img_size + sources = [sources] + + n = len(sources) + self.imgs = [None] * n + self.sources = sources + for i, s in enumerate(sources): + # Start the thread to read frames from the video stream + print('%g/%g: %s... ' % (i + 1, n, s), end='') + cap = cv2.VideoCapture(0 if s == '0' else s) + assert cap.isOpened(), 'Failed to open %s' % s + w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fps = cap.get(cv2.CAP_PROP_FPS) % 100 + _, self.imgs[i] = cap.read() # guarantee first frame + thread = Thread(target=self.update, args=([i, cap]), daemon=True) + print(' success (%gx%g at %.2f FPS).' % (w, h, fps)) + thread.start() + print('') # newline + + def update(self, index, cap): + # Read next stream frame in a daemon thread + n = 0 + while cap.isOpened(): + n += 1 + # _, self.imgs[index] = cap.read() + cap.grab() + if n == 4: # read every 4th frame + _, self.imgs[index] = cap.retrieve() + n = 0 + time.sleep(0.01) # wait time + + def __iter__(self): + self.count = -1 + return self + + def __next__(self): + self.count += 1 + img0 = self.imgs.copy() + if cv2.waitKey(1) == ord('q'): # q to quit + cv2.destroyAllWindows() + raise StopIteration + return self.sources, img0, None + + def __len__(self): + return 0 # 1E12 frames = 32 streams at 30 FPS for 30 years + + + +class datasets: # multiple IP or RTSP cameras + def __init__(self, sources='streams.txt', img_size=640): + self.mode = 'images' + self.img_size = img_size + + if os.path.isfile(sources): + with open(sources, 'r') as f: + sources = [x.strip() for x in f.read().splitlines() if len(x.strip())] + else: + sources = [sources] + + n = len(sources) + self.imgs = [None] * n + self.sources = sources + for i, s in enumerate(sources): + # Start the thread to read frames from the video stream + print('%g/%g: %s... ' % (i + 1, n, s), end='') + cap = cv2.VideoCapture(0 if s == '0' else s) + assert cap.isOpened(), 'Failed to open %s' % s + w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fps = cap.get(cv2.CAP_PROP_FPS) % 100 + _, self.imgs[i] = cap.read() # guarantee first frame + thread = Thread(target=self.update, args=([i, cap]), daemon=True) + print(' success (%gx%g at %.2f FPS).' % (w, h, fps)) + thread.start() + print('') # newline + + def update(self, index, cap): + # Read next stream frame in a daemon thread + n = 0 + while cap.isOpened(): + n += 1 + # _, self.imgs[index] = cap.read() + cap.grab() + if n == 1: # read every 4th frame + _, self.imgs[index] = cap.retrieve() + n = 0 + time.sleep(0.01) # wait time + + def __iter__(self): + self.count = -1 + return self + + def __next__(self): + self.count += 1 + img0 = self.imgs.copy() + if cv2.waitKey(1) == ord('q'): # q to quit + cv2.destroyAllWindows() + raise StopIteration + return self.sources, img0, None + + def __len__(self): + return 0 # 1E12 frames = 32 streams at 30 FPS for 30 years \ No newline at end of file diff --git a/deep_sort/README.md b/deep_sort/README.md new file mode 100644 index 0000000..e89c9b3 --- /dev/null +++ b/deep_sort/README.md @@ -0,0 +1,3 @@ +# Deep Sort + +This is the implemention of deep sort with pytorch. \ No newline at end of file diff --git a/deep_sort/__init__.py b/deep_sort/__init__.py new file mode 100644 index 0000000..8bb81a7 --- /dev/null +++ b/deep_sort/__init__.py @@ -0,0 +1,24 @@ +from deep_sort.deep_sort import DeepSort #在我们执行import时,当前目录是不会变的(就算是执行子目录的文件),还是需要完整的包名。 + + +__all__ = ['DeepSort', 'build_tracker','build_tracker_car'] ##import * 的时候,防止导入过多的变量,把导入的限制在__all__中 + +def build_tracker(cfg, use_cuda): + return DeepSort(cfg.DEEPSORT.REID_CKPT, + max_dist=cfg.DEEPSORT.MAX_DIST, min_confidence=cfg.DEEPSORT.MIN_CONFIDENCE, + nms_max_overlap=cfg.DEEPSORT.NMS_MAX_OVERLAP, max_iou_distance=cfg.DEEPSORT.MAX_IOU_DISTANCE, + max_age=cfg.DEEPSORT.MAX_AGE, n_init=cfg.DEEPSORT.N_INIT, nn_budget=cfg.DEEPSORT.NN_BUDGET, use_cuda=use_cuda) + + +def build_tracker_car(cfg, use_cuda): + return DeepSort(cfg.DEEPSORT.REID_CKPT_Car, + max_dist=cfg.DEEPSORT.MAX_DIST, min_confidence=cfg.DEEPSORT.MIN_CONFIDENCE, + nms_max_overlap=cfg.DEEPSORT.NMS_MAX_OVERLAP, max_iou_distance=cfg.DEEPSORT.MAX_IOU_DISTANCE, + max_age=cfg.DEEPSORT.MAX_AGE, n_init=cfg.DEEPSORT.N_INIT, nn_budget=cfg.DEEPSORT.NN_BUDGET, use_cuda=use_cuda,num_class = 685) + + + + + + + diff --git a/deep_sort/deep/__init__.py b/deep_sort/deep/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/deep_sort/deep/checkpoint/.gitkeep b/deep_sort/deep/checkpoint/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/deep_sort/deep/evaluate.py b/deep_sort/deep/evaluate.py new file mode 100644 index 0000000..85eaa6f --- /dev/null +++ b/deep_sort/deep/evaluate.py @@ -0,0 +1,15 @@ +import torch + +features = torch.load("features.pth") +qf = features["qf"] +ql = features["ql"] +gf = features["gf"] +gl = features["gl"] + +scores = qf.mm(gf.t()) +res = scores.topk(5, dim=1)[1][:,0] +top1correct = gl[res].eq(ql).sum().item() + +print("Acc top1:{:.3f}".format(top1correct/ql.size(0))) + + diff --git a/deep_sort/deep/feature_extractor.py b/deep_sort/deep/feature_extractor.py new file mode 100644 index 0000000..1fb8ea2 --- /dev/null +++ b/deep_sort/deep/feature_extractor.py @@ -0,0 +1,55 @@ +import torch +import torchvision.transforms as transforms +import numpy as np +import cv2 +import logging + +from deep_sort.deep.model import Net + +class Extractor(object): + def __init__(self, model_path, use_cuda=True,num_class = 751): + self.net = Net(reid=True,num_classes = num_class) + self.device = "cuda" if torch.cuda.is_available() and use_cuda else "cpu" + state_dict = torch.load(model_path, map_location=lambda storage, loc: storage)['net_dict'] + self.net.load_state_dict(state_dict) + logger = logging.getLogger("root.tracker") + logger.info("Loading weights from {}... Done!".format(model_path)) + self.net.to(self.device) + self.size = (64, 128) + self.norm = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), + ]) + + + + def _preprocess(self, im_crops): + """ + TODO: + 1. to float with scale from 0 to 1 + 2. resize to (64, 128) as Market1501 dataset did + 3. concatenate to a numpy array + 3. to torch Tensor + 4. normalize + """ + def _resize(im, size): + return cv2.resize(im.astype(np.float32)/255., size) + + im_batch = torch.cat([self.norm(_resize(im, self.size)).unsqueeze(0) for im in im_crops], dim=0).float() + return im_batch + + + def __call__(self, im_crops): + im_batch = self._preprocess(im_crops) + with torch.no_grad(): + im_batch = im_batch.to(self.device) + features = self.net(im_batch) + return features.cpu().numpy() + + +if __name__ == '__main__': + img = cv2.imread("demo.jpg")[:,:,(2,1,0)] + extr = Extractor("checkpoint/ckpt.t7") + feature = extr(img) + print(feature.shape) + diff --git a/deep_sort/deep/model.py b/deep_sort/deep/model.py new file mode 100644 index 0000000..0427f71 --- /dev/null +++ b/deep_sort/deep/model.py @@ -0,0 +1,104 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class BasicBlock(nn.Module): + def __init__(self, c_in, c_out,is_downsample=False): + super(BasicBlock,self).__init__() + self.is_downsample = is_downsample + if is_downsample: + self.conv1 = nn.Conv2d(c_in, c_out, 3, stride=2, padding=1, bias=False) + else: + self.conv1 = nn.Conv2d(c_in, c_out, 3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(c_out) + self.relu = nn.ReLU(True) + self.conv2 = nn.Conv2d(c_out,c_out,3,stride=1,padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(c_out) + if is_downsample: + self.downsample = nn.Sequential( + nn.Conv2d(c_in, c_out, 1, stride=2, bias=False), + nn.BatchNorm2d(c_out) + ) + elif c_in != c_out: + self.downsample = nn.Sequential( + nn.Conv2d(c_in, c_out, 1, stride=1, bias=False), + nn.BatchNorm2d(c_out) + ) + self.is_downsample = True + + def forward(self,x): + y = self.conv1(x) + y = self.bn1(y) + y = self.relu(y) + y = self.conv2(y) + y = self.bn2(y) + if self.is_downsample: + x = self.downsample(x) + return F.relu(x.add(y),True) ##残差网络在这 + +def make_layers(c_in,c_out,repeat_times, is_downsample=False): + blocks = [] + for i in range(repeat_times): + if i ==0: + blocks += [BasicBlock(c_in,c_out, is_downsample=is_downsample),] + else: + blocks += [BasicBlock(c_out,c_out),] + return nn.Sequential(*blocks) + +class Net(nn.Module): + def __init__(self, num_classes=751 ,reid=False): + super(Net,self).__init__() + # 3 128 64 + self.conv = nn.Sequential( + nn.Conv2d(3,64,3,stride=1,padding=1), + nn.BatchNorm2d(64), + nn.ReLU(inplace=True), + # nn.Conv2d(32,32,3,stride=1,padding=1), + # nn.BatchNorm2d(32), + # nn.ReLU(inplace=True), + nn.MaxPool2d(3,2,padding=1), + ) + # 32 64 32 + self.layer1 = make_layers(64,64,2,False) + # 32 64 32 + self.layer2 = make_layers(64,128,2,True) + # 64 32 16 + self.layer3 = make_layers(128,256,2,True) + # 128 16 8 + self.layer4 = make_layers(256,512,2,True) + # 256 8 4 + self.avgpool = nn.AvgPool2d((8,4),1) + # 256 1 1 + self.reid = reid + self.classifier = nn.Sequential( + nn.Linear(512, 256), + nn.BatchNorm1d(256), + nn.ReLU(inplace=True), + nn.Dropout(), + nn.Linear(256, num_classes), + ) + + def forward(self, x): + x = self.conv(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.avgpool(x) + x = x.view(x.size(0),-1) + # B x 128 + if self.reid: + x = x.div(x.norm(p=2,dim=1,keepdim=True)) + return x + # classifier + x = self.classifier(x) + return x + + +if __name__ == '__main__': + net = Net() + x = torch.randn(4,3,128,64) + y = net(x) + # import ipdb; ipdb.set_trace() + + diff --git a/deep_sort/deep/original_model.py b/deep_sort/deep/original_model.py new file mode 100644 index 0000000..72453a6 --- /dev/null +++ b/deep_sort/deep/original_model.py @@ -0,0 +1,106 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class BasicBlock(nn.Module): + def __init__(self, c_in, c_out,is_downsample=False): + super(BasicBlock,self).__init__() + self.is_downsample = is_downsample + if is_downsample: + self.conv1 = nn.Conv2d(c_in, c_out, 3, stride=2, padding=1, bias=False) + else: + self.conv1 = nn.Conv2d(c_in, c_out, 3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(c_out) + self.relu = nn.ReLU(True) + self.conv2 = nn.Conv2d(c_out,c_out,3,stride=1,padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(c_out) + if is_downsample: + self.downsample = nn.Sequential( + nn.Conv2d(c_in, c_out, 1, stride=2, bias=False), + nn.BatchNorm2d(c_out) + ) + elif c_in != c_out: + self.downsample = nn.Sequential( + nn.Conv2d(c_in, c_out, 1, stride=1, bias=False), + nn.BatchNorm2d(c_out) + ) + self.is_downsample = True + + def forward(self,x): + y = self.conv1(x) + y = self.bn1(y) + y = self.relu(y) + y = self.conv2(y) + y = self.bn2(y) + if self.is_downsample: + x = self.downsample(x) + return F.relu(x.add(y),True) + +def make_layers(c_in,c_out,repeat_times, is_downsample=False): + blocks = [] + for i in range(repeat_times): + if i ==0: + blocks += [BasicBlock(c_in,c_out, is_downsample=is_downsample),] + else: + blocks += [BasicBlock(c_out,c_out),] + return nn.Sequential(*blocks) + +class Net(nn.Module): + def __init__(self, num_classes=625 ,reid=False): + super(Net,self).__init__() + # 3 128 64 + self.conv = nn.Sequential( + nn.Conv2d(3,32,3,stride=1,padding=1), + nn.BatchNorm2d(32), + nn.ELU(inplace=True), + nn.Conv2d(32,32,3,stride=1,padding=1), + nn.BatchNorm2d(32), + nn.ELU(inplace=True), + nn.MaxPool2d(3,2,padding=1), + ) + # 32 64 32 + self.layer1 = make_layers(32,32,2,False) + # 32 64 32 + self.layer2 = make_layers(32,64,2,True) + # 64 32 16 + self.layer3 = make_layers(64,128,2,True) + # 128 16 8 + self.dense = nn.Sequential( + nn.Dropout(p=0.6), + nn.Linear(128*16*8, 128), + nn.BatchNorm1d(128), + nn.ELU(inplace=True) + ) + # 256 1 1 + self.reid = reid + self.batch_norm = nn.BatchNorm1d(128) + self.classifier = nn.Sequential( + nn.Linear(128, num_classes), + ) + + def forward(self, x): + x = self.conv(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + + x = x.view(x.size(0),-1) + if self.reid: + x = self.dense[0](x) + x = self.dense[1](x) + x = x.div(x.norm(p=2,dim=1,keepdim=True)) + return x + x = self.dense(x) + # B x 128 + # classifier + x = self.classifier(x) + return x + + +if __name__ == '__main__': + net = Net(reid=True) + x = torch.randn(4,3,128,64) + y = net(x) + import ipdb; ipdb.set_trace() + + diff --git a/deep_sort/deep/test.py b/deep_sort/deep/test.py new file mode 100644 index 0000000..ecac0ad --- /dev/null +++ b/deep_sort/deep/test.py @@ -0,0 +1,77 @@ +import torch +import torch.backends.cudnn as cudnn +import torchvision + +import argparse +import os + +from model import Net + +parser = argparse.ArgumentParser(description="Train on market1501") +parser.add_argument("--data-dir",default='data',type=str) +parser.add_argument("--no-cuda",action="store_true") +parser.add_argument("--gpu-id",default=0,type=int) +args = parser.parse_args() + +# device +device = "cuda:{}".format(args.gpu_id) if torch.cuda.is_available() and not args.no_cuda else "cpu" +if torch.cuda.is_available() and not args.no_cuda: + cudnn.benchmark = True + +# data loader +root = args.data_dir +query_dir = os.path.join(root,"query") +gallery_dir = os.path.join(root,"gallery") +transform = torchvision.transforms.Compose([ + torchvision.transforms.Resize((128,64)), + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) +]) +queryloader = torch.utils.data.DataLoader( + torchvision.datasets.ImageFolder(query_dir, transform=transform), + batch_size=64, shuffle=False +) +galleryloader = torch.utils.data.DataLoader( + torchvision.datasets.ImageFolder(gallery_dir, transform=transform), + batch_size=64, shuffle=False +) + +# net definition +net = Net(reid=True) +assert os.path.isfile("./checkpoint/ckpt.t7"), "Error: no checkpoint file found!" +print('Loading from checkpoint/ckpt.t7') +checkpoint = torch.load("./checkpoint/ckpt.t7") +net_dict = checkpoint['net_dict'] +net.load_state_dict(net_dict, strict=False) +net.eval() +net.to(device) + +# compute features +query_features = torch.tensor([]).float() +query_labels = torch.tensor([]).long() +gallery_features = torch.tensor([]).float() +gallery_labels = torch.tensor([]).long() + +with torch.no_grad(): + for idx,(inputs,labels) in enumerate(queryloader): + inputs = inputs.to(device) + features = net(inputs).cpu() + query_features = torch.cat((query_features, features), dim=0) + query_labels = torch.cat((query_labels, labels)) + + for idx,(inputs,labels) in enumerate(galleryloader): + inputs = inputs.to(device) + features = net(inputs).cpu() + gallery_features = torch.cat((gallery_features, features), dim=0) + gallery_labels = torch.cat((gallery_labels, labels)) + +gallery_labels -= 2 + +# save features +features = { + "qf": query_features, + "ql": query_labels, + "gf": gallery_features, + "gl": gallery_labels +} +torch.save(features,"features.pth") \ No newline at end of file diff --git a/deep_sort/deep/train.jpg b/deep_sort/deep/train.jpg new file mode 100644 index 0000000..3635a61 Binary files /dev/null and b/deep_sort/deep/train.jpg differ diff --git a/deep_sort/deep/train.py b/deep_sort/deep/train.py new file mode 100644 index 0000000..5322af9 --- /dev/null +++ b/deep_sort/deep/train.py @@ -0,0 +1,189 @@ +import argparse +import os +import time + +import numpy as np +import matplotlib.pyplot as plt +import torch +import torch.backends.cudnn as cudnn +import torchvision + +from model import Net + +parser = argparse.ArgumentParser(description="Train on market1501") +parser.add_argument("--data-dir",default='/home/hncr/workspace/MOT_TRACKING/deep_sort_pytorch-master/deep_sort/deep/data',type=str) +parser.add_argument("--no-cuda",action="store_true") +parser.add_argument("--gpu-id",default=0,type=int) +parser.add_argument("--lr",default=0.01, type=float) +parser.add_argument("--interval",'-i',default=20,type=int) +parser.add_argument('--resume', '-r',default=False,action='store_true') +args = parser.parse_args() + +# device +device = "cuda:{}".format(args.gpu_id) if torch.cuda.is_available() and not args.no_cuda else "cpu" +if torch.cuda.is_available() and not args.no_cuda: + cudnn.benchmark = True + +# data loading +root = args.data_dir +train_dir = os.path.join(root,"train/") +test_dir = os.path.join(root,"test/") +transform_train = torchvision.transforms.Compose([ + torchvision.transforms.RandomCrop((128,64),padding=4), + torchvision.transforms.RandomHorizontalFlip(), + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) +]) +transform_test = torchvision.transforms.Compose([ + torchvision.transforms.Resize((128,64)), + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) +]) +trainloader = torch.utils.data.DataLoader( + torchvision.datasets.ImageFolder(train_dir, transform=transform_train), + batch_size=64,shuffle=True +) +testloader = torch.utils.data.DataLoader( + torchvision.datasets.ImageFolder(test_dir, transform=transform_test), + batch_size=64,shuffle=True +) +num_classes = max(len(trainloader.dataset.classes), len(testloader.dataset.classes)) + +# net definition +start_epoch = 0 +net = Net(num_classes=num_classes) +if args.resume: + assert os.path.isfile("/home/hncr/workspace/MOT_TRACKING/deep_sort_pytorch-master/checkpoint/ckpt.t7"), "Error: no checkpoint file found!" + print('Loading from checkpoint/ckpt.t7') + checkpoint = torch.load("/home/hncr/workspace/MOT_TRACKING/deep_sort_pytorch-master/checkpoint/ckpt.t7") + # import ipdb; ipdb.set_trace() + net_dict = checkpoint['net_dict'] + net.load_state_dict(net_dict) + best_acc = checkpoint['acc'] + start_epoch = checkpoint['epoch'] +net.to(device) + +# loss and optimizer +criterion = torch.nn.CrossEntropyLoss() +optimizer = torch.optim.SGD(net.parameters(), args.lr, momentum=0.9, weight_decay=5e-4) +best_acc = 0. + +# train function for each epoch +def train(epoch): + print("\nEpoch : %d"%(epoch+1)) + net.train() + training_loss = 0. + train_loss = 0. + correct = 0 + total = 0 + interval = args.interval + start = time.time() + for idx, (inputs, labels) in enumerate(trainloader): + # forward + inputs,labels = inputs.to(device),labels.to(device) + outputs = net(inputs) + loss = criterion(outputs, labels) + + # backward + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # accumurating + training_loss += loss.item() + train_loss += loss.item() + correct += outputs.max(dim=1)[1].eq(labels).sum().item() + total += labels.size(0) + + # print + if (idx+1)%interval == 0: + end = time.time() + print("[progress:{:.1f}%]time:{:.2f}s Loss:{:.5f} Correct:{}/{} Acc:{:.3f}%".format( + 100.*(idx+1)/len(trainloader), end-start, training_loss/interval, correct, total, 100.*correct/total + )) + training_loss = 0. + start = time.time() + + return train_loss/len(trainloader), 1.- correct/total + +def test(epoch): + global best_acc + net.eval() + test_loss = 0. + correct = 0 + total = 0 + start = time.time() + with torch.no_grad(): + for idx, (inputs, labels) in enumerate(testloader): + inputs, labels = inputs.to(device), labels.to(device) + outputs = net(inputs) + loss = criterion(outputs, labels) + + test_loss += loss.item() + correct += outputs.max(dim=1)[1].eq(labels).sum().item() + total += labels.size(0) + + print("Testing ...") + end = time.time() + print("[progress:{:.1f}%]time:{:.2f}s Loss:{:.5f} Correct:{}/{} Acc:{:.3f}%".format( + 100.*(idx+1)/len(testloader), end-start, test_loss/len(testloader), correct, total, 100.*correct/total + )) + + # saving checkpoint + acc = 100.*correct/total + if acc > best_acc: + best_acc = acc + print("Saving parameters to checkpoint/ckpt.t7") + checkpoint = { + 'net_dict':net.state_dict(), + 'acc':acc, + 'epoch':epoch, + } + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + torch.save(checkpoint, './checkpoint/ckpt.t7') + + return test_loss/len(testloader), 1.- correct/total + +# plot figure +x_epoch = [] +record = {'train_loss':[], 'train_err':[], 'test_loss':[], 'test_err':[]} +fig = plt.figure() +ax0 = fig.add_subplot(121, title="loss") +ax1 = fig.add_subplot(122, title="top1err") +def draw_curve(epoch, train_loss, train_err, test_loss, test_err): + global record + record['train_loss'].append(train_loss) + record['train_err'].append(train_err) + record['test_loss'].append(test_loss) + record['test_err'].append(test_err) + + x_epoch.append(epoch) + ax0.plot(x_epoch, record['train_loss'], 'bo-', label='train') + ax0.plot(x_epoch, record['test_loss'], 'ro-', label='val') + ax1.plot(x_epoch, record['train_err'], 'bo-', label='train') + ax1.plot(x_epoch, record['test_err'], 'ro-', label='val') + if epoch == 0: + ax0.legend() + ax1.legend() + fig.savefig("train.jpg") + +# lr decay +def lr_decay(): + global optimizer + for params in optimizer.param_groups: + params['lr'] *= 0.1 + lr = params['lr'] + print("Learning rate adjusted to {}".format(lr)) + +def main(): + for epoch in range(start_epoch, start_epoch+70): + train_loss, train_err = train(epoch) + test_loss, test_err = test(epoch) + draw_curve(epoch, train_loss, train_err, test_loss, test_err) + if (epoch+1)%20==0: + lr_decay() + + +if __name__ == '__main__': + main() diff --git a/deep_sort/deep_sort.py b/deep_sort/deep_sort.py new file mode 100644 index 0000000..e5476f6 --- /dev/null +++ b/deep_sort/deep_sort.py @@ -0,0 +1,118 @@ +import numpy as np +import torch + +from deep_sort.deep.feature_extractor import Extractor +from deep_sort.sort.nn_matching import NearestNeighborDistanceMetric +from deep_sort.sort.preprocessing import non_max_suppression +from deep_sort.sort.detection import Detection +from deep_sort.sort.tracker import Tracker + + +__all__ = ['DeepSort'] + + +class DeepSort(object): + def __init__(self, model_path, max_dist=0.2, min_confidence=0.3, nms_max_overlap=1.0, max_iou_distance=0.7, max_age=70, n_init=3, nn_budget=100, use_cuda=True,num_class = 751): + self.min_confidence = min_confidence + self.nms_max_overlap = nms_max_overlap + + self.extractor = Extractor(model_path, use_cuda=use_cuda,num_class=num_class) + + max_cosine_distance = max_dist + nn_budget = 100 + metric = NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) + self.tracker = Tracker(metric, max_iou_distance=max_iou_distance, max_age=max_age, n_init=n_init) + + def update(self, bbox_xywh, confidences, ori_img,count): + self.height, self.width = ori_img.shape[:2] + # generate detections + features = self._get_features(bbox_xywh, ori_img)##reid 得到512的 行人embamding + bbox_tlwh = self._xywh_to_tlwh(bbox_xywh) ## xywh 转成 左上角 wh + detections = [Detection(bbox_tlwh[i], conf, features[i]) for i,conf in enumerate(confidences) if conf>self.min_confidence] + + # run on non-maximum supression + boxes = np.array([d.tlwh for d in detections]) + scores = np.array([d.confidence for d in detections]) + indices = non_max_suppression(boxes, self.nms_max_overlap, scores) + detections = [detections[i] for i in indices] + + # update tracker + self.tracker.predict() + self.tracker.update(detections) + + # output bbox identities + outputs = [] + detection_id = len(bbox_xywh) + + for track in self.tracker.tracks: + if not track.is_confirmed() or track.time_since_update > 1: + continue + box = track.to_tlwh() + x1,y1,x2,y2 = self._tlwh_to_xyxy(box) + track_id = track.track_id + count.append(int(track_id)) + outputs.append(np.array([x1,y1,x2,y2,track_id], dtype=np.int)) + if len(outputs) > 0: + outputs = np.stack(outputs,axis=0) + return outputs,count,detection_id + + + """ + TODO: + Convert bbox from xc_yc_w_h to xtl_ytl_w_h + Thanks JieChen91@github.com for reporting this bug! + """ + @staticmethod ##该方法不强制要求传递参数,可以不需要实例化: + def _xywh_to_tlwh(bbox_xywh): + if isinstance(bbox_xywh, np.ndarray): + bbox_tlwh = bbox_xywh.copy() + elif isinstance(bbox_xywh, torch.Tensor): + bbox_tlwh = bbox_xywh.clone() + bbox_tlwh[:,0] = bbox_xywh[:,0] - bbox_xywh[:,2]/2. + bbox_tlwh[:,1] = bbox_xywh[:,1] - bbox_xywh[:,3]/2. + return bbox_tlwh + + + def _xywh_to_xyxy(self, bbox_xywh): + x,y,w,h = bbox_xywh + x1 = max(int(x-w/2),0) + x2 = min(int(x+w/2),self.width-1) + y1 = max(int(y-h/2),0) + y2 = min(int(y+h/2),self.height-1) + return x1,y1,x2,y2 + + def _tlwh_to_xyxy(self, bbox_tlwh): + """ + TODO: + Convert bbox from xtl_ytl_w_h to xc_yc_w_h + Thanks JieChen91@github.com for reporting this bug! + """ + x,y,w,h = bbox_tlwh + x1 = max(int(x),0) + x2 = min(int(x+w),self.width-1) + y1 = max(int(y),0) + y2 = min(int(y+h),self.height-1) + return x1,y1,x2,y2 + + def _xyxy_to_tlwh(self, bbox_xyxy): + x1,y1,x2,y2 = bbox_xyxy + + t = x1 + l = y1 + w = int(x2-x1) + h = int(y2-y1) + return t,l,w,h + + def _get_features(self, bbox_xywh, ori_img): + im_crops = [] + for box in bbox_xywh: + x1,y1,x2,y2 = self._xywh_to_xyxy(box) + im = ori_img[y1:y2,x1:x2] + im_crops.append(im) + if im_crops: + features = self.extractor(im_crops) + else: + features = np.array([]) + return features + + diff --git a/deep_sort/sort/__init__.py b/deep_sort/sort/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/deep_sort/sort/detection.py b/deep_sort/sort/detection.py new file mode 100644 index 0000000..87fc5fd --- /dev/null +++ b/deep_sort/sort/detection.py @@ -0,0 +1,49 @@ +# vim: expandtab:ts=4:sw=4 +import numpy as np + + +class Detection(object): + """ + This class represents a bounding box detection in a single image. + + Parameters + ---------- + tlwh : array_like + Bounding box in format `(x, y, w, h)`. + confidence : float + Detector confidence score. + feature : array_like + A feature vector that describes the object contained in this image. + + Attributes + ---------- + tlwh : ndarray + Bounding box in format `(top left x, top left y, width, height)`. + confidence : ndarray + Detector confidence score. + feature : ndarray | NoneType + A feature vector that describes the object contained in this image. + + """ + + def __init__(self, tlwh, confidence, feature): + self.tlwh = np.asarray(tlwh, dtype=np.float) + self.confidence = float(confidence) + self.feature = np.asarray(feature, dtype=np.float32) + + def to_tlbr(self): + """Convert bounding box to format `(min x, min y, max x, max y)`, i.e., + `(top left, bottom right)`. + """ + ret = self.tlwh.copy() + ret[2:] += ret[:2] + return ret + + def to_xyah(self): + """Convert bounding box to format `(center x, center y, aspect ratio, + height)`, where the aspect ratio is `width / height`. + """ + ret = self.tlwh.copy() + ret[:2] += ret[2:] / 2 + ret[2] /= ret[3] + return ret diff --git a/deep_sort/sort/iou_matching.py b/deep_sort/sort/iou_matching.py new file mode 100644 index 0000000..481e930 --- /dev/null +++ b/deep_sort/sort/iou_matching.py @@ -0,0 +1,81 @@ +# vim: expandtab:ts=4:sw=4 +from __future__ import absolute_import +import numpy as np +from . import linear_assignment + + +def iou(bbox, candidates): + """Computer intersection over union. + + Parameters + ---------- + bbox : ndarray + A bounding box in format `(top left x, top left y, width, height)`. + candidates : ndarray + A matrix of candidate bounding boxes (one per row) in the same format + as `bbox`. + + Returns + ------- + ndarray + The intersection over union in [0, 1] between the `bbox` and each + candidate. A higher score means a larger fraction of the `bbox` is + occluded by the candidate. + + """ + bbox_tl, bbox_br = bbox[:2], bbox[:2] + bbox[2:] + candidates_tl = candidates[:, :2] + candidates_br = candidates[:, :2] + candidates[:, 2:] + + tl = np.c_[np.maximum(bbox_tl[0], candidates_tl[:, 0])[:, np.newaxis], + np.maximum(bbox_tl[1], candidates_tl[:, 1])[:, np.newaxis]] + br = np.c_[np.minimum(bbox_br[0], candidates_br[:, 0])[:, np.newaxis], + np.minimum(bbox_br[1], candidates_br[:, 1])[:, np.newaxis]] + wh = np.maximum(0., br - tl) + + area_intersection = wh.prod(axis=1) + area_bbox = bbox[2:].prod() + area_candidates = candidates[:, 2:].prod(axis=1) + return area_intersection / (area_bbox + area_candidates - area_intersection) + + +def iou_cost(tracks, detections, track_indices=None, + detection_indices=None): + """An intersection over union distance metric. + + Parameters + ---------- + tracks : List[deep_sort.track.Track] + A list of tracks. + detections : List[deep_sort.detection.Detection] + A list of detections. + track_indices : Optional[List[int]] + A list of indices to tracks that should be matched. Defaults to + all `tracks`. + detection_indices : Optional[List[int]] + A list of indices to detections that should be matched. Defaults + to all `detections`. + + Returns + ------- + ndarray + Returns a cost matrix of shape + len(track_indices), len(detection_indices) where entry (i, j) is + `1 - iou(tracks[track_indices[i]], detections[detection_indices[j]])`. + + """ + if track_indices is None: + track_indices = np.arange(len(tracks)) + if detection_indices is None: + detection_indices = np.arange(len(detections)) + + cost_matrix = np.zeros((len(track_indices), len(detection_indices))) + for row, track_idx in enumerate(track_indices): + if tracks[track_idx].time_since_update > 1: + cost_matrix[row, :] = linear_assignment.INFTY_COST + continue + + bbox = tracks[track_idx].to_tlwh() + candidates = np.asarray([detections[i].tlwh for i in detection_indices]) + cost_matrix[row, :] = 1. - iou(bbox, candidates) + return cost_matrix diff --git a/deep_sort/sort/kalman_filter.py b/deep_sort/sort/kalman_filter.py new file mode 100644 index 0000000..1e50a7c --- /dev/null +++ b/deep_sort/sort/kalman_filter.py @@ -0,0 +1,241 @@ +# vim: expandtab:ts=4:sw=4 +import numpy as np +import scipy.linalg + + +""" +Table for the 0.95 quantile of the chi-square distribution with N degrees of +freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv +function and used as Mahalanobis gating threshold. +""" +chi2inv95 = { + 1: 3.8415, + 2: 5.9915, + 3: 7.8147, + 4: 9.4877, + 5: 11.070, + 6: 12.592, + 7: 14.067, + 8: 15.507, + 9: 16.919} + + +class KalmanFilter(object): + """ + A simple Kalman filter for tracking bounding boxes in image space. + + The 8-dimensional state space + + x, y, a, h, vx, vy, va, vh + + contains the bounding box center position (x, y), aspect ratio a, height h, + and their respective velocities. + + Object motion follows a constant velocity model. The bounding box location + (x, y, a, h) is taken as direct observation of the state space (linear + observation model). + + """ + + def __init__(self): + ndim, dt = 4, 1. + + # Create Kalman filter model matrices. + self._motion_mat = np.eye(2 * ndim, 2 * ndim) + for i in range(ndim): + self._motion_mat[i, ndim + i] = dt + self._update_mat = np.eye(ndim, 2 * ndim) + + # Motion and observation uncertainty are chosen relative to the current + # state estimate. These weights control the amount of uncertainty in + # the model. This is a bit hacky. + self._std_weight_position = 1. / 20 + self._std_weight_velocity = 1. / 160 + + def initiate(self, measurement): + """Create track from unassociated measurement. + + Parameters + ---------- + measurement : ndarray + Bounding box coordinates (x, y, a, h) with center position (x, y), + aspect ratio a, and height h. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector (8 dimensional) and covariance matrix (8x8 + dimensional) of the new track. Unobserved velocities are initialized + to 0 mean. + + """ + mean_pos = measurement + mean_vel = np.zeros_like(mean_pos) + mean = np.r_[mean_pos, mean_vel] + + std = [ + 2 * self._std_weight_position * measurement[3], + 2 * self._std_weight_position * measurement[3], + 1e-2, + 2 * self._std_weight_position * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 1e-5, + 10 * self._std_weight_velocity * measurement[3]] + covariance = np.diag(np.square(std)) + return mean, covariance + + def predict(self, mean, covariance): + """Run Kalman filter prediction step. + + Parameters + ---------- + mean : ndarray + The 8 dimensional mean vector of the object state at the previous + time step. + covariance : ndarray + The 8x8 dimensional covariance matrix of the object state at the + previous time step. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector and covariance matrix of the predicted + state. Unobserved velocities are initialized to 0 mean. + + """ + std_pos = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], + 1e-2, + self._std_weight_position * mean[3]] + std_vel = [ + self._std_weight_velocity * mean[3], + self._std_weight_velocity * mean[3], + 1e-5, + self._std_weight_velocity * mean[3]] + # np.r_ 按列连接两个矩阵 + # 初始化噪声矩阵 Q 对角矩阵 + motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) + ## x' = Fx + mean = np.dot(self._motion_mat, mean) + # P' = FPF^T+Q + covariance = np.linalg.multi_dot(( + self._motion_mat, covariance, self._motion_mat.T)) + motion_cov + + return mean, covariance + + def project(self, mean, covariance): + """Project state distribution to measurement space. + + Parameters + ---------- + mean : ndarray + The state's mean vector (8 dimensional array). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + + Returns + ------- + (ndarray, ndarray) + Returns the projected mean and covariance matrix of the given state + estimate. + + """ + std = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], + 1e-1, + self._std_weight_position * mean[3]] + innovation_cov = np.diag(np.square(std)) + # 将均值向量映射到检测空间,即 Hx' + mean = np.dot(self._update_mat, mean) + covariance = np.linalg.multi_dot(( + self._update_mat, covariance, self._update_mat.T)) + return mean, covariance + innovation_cov + + def update(self, mean, covariance, measurement): + """Run Kalman filter correction step. + + Parameters + ---------- + mean : ndarray + The predicted state's mean vector (8 dimensional). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + measurement : ndarray + The 4 dimensional measurement vector (x, y, a, h), where (x, y) + is the center position, a the aspect ratio, and h the height of the + bounding box. + + Returns + ------- + (ndarray, ndarray) + Returns the measurement-corrected state distribution. + + """ + #y = z − Hx ′ + #S = HP ′ H T + R + #K = P ′ H T S −1 + #x = x ′ + Ky + #P = (I − KH)P ′ + # 将均值和协方差映射到检测空间,得到 Hx' 和 S + projected_mean, projected_cov = self.project(mean, covariance) + # 矩阵分解 + chol_factor, lower = scipy.linalg.cho_factor( + projected_cov, lower=True, check_finite=False) + # 计算卡尔曼增益 K + kalman_gain = scipy.linalg.cho_solve( + (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, + check_finite=False).T + # z - Hx' + innovation = measurement - projected_mean + #x = x ′ + Ky + new_mean = mean + np.dot(innovation, kalman_gain.T) + #P = (I − KH)P ′ ???? + new_covariance = covariance - np.linalg.multi_dot(( + kalman_gain, projected_cov, kalman_gain.T)) + return new_mean, new_covariance + + def gating_distance(self, mean, covariance, measurements, + only_position=False): + """Compute gating distance between state distribution and measurements. + + A suitable distance threshold can be obtained from `chi2inv95`. If + `only_position` is False, the chi-square distribution has 4 degrees of + freedom, otherwise 2. + + Parameters + ---------- + mean : ndarray + Mean vector over the state distribution (8 dimensional). + covariance : ndarray + Covariance of the state distribution (8x8 dimensional). + measurements : ndarray + An Nx4 dimensional matrix of N measurements, each in + format (x, y, a, h) where (x, y) is the bounding box center + position, a the aspect ratio, and h the height. + only_position : Optional[bool] + If True, distance computation is done with respect to the bounding + box center position only. + + Returns + ------- + ndarray + Returns an array of length N, where the i-th element contains the + squared Mahalanobis distance between (mean, covariance) and + `measurements[i]`. + + """ + mean, covariance = self.project(mean, covariance) + if only_position: + mean, covariance = mean[:2], covariance[:2, :2] + measurements = measurements[:, :2] + + cholesky_factor = np.linalg.cholesky(covariance) + d = measurements - mean + z = scipy.linalg.solve_triangular( + cholesky_factor, d.T, lower=True, check_finite=False, + overwrite_b=True) + squared_maha = np.sum(z * z, axis=0) + return squared_maha diff --git a/deep_sort/sort/linear_assignment.py b/deep_sort/sort/linear_assignment.py new file mode 100644 index 0000000..f1238dc --- /dev/null +++ b/deep_sort/sort/linear_assignment.py @@ -0,0 +1,193 @@ +# vim: expandtab:ts=4:sw=4 +from __future__ import absolute_import +import numpy as np +# from sklearn.utils.linear_assignment_ import linear_assignment +from scipy.optimize import linear_sum_assignment as linear_assignment +from . import kalman_filter + + +INFTY_COST = 1e+5 + + +def min_cost_matching( + distance_metric, max_distance, tracks, detections, track_indices=None, + detection_indices=None): + """Solve linear assignment problem. + + Parameters + ---------- + distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray + The distance metric is given a list of tracks and detections as well as + a list of N track indices and M detection indices. The metric should + return the NxM dimensional cost matrix, where element (i, j) is the + association cost between the i-th track in the given track indices and + the j-th detection in the given detection_indices. + max_distance : float + Gating threshold. Associations with cost larger than this value are + disregarded. + tracks : List[track.Track] + A list of predicted tracks at the current time step. + detections : List[detection.Detection] + A list of detections at the current time step. + track_indices : List[int] + List of track indices that maps rows in `cost_matrix` to tracks in + `tracks` (see description above). + detection_indices : List[int] + List of detection indices that maps columns in `cost_matrix` to + detections in `detections` (see description above). + + Returns + ------- + (List[(int, int)], List[int], List[int]) + Returns a tuple with the following three entries: + * A list of matched track and detection indices. + * A list of unmatched track indices. + * A list of unmatched detection indices. + + """ + if track_indices is None: + track_indices = np.arange(len(tracks)) + if detection_indices is None: + detection_indices = np.arange(len(detections)) + + if len(detection_indices) == 0 or len(track_indices) == 0: + return [], track_indices, detection_indices # Nothing to match. + + cost_matrix = distance_metric( + tracks, detections, track_indices, detection_indices) + cost_matrix[cost_matrix > max_distance] = max_distance + 1e-5 + + ##匈牙利或者KM匹配 + row_indices, col_indices = linear_assignment(cost_matrix) + + matches, unmatched_tracks, unmatched_detections = [], [], [] + for col, detection_idx in enumerate(detection_indices): + if col not in col_indices: + unmatched_detections.append(detection_idx) + for row, track_idx in enumerate(track_indices): + if row not in row_indices: + unmatched_tracks.append(track_idx) + for row, col in zip(row_indices, col_indices): + track_idx = track_indices[row] + detection_idx = detection_indices[col] + if cost_matrix[row, col] > max_distance: + unmatched_tracks.append(track_idx) + unmatched_detections.append(detection_idx) + else: + matches.append((track_idx, detection_idx)) + return matches, unmatched_tracks, unmatched_detections + + +def matching_cascade( + distance_metric, max_distance, cascade_depth, tracks, detections, + track_indices=None, detection_indices=None): + """Run matching cascade. + + Parameters + ---------- + distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray + The distance metric is given a list of tracks and detections as well as + a list of N track indices and M detection indices. The metric should + return the NxM dimensional cost matrix, where element (i, j) is the + association cost between the i-th track in the given track indices and + the j-th detection in the given detection indices. + max_distance : float + Gating threshold. Associations with cost larger than this value are + disregarded. + cascade_depth: int + The cascade depth, should be se to the maximum track age. + tracks : List[track.Track] + A list of predicted tracks at the current time step. + detections : List[detection.Detection] + A list of detections at the current time step. + track_indices : Optional[List[int]] + List of track indices that maps rows in `cost_matrix` to tracks in + `tracks` (see description above). Defaults to all tracks. + detection_indices : Optional[List[int]] + List of detection indices that maps columns in `cost_matrix` to + detections in `detections` (see description above). Defaults to all + detections. + + Returns + ------- + (List[(int, int)], List[int], List[int]) + Returns a tuple with the following three entries: + * A list of matched track and detection indices. + * A list of unmatched track indices. + * A list of unmatched detection indices. + + """ + if track_indices is None: + track_indices = list(range(len(tracks))) + if detection_indices is None: + detection_indices = list(range(len(detections))) + + unmatched_detections = detection_indices + matches = [] + for level in range(cascade_depth): + if len(unmatched_detections) == 0: # No detections left + break + + track_indices_l = [ + k for k in track_indices + if tracks[k].time_since_update == 1 + level #按照匹配先后顺序进行匹配,首先是level=0,因为每次update time_since_update归0 + ] + if len(track_indices_l) == 0: # Nothing to match at this level 不同level 保留的track_indices_l不同 + continue + + matches_l, _, unmatched_detections = \ + min_cost_matching( + distance_metric, max_distance, tracks, detections, + track_indices_l, unmatched_detections) + matches += matches_l + unmatched_tracks = list(set(track_indices) - set(k for k, _ in matches)) + return matches, unmatched_tracks, unmatched_detections + + +def gate_cost_matrix( + kf, cost_matrix, tracks, detections, track_indices, detection_indices, + gated_cost=INFTY_COST, only_position=False): + """Invalidate infeasible entries in cost matrix based on the state + distributions obtained by Kalman filtering. + + Parameters + ---------- + kf : The Kalman filter. + cost_matrix : ndarray + The NxM dimensional cost matrix, where N is the number of track indices + and M is the number of detection indices, such that entry (i, j) is the + association cost between `tracks[track_indices[i]]` and + `detections[detection_indices[j]]`. + tracks : List[track.Track] + A list of predicted tracks at the current time step. + detections : List[detection.Detection] + A list of detections at the current time step. + track_indices : List[int] + List of track indices that maps rows in `cost_matrix` to tracks in + `tracks` (see description above). + detection_indices : List[int] + List of detection indices that maps columns in `cost_matrix` to + detections in `detections` (see description above). + gated_cost : Optional[float] + Entries in the cost matrix corresponding to infeasible associations are + set this value. Defaults to a very large value. + only_position : Optional[bool] + If True, only the x, y position of the state distribution is considered + during gating. Defaults to False. + + Returns + ------- + ndarray + Returns the modified cost matrix. + + """ + gating_dim = 2 if only_position else 4 + gating_threshold = kalman_filter.chi2inv95[gating_dim] + measurements = np.asarray( + [detections[i].to_xyah() for i in detection_indices]) + for row, track_idx in enumerate(track_indices): + track = tracks[track_idx] + gating_distance = kf.gating_distance( + track.mean, track.covariance, measurements, only_position) + cost_matrix[row, gating_distance > gating_threshold] = gated_cost + return cost_matrix diff --git a/deep_sort/sort/nn_matching.py b/deep_sort/sort/nn_matching.py new file mode 100644 index 0000000..6c7d68c --- /dev/null +++ b/deep_sort/sort/nn_matching.py @@ -0,0 +1,178 @@ +# vim: expandtab:ts=4:sw=4 +import numpy as np + + +def _pdist(a, b): + """Compute pair-wise squared distance between points in `a` and `b`. + + Parameters + ---------- + a : array_like + An NxM matrix of N samples of dimensionality M. + b : array_like + An LxM matrix of L samples of dimensionality M. + + Returns + ------- + ndarray + Returns a matrix of size len(a), len(b) such that eleement (i, j) + contains the squared distance between `a[i]` and `b[j]`. + + """ + a, b = np.asarray(a), np.asarray(b) + if len(a) == 0 or len(b) == 0: + return np.zeros((len(a), len(b))) + a2, b2 = np.square(a).sum(axis=1), np.square(b).sum(axis=1) + r2 = -2. * np.dot(a, b.T) + a2[:, None] + b2[None, :] + r2 = np.clip(r2, 0., float(np.inf)) + return r2 + + +def _cosine_distance(a, b, data_is_normalized=False): + """Compute pair-wise cosine distance between points in `a` and `b`. + + Parameters + ---------- + a : array_like + An NxM matrix of N samples of dimensionality M. + b : array_like + An LxM matrix of L samples of dimensionality M. + data_is_normalized : Optional[bool] + If True, assumes rows in a and b are unit length vectors. + Otherwise, a and b are explicitly normalized to lenght 1. + + Returns + ------- + ndarray + Returns a matrix of size len(a), len(b) such that eleement (i, j) + contains the squared distance between `a[i]` and `b[j]`. + + """ + if not data_is_normalized: + a = np.asarray(a) / np.linalg.norm(a, axis=1, keepdims=True) ##求取二范数 + b = np.asarray(b) / np.linalg.norm(b, axis=1, keepdims=True) + return 1. - np.dot(a, b.T) + + +def _nn_euclidean_distance(x, y): + """ Helper function for nearest neighbor distance metric (Euclidean). + + Parameters + ---------- + x : ndarray + A matrix of N row-vectors (sample points). + y : ndarray + A matrix of M row-vectors (query points). + + Returns + ------- + ndarray + A vector of length M that contains for each entry in `y` the + smallest Euclidean distance to a sample in `x`. + + """ + distances = _pdist(x, y) + return np.maximum(0.0, distances.min(axis=0)) + + +def _nn_cosine_distance(x, y): + """ Helper function for nearest neighbor distance metric (cosine). + + Parameters + ---------- + x : ndarray + A matrix of N row-vectors (sample points). + y : ndarray + A matrix of M row-vectors (query points). + + Returns + ------- + ndarray + A vector of length M that contains for each entry in `y` the + smallest cosine distance to a sample in `x`. + + """ + distances = _cosine_distance(x, y) + #smallest cosine distance to a sample in `x` [1 , ...] + return distances.min(axis=0) + + +class NearestNeighborDistanceMetric(object): + """ + A nearest neighbor distance metric that, for each target, returns + the closest distance to any sample that has been observed so far. + + Parameters + ---------- + metric : str + Either "euclidean" or "cosine". + matching_threshold: float + The matching threshold. Samples with larger distance are considered an + invalid match. + budget : Optional[int] + If not None, fix samples per class to at most this number. Removes + the oldest samples when the budget is reached. + + Attributes + ---------- + samples : Dict[int -> List[ndarray]] + A dictionary that maps from target identities to the list of samples + that have been observed so far. + + """ + + def __init__(self, metric, matching_threshold, budget=None): + + + if metric == "euclidean": + self._metric = _nn_euclidean_distance + elif metric == "cosine": + self._metric = _nn_cosine_distance + else: + raise ValueError( + "Invalid metric; must be either 'euclidean' or 'cosine'") + self.matching_threshold = matching_threshold + self.budget = budget + self.samples = {} + + def partial_fit(self, features, targets, active_targets): + """Update the distance metric with new data. + + Parameters + ---------- + features : ndarray + An NxM matrix of N features of dimensionality M. + targets : ndarray + An integer array of associated target identities. + active_targets : List[int] + A list of targets that are currently present in the scene. + + """ + for feature, target in zip(features, targets): + self.samples.setdefault(target, []).append(feature) + if self.budget is not None: + self.samples[target] = self.samples[target][-self.budget:] # 取前100个features + self.samples = {k: self.samples[k] for k in active_targets} + + def distance(self, features, targets): + """Compute distance between features and targets. + + Parameters + ---------- + features : ndarray + An NxM matrix of N features of dimensionality M. + targets : List[int]gated_metric + A list of targets to match the given `features` against. + + Returns + ------- + ndarray + Returns a cost matrix of shape len(targets), len(features), where + element (i, j) contains the closest squared distance between + `targets[i]` and `features[j]`. + + """ + cost_matrix = np.zeros((len(targets), len(features))) + for i, target in enumerate(targets): + cost_matrix[i, :] = self._metric(self.samples[target], features) + return cost_matrix diff --git a/deep_sort/sort/preprocessing.py b/deep_sort/sort/preprocessing.py new file mode 100644 index 0000000..de31eea --- /dev/null +++ b/deep_sort/sort/preprocessing.py @@ -0,0 +1,73 @@ +# vim: expandtab:ts=4:sw=4 +import numpy as np +import cv2 + + +def non_max_suppression(boxes, max_bbox_overlap, scores=None): + """Suppress overlapping detections. + + Original code from [1]_ has been adapted to include confidence score. + + .. [1] http://www.pyimagesearch.com/2015/02/16/ + faster-non-maximum-suppression-python/ + + Examples + -------- + + >>> boxes = [d.roi for d in detections] + >>> scores = [d.confidence for d in detections] + >>> indices = non_max_suppression(boxes, max_bbox_overlap, scores) + >>> detections = [detections[i] for i in indices] + + Parameters + ---------- + boxes : ndarray + Array of ROIs (x, y, width, height). + max_bbox_overlap : float + ROIs that overlap more than this values are suppressed. + scores : Optional[array_like] + Detector confidence score. + + Returns + ------- + List[int] + Returns indices of detections that have survived non-maxima suppression. + + """ + if len(boxes) == 0: + return [] + + boxes = boxes.astype(np.float) + pick = [] + + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + boxes[:, 0] + y2 = boxes[:, 3] + boxes[:, 1] + + area = (x2 - x1 + 1) * (y2 - y1 + 1) + if scores is not None: + idxs = np.argsort(scores) + else: + idxs = np.argsort(y2) + + while len(idxs) > 0: + last = len(idxs) - 1 + i = idxs[last] + pick.append(i) + + xx1 = np.maximum(x1[i], x1[idxs[:last]]) + yy1 = np.maximum(y1[i], y1[idxs[:last]]) + xx2 = np.minimum(x2[i], x2[idxs[:last]]) + yy2 = np.minimum(y2[i], y2[idxs[:last]]) + + w = np.maximum(0, xx2 - xx1 + 1) + h = np.maximum(0, yy2 - yy1 + 1) + + overlap = (w * h) / area[idxs[:last]] + + idxs = np.delete( + idxs, np.concatenate( + ([last], np.where(overlap > max_bbox_overlap)[0]))) + + return pick diff --git a/deep_sort/sort/track.py b/deep_sort/sort/track.py new file mode 100644 index 0000000..550f9ca --- /dev/null +++ b/deep_sort/sort/track.py @@ -0,0 +1,172 @@ +# vim: expandtab:ts=4:sw=4 + + +class TrackState: + """ + Enumeration type for the single target track state. Newly created tracks are + classified as `tentative` until enough evidence has been collected. Then, + the track state is changed to `confirmed`. Tracks that are no longer alive + are classified as `deleted` to mark them for removal from the set of active + tracks. + + """ + + Tentative = 1 + Confirmed = 2 + Deleted = 3 + + +class Track: + """ + A single target track with state space `(x, y, a, h)` and associated + velocities, where `(x, y)` is the center of the bounding box, `a` is the + aspect ratio and `h` is the height. + + Parameters + ---------- + mean : ndarray + Mean vector of the initial state distribution. + covariance : ndarray + Covariance matrix of the initial state distribution. + track_id : int + A unique track identifier. + n_init : int + Number of consecutive detections before the track is confirmed. The + track state is set to `Deleted` if a miss occurs within the first + `n_init` frames. + max_age : int + The maximum number of consecutive misses before the track state is + set to `Deleted`. + feature : Optional[ndarray] + Feature vector of the detection this track originates from. If not None, + this feature is added to the `features` cache. + + Attributes + ---------- + mean : ndarray + Mean vector of the initial state distribution. + covariance : ndarray + Covariance matrix of the initial state distribution. + track_id : int + A unique track identifier. + hits : int + Total number of measurement updates. + age : int + Total number of frames since first occurance. + time_since_update : int + Total number of frames since last measurement update. + state : TrackState + The current track state. + features : List[ndarray] + A cache of features. On each measurement update, the associated feature + vector is added to this list. + + """ + + def __init__(self, mean, covariance, track_id, n_init, max_age, + feature=None): + self.mean = mean + self.covariance = covariance + self.track_id = track_id + # hits 和 n_init 进行比较 + # hits 每次 update 的时候进行一次更新(只有 match 的时候才进行 update ) + # hits 代表匹配上了多少次,匹配次数超过 n_init 就会设置为 confirmed 状态 + self.hits = 1 + self.age = 1 # 没有用到,和 time_since_update 功能重复 + # 每次调用 predict 函数的时候就会 +1 + # 每次调用 update 函数的时候就会设置为 0 + self.time_since_update = 0 + + self.state = TrackState.Tentative ##Newly created tracks areclassified as `tentative` until enough evidence has been collected. + # 每个 track 对应多个 features, 每次更新都将最新的 feature 添加到列表中 + self.features = [] + if feature is not None: + self.features.append(feature) + + self._n_init = n_init + self._max_age = max_age + + def to_tlwh(self): + """Get current position in bounding box format `(top left x, top left y, + width, height)`. + + Returns + ------- + ndarray + The bounding box. + + """ + ret = self.mean[:4].copy() + ret[2] *= ret[3] + ret[:2] -= ret[2:] / 2 + return ret + + def to_tlbr(self): + """Get current position in bounding box format `(min x, miny, max x, + max y)`. + + Returns + ------- + ndarray + The bounding box. + + """ + ret = self.to_tlwh() + ret[2:] = ret[:2] + ret[2:] + return ret + + def predict(self, kf): + """Propagate the state distribution to the current time step using a + Kalman filter prediction step. + + Parameters + ---------- + kf : kalman_filter.KalmanFilter + The Kalman filter. + + """ + self.mean, self.covariance = kf.predict(self.mean, self.covariance) + self.age += 1 + self.time_since_update += 1 + + def update(self, kf, detection): + """Perform Kalman filter measurement update step and update the feature + cache. + + Parameters + ---------- + kf : kalman_filter.KalmanFilter + The Kalman filter. + detection : Detection + The associated detection. + + """ + self.mean, self.covariance = kf.update( + self.mean, self.covariance, detection.to_xyah()) + self.features.append(detection.feature) + + self.hits += 1 + self.time_since_update = 0 + if self.state == TrackState.Tentative and self.hits >= self._n_init: + self.state = TrackState.Confirmed + + def mark_missed(self): + """Mark this track as missed (no association at the current time step). + """ + if self.state == TrackState.Tentative: ##如果track 序列 没有被detction匹配过,则删除,这种概率很少! + self.state = TrackState.Deleted + elif self.time_since_update > self._max_age: ##如果track 序列 以前被detction匹配过,但时间过长没有被匹配,则删除 + self.state = TrackState.Deleted + + def is_tentative(self): + """Returns True if this track is tentative (unconfirmed). + """ + return self.state == TrackState.Tentative + + def is_confirmed(self): + """Returns True if this track is confirmed.""" + return self.state == TrackState.Confirmed + + def is_deleted(self): + """Returns True if this track is dead and should be deleted.""" + return self.state == TrackState.Deleted diff --git a/deep_sort/sort/tracker.py b/deep_sort/sort/tracker.py new file mode 100644 index 0000000..6eb0a82 --- /dev/null +++ b/deep_sort/sort/tracker.py @@ -0,0 +1,141 @@ +# vim: expandtab:ts=4:sw=4 +from __future__ import absolute_import +import numpy as np +from . import kalman_filter +from . import linear_assignment +from . import iou_matching +from .track import Track + + +class Tracker: + """ + This is the multi-target tracker. + + Parameters + ---------- + metric : nn_matching.NearestNeighborDistanceMetric + A distance metric for measurement-to-track association. + max_age : int + Maximum number of missed misses before a track is deleted. + n_init : int + Number of consecutive detections before the track is confirmed. The + track state is set to `Deleted` if a miss occurs within the first + `n_init` frames. + + Attributes + ---------- + metric : nn_matching.NearestNeighborDistanceMetric + The distance metric used for measurement to track association. + max_age : int + Maximum number of missed misses before a track is deleted. + n_init : int + Number of frames that a track remains in initialization phase. + kf : kalman_filter.KalmanFilter + A Kalman filter to filter target trajectories in image space. + tracks : List[Track] + The list of active tracks at the current time step. + + """ + + def __init__(self, metric, max_iou_distance=0.7, max_age=70, n_init=3): + # metric 是一个类,用于计算距离 ( 余弦距离或马氏距离 ) + self.metric = metric + self.max_iou_distance = max_iou_distance + self.max_age = max_age + self.n_init = n_init + + self.kf = kalman_filter.KalmanFilter() + self.tracks = [] + self._next_id = 1 + + def predict(self): + """Propagate track state distributions one time step forward. + + This function should be called once every time step, before `update`. + """ + for track in self.tracks: + track.predict(self.kf) + + def update(self, detections): + """Perform measurement update and track management. + + Parameters + ---------- + detections : List[deep_sort.detection.Detection] + A list of detections at the current time step. + + """ + # Run matching cascade. + matches, unmatched_tracks, unmatched_detections = \ + self._match(detections) + + # Update track set. + for track_idx, detection_idx in matches: + self.tracks[track_idx].update( + self.kf, detections[detection_idx]) + for track_idx in unmatched_tracks: + self.tracks[track_idx].mark_missed() + for detection_idx in unmatched_detections: + self._initiate_track(detections[detection_idx]) + self.tracks = [t for t in self.tracks if not t.is_deleted()] + + # Update distance metric. + active_targets = [t.track_id for t in self.tracks if t.is_confirmed()] + features, targets = [], [] + for track in self.tracks: + if not track.is_confirmed(): + continue + features += track.features + targets += [track.track_id for _ in track.features] + track.features = [] ## 先将track.features置空,下次update时候,feature可以进来 + self.metric.partial_fit( + np.asarray(features), np.asarray(targets), active_targets) + + def _match(self, detections): + + def gated_metric(tracks, dets, track_indices, detection_indices): + features = np.array([dets[i].feature for i in detection_indices]) + targets = np.array([tracks[i].track_id for i in track_indices]) + # 1. 通过最近邻计算出代价矩阵 cosine distance 每个框的embading进行cost_matrix计算,外观状态 + cost_matrix = self.metric.distance(features, targets) + # 2. 计算马氏距离 , 得到新的状态矩阵,运动状态 + cost_matrix = linear_assignment.gate_cost_matrix( + self.kf, cost_matrix, tracks, dets, track_indices, + detection_indices) + + return cost_matrix + + # Split track set into confirmed and unconfirmed tracks. + confirmed_tracks = [ + i for i, t in enumerate(self.tracks) if t.is_confirmed()] + unconfirmed_tracks = [ + i for i, t in enumerate(self.tracks) if not t.is_confirmed()] + + # Associate confirmed tracks using appearance features. 这里的gated_metric不带括号直接引用函数本身,在min里面实现 + matches_a, unmatched_tracks_a, unmatched_detections = \ + linear_assignment.matching_cascade( + gated_metric, self.metric.matching_threshold, self.max_age, + self.tracks, detections, confirmed_tracks) + + # Associate remaining tracks together with unconfirmed tracks using IOU. + iou_track_candidates = unconfirmed_tracks + [ + k for k in unmatched_tracks_a if + self.tracks[k].time_since_update == 1] + unmatched_tracks_a = [ + k for k in unmatched_tracks_a if + self.tracks[k].time_since_update != 1] + matches_b, unmatched_tracks_b, unmatched_detections = \ + linear_assignment.min_cost_matching( + iou_matching.iou_cost, self.max_iou_distance, self.tracks, + detections, iou_track_candidates, unmatched_detections) + + matches = matches_a + matches_b + unmatched_tracks = list(set(unmatched_tracks_a + unmatched_tracks_b)) + return matches, unmatched_tracks, unmatched_detections + + def _initiate_track(self, detection): + mean, covariance = self.kf.initiate(detection.to_xyah()) + self.tracks.append(Track( + mean, covariance, self._next_id, self.n_init, self.max_age, + detection.feature)) + self._next_id += 1 diff --git a/detector/YOLOv3/README.md b/detector/YOLOv3/README.md new file mode 100644 index 0000000..ef8e168 --- /dev/null +++ b/detector/YOLOv3/README.md @@ -0,0 +1,11 @@ +# YOLOv3 for detection + +This is an implemention of YOLOv3 with only the forward part. + +If you want to train YOLOv3 on your custom dataset, please search `YOLOv3` on github. + +## Quick forward +```bash +cd YOLOv3 +python +``` \ No newline at end of file diff --git a/detector/YOLOv3/__init__.py b/detector/YOLOv3/__init__.py new file mode 100644 index 0000000..fff6a61 --- /dev/null +++ b/detector/YOLOv3/__init__.py @@ -0,0 +1,9 @@ +import sys +sys.path.append("detector/YOLOv3") + + +from .detector import YOLOv3 +__all__ = ['YOLOv3'] + + + diff --git a/detector/YOLOv3/cfg.py b/detector/YOLOv3/cfg.py new file mode 100644 index 0000000..9b2a0e7 --- /dev/null +++ b/detector/YOLOv3/cfg.py @@ -0,0 +1,248 @@ +import torch +from .yolo_utils import convert2cpu + + +def parse_cfg(cfgfile): + blocks = [] + fp = open(cfgfile) + block = None + line = fp.readline() + while line != '': + line = line.rstrip() + if line == '' or line[0] == '#': + line = fp.readline() + continue + elif line[0] == '[': + if block: + blocks.append(block) + block = dict() + block['type'] = line.lstrip('[').rstrip(']') + # set default value + if block['type'] == 'convolutional': + block['batch_normalize'] = 0 + else: + key, value = line.split('=') + key = key.strip() + if key == 'type': + key = '_type' + value = value.strip() + block[key] = value + line = fp.readline() + + if block: + blocks.append(block) + fp.close() + return blocks + + +def print_cfg(blocks): + print('layer filters size input output') + prev_width = 416 + prev_height = 416 + prev_filters = 3 + out_filters = [] + out_widths = [] + out_heights = [] + ind = -2 + for block in blocks: + ind += 1 + if block['type'] == 'net': + prev_width = int(block['width']) + prev_height = int(block['height']) + continue + elif block['type'] == 'convolutional': + filters = int(block['filters']) + kernel_size = int(block['size']) + stride = int(block['stride']) + is_pad = int(block['pad']) + pad = (kernel_size - 1) // 2 if is_pad else 0 + width = (prev_width + 2 * pad - kernel_size) // stride + 1 + height = (prev_height + 2 * pad - kernel_size) // stride + 1 + print('%5d %-6s %4d %d x %d / %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( + ind, 'conv', filters, kernel_size, kernel_size, stride, prev_width, prev_height, prev_filters, width, + height, filters)) + prev_width = width + prev_height = height + prev_filters = filters + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'maxpool': + pool_size = int(block['size']) + stride = int(block['stride']) + width = prev_width // stride + height = prev_height // stride + print('%5d %-6s %d x %d / %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( + ind, 'max', pool_size, pool_size, stride, prev_width, prev_height, prev_filters, width, height, filters)) + prev_width = width + prev_height = height + prev_filters = filters + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'avgpool': + width = 1 + height = 1 + print('%5d %-6s %3d x %3d x%4d -> %3d' % ( + ind, 'avg', prev_width, prev_height, prev_filters, prev_filters)) + prev_width = width + prev_height = height + prev_filters = filters + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'softmax': + print('%5d %-6s -> %3d' % (ind, 'softmax', prev_filters)) + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'cost': + print('%5d %-6s -> %3d' % (ind, 'cost', prev_filters)) + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'reorg': + stride = int(block['stride']) + filters = stride * stride * prev_filters + width = prev_width // stride + height = prev_height // stride + print('%5d %-6s / %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( + ind, 'reorg', stride, prev_width, prev_height, prev_filters, width, height, filters)) + prev_width = width + prev_height = height + prev_filters = filters + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'upsample': + stride = int(block['stride']) + filters = prev_filters + width = prev_width * stride + height = prev_height * stride + print('%5d %-6s * %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( + ind, 'upsample', stride, prev_width, prev_height, prev_filters, width, height, filters)) + prev_width = width + prev_height = height + prev_filters = filters + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'route': + layers = block['layers'].split(',') + layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers] + if len(layers) == 1: + print('%5d %-6s %d' % (ind, 'route', layers[0])) + prev_width = out_widths[layers[0]] + prev_height = out_heights[layers[0]] + prev_filters = out_filters[layers[0]] + elif len(layers) == 2: + print('%5d %-6s %d %d' % (ind, 'route', layers[0], layers[1])) + prev_width = out_widths[layers[0]] + prev_height = out_heights[layers[0]] + assert (prev_width == out_widths[layers[1]]) + assert (prev_height == out_heights[layers[1]]) + prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] in ['region', 'yolo']: + print('%5d %-6s' % (ind, 'detection')) + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'shortcut': + from_id = int(block['from']) + from_id = from_id if from_id > 0 else from_id + ind + print('%5d %-6s %d' % (ind, 'shortcut', from_id)) + prev_width = out_widths[from_id] + prev_height = out_heights[from_id] + prev_filters = out_filters[from_id] + out_widths.append(prev_width) + out_heights.append(prev_height) + out_filters.append(prev_filters) + elif block['type'] == 'connected': + filters = int(block['output']) + print('%5d %-6s %d -> %3d' % (ind, 'connected', prev_filters, filters)) + prev_filters = filters + out_widths.append(1) + out_heights.append(1) + out_filters.append(prev_filters) + else: + print('unknown type %s' % (block['type'])) + + +def load_conv(buf, start, conv_model): + num_w = conv_model.weight.numel() + num_b = conv_model.bias.numel() + # print("start: {}, num_w: {}, num_b: {}".format(start, num_w, num_b)) + # by ysyun, use .view_as() + conv_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]).view_as(conv_model.bias.data)); + start = start + num_b + conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).view_as(conv_model.weight.data)); + start = start + num_w + return start + + +def save_conv(fp, conv_model): + if conv_model.bias.is_cuda: + convert2cpu(conv_model.bias.data).numpy().tofile(fp) + convert2cpu(conv_model.weight.data).numpy().tofile(fp) + else: + conv_model.bias.data.numpy().tofile(fp) + conv_model.weight.data.numpy().tofile(fp) + + +def load_conv_bn(buf, start, conv_model, bn_model): + num_w = conv_model.weight.numel() + num_b = bn_model.bias.numel() + bn_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b])); + start = start + num_b + bn_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_b])); + start = start + num_b + bn_model.running_mean.copy_(torch.from_numpy(buf[start:start + num_b])); + start = start + num_b + bn_model.running_var.copy_(torch.from_numpy(buf[start:start + num_b])); + start = start + num_b + # conv_model.weight.data.copy_(torch.from_numpy(buf[start:start+num_w])); start = start + num_w + conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).view_as(conv_model.weight.data)); + start = start + num_w + return start + + +def save_conv_bn(fp, conv_model, bn_model): + if bn_model.bias.is_cuda: + convert2cpu(bn_model.bias.data).numpy().tofile(fp) + convert2cpu(bn_model.weight.data).numpy().tofile(fp) + convert2cpu(bn_model.running_mean).numpy().tofile(fp) + convert2cpu(bn_model.running_var).numpy().tofile(fp) + convert2cpu(conv_model.weight.data).numpy().tofile(fp) + else: + bn_model.bias.data.numpy().tofile(fp) + bn_model.weight.data.numpy().tofile(fp) + bn_model.running_mean.numpy().tofile(fp) + bn_model.running_var.numpy().tofile(fp) + conv_model.weight.data.numpy().tofile(fp) + + +def load_fc(buf, start, fc_model): + num_w = fc_model.weight.numel() + num_b = fc_model.bias.numel() + fc_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b])); + start = start + num_b + fc_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w])); + start = start + num_w + return start + + +def save_fc(fp, fc_model): + fc_model.bias.data.numpy().tofile(fp) + fc_model.weight.data.numpy().tofile(fp) + + +if __name__ == '__main__': + import sys + + blocks = parse_cfg('cfg/yolo.cfg') + if len(sys.argv) == 2: + blocks = parse_cfg(sys.argv[1]) + print_cfg(blocks) diff --git a/detector/YOLOv3/cfg/coco.data b/detector/YOLOv3/cfg/coco.data new file mode 100644 index 0000000..b7e31be --- /dev/null +++ b/detector/YOLOv3/cfg/coco.data @@ -0,0 +1,5 @@ +train = coco_train.txt +valid = coco_test.txt +names = data/coco.names +backup = backup +gpus = 0,1,2,3 diff --git a/detector/YOLOv3/cfg/coco.names b/detector/YOLOv3/cfg/coco.names new file mode 100644 index 0000000..ca76c80 --- /dev/null +++ b/detector/YOLOv3/cfg/coco.names @@ -0,0 +1,80 @@ +person +bicycle +car +motorbike +aeroplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +sofa +pottedplant +bed +diningtable +toilet +tvmonitor +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/detector/YOLOv3/cfg/darknet19_448.cfg b/detector/YOLOv3/cfg/darknet19_448.cfg new file mode 100644 index 0000000..133c688 --- /dev/null +++ b/detector/YOLOv3/cfg/darknet19_448.cfg @@ -0,0 +1,200 @@ +[net] +batch=128 +subdivisions=4 +height=448 +width=448 +max_crop=512 +channels=3 +momentum=0.9 +decay=0.0005 + +learning_rate=0.001 +policy=poly +power=4 +max_batches=100000 + +angle=7 +hue = .1 +saturation=.75 +exposure=.75 +aspect=.75 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +filters=1000 +size=1 +stride=1 +pad=1 +activation=linear + +[avgpool] + +[softmax] +groups=1 + +[cost] +type=sse + diff --git a/detector/YOLOv3/cfg/tiny-yolo-voc.cfg b/detector/YOLOv3/cfg/tiny-yolo-voc.cfg new file mode 100644 index 0000000..ab2c066 --- /dev/null +++ b/detector/YOLOv3/cfg/tiny-yolo-voc.cfg @@ -0,0 +1,134 @@ +[net] +batch=64 +subdivisions=8 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +max_batches = 40200 +policy=steps +steps=-1,100,20000,30000 +scales=.1,10,.1,.1 + +[convolutional] +batch_normalize=1 +filters=16 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=1 + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +########### + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=125 +activation=linear + +[region] +anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52 +bias_match=1 +classes=20 +coords=4 +num=5 +softmax=1 +jitter=.2 +rescore=1 + +object_scale=5 +noobject_scale=1 +class_scale=1 +coord_scale=1 + +absolute=1 +thresh = .6 +random=1 diff --git a/detector/YOLOv3/cfg/tiny-yolo.cfg b/detector/YOLOv3/cfg/tiny-yolo.cfg new file mode 100644 index 0000000..ac5770e --- /dev/null +++ b/detector/YOLOv3/cfg/tiny-yolo.cfg @@ -0,0 +1,140 @@ +[net] +# Training +# batch=64 +# subdivisions=2 +# Testing +batch=1 +subdivisions=1 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +burn_in=1000 +max_batches = 500200 +policy=steps +steps=400000,450000 +scales=.1,.1 + +[convolutional] +batch_normalize=1 +filters=16 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=1 + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +########### + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=425 +activation=linear + +[region] +anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 +bias_match=1 +classes=80 +coords=4 +num=5 +softmax=1 +jitter=.2 +rescore=0 + +object_scale=5 +noobject_scale=1 +class_scale=1 +coord_scale=1 + +absolute=1 +thresh = .6 +random=1 + diff --git a/detector/YOLOv3/cfg/voc.data b/detector/YOLOv3/cfg/voc.data new file mode 100644 index 0000000..3329357 --- /dev/null +++ b/detector/YOLOv3/cfg/voc.data @@ -0,0 +1,5 @@ +train = data/voc_train.txt +valid = data/2007_test.txt +names = data/voc.names +backup = backup +gpus = 3 diff --git a/detector/YOLOv3/cfg/voc.names b/detector/YOLOv3/cfg/voc.names new file mode 100644 index 0000000..8420ab3 --- /dev/null +++ b/detector/YOLOv3/cfg/voc.names @@ -0,0 +1,20 @@ +aeroplane +bicycle +bird +boat +bottle +bus +car +cat +chair +cow +diningtable +dog +horse +motorbike +person +pottedplant +sheep +sofa +train +tvmonitor diff --git a/detector/YOLOv3/cfg/voc_gaotie.data b/detector/YOLOv3/cfg/voc_gaotie.data new file mode 100644 index 0000000..66495ec --- /dev/null +++ b/detector/YOLOv3/cfg/voc_gaotie.data @@ -0,0 +1,5 @@ +train = data/gaotie_trainval.txt +valid = data/gaotie_test.txt +names = data/voc.names +backup = backup +gpus = 3 \ No newline at end of file diff --git a/detector/YOLOv3/cfg/yolo-voc.cfg b/detector/YOLOv3/cfg/yolo-voc.cfg new file mode 100644 index 0000000..d5bdfc1 --- /dev/null +++ b/detector/YOLOv3/cfg/yolo-voc.cfg @@ -0,0 +1,258 @@ +[net] +# Testing +batch=64 +subdivisions=8 +# Training +# batch=64 +# subdivisions=8 +height=416 +width=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +burn_in=1000 +max_batches = 80200 +policy=steps +steps=-1,500,40000,60000 +scales=0.1,10,.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + + +####### + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[route] +layers=-9 + +[convolutional] +batch_normalize=1 +size=1 +stride=1 +pad=1 +filters=64 +activation=leaky + +[reorg] +stride=2 + +[route] +layers=-1,-4 + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=125 +activation=linear + + +[region] +anchors = 1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071 +bias_match=1 +classes=20 +coords=4 +num=5 +softmax=1 +jitter=.3 +rescore=1 + +object_scale=5 +noobject_scale=1 +class_scale=1 +coord_scale=1 + +absolute=1 +thresh = .6 +random=1 diff --git a/detector/YOLOv3/cfg/yolo.cfg b/detector/YOLOv3/cfg/yolo.cfg new file mode 100644 index 0000000..2a0cd98 --- /dev/null +++ b/detector/YOLOv3/cfg/yolo.cfg @@ -0,0 +1,258 @@ +[net] +# Testing +batch=1 +subdivisions=1 +# Training +# batch=64 +# subdivisions=8 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +burn_in=1000 +max_batches = 500200 +policy=steps +steps=400000,450000 +scales=.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + + +####### + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[route] +layers=-9 + +[convolutional] +batch_normalize=1 +size=1 +stride=1 +pad=1 +filters=64 +activation=leaky + +[reorg] +stride=2 + +[route] +layers=-1,-4 + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=425 +activation=linear + + +[region] +anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 +bias_match=1 +classes=80 +coords=4 +num=5 +softmax=1 +jitter=.3 +rescore=1 + +object_scale=5 +noobject_scale=1 +class_scale=1 +coord_scale=1 + +absolute=1 +thresh = .6 +random=1 diff --git a/detector/YOLOv3/cfg/yolo_v3.cfg b/detector/YOLOv3/cfg/yolo_v3.cfg new file mode 100644 index 0000000..f6a3d22 --- /dev/null +++ b/detector/YOLOv3/cfg/yolo_v3.cfg @@ -0,0 +1,789 @@ +[net] +# Testing +#batch=1 +#subdivisions=1 +# Training +batch=16 +subdivisions=4 +width=608 +height=608 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +burn_in=1000 +max_batches = 500200 +policy=steps +steps=20,25 +scales=.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +# Downsample + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=32 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +###################### + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 6,7,8 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .5 +truth_thresh = 1 +random=1 + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 61 + + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 3,4,5 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .5 +truth_thresh = 1 +random=1 + + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 36 + + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 0,1,2 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .5 +truth_thresh = 1 +random=1 + diff --git a/detector/YOLOv3/cfg/yolov3-tiny.cfg b/detector/YOLOv3/cfg/yolov3-tiny.cfg new file mode 100644 index 0000000..cfca3cf --- /dev/null +++ b/detector/YOLOv3/cfg/yolov3-tiny.cfg @@ -0,0 +1,182 @@ +[net] +# Testing +batch=1 +subdivisions=1 +# Training +# batch=64 +# subdivisions=2 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +burn_in=1000 +max_batches = 500200 +policy=steps +steps=400000,450000 +scales=.1,.1 + +[convolutional] +batch_normalize=1 +filters=16 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=1 + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +########### + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + + +[yolo] +mask = 3,4,5 +anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 +classes=80 +num=6 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 8 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + +[yolo] +mask = 0,1,2 +anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 +classes=80 +num=6 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 diff --git a/detector/YOLOv3/cfg/yolov4-tiny.cfg b/detector/YOLOv3/cfg/yolov4-tiny.cfg new file mode 100644 index 0000000..dc6f5bf --- /dev/null +++ b/detector/YOLOv3/cfg/yolov4-tiny.cfg @@ -0,0 +1,281 @@ +[net] +# Testing +#batch=1 +#subdivisions=1 +# Training +batch=64 +subdivisions=1 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.00261 +burn_in=1000 +max_batches = 500200 +policy=steps +steps=400000,450000 +scales=.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers=-1 +groups=2 +group_id=1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -1,-2 + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -6,-1 + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers=-1 +groups=2 +group_id=1 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -1,-2 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -6,-1 + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers=-1 +groups=2 +group_id=1 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -1,-2 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -6,-1 + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +################################## + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + + +[yolo] +mask = 3,4,5 +anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 +classes=80 +num=6 +jitter=.3 +scale_x_y = 1.05 +cls_normalizer=1.0 +iou_normalizer=0.07 +iou_loss=ciou +ignore_thresh = .7 +truth_thresh = 1 +random=0 +resize=1.5 +nms_kind=greedynms +beta_nms=0.6 + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 23 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + +[yolo] +mask = 1,2,3 +anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 +classes=80 +num=6 +jitter=.3 +scale_x_y = 1.05 +cls_normalizer=1.0 +iou_normalizer=0.07 +iou_loss=ciou +ignore_thresh = .7 +truth_thresh = 1 +random=0 +resize=1.5 +nms_kind=greedynms +beta_nms=0.6 diff --git a/detector/YOLOv3/cfg/yolov4.cfg b/detector/YOLOv3/cfg/yolov4.cfg new file mode 100644 index 0000000..2a1d171 --- /dev/null +++ b/detector/YOLOv3/cfg/yolov4.cfg @@ -0,0 +1,1157 @@ +[net] +batch=64 +subdivisions=8 +# Training +#width=512 +#height=512 +width=416 +height=416 +channels=3 +momentum=0.949 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.0013 +burn_in=1000 +max_batches = 500500 +policy=steps +steps=400000,450000 +scales=.1,.1 + +#cutmix=1 +mosaic=1 + +#:104x104 54:52x52 85:26x26 104:13x13 for 416 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=mish + +# Downsample + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=2 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -2 + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=32 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -1,-7 + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +# Downsample + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=2 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -2 + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -1,-10 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +# Downsample + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=2 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -2 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -1,-28 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +# Downsample + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=2 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -2 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -1,-28 + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=mish + +# Downsample + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=2 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -2 + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=mish + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=mish + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=mish + +[route] +layers = -1,-16 + +[convolutional] +batch_normalize=1 +filters=1024 +size=1 +stride=1 +pad=1 +activation=mish + +########################## + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +### SPP ### +[maxpool] +stride=1 +size=5 + +[route] +layers=-2 + +[maxpool] +stride=1 +size=9 + +[route] +layers=-4 + +[maxpool] +stride=1 +size=13 + +[route] +layers=-1,-3,-5,-6 +### End SPP ### + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = 85 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -1, -3 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = 54 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -1, -3 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +########################## + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 0,1,2 +anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +scale_x_y = 1.2 +iou_thresh=0.213 +cls_normalizer=1.0 +iou_normalizer=0.07 +iou_loss=ciou +nms_kind=greedynms +beta_nms=0.6 +max_delta=5 + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +size=3 +stride=2 +pad=1 +filters=256 +activation=leaky + +[route] +layers = -1, -16 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 3,4,5 +anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +scale_x_y = 1.1 +iou_thresh=0.213 +cls_normalizer=1.0 +iou_normalizer=0.07 +iou_loss=ciou +nms_kind=greedynms +beta_nms=0.6 +max_delta=5 + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +size=3 +stride=2 +pad=1 +filters=512 +activation=leaky + +[route] +layers = -1, -37 + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 6,7,8 +anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 +scale_x_y = 1.05 +iou_thresh=0.213 +cls_normalizer=1.0 +iou_normalizer=0.07 +iou_loss=ciou +nms_kind=greedynms +beta_nms=0.6 +max_delta=5 diff --git a/detector/YOLOv3/darknet.py b/detector/YOLOv3/darknet.py new file mode 100644 index 0000000..9cef048 --- /dev/null +++ b/detector/YOLOv3/darknet.py @@ -0,0 +1,453 @@ +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from .cfg import * +from .region_layer import RegionLayer +from .yolo_layer import YoloLayer + +class MaxPoolStride1(nn.Module): + def __init__(self): + super(MaxPoolStride1, self).__init__() + + def forward(self, x): + x = F.max_pool2d(F.pad(x, (0,1,0,1), mode='replicate'), 2, stride=1) + return x + +class Upsample(nn.Module): + def __init__(self, stride=2): + super(Upsample, self).__init__() + self.stride = stride + def forward(self, x): + stride = self.stride + assert(x.data.dim() == 4) + B = x.data.size(0) + C = x.data.size(1) + H = x.data.size(2) + W = x.data.size(3) + ws = stride + hs = stride + x = x.view(B, C, H, 1, W, 1).expand(B, C, H, hs, W, ws).contiguous().view(B, C, H*hs, W*ws) + return x + +class Reorg(nn.Module): + def __init__(self, stride=2): + super(Reorg, self).__init__() + self.stride = stride + def forward(self, x): + stride = self.stride + assert(x.data.dim() == 4) + B = x.data.size(0) + C = x.data.size(1) + H = x.data.size(2) + W = x.data.size(3) + assert(H % stride == 0) + assert(W % stride == 0) + ws = stride + hs = stride + x = x.view(B, C, H//hs, hs, W//ws, ws).transpose(3,4).contiguous() + x = x.view(B, C, (H//hs)*(W//ws), hs*ws).transpose(2,3).contiguous() + x = x.view(B, C, hs*ws, H//hs, W//ws).transpose(1,2).contiguous() + x = x.view(B, hs*ws*C, H//hs, W//ws) + return x + +class GlobalAvgPool2d(nn.Module): + def __init__(self): + super(GlobalAvgPool2d, self).__init__() + + def forward(self, x): + N = x.data.size(0) + C = x.data.size(1) + H = x.data.size(2) + W = x.data.size(3) + x = F.avg_pool2d(x, (H, W)) + x = x.view(N, C) + return x + +# for route and shortcut +class EmptyModule(nn.Module): + def __init__(self): + super(EmptyModule, self).__init__() + + def forward(self, x): + return x + +class Mish(nn.Module): + def __init__(self): + super(Mish , self).__init__() + def forward(self , x): + return x * torch.tanh(F.softplus(x)) +# support route shortcut and reorg + +class Darknet(nn.Module): + def getLossLayers(self): + loss_layers = [] + for m in self.models: + if isinstance(m, RegionLayer) or isinstance(m, YoloLayer): + loss_layers.append(m) + return loss_layers + + def __init__(self, cfgfile, use_cuda=True): + super(Darknet, self).__init__() + self.use_cuda = use_cuda + self.blocks = parse_cfg(cfgfile) + self.models = self.create_network(self.blocks) # merge conv, bn,leaky + self.loss_layers = self.getLossLayers() + + #self.width = int(self.blocks[0]['width']) + #self.height = int(self.blocks[0]['height']) + + if len(self.loss_layers) > 0: + last = len(self.loss_layers)-1 + self.anchors = self.loss_layers[last].anchors + self.num_anchors = self.loss_layers[last].num_anchors + self.anchor_step = self.loss_layers[last].anchor_step + self.num_classes = self.loss_layers[last].num_classes + + # default format : major=0, minor=1 + self.header = torch.IntTensor([0,1,0,0]) + self.seen = 0 + + def forward(self, x): + ind = -2 + self.loss_layers = None + outputs = dict() + out_boxes = dict() + outno = 0 + for block in self.blocks: + ind = ind + 1 + + if block['type'] == 'net': + continue + elif block['type'] in ['convolutional', 'maxpool', 'reorg', 'upsample', 'avgpool', 'softmax', 'connected']: + x = self.models[ind](x) + outputs[ind] = x + elif block['type'] == 'route': + layers = block['layers'].split(',') + layers = [int(i) if int(i) > 0 else int(i)+ind for i in layers] + if len(layers) == 1: + x = outputs[layers[0]] + elif len(layers) == 2: + x1 = outputs[layers[0]] + x2 = outputs[layers[1]] + x = torch.cat((x1,x2),1) + elif (len(layers) == 4): + x1 = outputs[layers[0]] + x2 = outputs[layers[1]] + x3 = outputs[layers[2]] + x4 = outputs[layers[3]] + x = torch.cat((x1, x2, x3 , x4), 1) # 在channel 维度进行cat + outputs[ind] = x + outputs[ind] = x + elif block['type'] == 'shortcut': + from_layer = int(block['from']) + activation = block['activation'] + from_layer = from_layer if from_layer > 0 else from_layer + ind + x1 = outputs[from_layer] + x2 = outputs[ind-1] + x = x1 + x2 + if activation == 'leaky': + x = F.leaky_relu(x, 0.1, inplace=True) + elif activation == 'relu': + x = F.relu(x, inplace=True) + outputs[ind] = x + elif block['type'] in [ 'region', 'yolo']: + boxes = self.models[ind].get_mask_boxes(x) + out_boxes[outno]= boxes + outno += 1 + outputs[ind] = None + elif block['type'] == 'cost': + continue + else: + print('unknown type %s' % (block['type'])) + return x if outno == 0 else out_boxes + + def print_network(self): + print_cfg(self.blocks) + + def create_network(self, blocks): + models = nn.ModuleList() + + prev_filters = 3 + out_filters =[] + prev_stride = 1 + out_strides = [] + conv_id = 0 + ind = -2 + for block in blocks: + ind += 1 + if block['type'] == 'net': + prev_filters = int(block['channels']) + self.width = int(block['width']) + self.height = int(block['height']) + continue + elif block['type'] == 'convolutional': + conv_id = conv_id + 1 + batch_normalize = int(block['batch_normalize']) + filters = int(block['filters']) + kernel_size = int(block['size']) + stride = int(block['stride']) + is_pad = int(block['pad']) + pad = (kernel_size-1)//2 if is_pad else 0 + activation = block['activation'] + model = nn.Sequential() + if batch_normalize: + model.add_module('conv{0}'.format(conv_id), nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias=False)) + model.add_module('bn{0}'.format(conv_id), nn.BatchNorm2d(filters)) + #model.add_module('bn{0}'.format(conv_id), BN2d(filters)) + else: + model.add_module('conv{0}'.format(conv_id), nn.Conv2d(prev_filters, filters, kernel_size, stride, pad)) + if activation == 'leaky': + model.add_module('leaky{0}'.format(conv_id), nn.LeakyReLU(0.1, inplace=True)) + elif activation == 'relu': + model.add_module('relu{0}'.format(conv_id), nn.ReLU(inplace=True)) + elif activation == 'mish': + model.add_module("mish{0}".format(conv_id), Mish()) + prev_filters = filters + out_filters.append(prev_filters) + prev_stride = stride * prev_stride + out_strides.append(prev_stride) + models.append(model) + elif block['type'] == 'maxpool': + pool_size = int(block['size']) + stride = int(block['stride']) + if stride > 1: + model = nn.MaxPool2d(pool_size, stride) + else: + model = MaxPoolStride1() + out_filters.append(prev_filters) + prev_stride = stride * prev_stride + out_strides.append(prev_stride) + models.append(model) + elif block['type'] == 'avgpool': + model = GlobalAvgPool2d() + out_filters.append(prev_filters) + models.append(model) + elif block['type'] == 'softmax': + model = nn.Softmax() + out_strides.append(prev_stride) + out_filters.append(prev_filters) + models.append(model) + elif block['type'] == 'cost': + if block['_type'] == 'sse': + model = nn.MSELoss(size_average=True) + elif block['_type'] == 'L1': + model = nn.L1Loss(size_average=True) + elif block['_type'] == 'smooth': + model = nn.SmoothL1Loss(size_average=True) + out_filters.append(1) + out_strides.append(prev_stride) + models.append(model) + elif block['type'] == 'reorg': + stride = int(block['stride']) + prev_filters = stride * stride * prev_filters + out_filters.append(prev_filters) + prev_stride = prev_stride * stride + out_strides.append(prev_stride) + models.append(Reorg(stride)) + elif block['type'] == 'upsample': + stride = int(block['stride']) + out_filters.append(prev_filters) + prev_stride = prev_stride / stride + out_strides.append(prev_stride) + #models.append(nn.Upsample(scale_factor=stride, mode='nearest')) + models.append(Upsample(stride)) + elif block['type'] == 'route': + layers = block['layers'].split(',') + ind = len(models) + layers = [int(i) if int(i) > 0 else int(i)+ind for i in layers] + if len(layers) == 1: + prev_filters = out_filters[layers[0]] + prev_stride = out_strides[layers[0]] + elif len(layers) == 2: + assert(layers[0] == ind - 1) + prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + prev_stride = out_strides[layers[0]] + if (len(layers) == 4): + prev_filters = out_filters[layers[0]] + \ + out_filters[layers[1]] +out_filters[layers[2]] +out_filters[layers[3]] # 这里是filter 相加 + prev_stride = out_strides[ind-1] + out_filters.append(prev_filters) + out_strides.append(prev_stride) + models.append(EmptyModule()) + elif block['type'] == 'shortcut': + ind = len(models) + prev_filters = out_filters[ind-1] + out_filters.append(prev_filters) + prev_stride = out_strides[ind-1] + out_strides.append(prev_stride) + models.append(EmptyModule()) + elif block['type'] == 'connected': + filters = int(block['output']) + if block['activation'] == 'linear': + model = nn.Linear(prev_filters, filters) + elif block['activation'] == 'leaky': + model = nn.Sequential( + nn.Linear(prev_filters, filters), + nn.LeakyReLU(0.1, inplace=True)) + elif block['activation'] == 'relu': + model = nn.Sequential( + nn.Linear(prev_filters, filters), + nn.ReLU(inplace=True)) + prev_filters = filters + out_filters.append(prev_filters) + out_strides.append(prev_stride) + models.append(model) + elif block['type'] == 'region': + region_layer = RegionLayer(use_cuda=self.use_cuda) + anchors = block['anchors'].split(',') + region_layer.anchors = [float(i) for i in anchors] + region_layer.num_classes = int(block['classes']) + region_layer.num_anchors = int(block['num']) + region_layer.anchor_step = len(region_layer.anchors)//region_layer.num_anchors + region_layer.rescore = int(block['rescore']) + region_layer.object_scale = float(block['object_scale']) + region_layer.noobject_scale = float(block['noobject_scale']) + region_layer.class_scale = float(block['class_scale']) + region_layer.coord_scale = float(block['coord_scale']) + region_layer.thresh = float(block['thresh']) + out_filters.append(prev_filters) + out_strides.append(prev_stride) + models.append(region_layer) + elif block['type'] == 'yolo': + yolo_layer = YoloLayer(use_cuda=self.use_cuda) + anchors = block['anchors'].split(',') + anchor_mask = block['mask'].split(',') + yolo_layer.anchor_mask = [int(i) for i in anchor_mask] + yolo_layer.anchors = [float(i) for i in anchors] + yolo_layer.num_classes = int(block['classes']) + yolo_layer.num_anchors = int(block['num']) + yolo_layer.anchor_step = len(yolo_layer.anchors)//yolo_layer.num_anchors + try: + yolo_layer.rescore = int(block['rescore']) + except: + pass + yolo_layer.ignore_thresh = float(block['ignore_thresh']) + yolo_layer.truth_thresh = float(block['truth_thresh']) + yolo_layer.stride = prev_stride + yolo_layer.nth_layer = ind + yolo_layer.net_width = self.width + yolo_layer.net_height = self.height + out_filters.append(prev_filters) + out_strides.append(prev_stride) + models.append(yolo_layer) + else: + print('unknown type %s' % (block['type'])) + + return models + + def load_binfile(self, weightfile): + fp = open(weightfile, 'rb') + + version = np.fromfile(fp, count=3, dtype=np.int32) + version = [int(i) for i in version] + if version[0]*10+version[1] >=2 and version[0] < 1000 and version[1] < 1000: + seen = np.fromfile(fp, count=1, dtype=np.int64) + else: + seen = np.fromfile(fp, count=1, dtype=np.int32) + self.header = torch.from_numpy(np.concatenate((version, seen), axis=0)) + self.seen = int(seen) + body = np.fromfile(fp, dtype=np.float32) + fp.close() + return body + + def load_weights(self, weightfile): + buf = self.load_binfile(weightfile) + + start = 0 + ind = -2 + for block in self.blocks: + if start >= buf.size: + break + ind = ind + 1 + if block['type'] == 'net': + continue + elif block['type'] == 'convolutional': + model = self.models[ind] + batch_normalize = int(block['batch_normalize']) + if batch_normalize: + start = load_conv_bn(buf, start, model[0], model[1]) + else: + start = load_conv(buf, start, model[0]) + elif block['type'] == 'connected': + model = self.models[ind] + if block['activation'] != 'linear': + start = load_fc(buf, start, model[0]) + else: + start = load_fc(buf, start, model) + elif block['type'] == 'maxpool': + pass + elif block['type'] == 'reorg': + pass + elif block['type'] == 'upsample': + pass + elif block['type'] == 'route': + pass + elif block['type'] == 'shortcut': + pass + elif block['type'] == 'region': + pass + elif block['type'] == 'yolo': + pass + elif block['type'] == 'avgpool': + pass + elif block['type'] == 'softmax': + pass + elif block['type'] == 'cost': + pass + else: + print('unknown type %s' % (block['type'])) + + def save_weights(self, outfile, cutoff=0): + if cutoff <= 0: + cutoff = len(self.blocks)-1 + + fp = open(outfile, 'wb') + self.header[3] = self.seen + header = np.array(self.header[0:3].numpy(), np.int32) + header.tofile(fp) + if (self.header[0]*10+self.header[1]) >= 2: + seen = np.array(self.seen, np.int64) + else: + seen = np.array(self.seen, np.int32) + seen.tofile(fp) + + ind = -1 + for blockId in range(1, cutoff+1): + ind = ind + 1 + block = self.blocks[blockId] + if block['type'] == 'convolutional': + model = self.models[ind] + batch_normalize = int(block['batch_normalize']) + if batch_normalize: + save_conv_bn(fp, model[0], model[1]) + else: + save_conv(fp, model[0]) + elif block['type'] == 'connected': + model = self.models[ind] + if block['activation'] != 'linear': + save_fc(fc, model) + else: + save_fc(fc, model[0]) + elif block['type'] == 'maxpool': + pass + elif block['type'] == 'reorg': + pass + elif block['type'] == 'upsample': + pass + elif block['type'] == 'route': + pass + elif block['type'] == 'shortcut': + pass + elif block['type'] == 'region': + pass + elif block['type'] == 'yolo': + pass + elif block['type'] == 'avgpool': + pass + elif block['type'] == 'softmax': + pass + elif block['type'] == 'cost': + pass + else: + print('unknown type %s' % (block['type'])) + fp.close() diff --git a/detector/YOLOv3/demo/004545.jpg b/detector/YOLOv3/demo/004545.jpg new file mode 100644 index 0000000..4e06c20 Binary files /dev/null and b/detector/YOLOv3/demo/004545.jpg differ diff --git a/detector/YOLOv3/demo/results/004545.jpg b/detector/YOLOv3/demo/results/004545.jpg new file mode 100644 index 0000000..4f8f75b Binary files /dev/null and b/detector/YOLOv3/demo/results/004545.jpg differ diff --git a/detector/YOLOv3/detect.py b/detector/YOLOv3/detect.py new file mode 100644 index 0000000..9a091a3 --- /dev/null +++ b/detector/YOLOv3/detect.py @@ -0,0 +1,131 @@ +import sys +import time +from PIL import Image, ImageDraw +#from models.tiny_yolo import TinyYoloNet +from yolo_utils import * +from darknet import Darknet + +import cv2 + +namesfile=None +def detect(cfgfile, weightfile, imgfolder): + m = Darknet(cfgfile) + + #m.print_network() + m.load_weights(weightfile) + print('Loading weights from %s... Done!' % (weightfile)) + + # if m.num_classes == 20: + # namesfile = 'data/voc.names' + # elif m.num_classes == 80: + # namesfile = 'data/coco.names' + # else: + # namesfile = 'data/names' + + use_cuda = True + if use_cuda: + m.cuda() + + imgfiles = [x for x in os.listdir(imgfolder) if x[-4:] == '.jpg'] + imgfiles.sort() + for imgname in imgfiles: + imgfile = os.path.join(imgfolder,imgname) + + img = Image.open(imgfile).convert('RGB') + sized = img.resize((m.width, m.height)) + + #for i in range(2): + start = time.time() + boxes = do_detect(m, sized, 0.5, 0.4, use_cuda) + finish = time.time() + #if i == 1: + print('%s: Predicted in %f seconds.' % (imgfile, (finish-start))) + + class_names = load_class_names(namesfile) + img = plot_boxes(img, boxes, 'result/{}'.format(os.path.basename(imgfile)), class_names) + img = np.array(img) + cv2.imshow('{}'.format(os.path.basename(imgfolder)), img) + cv2.resizeWindow('{}'.format(os.path.basename(imgfolder)), 1000,800) + cv2.waitKey(1000) + +def detect_cv2(cfgfile, weightfile, imgfile): + import cv2 + m = Darknet(cfgfile) + + m.print_network() + m.load_weights(weightfile) + print('Loading weights from %s... Done!' % (weightfile)) + + if m.num_classes == 20: + namesfile = 'data/voc.names' + elif m.num_classes == 80: + namesfile = 'data/coco.names' + else: + namesfile = 'data/names' + + use_cuda = True + if use_cuda: + m.cuda() + + img = cv2.imread(imgfile) + sized = cv2.resize(img, (m.width, m.height)) + sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB) + + for i in range(2): + start = time.time() + boxes = do_detect(m, sized, 0.5, 0.4, use_cuda) + finish = time.time() + if i == 1: + print('%s: Predicted in %f seconds.' % (imgfile, (finish-start))) + + class_names = load_class_names(namesfile) + plot_boxes_cv2(img, boxes, savename='predictions.jpg', class_names=class_names) + +def detect_skimage(cfgfile, weightfile, imgfile): + from skimage import io + from skimage.transform import resize + m = Darknet(cfgfile) + + m.print_network() + m.load_weights(weightfile) + print('Loading weights from %s... Done!' % (weightfile)) + + if m.num_classes == 20: + namesfile = 'data/voc.names' + elif m.num_classes == 80: + namesfile = 'data/coco.names' + else: + namesfile = 'data/names' + + use_cuda = True + if use_cuda: + m.cuda() + + img = io.imread(imgfile) + sized = resize(img, (m.width, m.height)) * 255 + + for i in range(2): + start = time.time() + boxes = do_detect(m, sized, 0.5, 0.4, use_cuda) + finish = time.time() + if i == 1: + print('%s: Predicted in %f seconds.' % (imgfile, (finish-start))) + + class_names = load_class_names(namesfile) + plot_boxes_cv2(img, boxes, savename='predictions.jpg', class_names=class_names) + +if __name__ == '__main__': + if len(sys.argv) == 5: + cfgfile = sys.argv[1] + weightfile = sys.argv[2] + imgfolder = sys.argv[3] + cv2.namedWindow('{}'.format(os.path.basename(imgfolder)), cv2.WINDOW_NORMAL ) + cv2.resizeWindow('{}'.format(os.path.basename(imgfolder)), 1000,800) + globals()["namesfile"] = sys.argv[4] + detect(cfgfile, weightfile, imgfolder) + #detect_cv2(cfgfile, weightfile, imgfile) + #detect_skimage(cfgfile, weightfile, imgfile) + else: + print('Usage: ') + print(' python detect.py cfgfile weightfile imgfolder names') + #detect('cfg/tiny-yolo-voc.cfg', 'tiny-yolo-voc.weights', 'data/person.jpg', version=1) diff --git a/detector/YOLOv3/detector.py b/detector/YOLOv3/detector.py new file mode 100644 index 0000000..1dcbb93 --- /dev/null +++ b/detector/YOLOv3/detector.py @@ -0,0 +1,107 @@ +import torch +import logging +import numpy as np +import cv2 + +from .darknet import Darknet +from .yolo_utils import get_all_boxes, nms, post_process, xywh_to_xyxy, xyxy_to_xywh +from .nms import boxes_nms +import time + +class YOLOv3(object): + def __init__(self, cfgfile, weightfile, namesfile, score_thresh=0.7, conf_thresh=0.01, nms_thresh=0.45, + is_xywh=False, use_cuda=True): + # net definition + self.net = Darknet(cfgfile) + self.net.load_weights(weightfile) + logger = logging.getLogger("root.detector") + logger.info('Loading weights from %s... Done!' % (weightfile)) + self.device = "cuda" if use_cuda else "cpu" + self.net.eval() + + self.net.to(self.device) + + # constants + self.size = self.net.width, self.net.height + self.score_thresh = score_thresh + self.conf_thresh = conf_thresh + self.nms_thresh = nms_thresh + self.use_cuda = use_cuda + self.is_xywh = is_xywh + self.num_classes = self.net.num_classes + self.class_names = self.load_class_names(namesfile) + + def __call__(self, ori_img): + # img to tensor + assert isinstance(ori_img, np.ndarray), "input must be a numpy array!" + img = ori_img.astype(np.float) / 255. + + img = cv2.resize(img, self.size) + + img = torch.from_numpy(img).float().permute(2, 0, 1).unsqueeze(0) ##BGR 去推理 + + # forward + with torch.no_grad(): + img = img.to(self.device) + # t5 = time.time() + out_boxes = self.net(img) + # t6 = time.time() + # print(' -------------infer----------------: %f' % (t5 - t6)) + boxes = get_all_boxes(out_boxes, self.conf_thresh, self.num_classes, + use_cuda=self.use_cuda) # batch size is 1 + # boxes = nms(boxes, self.nms_thresh) + + boxes = post_process(boxes, self.net.num_classes, self.conf_thresh, self.nms_thresh)[0].cpu() + boxes = boxes[boxes[:, -2] > self.score_thresh, :] # bbox xmin ymin xmax ymax + + if len(boxes) == 0: + bbox = torch.FloatTensor([]).reshape([0, 4]) + cls_conf = torch.FloatTensor([]) + cls_ids = torch.LongTensor([]) + else: + height, width = ori_img.shape[:2] + bbox = boxes[:, :4] + if self.is_xywh: + # bbox x y w h + bbox = xyxy_to_xywh(bbox) + + bbox *= torch.FloatTensor([[width, height, width, height]]) + cls_conf = boxes[:, 5] + cls_ids = boxes[:, 6].long() + return bbox.numpy(), cls_conf.numpy(), cls_ids.numpy() + + def load_class_names(self, namesfile): + with open(namesfile, 'r', encoding='utf8') as fp: + class_names = [line.strip() for line in fp.readlines()] + return class_names + + +def demo(): + import os + from vizer.draw import draw_boxes + + yolo = YOLOv3("cfg/yolo_v3.cfg", "weight/yolov3.weights", "cfg/coco.names") + print("yolo.size =", yolo.size) + root = "./demo" + resdir = os.path.join(root, "results") + os.makedirs(resdir, exist_ok=True) + files = [os.path.join(root, file) for file in os.listdir(root) if file.endswith('.jpg')] + files.sort() + for filename in files: + img = cv2.imread(filename) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + bbox, cls_conf, cls_ids = yolo(img) + + if bbox is not None: + img = draw_boxes(img, bbox, cls_ids, cls_conf, class_name_map=yolo.class_names) + # save results + cv2.imwrite(os.path.join(resdir, os.path.basename(filename)), img[:, :, (2, 1, 0)]) + # imshow + # cv2.namedWindow("yolo", cv2.WINDOW_NORMAL) + # cv2.resizeWindow("yolo", 600,600) + # cv2.imshow("yolo",res[:,:,(2,1,0)]) + # cv2.waitKey(0) + + +if __name__ == "__main__": + demo() diff --git a/detector/YOLOv3/nms/__init__.py b/detector/YOLOv3/nms/__init__.py new file mode 100644 index 0000000..4da7007 --- /dev/null +++ b/detector/YOLOv3/nms/__init__.py @@ -0,0 +1 @@ +from .nms import boxes_nms \ No newline at end of file diff --git a/detector/YOLOv3/nms/build.sh b/detector/YOLOv3/nms/build.sh new file mode 100644 index 0000000..44766a2 --- /dev/null +++ b/detector/YOLOv3/nms/build.sh @@ -0,0 +1,5 @@ +cd ext + +python build.py build_ext develop + +cd .. diff --git a/detector/YOLOv3/nms/ext/__init__.py b/detector/YOLOv3/nms/ext/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/detector/YOLOv3/nms/ext/build.py b/detector/YOLOv3/nms/ext/build.py new file mode 100644 index 0000000..66973bc --- /dev/null +++ b/detector/YOLOv3/nms/ext/build.py @@ -0,0 +1,58 @@ +import glob +import os + +import torch +from setuptools import setup +from torch.utils.cpp_extension import CUDA_HOME +from torch.utils.cpp_extension import CppExtension +from torch.utils.cpp_extension import CUDAExtension + +requirements = ["torch"] + + +def get_extensions(): + extensions_dir = os.path.dirname(os.path.abspath(__file__)) + + main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) + source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) + + sources = main_file + source_cpu + extension = CppExtension + + extra_compile_args = {"cxx": []} + define_macros = [] + + if torch.cuda.is_available() and CUDA_HOME is not None: + extension = CUDAExtension + sources += source_cuda + define_macros += [("WITH_CUDA", None)] + extra_compile_args["nvcc"] = [ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ] + + sources = [os.path.join(extensions_dir, s) for s in sources] + + include_dirs = [extensions_dir] + + ext_modules = [ + extension( + "torch_extension", + sources, + include_dirs=include_dirs, + define_macros=define_macros, + extra_compile_args=extra_compile_args, + ) + ] + + return ext_modules + + +setup( + name="torch_extension", + version="0.1", + ext_modules=get_extensions(), + cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}) diff --git a/detector/YOLOv3/nms/ext/cpu/nms_cpu.cpp b/detector/YOLOv3/nms/ext/cpu/nms_cpu.cpp new file mode 100644 index 0000000..5b3f93c --- /dev/null +++ b/detector/YOLOv3/nms/ext/cpu/nms_cpu.cpp @@ -0,0 +1,75 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include "cpu/vision.h" + + +template +at::Tensor nms_cpu_kernel(const at::Tensor& dets, + const at::Tensor& scores, + const float threshold) { + AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); + AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor"); + AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores"); + + if (dets.numel() == 0) { + return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); + } + + auto x1_t = dets.select(1, 0).contiguous(); + auto y1_t = dets.select(1, 1).contiguous(); + auto x2_t = dets.select(1, 2).contiguous(); + auto y2_t = dets.select(1, 3).contiguous(); + + at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t); + + auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); + + auto ndets = dets.size(0); + at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); + + auto suppressed = suppressed_t.data(); + auto order = order_t.data(); + auto x1 = x1_t.data(); + auto y1 = y1_t.data(); + auto x2 = x2_t.data(); + auto y2 = y2_t.data(); + auto areas = areas_t.data(); + + for (int64_t _i = 0; _i < ndets; _i++) { + auto i = order[_i]; + if (suppressed[i] == 1) + continue; + auto ix1 = x1[i]; + auto iy1 = y1[i]; + auto ix2 = x2[i]; + auto iy2 = y2[i]; + auto iarea = areas[i]; + + for (int64_t _j = _i + 1; _j < ndets; _j++) { + auto j = order[_j]; + if (suppressed[j] == 1) + continue; + auto xx1 = std::max(ix1, x1[j]); + auto yy1 = std::max(iy1, y1[j]); + auto xx2 = std::min(ix2, x2[j]); + auto yy2 = std::min(iy2, y2[j]); + + auto w = std::max(static_cast(0), xx2 - xx1); + auto h = std::max(static_cast(0), yy2 - yy1); + auto inter = w * h; + auto ovr = inter / (iarea + areas[j] - inter); + if (ovr >= threshold) + suppressed[j] = 1; + } + } + return at::nonzero(suppressed_t == 0).squeeze(1); +} + +at::Tensor nms_cpu(const at::Tensor& dets, + const at::Tensor& scores, + const float threshold) { + at::Tensor result; + AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] { + result = nms_cpu_kernel(dets, scores, threshold); + }); + return result; +} \ No newline at end of file diff --git a/detector/YOLOv3/nms/ext/cpu/vision.h b/detector/YOLOv3/nms/ext/cpu/vision.h new file mode 100644 index 0000000..b3529ad --- /dev/null +++ b/detector/YOLOv3/nms/ext/cpu/vision.h @@ -0,0 +1,7 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once +#include + +at::Tensor nms_cpu(const at::Tensor& dets, + const at::Tensor& scores, + const float threshold); diff --git a/detector/YOLOv3/nms/ext/cuda/nms.cu b/detector/YOLOv3/nms/ext/cuda/nms.cu new file mode 100644 index 0000000..2eb4525 --- /dev/null +++ b/detector/YOLOv3/nms/ext/cuda/nms.cu @@ -0,0 +1,131 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include +#include + +#include +#include + +#include +#include + +int const threadsPerBlock = sizeof(unsigned long long) * 8; + +__device__ inline float devIoU(float const * const a, float const * const b) { + float left = max(a[0], b[0]), right = min(a[2], b[2]); + float top = max(a[1], b[1]), bottom = min(a[3], b[3]); + float width = max(right - left, 0.f), height = max(bottom - top, 0.f); + float interS = width * height; + float Sa = (a[2] - a[0]) * (a[3] - a[1]); + float Sb = (b[2] - b[0]) * (b[3] - b[1]); + return interS / (Sa + Sb - interS); +} + +__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, + const float *dev_boxes, unsigned long long *dev_mask) { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = + min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); + const int col_size = + min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + + __shared__ float block_boxes[threadsPerBlock * 5]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 5 + 0] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; + block_boxes[threadIdx.x * 5 + 1] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; + block_boxes[threadIdx.x * 5 + 2] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; + block_boxes[threadIdx.x * 5 + 3] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; + block_boxes[threadIdx.x * 5 + 4] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; + const float *cur_box = dev_boxes + cur_box_idx * 5; + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } +} + +// boxes is a N x 5 tensor +at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) { + using scalar_t = float; + AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); + auto scores = boxes.select(1, 4); + auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); + auto boxes_sorted = boxes.index_select(0, order_t); + + int boxes_num = boxes.size(0); + + const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); + + scalar_t* boxes_dev = boxes_sorted.data(); + + THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState + + unsigned long long* mask_dev = NULL; + //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, + // boxes_num * col_blocks * sizeof(unsigned long long))); + + mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); + + dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), + THCCeilDiv(boxes_num, threadsPerBlock)); + dim3 threads(threadsPerBlock); + nms_kernel<<>>(boxes_num, + nms_overlap_thresh, + boxes_dev, + mask_dev); + + std::vector mask_host(boxes_num * col_blocks); + THCudaCheck(cudaMemcpy(&mask_host[0], + mask_dev, + sizeof(unsigned long long) * boxes_num * col_blocks, + cudaMemcpyDeviceToHost)); + + std::vector remv(col_blocks); + memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); + + at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); + int64_t* keep_out = keep.data(); + + int num_to_keep = 0; + for (int i = 0; i < boxes_num; i++) { + int nblock = i / threadsPerBlock; + int inblock = i % threadsPerBlock; + + if (!(remv[nblock] & (1ULL << inblock))) { + keep_out[num_to_keep++] = i; + unsigned long long *p = &mask_host[0] + i * col_blocks; + for (int j = nblock; j < col_blocks; j++) { + remv[j] |= p[j]; + } + } + } + + THCudaFree(state, mask_dev); + // TODO improve this part + return std::get<0>(order_t.index({ + keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( + order_t.device(), keep.scalar_type()) + }).sort(0, false)); +} \ No newline at end of file diff --git a/detector/YOLOv3/nms/ext/cuda/vision.h b/detector/YOLOv3/nms/ext/cuda/vision.h new file mode 100644 index 0000000..b5bd907 --- /dev/null +++ b/detector/YOLOv3/nms/ext/cuda/vision.h @@ -0,0 +1,7 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once +#include + +at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh); + + diff --git a/detector/YOLOv3/nms/ext/nms.h b/detector/YOLOv3/nms/ext/nms.h new file mode 100644 index 0000000..312fed4 --- /dev/null +++ b/detector/YOLOv3/nms/ext/nms.h @@ -0,0 +1,28 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once +#include "cpu/vision.h" + +#ifdef WITH_CUDA +#include "cuda/vision.h" +#endif + + +at::Tensor nms(const at::Tensor& dets, + const at::Tensor& scores, + const float threshold) { + + if (dets.type().is_cuda()) { +#ifdef WITH_CUDA + // TODO raise error if not compiled with CUDA + if (dets.numel() == 0) + return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); + auto b = at::cat({dets, scores.unsqueeze(1)}, 1); + return nms_cuda(b, threshold); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + + at::Tensor result = nms_cpu(dets, scores, threshold); + return result; +} diff --git a/detector/YOLOv3/nms/ext/vision.cpp b/detector/YOLOv3/nms/ext/vision.cpp new file mode 100644 index 0000000..726b77b --- /dev/null +++ b/detector/YOLOv3/nms/ext/vision.cpp @@ -0,0 +1,7 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include "nms.h" + + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("nms", &nms, "non-maximum suppression"); +} diff --git a/detector/YOLOv3/nms/nms.py b/detector/YOLOv3/nms/nms.py new file mode 100644 index 0000000..1b4a2db --- /dev/null +++ b/detector/YOLOv3/nms/nms.py @@ -0,0 +1,34 @@ +import warnings +import torchvision + +try: + import torch + import torch_extension + + _nms = torch_extension.nms +except ImportError: + if torchvision.__version__ >= '0.3.0': + _nms = torchvision.ops.nms + else: + from .python_nms import python_nms + + _nms = python_nms + warnings.warn('You are using python version NMS, which is very very slow. Try compile c++ NMS ' + 'using `cd ext & python build.py build_ext develop`') + + +def boxes_nms(boxes, scores, nms_thresh, max_count=-1): + """ Performs non-maximum suppression, run on GPU or CPU according to + boxes's device. + Args: + boxes(Tensor): `xyxy` mode boxes, use absolute coordinates(or relative coordinates), shape is (n, 4) + scores(Tensor): scores, shape is (n, ) + nms_thresh(float): thresh + max_count (int): if > 0, then only the top max_proposals are kept after non-maximum suppression + Returns: + indices kept. + """ + keep = _nms(boxes, scores, nms_thresh) + if max_count > 0: + keep = keep[:max_count] + return keep diff --git a/detector/YOLOv3/nms/python_nms.py b/detector/YOLOv3/nms/python_nms.py new file mode 100644 index 0000000..bd8a4ba --- /dev/null +++ b/detector/YOLOv3/nms/python_nms.py @@ -0,0 +1,59 @@ +import torch +import numpy as np + + +def python_nms(boxes, scores, nms_thresh): + """ Performs non-maximum suppression using numpy + Args: + boxes(Tensor): `xyxy` mode boxes, use absolute coordinates(not support relative coordinates), + shape is (n, 4) + scores(Tensor): scores, shape is (n, ) + nms_thresh(float): thresh + Returns: + indices kept. + """ + if boxes.numel() == 0: + return torch.empty((0,), dtype=torch.long) + # Use numpy to run nms. Running nms in PyTorch code on CPU is really slow. + origin_device = boxes.device + cpu_device = torch.device('cpu') + boxes = boxes.to(cpu_device).numpy() + scores = scores.to(cpu_device).numpy() + + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + areas = (x2 - x1) * (y2 - y1) + order = np.argsort(scores)[::-1] + num_detections = boxes.shape[0] + suppressed = np.zeros((num_detections,), dtype=np.bool) + for _i in range(num_detections): + i = order[_i] + if suppressed[i]: + continue + ix1 = x1[i] + iy1 = y1[i] + ix2 = x2[i] + iy2 = y2[i] + iarea = areas[i] + + for _j in range(_i + 1, num_detections): + j = order[_j] + if suppressed[j]: + continue + + xx1 = max(ix1, x1[j]) + yy1 = max(iy1, y1[j]) + xx2 = min(ix2, x2[j]) + yy2 = min(iy2, y2[j]) + w = max(0, xx2 - xx1) + h = max(0, yy2 - yy1) + + inter = w * h + ovr = inter / (iarea + areas[j] - inter) + if ovr >= nms_thresh: + suppressed[j] = True + keep = np.nonzero(suppressed == 0)[0] + keep = torch.from_numpy(keep).to(origin_device) + return keep diff --git a/detector/YOLOv3/region_layer.py b/detector/YOLOv3/region_layer.py new file mode 100644 index 0000000..c55ef37 --- /dev/null +++ b/detector/YOLOv3/region_layer.py @@ -0,0 +1,185 @@ +import math +import sys +import time +import torch +import torch.nn as nn +from .yolo_utils import bbox_iou, multi_bbox_ious, convert2cpu + + +class RegionLayer(nn.Module): + def __init__(self, num_classes=0, anchors=[], num_anchors=1, use_cuda=None): + super(RegionLayer, self).__init__() + use_cuda = torch.cuda.is_available() and (True if use_cuda is None else use_cuda) + self.device = torch.device("cuda" if use_cuda else "cpu") + self.num_classes = num_classes + self.num_anchors = num_anchors + self.anchor_step = len(anchors) // num_anchors + # self.anchors = torch.stack(torch.FloatTensor(anchors).split(self.anchor_step)).to(self.device) + self.anchors = torch.FloatTensor(anchors).view(self.num_anchors, self.anchor_step).to(self.device) + self.rescore = 1 + self.coord_scale = 1 + self.noobject_scale = 1 + self.object_scale = 5 + self.class_scale = 1 + self.thresh = 0.6 + self.seen = 0 + + def build_targets(self, pred_boxes, target, nH, nW): + nB = target.size(0) + nA = self.num_anchors + conf_mask = torch.ones(nB, nA, nH, nW) * self.noobject_scale + coord_mask = torch.zeros(nB, nA, nH, nW) + cls_mask = torch.zeros(nB, nA, nH, nW) + tcoord = torch.zeros(4, nB, nA, nH, nW) + tconf = torch.zeros(nB, nA, nH, nW) + tcls = torch.zeros(nB, nA, nH, nW) + + nAnchors = nA * nH * nW + nPixels = nH * nW + nGT = 0 # number of ground truth + nRecall = 0 + # it works faster on CPU than on GPU. + anchors = self.anchors.to("cpu") + + if self.seen < 12800: + tcoord[0].fill_(0.5) + tcoord[1].fill_(0.5) + coord_mask.fill_(1) + + for b in range(nB): + cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t() + cur_ious = torch.zeros(nAnchors) + tbox = target[b].view(-1, 5).to("cpu") + for t in range(50): + if tbox[t][1] == 0: + break + gx, gw = [i * nW for i in (tbox[t][1], tbox[t][3])] + gy, gh = [i * nH for i in (tbox[t][2], tbox[t][4])] + cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors, 1).t() + cur_ious = torch.max(cur_ious, multi_bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False)) + ignore_ix = cur_ious > self.thresh + conf_mask[b][ignore_ix.view(nA, nH, nW)] = 0 + + for t in range(50): + if tbox[t][1] == 0: + break + nGT += 1 + gx, gw = [i * nW for i in (tbox[t][1], tbox[t][3])] + gy, gh = [i * nH for i in (tbox[t][2], tbox[t][4])] + gw, gh = gw.float(), gh.float() + gi, gj = int(gx), int(gy) + + tmp_gt_boxes = torch.FloatTensor([0, 0, gw, gh]).repeat(nA, 1).t() + anchor_boxes = torch.cat((torch.zeros(nA, 2), anchors), 1).t() + tmp_ious = multi_bbox_ious(tmp_gt_boxes, anchor_boxes, x1y1x2y2=False) + best_iou, best_n = torch.max(tmp_ious, 0) + + if self.anchor_step == 4: # this part is not tested. + tmp_ious_mask = (tmp_ious == best_iou) + if tmp_ious_mask.sum() > 0: + gt_pos = torch.FloatTensor([gi, gj, gx, gy]).repeat(nA, 1).t() + an_pos = anchor_boxes[4:6] # anchor_boxes are consisted of [0 0 aw ah ax ay] + dist = pow(((gt_pos[0] + an_pos[0]) - gt_pos[2]), 2) + pow( + ((gt_pos[1] + an_pos[1]) - gt_pos[3]), 2) + dist[1 - tmp_ious_mask] = 10000 # set the large number for the small ious + _, best_n = torch.min(dist, 0) + + gt_box = torch.FloatTensor([gx, gy, gw, gh]) + pred_box = pred_boxes[b * nAnchors + best_n * nPixels + gj * nW + gi] + iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False) + + coord_mask[b][best_n][gj][gi] = 1 + cls_mask[b][best_n][gj][gi] = 1 + conf_mask[b][best_n][gj][gi] = self.object_scale + tcoord[0][b][best_n][gj][gi] = gx - gi + tcoord[1][b][best_n][gj][gi] = gy - gj + tcoord[2][b][best_n][gj][gi] = math.log(gw / anchors[best_n][0]) + tcoord[3][b][best_n][gj][gi] = math.log(gh / anchors[best_n][1]) + tcls[b][best_n][gj][gi] = tbox[t][0] + tconf[b][best_n][gj][gi] = iou if self.rescore else 1. + if iou > 0.5: + nRecall += 1 + + return nGT, nRecall, coord_mask, conf_mask, cls_mask, tcoord, tconf, tcls + + def get_mask_boxes(self, output): + if not isinstance(self.anchors, torch.Tensor): + self.anchors = torch.FloatTensor(self.anchors).view(self.num_anchors, self.anchor_step).to(self.device) + masked_anchors = self.anchors.view(-1) + num_anchors = torch.IntTensor([self.num_anchors]).to(self.device) + return {'x': output, 'a': masked_anchors, 'n': num_anchors} + + def forward(self, output, target): + # output : BxAs*(4+1+num_classes)*H*W + t0 = time.time() + nB = output.data.size(0) # batch size + nA = self.num_anchors + nC = self.num_classes + nH = output.data.size(2) + nW = output.data.size(3) + cls_anchor_dim = nB * nA * nH * nW + + if not isinstance(self.anchors, torch.Tensor): + self.anchors = torch.FloatTensor(self.anchors).view(self.num_anchors, self.anchor_step).to(self.device) + + output = output.view(nB, nA, (5 + nC), nH, nW) + cls_grid = torch.linspace(5, 5 + nC - 1, nC).long().to(self.device) + ix = torch.LongTensor(range(0, 5)).to(self.device) + pred_boxes = torch.FloatTensor(4, cls_anchor_dim).to(self.device) + + coord = output.index_select(2, ix[0:4]).view(nB * nA, -1, nH * nW).transpose(0, 1).contiguous().view(-1, + cls_anchor_dim) # x, y, w, h + coord[0:2] = coord[0:2].sigmoid() # x, y + conf = output.index_select(2, ix[4]).view(nB, nA, nH, nW).sigmoid() + cls = output.index_select(2, cls_grid) + cls = cls.view(nB * nA, nC, nH * nW).transpose(1, 2).contiguous().view(cls_anchor_dim, nC) + + t1 = time.time() + grid_x = torch.linspace(0, nW - 1, nW).repeat(nB * nA, nH, 1).view(cls_anchor_dim).to(self.device) + grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view(cls_anchor_dim).to( + self.device) + anchor_w = self.anchors.index_select(1, ix[0]).repeat(1, nB * nH * nW).view(cls_anchor_dim) + anchor_h = self.anchors.index_select(1, ix[1]).repeat(1, nB * nH * nW).view(cls_anchor_dim) + + pred_boxes[0] = coord[0] + grid_x + pred_boxes[1] = coord[1] + grid_y + pred_boxes[2] = coord[2].exp() * anchor_w + pred_boxes[3] = coord[3].exp() * anchor_h + # for build_targets. it works faster on CPU than on GPU + pred_boxes = convert2cpu(pred_boxes.transpose(0, 1).contiguous().view(-1, 4)).detach() + + t2 = time.time() + nGT, nRecall, coord_mask, conf_mask, cls_mask, tcoord, tconf, tcls = \ + self.build_targets(pred_boxes, target.detach(), nH, nW) + + cls_mask = (cls_mask == 1) + tcls = tcls[cls_mask].long().view(-1) + cls_mask = cls_mask.view(-1, 1).repeat(1, nC).to(self.device) + cls = cls[cls_mask].view(-1, nC) + + nProposals = int((conf > 0.25).sum()) + + tcoord = tcoord.view(4, cls_anchor_dim).to(self.device) + tconf, tcls = tconf.to(self.device), tcls.to(self.device) + coord_mask, conf_mask = coord_mask.view(cls_anchor_dim).to(self.device), conf_mask.sqrt().to(self.device) + + t3 = time.time() + loss_coord = self.coord_scale * nn.MSELoss(size_average=False)(coord * coord_mask, tcoord * coord_mask) / 2 + # sqrt(object_scale)/2 is almost equal to 1. + loss_conf = nn.MSELoss(size_average=False)(conf * conf_mask, tconf * conf_mask) / 2 + loss_cls = self.class_scale * nn.CrossEntropyLoss(size_average=False)(cls, tcls) if cls.size(0) > 0 else 0 + loss = loss_coord + loss_conf + loss_cls + t4 = time.time() + if False: + print('-' * 30) + print(' activation : %f' % (t1 - t0)) + print(' create pred_boxes : %f' % (t2 - t1)) + print(' build targets : %f' % (t3 - t2)) + print(' create loss : %f' % (t4 - t3)) + print(' total : %f' % (t4 - t0)) + print('%d: nGT %3d, nRC %3d, nPP %3d, loss: box %6.3f, conf %6.3f, class %6.3f, total %7.3f' + % (self.seen, nGT, nRecall, nProposals, loss_coord, loss_conf, loss_cls, loss)) + if math.isnan(loss.item()): + print(conf, tconf) + sys.exit(0) + return loss diff --git a/detector/YOLOv3/weight/.gitkeep b/detector/YOLOv3/weight/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/detector/YOLOv3/yolo_layer.py b/detector/YOLOv3/yolo_layer.py new file mode 100644 index 0000000..578969f --- /dev/null +++ b/detector/YOLOv3/yolo_layer.py @@ -0,0 +1,181 @@ +import math +import sys +import time +import torch +import torch.nn as nn +from .yolo_utils import bbox_iou, multi_bbox_ious, convert2cpu + + +class YoloLayer(nn.Module): + def __init__(self, anchor_mask=[], num_classes=0, anchors=[], num_anchors=1, use_cuda=None): + super(YoloLayer, self).__init__() + use_cuda = torch.cuda.is_available() and (True if use_cuda is None else use_cuda) + self.device = torch.device("cuda" if use_cuda else "cpu") + + self.anchor_mask = anchor_mask + self.num_classes = num_classes + self.anchors = anchors + self.num_anchors = num_anchors + self.anchor_step = len(anchors) // num_anchors + self.rescore = 0 + self.ignore_thresh = 0.5 + self.truth_thresh = 1. + self.stride = 32 + self.nth_layer = 0 + self.seen = 0 + self.net_width = 0 + self.net_height = 0 + + def get_mask_boxes(self, output): + masked_anchors = [] + for m in self.anchor_mask: + masked_anchors += self.anchors[m * self.anchor_step:(m + 1) * self.anchor_step] + masked_anchors = [anchor / self.stride for anchor in masked_anchors] + + masked_anchors = torch.FloatTensor(masked_anchors).to(self.device) + num_anchors = torch.IntTensor([len(self.anchor_mask)]).to(self.device) + return {'x': output, 'a': masked_anchors, 'n': num_anchors} + + def build_targets(self, pred_boxes, target, anchors, nA, nH, nW): + nB = target.size(0) + anchor_step = anchors.size(1) # anchors[nA][anchor_step] + conf_mask = torch.ones(nB, nA, nH, nW) + coord_mask = torch.zeros(nB, nA, nH, nW) + cls_mask = torch.zeros(nB, nA, nH, nW) + tcoord = torch.zeros(4, nB, nA, nH, nW) + tconf = torch.zeros(nB, nA, nH, nW) + tcls = torch.zeros(nB, nA, nH, nW) + twidth, theight = self.net_width / self.stride, self.net_height / self.stride + + nAnchors = nA * nH * nW + nPixels = nH * nW + nGT = 0 + nRecall = 0 + nRecall75 = 0 + + # it works faster on CPU than on GPU. + anchors = anchors.to("cpu") + + for b in range(nB): + cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t() + cur_ious = torch.zeros(nAnchors) + tbox = target[b].view(-1, 5).to("cpu") + for t in range(50): + if tbox[t][1] == 0: + break + gx, gy = tbox[t][1] * nW, tbox[t][2] * nH + gw, gh = tbox[t][3] * twidth, tbox[t][4] * theight + cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors, 1).t() + cur_ious = torch.max(cur_ious, multi_bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False)) + ignore_ix = cur_ious > self.ignore_thresh + conf_mask[b][ignore_ix.view(nA, nH, nW)] = 0 + + for t in range(50): + if tbox[t][1] == 0: + break + nGT += 1 + gx, gy = tbox[t][1] * nW, tbox[t][2] * nH + gw, gh = tbox[t][3] * twidth, tbox[t][4] * theight + gw, gh = gw.float(), gh.float() + gi, gj = int(gx), int(gy) + + tmp_gt_boxes = torch.FloatTensor([0, 0, gw, gh]).repeat(nA, 1).t() + anchor_boxes = torch.cat((torch.zeros(nA, anchor_step), anchors), 1).t() + _, best_n = torch.max(multi_bbox_ious(tmp_gt_boxes, anchor_boxes, x1y1x2y2=False), 0) + + gt_box = torch.FloatTensor([gx, gy, gw, gh]) + pred_box = pred_boxes[b * nAnchors + best_n * nPixels + gj * nW + gi] + iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False) + + coord_mask[b][best_n][gj][gi] = 1 + cls_mask[b][best_n][gj][gi] = 1 + conf_mask[b][best_n][gj][gi] = 1 + tcoord[0][b][best_n][gj][gi] = gx - gi + tcoord[1][b][best_n][gj][gi] = gy - gj + tcoord[2][b][best_n][gj][gi] = math.log(gw / anchors[best_n][0]) + tcoord[3][b][best_n][gj][gi] = math.log(gh / anchors[best_n][1]) + tcls[b][best_n][gj][gi] = tbox[t][0] + tconf[b][best_n][gj][gi] = iou if self.rescore else 1. + + if iou > 0.5: + nRecall += 1 + if iou > 0.75: + nRecall75 += 1 + + return nGT, nRecall, nRecall75, coord_mask, conf_mask, cls_mask, tcoord, tconf, tcls + + def forward(self, output, target): + # output : BxAs*(4+1+num_classes)*H*W + mask_tuple = self.get_mask_boxes(output) + t0 = time.time() + nB = output.data.size(0) # batch size + nA = mask_tuple['n'].item() # num_anchors + nC = self.num_classes + nH = output.data.size(2) + nW = output.data.size(3) + anchor_step = mask_tuple['a'].size(0) // nA + anchors = mask_tuple['a'].view(nA, anchor_step).to(self.device) + cls_anchor_dim = nB * nA * nH * nW + + output = output.view(nB, nA, (5 + nC), nH, nW) + cls_grid = torch.linspace(5, 5 + nC - 1, nC).long().to(self.device) + ix = torch.LongTensor(range(0, 5)).to(self.device) + pred_boxes = torch.FloatTensor(4, cls_anchor_dim).to(self.device) + + coord = output.index_select(2, ix[0:4]).view(nB * nA, -1, nH * nW).transpose(0, 1).contiguous().view(-1, + cls_anchor_dim) # x, y, w, h + coord[0:2] = coord[0:2].sigmoid() # x, y + conf = output.index_select(2, ix[4]).view(nB, nA, nH, nW).sigmoid() + cls = output.index_select(2, cls_grid) + cls = cls.view(nB * nA, nC, nH * nW).transpose(1, 2).contiguous().view(cls_anchor_dim, nC) + + t1 = time.time() + grid_x = torch.linspace(0, nW - 1, nW).repeat(nB * nA, nH, 1).view(cls_anchor_dim).to(self.device) + grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view(cls_anchor_dim).to( + self.device) + anchor_w = anchors.index_select(1, ix[0]).repeat(1, nB * nH * nW).view(cls_anchor_dim) + anchor_h = anchors.index_select(1, ix[1]).repeat(1, nB * nH * nW).view(cls_anchor_dim) + + pred_boxes[0] = coord[0] + grid_x + pred_boxes[1] = coord[1] + grid_y + pred_boxes[2] = coord[2].exp() * anchor_w + pred_boxes[3] = coord[3].exp() * anchor_h + # for build_targets. it works faster on CPU than on GPU + pred_boxes = convert2cpu(pred_boxes.transpose(0, 1).contiguous().view(-1, 4)).detach() + + t2 = time.time() + nGT, nRecall, nRecall75, coord_mask, conf_mask, cls_mask, tcoord, tconf, tcls = \ + self.build_targets(pred_boxes, target.detach(), anchors.detach(), nA, nH, nW) + + cls_mask = (cls_mask == 1) + tcls = tcls[cls_mask].long().view(-1) + cls_mask = cls_mask.view(-1, 1).repeat(1, nC).to(self.device) + cls = cls[cls_mask].view(-1, nC) + + nProposals = int((conf > 0.25).sum()) + + tcoord = tcoord.view(4, cls_anchor_dim).to(self.device) + tconf, tcls = tconf.to(self.device), tcls.to(self.device) + coord_mask, conf_mask = coord_mask.view(cls_anchor_dim).to(self.device), conf_mask.to(self.device) + + t3 = time.time() + loss_coord = nn.MSELoss(size_average=False)(coord * coord_mask, tcoord * coord_mask) / 2 + loss_conf = nn.MSELoss(size_average=False)(conf * conf_mask, tconf * conf_mask) + loss_cls = nn.CrossEntropyLoss(size_average=False)(cls, tcls) if cls.size(0) > 0 else 0 + loss = loss_coord + loss_conf + loss_cls + + t4 = time.time() + if False: + print('-' * 30) + print(' activation : %f' % (t1 - t0)) + print(' create pred_boxes : %f' % (t2 - t1)) + print(' build targets : %f' % (t3 - t2)) + print(' create loss : %f' % (t4 - t3)) + print(' total : %f' % (t4 - t0)) + print( + '%d: Layer(%03d) nGT %3d, nRC %3d, nRC75 %3d, nPP %3d, loss: box %6.3f, conf %6.3f, class %6.3f, total %7.3f' + % (self.seen, self.nth_layer, nGT, nRecall, nRecall75, nProposals, loss_coord, loss_conf, loss_cls, loss)) + if math.isnan(loss.item()): + print(conf, tconf) + sys.exit(0) + return loss diff --git a/detector/YOLOv3/yolo_utils.py b/detector/YOLOv3/yolo_utils.py new file mode 100644 index 0000000..b546eef --- /dev/null +++ b/detector/YOLOv3/yolo_utils.py @@ -0,0 +1,589 @@ +import os +import time +import math +import torch +import numpy as np +from PIL import Image, ImageDraw +import struct # get_image_size +import imghdr # get_image_size + + +def sigmoid(x): + return 1.0 / (math.exp(-x) + 1.) + + +def softmax(x): + x = torch.exp(x - torch.max(x)) + x /= x.sum() + return x + + +def bbox_iou(box1, box2, x1y1x2y2=True): + if x1y1x2y2: + x1_min = min(box1[0], box2[0]) + x2_max = max(box1[2], box2[2]) + y1_min = min(box1[1], box2[1]) + y2_max = max(box1[3], box2[3]) + w1, h1 = box1[2] - box1[0], box1[3] - box1[1] + w2, h2 = box2[2] - box2[0], box2[3] - box2[1] + else: + w1, h1 = box1[2], box1[3] + w2, h2 = box2[2], box2[3] + x1_min = min(box1[0] - w1 / 2.0, box2[0] - w2 / 2.0) + x2_max = max(box1[0] + w1 / 2.0, box2[0] + w2 / 2.0) + y1_min = min(box1[1] - h1 / 2.0, box2[1] - h2 / 2.0) + y2_max = max(box1[1] + h1 / 2.0, box2[1] + h2 / 2.0) + + w_union = x2_max - x1_min + h_union = y2_max - y1_min + w_cross = w1 + w2 - w_union + h_cross = h1 + h2 - h_union + carea = 0 + if w_cross <= 0 or h_cross <= 0: + return 0.0 + + area1 = w1 * h1 + area2 = w2 * h2 + carea = w_cross * h_cross + uarea = area1 + area2 - carea + return float(carea / uarea) + + +def multi_bbox_ious(boxes1, boxes2, x1y1x2y2=True): + if x1y1x2y2: + x1_min = torch.min(boxes1[0], boxes2[0]) + x2_max = torch.max(boxes1[2], boxes2[2]) + y1_min = torch.min(boxes1[1], boxes2[1]) + y2_max = torch.max(boxes1[3], boxes2[3]) + w1, h1 = boxes1[2] - boxes1[0], boxes1[3] - boxes1[1] + w2, h2 = boxes2[2] - boxes2[0], boxes2[3] - boxes2[1] + else: + w1, h1 = boxes1[2], boxes1[3] + w2, h2 = boxes2[2], boxes2[3] + x1_min = torch.min(boxes1[0] - w1 / 2.0, boxes2[0] - w2 / 2.0) + x2_max = torch.max(boxes1[0] + w1 / 2.0, boxes2[0] + w2 / 2.0) + y1_min = torch.min(boxes1[1] - h1 / 2.0, boxes2[1] - h2 / 2.0) + y2_max = torch.max(boxes1[1] + h1 / 2.0, boxes2[1] + h2 / 2.0) + + w_union = x2_max - x1_min + h_union = y2_max - y1_min + w_cross = w1 + w2 - w_union + h_cross = h1 + h2 - h_union + mask = (((w_cross <= 0) + (h_cross <= 0)) > 0) + area1 = w1 * h1 + area2 = w2 * h2 + carea = w_cross * h_cross + carea[mask] = 0 + uarea = area1 + area2 - carea + return carea / uarea + + +from .nms import boxes_nms + + +def post_process(boxes, num_classes, conf_thresh=0.01, nms_thresh=0.45, obj_thresh=0.3): + batch_size = boxes.size(0) + + # nms + results_boxes = [] + for batch_id in range(batch_size): + processed_boxes = [] + for cls_id in range(num_classes): + mask = (boxes[batch_id, :, -1] == cls_id) * (boxes[batch_id, :, 4] > obj_thresh) + masked_boxes = boxes[batch_id, mask] + + keep = boxes_nms(masked_boxes[:, :4], masked_boxes[:, 5], nms_thresh) + + nmsed_boxes = masked_boxes[keep, :] + + processed_boxes.append(nmsed_boxes) + processed_boxes = torch.cat(processed_boxes, dim=0) + + results_boxes.append(processed_boxes) + + return results_boxes + + +def xywh_to_xyxy(boxes_xywh): + boxes_xyxy = boxes_xywh.copy() + boxes_xyxy[:, 0] = boxes_xywh[:, 0] - boxes_xywh[:, 2] / 2. + boxes_xyxy[:, 0] = boxes_xywh[:, 0] - boxes_xywh[:, 2] / 2. + boxes_xyxy[:, 0] = boxes_xywh[:, 0] - boxes_xywh[:, 2] / 2. + boxes_xyxy[:, 0] = boxes_xywh[:, 0] - boxes_xywh[:, 2] / 2. + + return boxes_xyxy + + +def xyxy_to_xywh(boxes_xyxy): + if isinstance(boxes_xyxy, torch.Tensor): + boxes_xywh = boxes_xyxy.clone() + elif isinstance(boxes_xyxy, np.ndarray): + boxes_xywh = boxes_xyxy.copy() + + boxes_xywh[:, 0] = (boxes_xyxy[:, 0] + boxes_xyxy[:, 2]) / 2. + boxes_xywh[:, 1] = (boxes_xyxy[:, 1] + boxes_xyxy[:, 3]) / 2. + boxes_xywh[:, 2] = boxes_xyxy[:, 2] - boxes_xyxy[:, 0] + boxes_xywh[:, 3] = boxes_xyxy[:, 3] - boxes_xyxy[:, 1] + + return boxes_xywh + + +def nms(boxes, nms_thresh): + if len(boxes) == 0: + return boxes + + det_confs = torch.zeros(len(boxes)) + for i in range(len(boxes)): + det_confs[i] = boxes[i][4] + + _, sortIds = torch.sort(det_confs, descending=True) + out_boxes = [] + for i in range(len(boxes)): + box_i = boxes[sortIds[i]] + if box_i[4] > 0: + out_boxes.append(box_i) + for j in range(i + 1, len(boxes)): + box_j = boxes[sortIds[j]] + if bbox_iou(box_i, box_j, x1y1x2y2=False) > nms_thresh: + # print(box_i, box_j, bbox_iou(box_i, box_j, x1y1x2y2=False)) + box_j[4] = 0 + return out_boxes + + +def convert2cpu(gpu_matrix): + return torch.FloatTensor(gpu_matrix.size()).copy_(gpu_matrix) + + +def convert2cpu_long(gpu_matrix): + return torch.LongTensor(gpu_matrix.size()).copy_(gpu_matrix) + + +def get_all_boxes(output, conf_thresh, num_classes, only_objectness=1, validation=False, use_cuda=True): + # total number of inputs (batch size) + # first element (x) for first tuple (x, anchor_mask, num_anchor) + batchsize = output[0]['x'].data.size(0) + + all_boxes = [] + for i in range(len(output)): + pred, anchors, num_anchors = output[i]['x'].data, output[i]['a'], output[i]['n'].item() + boxes = get_region_boxes(pred, conf_thresh, num_classes, anchors, num_anchors, \ + only_objectness=only_objectness, validation=validation, use_cuda=use_cuda) + + all_boxes.append(boxes) + return torch.cat(all_boxes, dim=1) + + +def get_region_boxes(output, obj_thresh, num_classes, anchors, num_anchors, only_objectness=1, validation=False, + use_cuda=True): + device = torch.device("cuda" if use_cuda else "cpu") + anchors = anchors.to(device) + anchor_step = anchors.size(0) // num_anchors + if output.dim() == 3: + output = output.unsqueeze(0) + batch = output.size(0) + assert (output.size(1) == (5 + num_classes) * num_anchors) + h = output.size(2) + w = output.size(3) + cls_anchor_dim = batch * num_anchors * h * w + + # all_boxes = [] + output = output.view(batch * num_anchors, 5 + num_classes, h * w).transpose(0, 1).contiguous().view(5 + num_classes, + cls_anchor_dim) + + grid_x = torch.linspace(0, w - 1, w).repeat(batch * num_anchors, h, 1).view(cls_anchor_dim).to(device) + grid_y = torch.linspace(0, h - 1, h).repeat(w, 1).t().repeat(batch * num_anchors, 1, 1).view(cls_anchor_dim).to( + device) + ix = torch.LongTensor(range(0, 2)).to(device) + anchor_w = anchors.view(num_anchors, anchor_step).index_select(1, ix[0]).repeat(1, batch, h * w).view( + cls_anchor_dim) + anchor_h = anchors.view(num_anchors, anchor_step).index_select(1, ix[1]).repeat(1, batch, h * w).view( + cls_anchor_dim) + + xs, ys = torch.sigmoid(output[0]) + grid_x, torch.sigmoid(output[1]) + grid_y + ws, hs = torch.exp(output[2]) * anchor_w.detach(), torch.exp(output[3]) * anchor_h.detach() + det_confs = torch.sigmoid(output[4]) + + # by ysyun, dim=1 means input is 2D or even dimension else dim=0 + cls_confs = torch.nn.Softmax(dim=1)(output[5:5 + num_classes].transpose(0, 1)).detach() + cls_max_confs, cls_max_ids = torch.max(cls_confs, 1) + cls_max_confs = cls_max_confs.view(-1) + cls_max_ids = cls_max_ids.view(-1).float() + + # sz_hw = h*w + # sz_hwa = sz_hw*num_anchors + # det_confs = convert2cpu(det_confs) + # cls_max_confs = convert2cpu(cls_max_confs) + # cls_max_ids = convert2cpu_long(cls_max_ids) + # xs, ys = convert2cpu(xs), convert2cpu(ys) + # ws, hs = convert2cpu(ws), convert2cpu(hs) + + cls_confs = det_confs * cls_max_confs + + # boxes = [xs/w, ys/h, ws/w, hs/h, det_confs, cls_confs, cls_max_ids] + xs, ys, ws, hs = xs / w, ys / h, ws / w, hs / h + x1, y1, x2, y2 = torch.clamp_min(xs - ws / 2., 0.), torch.clamp_min(ys - hs / 2., 0.), torch.clamp_max(xs + ws / 2., + 1.), torch.clamp_max( + ys + hs / 2., 1.) + boxes = [x1, y1, x2, y2, det_confs, cls_confs, cls_max_ids] + boxes = list(map(lambda x: x.view(batch, -1), boxes)) + boxes = torch.stack(boxes, dim=2) + + # for b in range(batch): + # boxes = [] + # for cy in range(h): + # for cx in range(w): + # for i in range(num_anchors): + # ind = b*sz_hwa + i*sz_hw + cy*w + cx + # det_conf = det_confs[ind] + # if only_objectness: + # conf = det_confs[ind] + # else: + # conf = det_confs[ind] * cls_max_confs[ind] + + # if conf > conf_thresh: + # bcx = xs[ind] + # bcy = ys[ind] + # bw = ws[ind] + # bh = hs[ind] + # cls_max_conf = cls_max_confs[ind] + # cls_max_id = cls_max_ids[ind] + # box = [bcx/w, bcy/h, bw/w, bh/h, det_conf, cls_max_conf, cls_max_id] + + # boxes.append(box) + # all_boxes.append(boxes) + return boxes + + +# def get_all_boxes(output, conf_thresh, num_classes, only_objectness=1, validation=False, use_cuda=True): +# # total number of inputs (batch size) +# # first element (x) for first tuple (x, anchor_mask, num_anchor) +# tot = output[0]['x'].data.size(0) +# all_boxes = [[] for i in range(tot)] +# for i in range(len(output)): +# pred, anchors, num_anchors = output[i]['x'].data, output[i]['a'], output[i]['n'].item() +# b = get_region_boxes(pred, conf_thresh, num_classes, anchors, num_anchors, \ +# only_objectness=only_objectness, validation=validation, use_cuda=use_cuda) +# for t in range(tot): +# all_boxes[t] += b[t] +# return all_boxes + +# def get_region_boxes(output, conf_thresh, num_classes, anchors, num_anchors, only_objectness=1, validation=False, use_cuda=True): +# device = torch.device("cuda" if use_cuda else "cpu") +# anchors = anchors.to(device) +# anchor_step = anchors.size(0)//num_anchors +# if output.dim() == 3: +# output = output.unsqueeze(0) +# batch = output.size(0) +# assert(output.size(1) == (5+num_classes)*num_anchors) +# h = output.size(2) +# w = output.size(3) +# cls_anchor_dim = batch*num_anchors*h*w + +# t0 = time.time() +# all_boxes = [] +# output = output.view(batch*num_anchors, 5+num_classes, h*w).transpose(0,1).contiguous().view(5+num_classes, cls_anchor_dim) + +# grid_x = torch.linspace(0, w-1, w).repeat(batch*num_anchors, h, 1).view(cls_anchor_dim).to(device) +# grid_y = torch.linspace(0, h-1, h).repeat(w,1).t().repeat(batch*num_anchors, 1, 1).view(cls_anchor_dim).to(device) +# ix = torch.LongTensor(range(0,2)).to(device) +# anchor_w = anchors.view(num_anchors, anchor_step).index_select(1, ix[0]).repeat(1, batch, h*w).view(cls_anchor_dim) +# anchor_h = anchors.view(num_anchors, anchor_step).index_select(1, ix[1]).repeat(1, batch, h*w).view(cls_anchor_dim) + +# xs, ys = torch.sigmoid(output[0]) + grid_x, torch.sigmoid(output[1]) + grid_y +# ws, hs = torch.exp(output[2]) * anchor_w.detach(), torch.exp(output[3]) * anchor_h.detach() +# det_confs = torch.sigmoid(output[4]) + +# # by ysyun, dim=1 means input is 2D or even dimension else dim=0 +# cls_confs = torch.nn.Softmax(dim=1)(output[5:5+num_classes].transpose(0,1)).detach() +# cls_max_confs, cls_max_ids = torch.max(cls_confs, 1) +# cls_max_confs = cls_max_confs.view(-1) +# cls_max_ids = cls_max_ids.view(-1) +# t1 = time.time() + +# sz_hw = h*w +# sz_hwa = sz_hw*num_anchors +# det_confs = convert2cpu(det_confs) +# cls_max_confs = convert2cpu(cls_max_confs) +# cls_max_ids = convert2cpu_long(cls_max_ids) +# xs, ys = convert2cpu(xs), convert2cpu(ys) +# ws, hs = convert2cpu(ws), convert2cpu(hs) +# if validation: +# cls_confs = convert2cpu(cls_confs.view(-1, num_classes)) + +# t2 = time.time() +# for b in range(batch): +# boxes = [] +# for cy in range(h): +# for cx in range(w): +# for i in range(num_anchors): +# ind = b*sz_hwa + i*sz_hw + cy*w + cx +# det_conf = det_confs[ind] +# if only_objectness: +# conf = det_confs[ind] +# else: +# conf = det_confs[ind] * cls_max_confs[ind] + +# if conf > conf_thresh: +# bcx = xs[ind] +# bcy = ys[ind] +# bw = ws[ind] +# bh = hs[ind] +# cls_max_conf = cls_max_confs[ind] +# cls_max_id = cls_max_ids[ind] +# box = [bcx/w, bcy/h, bw/w, bh/h, det_conf, cls_max_conf, cls_max_id] +# if (not only_objectness) and validation: +# for c in range(num_classes): +# tmp_conf = cls_confs[ind][c] +# if c != cls_max_id and det_confs[ind]*tmp_conf > conf_thresh: +# box.append(tmp_conf) +# box.append(c) +# boxes.append(box) +# all_boxes.append(boxes) +# t3 = time.time() +# if False: +# print('---------------------------------') +# print('matrix computation : %f' % (t1-t0)) +# print(' gpu to cpu : %f' % (t2-t1)) +# print(' boxes filter : %f' % (t3-t2)) +# print('---------------------------------') +# return all_boxes + +def plot_boxes_cv2(img, boxes, savename=None, class_names=None, color=None): + import cv2 + colors = torch.FloatTensor([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]]) + + def get_color(c, x, max_val): + ratio = float(x) / max_val * 5 + i = int(math.floor(ratio)) + j = int(math.ceil(ratio)) + ratio -= i + r = (1 - ratio) * colors[i][c] + ratio * colors[j][c] + return int(r * 255) + + width = img.shape[1] + height = img.shape[0] + for i in range(len(boxes)): + box = boxes[i] + x1 = int(round((box[0] - box[2] / 2.0) * width)) + y1 = int(round((box[1] - box[3] / 2.0) * height)) + x2 = int(round((box[0] + box[2] / 2.0) * width)) + y2 = int(round((box[1] + box[3] / 2.0) * height)) + + if color: + rgb = color + else: + rgb = (255, 0, 0) + if len(box) >= 7 and class_names: + cls_conf = box[5] + cls_id = box[6] + # print('%s: %f' % (class_names[cls_id], cls_conf)) + classes = len(class_names) + offset = cls_id * 123457 % classes + red = get_color(2, offset, classes) + green = get_color(1, offset, classes) + blue = get_color(0, offset, classes) + if color is None: + rgb = (red, green, blue) + img = cv2.putText(img, class_names[cls_id], (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 1.2, rgb, 1) + img = cv2.rectangle(img, (x1, y1), (x2, y2), rgb, 1) + if savename: + print("save plot results to %s" % savename) + cv2.imwrite(savename, img) + return img + + +def plot_boxes(img, boxes, savename=None, class_names=None): + colors = torch.FloatTensor([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]]) + + def get_color(c, x, max_val): + ratio = float(x) / max_val * 5 + i = int(math.floor(ratio)) + j = int(math.ceil(ratio)) + ratio -= i + r = (1 - ratio) * colors[i][c] + ratio * colors[j][c] + return int(r * 255) + + width = img.width + height = img.height + draw = ImageDraw.Draw(img) + print("%d box(es) is(are) found" % len(boxes)) + for i in range(len(boxes)): + box = boxes[i] + x1 = (box[0] - box[2] / 2.0) * width + y1 = (box[1] - box[3] / 2.0) * height + x2 = (box[0] + box[2] / 2.0) * width + y2 = (box[1] + box[3] / 2.0) * height + + rgb = (255, 0, 0) + if len(box) >= 7 and class_names: + cls_conf = box[5] + cls_id = box[6] + print('%s: %f' % (class_names[cls_id], cls_conf)) + classes = len(class_names) + offset = cls_id * 123457 % classes + red = get_color(2, offset, classes) + green = get_color(1, offset, classes) + blue = get_color(0, offset, classes) + rgb = (red, green, blue) + draw.text((x1, y1), class_names[cls_id], fill=rgb) + draw.rectangle([x1, y1, x2, y2], outline=rgb) + if savename: + print("save plot results to %s" % savename) + img.save(savename) + return img + + +def read_truths(lab_path): + if not os.path.exists(lab_path): + return np.array([]) + if os.path.getsize(lab_path): + truths = np.loadtxt(lab_path) + truths = truths.reshape(truths.size // 5, 5) # to avoid single truth problem + return truths + else: + return np.array([]) + + +def read_truths_args(lab_path, min_box_scale): + truths = read_truths(lab_path) + new_truths = [] + for i in range(truths.shape[0]): + if truths[i][3] < min_box_scale: + continue + new_truths.append([truths[i][0], truths[i][1], truths[i][2], truths[i][3], truths[i][4]]) + return np.array(new_truths) + + +def load_class_names(namesfile): + class_names = [] + with open(namesfile, 'r', encoding='utf8') as fp: + lines = fp.readlines() + for line in lines: + class_names.append(line.strip()) + return class_names + + +def image2torch(img): + if isinstance(img, Image.Image): + width = img.width + height = img.height + img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes())) + img = img.view(height, width, 3).transpose(0, 1).transpose(0, 2).contiguous() + img = img.view(1, 3, height, width) + img = img.float().div(255.0) + elif type(img) == np.ndarray: # cv2 image + img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0) + else: + print("unknown image type") + exit(-1) + return img + + +def do_detect(model, img, conf_thresh, nms_thresh, use_cuda=True): + model.eval() + t0 = time.time() + img = image2torch(img) + t1 = time.time() + + img = img.to(torch.device("cuda" if use_cuda else "cpu")) + t2 = time.time() + + out_boxes = model(img) + boxes = get_all_boxes(out_boxes, conf_thresh, model.num_classes, use_cuda=use_cuda)[0] + + t3 = time.time() + boxes = nms(boxes, nms_thresh) + t4 = time.time() + + if False: + print('-----------------------------------') + print(' image to tensor : %f' % (t1 - t0)) + print(' tensor to cuda : %f' % (t2 - t1)) + print(' predict : %f' % (t3 - t2)) + print(' nms : %f' % (t4 - t3)) + print(' total : %f' % (t4 - t0)) + print('-----------------------------------') + return boxes + + +def read_data_cfg(datacfg): + options = dict() + options['gpus'] = '0,1,2,3' + options['num_workers'] = '10' + with open(datacfg) as fp: + lines = fp.readlines() + + for line in lines: + line = line.strip() + if line == '': + continue + key, value = line.split('=') + key = key.strip() + value = value.strip() + options[key] = value + return options + + +def scale_bboxes(bboxes, width, height): + import copy + dets = copy.deepcopy(bboxes) + for i in range(len(dets)): + dets[i][0] = dets[i][0] * width + dets[i][1] = dets[i][1] * height + dets[i][2] = dets[i][2] * width + dets[i][3] = dets[i][3] * height + return dets + + +def file_lines(thefilepath): + count = 0 + thefile = open(thefilepath, 'rb') + while True: + buffer = thefile.read(8192 * 1024) + if not buffer: + break + count += buffer.count(b'\n') + thefile.close() + return count + + +def get_image_size(fname): + """ + Determine the image type of fhandle and return its size. + from draco + """ + with open(fname, 'rb') as fhandle: + head = fhandle.read(24) + if len(head) != 24: + return + if imghdr.what(fname) == 'png': + check = struct.unpack('>i', head[4:8])[0] + if check != 0x0d0a1a0a: + return + width, height = struct.unpack('>ii', head[16:24]) + elif imghdr.what(fname) == 'gif': + width, height = struct.unpack('H', fhandle.read(2))[0] - 2 + # We are at a SOFn block + fhandle.seek(1, 1) # Skip `precision' byte. + height, width = struct.unpack('>HH', fhandle.read(4)) + except Exception: # IGNORE:W0703 + return + else: + return + return width, height + + +def logging(message): + print('%s %s' % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), message)) diff --git a/detector/__init__.py b/detector/__init__.py new file mode 100644 index 0000000..0e57c92 --- /dev/null +++ b/detector/__init__.py @@ -0,0 +1,132 @@ +from .YOLOv3 import YOLOv3 +import onnxruntime +import numpy as np +import time +import cv2 +import torch +from detector.YOLOv3.nms import boxes_nms +__all__ = ['build_detector','build_onnx'] + +def xyxy_to_xywh(boxes_xyxy): + if isinstance(boxes_xyxy, torch.Tensor): + boxes_xywh = boxes_xyxy.clone() + elif isinstance(boxes_xyxy, np.ndarray): + boxes_xywh = boxes_xyxy.copy() + + boxes_xywh[:, 0] = (boxes_xyxy[:, 0] + boxes_xyxy[:, 2]) / 2. + boxes_xywh[:, 1] = (boxes_xyxy[:, 1] + boxes_xyxy[:, 3]) / 2. + boxes_xywh[:, 2] = boxes_xyxy[:, 2] - boxes_xyxy[:, 0] + boxes_xywh[:, 3] = boxes_xyxy[:, 3] - boxes_xyxy[:, 1] + + return boxes_xywh + +def build_detector(cfg, use_cuda): + return YOLOv3(cfg.YOLOV3.CFG, cfg.YOLOV3.WEIGHT, cfg.YOLOV3.CLASS_NAMES, + score_thresh=cfg.YOLOV3.SCORE_THRESH, nms_thresh=cfg.YOLOV3.NMS_THRESH, + is_xywh=True, use_cuda=use_cuda) + + +class build_onnx(): + def __init__(self , cfg): + + self.session = onnxruntime.InferenceSession(cfg.YOLOV4.WEIGHT) + # session = onnx.load(onnx_path) + print("The model expects input shape: ", self.session.get_inputs()[0].shape) + self.class_names = self.load_class_names(cfg.YOLOV4.CLASS_NAMES) + + + def forward(self,img,video_width,video_height): + IN_IMAGE_H = self.session.get_inputs()[0].shape[2] + IN_IMAGE_W = self.session.get_inputs()[0].shape[3] + + # Input + resized = cv2.resize(img, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR) + img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) + img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32) + img_in = np.expand_dims(img_in, axis=0) + img_in /= 255.0 + print("Shape of the network input: ", img_in.shape) + + # Compute + input_name = self.session.get_inputs()[0].name + t5 = time.time() + outputs = self.session.run(None, {input_name: img_in}) + t6 = time.time() + print(' -------------infer----------------: %f' % (t5 - t6)) + + + self.boxes = np.array(self.post_processing(img_in, 0.4, 0.6, outputs))[0] + self.box =xyxy_to_xywh(self.boxes[:,0:4]) + self.box = self.box * np.array([video_width, video_height, video_width, video_height]) + + self.cls = self.boxes[:,5] + self.id = self.boxes[:,6] + return self.box , self.cls , self.id + + def load_class_names(self , namesfile): + with open(namesfile, 'r', encoding='utf8') as fp: + class_names = [line.strip() for line in fp.readlines()] + return class_names + + def post_processing(self,img, conf_thresh, nms_thresh, output): + + # [batch, num, 1, 4] + box_array = output[0] + # [batch, num, num_classes] + confs = output[1] + + t1 = time.time() + + if type(box_array).__name__ != 'ndarray': + box_array = box_array.cpu().detach().numpy() + confs = confs.cpu().detach().numpy() + + num_classes = confs.shape[2] + + # [batch, num, 4] + box_array = box_array[:, :, 0] + + # [batch, num, num_classes] --> [batch, num] + max_conf = np.max(confs, axis=2) + max_id = np.argmax(confs, axis=2) + + t2 = time.time() + + bboxes_batch = [] + for i in range(box_array.shape[0]): + + argwhere = max_conf[i] > conf_thresh + l_box_array = box_array[i, argwhere, :] + l_max_conf = max_conf[i, argwhere] + l_max_id = max_id[i, argwhere] + + bboxes = [] + # nms for each class + for j in range(num_classes): + + cls_argwhere = l_max_id == j + ll_box_array = l_box_array[cls_argwhere, :] + ll_max_conf = l_max_conf[cls_argwhere] + ll_max_id = l_max_id[cls_argwhere] + + keep = np.array(boxes_nms(torch.tensor(ll_box_array), torch.tensor(ll_max_conf), nms_thresh)) + + if (keep.size > 0): + ll_box_array = ll_box_array[keep, :] + ll_max_conf = ll_max_conf[keep] + ll_max_id = ll_max_id[keep] + + for k in range(ll_box_array.shape[0]): + bboxes.append([ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2], ll_box_array[k, 3], ll_max_conf[k], ll_max_conf[k], ll_max_id[k]]) + + bboxes_batch.append(bboxes) + + t3 = time.time() + + print('-----------------------------------') + print(' max and argmax : %f' % (t2 - t1)) + print(' nms : %f' % (t3 - t2)) + print('Post processing total : %f' % (t3 - t1)) + print('-----------------------------------') + + return bboxes_batch diff --git a/detector/trt.py b/detector/trt.py new file mode 100644 index 0000000..746ca8f --- /dev/null +++ b/detector/trt.py @@ -0,0 +1,212 @@ +import sys +import os +import time +import argparse +import numpy as np +import cv2 +# from PIL import Image +import tensorrt as trt +import pycuda.driver as cuda +import pycuda.autoinit +from detector.YOLOv3.nms import boxes_nms +import torch + +try: + # Sometimes python2 does not understand FileNotFoundError + FileNotFoundError +except NameError: + FileNotFoundError = IOError + +# __all__ = ['trt'] + +class tensorrt(): + def __init__(self , cfg, img_size = [416,416]): + self.cfg = cfg + self.engine = self.get_engine(cfg.YOLOV4.WEIGHT) + self.context = self.engine.create_execution_context() + self.buffers = self.allocate_buffers(self.engine, 1) + IN_IMAGE_H, IN_IMAGE_W = img_size + self.context.set_binding_shape(0, (1, 3, IN_IMAGE_H, IN_IMAGE_W)) + self.num_classes = 80 + self.image_size = img_size + # return context , buffers + + def get_engine(self,engine_path): + # If a serialized engine exists, use it instead of building an engine. + print("Reading engine from file {}".format(engine_path)) + TRT_LOGGER = trt.Logger() + with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: + return runtime.deserialize_cuda_engine(f.read()) + + def detect(self,context , buffers , image_src,video_width=416,video_height=416): + IN_IMAGE_H, IN_IMAGE_W = self.image_size + ta = time.time() + # Input + # image_src = cv2.imread(image_src) + resized = cv2.resize(image_src, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR) + img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) + img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32) + img_in = np.expand_dims(img_in, axis=0) + img_in /= 255.0 + img_in = np.ascontiguousarray(img_in) + print("Shape of the network input: ", img_in.shape) + # print(img_in) + + inputs, outputs, bindings, stream = buffers + # print('Length of inputs: ', len(inputs)) + inputs[0].host = img_in + + trt_outputs = self.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) + + # print('Len of outputs: ', len(trt_outputs)) + + trt_outputs[0] = trt_outputs[0].reshape(1, -1, 1, 4) + trt_outputs[1] = trt_outputs[1].reshape(1, -1, self.num_classes) + + tb = time.time() + + # print('-----------------------------------') + # print(' TRT inference time: %f' % (tb - ta)) + # print('-----------------------------------') + + # boxes = post_processing(img_in, 0.4, 0.6, trt_outputs) + self.boxes = np.array(self.post_processing(img_in, self.cfg.YOLOV4.SCORE_THRESH, self.cfg.YOLOV4.NMS_THRESH, trt_outputs))[0] + # assert self.boxes[:,0:4] + self.box =self.xyxy_to_xywh(self.boxes[:,0:4]) + self.box = self.box * np.array([video_width, video_height, video_width, video_height]) + + self.cls = self.boxes[:,5] + self.id = self.boxes[:,6] + return self.box , self.cls , self.id + + def post_processing(self,img, conf_thresh, nms_thresh, output): + + box_array = output[0] + # [batch, num, num_classes] + confs = output[1] + + t1 = time.time() + + if type(box_array).__name__ != 'ndarray': + box_array = box_array.cpu().detach().numpy() + confs = confs.cpu().detach().numpy() + + num_classes = confs.shape[2] + + # [batch, num, 4] + box_array = box_array[:, :, 0] + + # [batch, num, num_classes] --> [batch, num] + max_conf = np.max(confs, axis=2) + max_id = np.argmax(confs, axis=2) + + t2 = time.time() + + bboxes_batch = [] + for i in range(box_array.shape[0]): + + argwhere = max_conf[i] > conf_thresh + l_box_array = box_array[i, argwhere, :] + l_max_conf = max_conf[i, argwhere] + l_max_id = max_id[i, argwhere] + + bboxes = [] + # nms for each class + for j in range(num_classes): + + cls_argwhere = l_max_id == j + ll_box_array = l_box_array[cls_argwhere, :] + ll_max_conf = l_max_conf[cls_argwhere] + ll_max_id = l_max_id[cls_argwhere] + + keep = np.array(boxes_nms(torch.tensor(ll_box_array), torch.tensor(ll_max_conf), nms_thresh)) + + if (keep.size > 0): + ll_box_array = ll_box_array[keep, :] + ll_max_conf = ll_max_conf[keep] + ll_max_id = ll_max_id[keep] + + for k in range(ll_box_array.shape[0]): + bboxes.append([ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2], ll_box_array[k, 3], ll_max_conf[k], ll_max_conf[k], ll_max_id[k]]) + + bboxes_batch.append(bboxes) + + t3 = time.time() + + # print('-----------------------------------') + # print(' max and argmax : %f' % (t2 - t1)) + # print(' nms : %f' % (t3 - t2)) + # print('Post processing total : %f' % (t3 - t1)) + # print('-----------------------------------') + + return bboxes_batch + + # Allocates all buffers required for an engine, i.e. host/device inputs/outputs. + # Simple helper data class that's a little nicer to use than a 2-tuple. + class HostDeviceMem(object): + def __init__(self, host_mem, device_mem): + self.host = host_mem + self.device = device_mem + + def __str__(self): + return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) + + def __repr__(self): + return self.__str__() + + def allocate_buffers(self,engine, batch_size): + inputs = [] + outputs = [] + bindings = [] + stream = cuda.Stream() + for binding in engine: + + size = trt.volume(engine.get_binding_shape(binding)) * batch_size + dims = engine.get_binding_shape(binding) + + # in case batch dimension is -1 (dynamic) + if dims[0] < 0: + size *= -1 + + dtype = trt.nptype(engine.get_binding_dtype(binding)) + # Allocate host and device buffers + host_mem = cuda.pagelocked_empty(size, dtype) + device_mem = cuda.mem_alloc(host_mem.nbytes) + # Append the device buffer to device bindings. + bindings.append(int(device_mem)) + # Append to the appropriate list. + if engine.binding_is_input(binding): + inputs.append(self.HostDeviceMem(host_mem, device_mem)) + else: + outputs.append(self.HostDeviceMem(host_mem, device_mem)) + return inputs, outputs, bindings, stream + +# This function is generalized for multiple inputs/outputs. +# inputs and outputs are expected to be lists of HostDeviceMem objects. + def do_inference(self,context, bindings, inputs, outputs, stream): + # Transfer input data to the GPU. + [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] + # Run inference. + context.execute_async(bindings=bindings, stream_handle=stream.handle) + # Transfer predictions back from the GPU. + [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] + # Synchronize the stream + stream.synchronize() + # Return only the host outputs. + return [out.host for out in outputs] + def GiB(self,val): + return val * 1 << 30 + + def xyxy_to_xywh(self,boxes_xyxy): + if isinstance(boxes_xyxy, torch.Tensor): + boxes_xywh = boxes_xyxy.clone() + elif isinstance(boxes_xyxy, np.ndarray): + boxes_xywh = boxes_xyxy.copy() + + boxes_xywh[:, 0] = (boxes_xyxy[:, 0] + boxes_xyxy[:, 2]) / 2. + boxes_xywh[:, 1] = (boxes_xyxy[:, 1] + boxes_xyxy[:, 3]) / 2. + boxes_xywh[:, 2] = boxes_xyxy[:, 2] - boxes_xyxy[:, 0] + boxes_xywh[:, 3] = boxes_xyxy[:, 3] - boxes_xyxy[:, 1] + + return boxes_xywh + diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/asserts.py b/utils/asserts.py new file mode 100644 index 0000000..59a73cc --- /dev/null +++ b/utils/asserts.py @@ -0,0 +1,13 @@ +from os import environ + + +def assert_in(file, files_to_check): + if file not in files_to_check: + raise AssertionError("{} does not exist in the list".format(str(file))) + return True + + +def assert_in_env(check_list: list): + for item in check_list: + assert_in(item, environ.keys()) + return True diff --git a/utils/draw.py b/utils/draw.py new file mode 100644 index 0000000..3366048 --- /dev/null +++ b/utils/draw.py @@ -0,0 +1,56 @@ +import numpy as np +import cv2 + +palette = (2 ** 11 - 1, 2 ** 15 - 1, 2 ** 20 - 1) + + +def compute_color_for_labels(label): + """ + Simple function that adds fixed color depending on the class + """ + color = [int((p * (label ** 2 - label + 1)) % 255) for p in palette] + return tuple(color) + + +def draw_boxes(img, output=None, count = [] , detection_id=0,Type='car',offset=(0,0) ): + track_num = len(set(count)) + if len(output) != 0: + bbox = output[:, :4] + identities = output[:, -1] + detection_id = len(identities) + for i,box in enumerate(bbox): + x1,y1,x2,y2 = [int(i) for i in box] + x1 += offset[0] + x2 += offset[0] + y1 += offset[1] + y2 += offset[1] + # box text and bar + id = int(identities[i]) if identities is not None else 0 + color = compute_color_for_labels(id) + label = '{}{:d}'.format("", id) + t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 2 , 2)[0] + cv2.rectangle(img,(x1, y1),(x2,y2),color,3) + cv2.rectangle(img,(x1, y1),(x1+t_size[0]+3,y1+t_size[1]+4), color,-1) ##填充 + cv2.putText(img,label,(x1,y1+t_size[1]+4), cv2.FONT_HERSHEY_PLAIN, 2, [255,255,255], 2) + ## 将track 跟踪数量放到图上 + else: + detection_id = 0 + puttxt_height=img.shape[0] + puttxt_width=img.shape[1] + if Type=='car': + cv2.putText(img, "Total Car: " + str(track_num), (int(20), int(120)), 0, 5e-3 * 200, (0, 255, 0), 2) + cv2.putText(img, "Current Car Counter: " + str(detection_id), (int(20), int(80)), 0, 5e-3 * 200, (0, 255, 0), 2) + else: + cv2.putText(img, "Total Person: " + str(track_num), (int(4), int(25)), 0, 1, (255, 0, 255), 2) + cv2.putText(img, "Current Person Counter: " + str(detection_id), (int(4), int(50)), 0, 1, (255, 0, 255), 2) + # cv2.putText(img, "FPS: %.2f" % (fps), (int(20), int(40)), 0, 5e-3 * 200, (0, 255, 0), 3) + return img,track_num,detection_id + + + + + + +if __name__ == '__main__': + for i in range(82): + print(compute_color_for_labels(i)) diff --git a/utils/evaluation.py b/utils/evaluation.py new file mode 100644 index 0000000..1001794 --- /dev/null +++ b/utils/evaluation.py @@ -0,0 +1,103 @@ +import os +import numpy as np +import copy +import motmetrics as mm +mm.lap.default_solver = 'lap' +from utils.io import read_results, unzip_objs + + +class Evaluator(object): + + def __init__(self, data_root, seq_name, data_type): + self.data_root = data_root + self.seq_name = seq_name + self.data_type = data_type + + self.load_annotations() + self.reset_accumulator() + + def load_annotations(self): + assert self.data_type == 'mot' + + gt_filename = os.path.join(self.data_root, self.seq_name, 'gt', 'gt.txt') + self.gt_frame_dict = read_results(gt_filename, self.data_type, is_gt=True) + self.gt_ignore_frame_dict = read_results(gt_filename, self.data_type, is_ignore=True) + + def reset_accumulator(self): + self.acc = mm.MOTAccumulator(auto_id=True) + + def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False): + # results + trk_tlwhs = np.copy(trk_tlwhs) + trk_ids = np.copy(trk_ids) + + # gts + gt_objs = self.gt_frame_dict.get(frame_id, []) + gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2] + + # ignore boxes + ignore_objs = self.gt_ignore_frame_dict.get(frame_id, []) + ignore_tlwhs = unzip_objs(ignore_objs)[0] + + + # remove ignored results + keep = np.ones(len(trk_tlwhs), dtype=bool) + iou_distance = mm.distances.iou_matrix(ignore_tlwhs, trk_tlwhs, max_iou=0.5) + if len(iou_distance) > 0: + match_is, match_js = mm.lap.linear_sum_assignment(iou_distance) + match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js]) + match_ious = iou_distance[match_is, match_js] + + match_js = np.asarray(match_js, dtype=int) + match_js = match_js[np.logical_not(np.isnan(match_ious))] + keep[match_js] = False + trk_tlwhs = trk_tlwhs[keep] + trk_ids = trk_ids[keep] + + # get distance matrix + iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5) + + # acc + self.acc.update(gt_ids, trk_ids, iou_distance) + + if rtn_events and iou_distance.size > 0 and hasattr(self.acc, 'last_mot_events'): + events = self.acc.last_mot_events # only supported by https://github.com/longcw/py-motmetrics + else: + events = None + return events + + def eval_file(self, filename): + self.reset_accumulator() + + result_frame_dict = read_results(filename, self.data_type, is_gt=False) + frames = sorted(list(set(self.gt_frame_dict.keys()) | set(result_frame_dict.keys()))) + for frame_id in frames: + trk_objs = result_frame_dict.get(frame_id, []) + trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2] + self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False) + + return self.acc + + @staticmethod + def get_summary(accs, names, metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1', 'precision', 'recall')): + names = copy.deepcopy(names) + if metrics is None: + metrics = mm.metrics.motchallenge_metrics + metrics = copy.deepcopy(metrics) + + mh = mm.metrics.create() + summary = mh.compute_many( + accs, + metrics=metrics, + names=names, + generate_overall=True + ) + + return summary + + @staticmethod + def save_summary(summary, filename): + import pandas as pd + writer = pd.ExcelWriter(filename) + summary.to_excel(writer) + writer.save() diff --git a/utils/io.py b/utils/io.py new file mode 100644 index 0000000..2dc9afd --- /dev/null +++ b/utils/io.py @@ -0,0 +1,133 @@ +import os +from typing import Dict +import numpy as np + +# from utils.log import get_logger + + +def write_results(filename, results, data_type): + if data_type == 'mot': + save_format = '{frame},{id},{x1},{y1},{w},{h},-1,-1,-1,-1\n' + elif data_type == 'kitti': + save_format = '{frame} {id} pedestrian 0 0 -10 {x1} {y1} {x2} {y2} -10 -10 -10 -1000 -1000 -1000 -10\n' + else: + raise ValueError(data_type) + + with open(filename, 'w') as f: + for frame_id, tlwhs, track_ids in results: + if data_type == 'kitti': + frame_id -= 1 + for tlwh, track_id in zip(tlwhs, track_ids): + if track_id < 0: + continue + x1, y1, w, h = tlwh + x2, y2 = x1 + w, y1 + h + line = save_format.format(frame=frame_id, id=track_id, x1=x1, y1=y1, x2=x2, y2=y2, w=w, h=h) + f.write(line) + + +# def write_results(filename, results_dict: Dict, data_type: str): +# if not filename: +# return +# path = os.path.dirname(filename) +# if not os.path.exists(path): +# os.makedirs(path) + +# if data_type in ('mot', 'mcmot', 'lab'): +# save_format = '{frame},{id},{x1},{y1},{w},{h},1,-1,-1,-1\n' +# elif data_type == 'kitti': +# save_format = '{frame} {id} pedestrian -1 -1 -10 {x1} {y1} {x2} {y2} -1 -1 -1 -1000 -1000 -1000 -10 {score}\n' +# else: +# raise ValueError(data_type) + +# with open(filename, 'w') as f: +# for frame_id, frame_data in results_dict.items(): +# if data_type == 'kitti': +# frame_id -= 1 +# for tlwh, track_id in frame_data: +# if track_id < 0: +# continue +# x1, y1, w, h = tlwh +# x2, y2 = x1 + w, y1 + h +# line = save_format.format(frame=frame_id, id=track_id, x1=x1, y1=y1, x2=x2, y2=y2, w=w, h=h, score=1.0) +# f.write(line) +# logger.info('Save results to {}'.format(filename)) + + +def read_results(filename, data_type: str, is_gt=False, is_ignore=False): + if data_type in ('mot', 'lab'): + read_fun = read_mot_results + else: + raise ValueError('Unknown data type: {}'.format(data_type)) + + return read_fun(filename, is_gt, is_ignore) + + +""" +labels={'ped', ... % 1 +'person_on_vhcl', ... % 2 +'car', ... % 3 +'bicycle', ... % 4 +'mbike', ... % 5 +'non_mot_vhcl', ... % 6 +'static_person', ... % 7 +'distractor', ... % 8 +'occluder', ... % 9 +'occluder_on_grnd', ... %10 +'occluder_full', ... % 11 +'reflection', ... % 12 +'crowd' ... % 13 +}; +""" + + +def read_mot_results(filename, is_gt, is_ignore): + valid_labels = {1} + ignore_labels = {2, 7, 8, 12} + results_dict = dict() + if os.path.isfile(filename): + with open(filename, 'r') as f: + for line in f.readlines(): + linelist = line.split(',') + if len(linelist) < 7: + continue + fid = int(linelist[0]) + if fid < 1: + continue + results_dict.setdefault(fid, list()) + + if is_gt: + if 'MOT16-' in filename or 'MOT17-' in filename: + label = int(float(linelist[7])) + mark = int(float(linelist[6])) + if mark == 0 or label not in valid_labels: + continue + score = 1 + elif is_ignore: + if 'MOT16-' in filename or 'MOT17-' in filename: + label = int(float(linelist[7])) + vis_ratio = float(linelist[8]) + if label not in ignore_labels and vis_ratio >= 0: + continue + else: + continue + score = 1 + else: + score = float(linelist[6]) + + tlwh = tuple(map(float, linelist[2:6])) + target_id = int(linelist[1]) + + results_dict[fid].append((tlwh, target_id, score)) + + return results_dict + + +def unzip_objs(objs): + if len(objs) > 0: + tlwhs, ids, scores = zip(*objs) + else: + tlwhs, ids, scores = [], [], [] + tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) + + return tlwhs, ids, scores \ No newline at end of file diff --git a/utils/json_logger.py b/utils/json_logger.py new file mode 100644 index 0000000..0afd0b4 --- /dev/null +++ b/utils/json_logger.py @@ -0,0 +1,383 @@ +""" +References: + https://medium.com/analytics-vidhya/creating-a-custom-logging-mechanism-for-real-time-object-detection-using-tdd-4ca2cfcd0a2f +""" +import json +from os import makedirs +from os.path import exists, join +from datetime import datetime + + +class JsonMeta(object): + HOURS = 3 + MINUTES = 59 + SECONDS = 59 + PATH_TO_SAVE = 'LOGS' + DEFAULT_FILE_NAME = 'remaining' + + +class BaseJsonLogger(object): + """ + This is the base class that returns __dict__ of its own + it also returns the dicts of objects in the attributes that are list instances + + """ + + def dic(self): + # returns dicts of objects + out = {} + for k, v in self.__dict__.items(): + if hasattr(v, 'dic'): + out[k] = v.dic() + elif isinstance(v, list): + out[k] = self.list(v) + else: + out[k] = v + return out + + @staticmethod + def list(values): + # applies the dic method on items in the list + return [v.dic() if hasattr(v, 'dic') else v for v in values] + + +class Label(BaseJsonLogger): + """ + For each bounding box there are various categories with confidences. Label class keeps track of that information. + """ + + def __init__(self, category: str, confidence: float): + self.category = category + self.confidence = confidence + + +class Bbox(BaseJsonLogger): + """ + This module stores the information for each frame and use them in JsonParser + Attributes: + labels (list): List of label module. + top (int): + left (int): + width (int): + height (int): + + Args: + bbox_id (float): + top (int): + left (int): + width (int): + height (int): + + References: + Check Label module for better understanding. + + + """ + + def __init__(self, bbox_id, top, left, width, height): + self.labels = [] + self.bbox_id = bbox_id + self.top = top + self.left = left + self.width = width + self.height = height + + def add_label(self, category, confidence): + # adds category and confidence only if top_k is not exceeded. + self.labels.append(Label(category, confidence)) + + def labels_full(self, value): + return len(self.labels) == value + + +class Frame(BaseJsonLogger): + """ + This module stores the information for each frame and use them in JsonParser + Attributes: + timestamp (float): The elapsed time of captured frame + frame_id (int): The frame number of the captured video + bboxes (list of Bbox objects): Stores the list of bbox objects. + + References: + Check Bbox class for better information + + Args: + timestamp (float): + frame_id (int): + + """ + + def __init__(self, frame_id: int, timestamp: float = None): + self.frame_id = frame_id + self.timestamp = timestamp + self.bboxes = [] + + def add_bbox(self, bbox_id: int, top: int, left: int, width: int, height: int): + bboxes_ids = [bbox.bbox_id for bbox in self.bboxes] + if bbox_id not in bboxes_ids: + self.bboxes.append(Bbox(bbox_id, top, left, width, height)) + else: + raise ValueError("Frame with id: {} already has a Bbox with id: {}".format(self.frame_id, bbox_id)) + + def add_label_to_bbox(self, bbox_id: int, category: str, confidence: float): + bboxes = {bbox.id: bbox for bbox in self.bboxes} + if bbox_id in bboxes.keys(): + res = bboxes.get(bbox_id) + res.add_label(category, confidence) + else: + raise ValueError('the bbox with id: {} does not exists!'.format(bbox_id)) + + +class BboxToJsonLogger(BaseJsonLogger): + """ + ُ This module is designed to automate the task of logging jsons. An example json is used + to show the contents of json file shortly + Example: + { + "video_details": { + "frame_width": 1920, + "frame_height": 1080, + "frame_rate": 20, + "video_name": "/home/gpu/codes/MSD/pedestrian_2/project/public/camera1.avi" + }, + "frames": [ + { + "frame_id": 329, + "timestamp": 3365.1254 + "bboxes": [ + { + "labels": [ + { + "category": "pedestrian", + "confidence": 0.9 + } + ], + "bbox_id": 0, + "top": 1257, + "left": 138, + "width": 68, + "height": 109 + } + ] + }], + + Attributes: + frames (dict): It's a dictionary that maps each frame_id to json attributes. + video_details (dict): information about video file. + top_k_labels (int): shows the allowed number of labels + start_time (datetime object): we use it to automate the json output by time. + + Args: + top_k_labels (int): shows the allowed number of labels + + """ + + def __init__(self, top_k_labels: int = 1): + self.frames = {} + self.video_details = self.video_details = dict(frame_width=None, frame_height=None, frame_rate=None, + video_name=None) + self.top_k_labels = top_k_labels + self.start_time = datetime.now() + + def set_top_k(self, value): + self.top_k_labels = value + + def frame_exists(self, frame_id: int) -> bool: + """ + Args: + frame_id (int): + + Returns: + bool: true if frame_id is recognized + """ + return frame_id in self.frames.keys() + + def add_frame(self, frame_id: int, timestamp: float = None) -> None: + """ + Args: + frame_id (int): + timestamp (float): opencv captured frame time property + + Raises: + ValueError: if frame_id would not exist in class frames attribute + + Returns: + None + + """ + if not self.frame_exists(frame_id): + self.frames[frame_id] = Frame(frame_id, timestamp) + else: + raise ValueError("Frame id: {} already exists".format(frame_id)) + + def bbox_exists(self, frame_id: int, bbox_id: int) -> bool: + """ + Args: + frame_id: + bbox_id: + + Returns: + bool: if bbox exists in frame bboxes list + """ + bboxes = [] + if self.frame_exists(frame_id=frame_id): + bboxes = [bbox.bbox_id for bbox in self.frames[frame_id].bboxes] + return bbox_id in bboxes + + def find_bbox(self, frame_id: int, bbox_id: int): + """ + + Args: + frame_id: + bbox_id: + + Returns: + bbox_id (int): + + Raises: + ValueError: if bbox_id does not exist in the bbox list of specific frame. + """ + if not self.bbox_exists(frame_id, bbox_id): + raise ValueError("frame with id: {} does not contain bbox with id: {}".format(frame_id, bbox_id)) + bboxes = {bbox.bbox_id: bbox for bbox in self.frames[frame_id].bboxes} + return bboxes.get(bbox_id) + + def add_bbox_to_frame(self, frame_id: int, bbox_id: int, top: int, left: int, width: int, height: int) -> None: + """ + + Args: + frame_id (int): + bbox_id (int): + top (int): + left (int): + width (int): + height (int): + + Returns: + None + + Raises: + ValueError: if bbox_id already exist in frame information with frame_id + ValueError: if frame_id does not exist in frames attribute + """ + if self.frame_exists(frame_id): + frame = self.frames[frame_id] + if not self.bbox_exists(frame_id, bbox_id): + frame.add_bbox(bbox_id, top, left, width, height) + else: + raise ValueError( + "frame with frame_id: {} already contains the bbox with id: {} ".format(frame_id, bbox_id)) + else: + raise ValueError("frame with frame_id: {} does not exist".format(frame_id)) + + def add_label_to_bbox(self, frame_id: int, bbox_id: int, category: str, confidence: float): + """ + Args: + frame_id: + bbox_id: + category: + confidence: the confidence value returned from yolo detection + + Returns: + None + + Raises: + ValueError: if labels quota (top_k_labels) exceeds. + """ + bbox = self.find_bbox(frame_id, bbox_id) + if not bbox.labels_full(self.top_k_labels): + bbox.add_label(category, confidence) + else: + raise ValueError("labels in frame_id: {}, bbox_id: {} is fulled".format(frame_id, bbox_id)) + + def add_video_details(self, frame_width: int = None, frame_height: int = None, frame_rate: int = None, + video_name: str = None): + self.video_details['frame_width'] = frame_width + self.video_details['frame_height'] = frame_height + self.video_details['frame_rate'] = frame_rate + self.video_details['video_name'] = video_name + + def output(self): + output = {'video_details': self.video_details} + result = list(self.frames.values()) + output['frames'] = [item.dic() for item in result] + return output + + def json_output(self, output_name): + """ + Args: + output_name: + + Returns: + None + + Notes: + It creates the json output with `output_name` name. + """ + if not output_name.endswith('.json'): + output_name += '.json' + with open(output_name, 'w') as file: + json.dump(self.output(), file) + file.close() + + def set_start(self): + self.start_time = datetime.now() + + def schedule_output_by_time(self, output_dir=JsonMeta.PATH_TO_SAVE, hours: int = 0, minutes: int = 0, + seconds: int = 60) -> None: + """ + Notes: + Creates folder and then periodically stores the jsons on that address. + + Args: + output_dir (str): the directory where output files will be stored + hours (int): + minutes (int): + seconds (int): + + Returns: + None + + """ + end = datetime.now() + interval = 0 + interval += abs(min([hours, JsonMeta.HOURS]) * 3600) + interval += abs(min([minutes, JsonMeta.MINUTES]) * 60) + interval += abs(min([seconds, JsonMeta.SECONDS])) + diff = (end - self.start_time).seconds + + if diff > interval: + output_name = self.start_time.strftime('%Y-%m-%d %H-%M-%S') + '.json' + if not exists(output_dir): + makedirs(output_dir) + output = join(output_dir, output_name) + self.json_output(output_name=output) + self.frames = {} + self.start_time = datetime.now() + + def schedule_output_by_frames(self, frames_quota, frame_counter, output_dir=JsonMeta.PATH_TO_SAVE): + """ + saves as the number of frames quota increases higher. + :param frames_quota: + :param frame_counter: + :param output_dir: + :return: + """ + pass + + def flush(self, output_dir): + """ + Notes: + We use this function to output jsons whenever possible. + like the time that we exit the while loop of opencv. + + Args: + output_dir: + + Returns: + None + + """ + filename = self.start_time.strftime('%Y-%m-%d %H-%M-%S') + '-remaining.json' + output = join(output_dir, filename) + self.json_output(output_name=output) diff --git a/utils/log.py b/utils/log.py new file mode 100644 index 0000000..5b8c940 --- /dev/null +++ b/utils/log.py @@ -0,0 +1,17 @@ +import logging + + +def get_logger(name='root'): + formatter = logging.Formatter( + # fmt='%(asctime)s [%(levelname)s]: %(filename)s(%(funcName)s:%(lineno)s) >> %(message)s') + fmt='%(asctime)s [%(levelname)s]: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') + + handler = logging.StreamHandler() + handler.setFormatter(formatter) + + logger = logging.getLogger(name) + logger.setLevel(logging.INFO) + logger.addHandler(handler) + return logger + + diff --git a/utils/parser.py b/utils/parser.py new file mode 100644 index 0000000..27fcf50 --- /dev/null +++ b/utils/parser.py @@ -0,0 +1,38 @@ +import os +import yaml +from easydict import EasyDict as edict + +class YamlParser(edict): + """ + This is yaml parser based on EasyDict. + """ + def __init__(self, cfg_dict=None, config_file=None): + if cfg_dict is None: + cfg_dict = {} + + if config_file is not None: + assert(os.path.isfile(config_file)) + with open(config_file, 'r') as fo: + cfg_dict.update(yaml.load(fo.read())) + + super(YamlParser, self).__init__(cfg_dict) + + + def merge_from_file(self, config_file): + with open(config_file, 'r') as fo: + self.update(yaml.load(fo.read())) + + + def merge_from_dict(self, config_dict): + self.update(config_dict) + + +def get_config(config_file=None): + return YamlParser(config_file=config_file) + + +if __name__ == "__main__": + cfg = YamlParser(config_file="../configs/yolov3.yaml") + cfg.merge_from_file("../configs/deep_sort.yaml") + + import ipdb; ipdb.set_trace() \ No newline at end of file diff --git a/utils/tools.py b/utils/tools.py new file mode 100644 index 0000000..965fb69 --- /dev/null +++ b/utils/tools.py @@ -0,0 +1,39 @@ +from functools import wraps +from time import time + + +def is_video(ext: str): + """ + Returns true if ext exists in + allowed_exts for video files. + + Args: + ext: + + Returns: + + """ + + allowed_exts = ('.mp4', '.webm', '.ogg', '.avi', '.wmv', '.mkv', '.3gp') + return any((ext.endswith(x) for x in allowed_exts)) + + +def tik_tok(func): + """ + keep track of time for each process. + Args: + func: + + Returns: + + """ + @wraps(func) + def _time_it(*args, **kwargs): + start = time() + try: + return func(*args, **kwargs) + finally: + end_ = time() + print("time: {:.03f}s, fps: {:.03f}".format(end_ - start, 1 / (end_ - start))) + + return _time_it diff --git a/yolov4_deepsort.py b/yolov4_deepsort.py new file mode 100644 index 0000000..20cadf1 --- /dev/null +++ b/yolov4_deepsort.py @@ -0,0 +1,141 @@ +import os +import cv2 +import time +import argparse +import torch +import warnings +import numpy as np + +from detector import build_detector,build_onnx +from deep_sort import build_tracker,build_tracker_car +from utils.draw import draw_boxes +from utils.parser import get_config +from utils.log import get_logger +from utils.io import write_results +# from threading import Thread +from dataset import LoadStreams +from detector.trt import tensorrt +import shutil + +class VideoTracker(object): + def __init__(self, cfg, args, video_path): + self.cfg = cfg + self.args = args + self.video_path = video_path + self.logger = get_logger("root") + self.cuda_ctx = None + + use_cuda = args.use_cuda and torch.cuda.is_available() + if not use_cuda: + warnings.warn("Running in cpu mode which maybe very slow!", UserWarning) + + if args.display: + cv2.namedWindow("test", cv2.WINDOW_NORMAL) + cv2.resizeWindow("test", args.display_width, args.display_height) + + if args.cam != -1: + print("Using webcam " + str(args.cam)) + self.datasets = LoadStreams(args.cam) + self.cap = cv2.VideoCapture(args.cam) + else: + self.datasets = LoadStreams(args.VIDEO_PATH) + self.cap = cv2.VideoCapture() + self.deepsort_person= build_tracker(cfg, use_cuda=use_cuda) + + def __enter__(self): #__enter__(self):当with开始运行的时候触发此方法的运行 + if isinstance(self.args.cam , int): + if self.args.cam != -1: + ret, frame = self.cap.read() + assert ret, "Error: Camera error" + self.im_width = frame.shape[0] + self.im_height = frame.shape[1] + else: + assert os.path.isfile(self.video_path), "Path error" + self.cap.open(self.video_path) + self.im_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + self.im_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + assert self.cap.isOpened() + elif isinstance(self.args.cam , str): + self.cap.open(self.args.cam) + self.im_width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + self.im_height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + assert self.cap.isOpened() + + + if self.args.save_path: + os.makedirs(self.args.save_path, exist_ok=True) + # path of saved video and results + self.save_video_path = os.path.join(self.args.save_path, "results.avi") + self.save_results_path = os.path.join(self.args.save_path, "results.txt") + # create video writer + fourcc = cv2.VideoWriter_fourcc(*'MJPG') + self.writer = cv2.VideoWriter(self.save_video_path, fourcc, 20, (self.im_width, self.im_height)) + # logging + self.logger.info("Save results to {}".format(self.args.save_path)) + return self + + def __exit__(self, exc_type, exc_value, exc_traceback): + if exc_type: + print(exc_type, exc_value, exc_traceback) + + def run(self): + count_P = [] + trt_person= tensorrt(self.cfg,[416,416]) + + while self.cap.grab(): + ##socket + for _, im0s, _ in self.datasets: ##dataset 进行了__next__ 方法 + start = time.time() + # _, ori_im = self.cap.retrieve() + ori_im = im0s[0] + im = cv2.cvtColor(ori_im, cv2.COLOR_BGR2RGB) + # bbox_xywh, cls_conf, cls_ids = self.session.forward(ori_im,self.im_width,self.im_height) + ##这里非常重要, context , buffer 全部应该在一个线程内实现, + bbox_xywh, cls_conf, cls_ids = trt_person.detect(trt_person.context , trt_person.buffers,ori_im,self.im_width,self.im_height) + # select person class TODO + class_det_P = [0] + save_id = [] + for i, id in enumerate(cls_ids): + if id not in class_det_P: + save_id.append(i) + ##numpy array 进行delete + bbox_xywh_P = np.delete(bbox_xywh , [save_id],axis=0) + cls_conf_P = np.delete(cls_conf ,[save_id]) + outputs_P ,count_num_P,detection_id_P= self.deepsort_person.update(bbox_xywh_P, cls_conf_P, im,count_P) + + # if len(outputs_P) > 0: + ori_im,track_num,detection_id = draw_boxes(ori_im, outputs_P,count_num_P,detection_id_P,Type='person' ) + + end = time.time() + fps = 1 / (end - start) + cv2.putText(ori_im, "FPS: %.2f" % (fps), (int(1050), int(200)), 0, 10e-3 * 200, (0, 255, 0), 2) + + if self.args.display: + cv2.imshow("test", ori_im) + wirte_img = cv2.resize(ori_im,(800,600)) + cv2.waitKey(10) + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--VIDEO_PATH", type=str,default='MOT16-03.mp4') + parser.add_argument("--config_detection", type=str, default="./configs/yolov4_trt.yaml") + parser.add_argument("--config_deepsort", type=str, default="./configs/deep_sort.yaml") + parser.add_argument("--ignore_display", dest="display", action="store_false", default=True) + parser.add_argument("--display", action="store_true",default=True) + parser.add_argument("--frame_interval", type=int, default=2) + parser.add_argument("--display_width", type=int, default=800) + parser.add_argument("--display_height", type=int, default=600) + parser.add_argument("--save_path", type=str, default="./output/") + parser.add_argument("--cpu", dest="use_cuda", action="store_false", default=True) + parser.add_argument("--camera", action="store", dest="cam", type=int,default='-1') + # parser.add_argument("--camera", action="store", dest="cam", type=str, default="rtsp://admin:abc12345@192.168.1.64/ch2/main/av_stream") + return parser.parse_args() + +if __name__ == "__main__": + args = parse_args() + cfg = get_config() + cfg.merge_from_file(args.config_detection) + cfg.merge_from_file(args.config_deepsort) + + with VideoTracker(cfg, args, video_path=args.VIDEO_PATH) as vdo_trk: + vdo_trk.run()