-
Notifications
You must be signed in to change notification settings - Fork 0
/
object_detector.py
473 lines (406 loc) · 17.4 KB
/
object_detector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
import random
import time
from math import tan, pi
import cv2
import numpy as np
import torch
from numpy.linalg import norm
###########################
# VISUALIZATION FUNCTIONS #
###########################
def draw_line(frame, p1, p2, color, thickness=3):
r""" Draws a line on the image
"""
frame = cv2.line(frame,
tuple(p1[:2].astype(int)),
tuple(p2[:2].astype(int)),
color,
thickness)
return frame
def draw_point(blob_frame, p, color=(0, 255, 0)):
r""" Draws a point on the image
"""
cv2.circle(blob_frame, tuple(p[:2].astype(int)),
3, color, -1)
def draw(frame, imgpts):
"""
Draws a cube on the image
param np.array imgpts: 8 image points representing the cube
"""
# corner = corners[0].ravel()
imgpts = np.int32(imgpts).reshape(-1, 2)
# draw ground floor in black
frame = cv2.drawContours(frame, [imgpts[:4]],
-1, (0, 0, 0), -3)
# draw pillars in blue color
for i, j in zip(range(4), range(4, 8)):
frame = cv2.line(frame, tuple(imgpts[i]),
tuple(imgpts[j]), (255), 1)
# draw top layer in red color
frame = cv2.drawContours(frame, [imgpts[4:]], -1, (0, 0, 255), 1)
return frame
class ObjectDetector:
def __init__(self, hfov=70.42, vfov=43.3, pixel_size=None, focal_mm=3.67):
r""" Initializes CameraCalibration module with sensor information
Either hfov and vfov must be given or the pixel_size
and the focal length must be passed to initialize the
camera_matrix in later steps.
Parameters
----------
hfov : float
horizontal field of view
vfov : float
vertical field of view
pixel_size : float
the pixel size in mm
focal_mm : float
focal length in mm
"""
self.hfov = hfov
self.vfov = vfov
self.pixel_size = pixel_size
self.focal_mm = focal_mm
# Model
# For PIL/cv2/np inputs and NMS
self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
self.model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True).autoshape()
self.model.to(self.device)
self.names = self.model.module.names if hasattr(self.model, 'module') else self.model.names
self.colors = [[random.randint(0, 255) for _ in range(3)] for _ in self.names]
############################
# INITIALIZATION FUNCTIONS #
############################
def get_blob_board(self):
r""" Initializes the blob board parameters
"""
self.column_count = 6
self.row_count = 4
spacing = 40
# circle_diameter = 30
# board_width = (column_count-1)*spacing
# board_height = (row_count-1)*spacing
blob_board = np.zeros((self.column_count * self.row_count, 3))
idx = 0
for column in range(self.column_count):
for row in range(self.row_count):
x = column * spacing
y = row * spacing
blob_board[idx] = (x, y, 0) # TODO check
idx = idx + 1
return blob_board
def get_blob_detector(self):
r"""Initialize SimpleBlobDetector
"""
# Setup SimpleBlobDetector parameters.
blobParams = cv2.SimpleBlobDetector_Params()
# Change thresholds
blobParams.minThreshold = 8
blobParams.maxThreshold = 255
# Filter by Area.
blobParams.filterByArea = True
blobParams.minArea = 10
blobParams.maxArea = 100
# Filter by Circularity
blobParams.filterByCircularity = True
blobParams.minCircularity = 0.8
# Filter by Convexity
blobParams.filterByConvexity = True
blobParams.minConvexity = 0.9
# Filter by Inertia
blobParams.filterByInertia = True
blobParams.minInertiaRatio = 0.4
# Create a detector with the parameters
blobDetector = cv2.SimpleBlobDetector_create(blobParams)
return blobDetector
def init_camera_matrix(self, shape):
r""" Calculates the camera_matrix
Based on the shape and the sensor parameters
the camera matrix is calculated as numpy.array as:
| f_x, 0, c_x |
| 0 , f_y, c_y |
| 0 , 0, 1 |
Parameters
----------
shape : tuple
the shape as (height, width)
Returns
-------
camera_matrix
the calculated camera matrix
"""
height = shape[0]
width = shape[1]
c_x = width / 2
c_y = height / 2
if self.hfov and self.vfov:
# From field of view
f_x = c_x / tan(self.hfov * 0.5 * pi / 180)
f_y = c_y / tan(self.vfov * 0.5 * pi / 180)
elif self.pixel_size:
# From sensor width and height
sensor_width_mm = self.focal_mm * self.pixel_size
sensor_height_mm = self.focal_mm * self.pixel_size
f_x = (self.focal_mm / sensor_width_mm) * width
f_y = (self.focal_mm / sensor_height_mm) * height
self.camera_matrix = np.array([[f_x, 0, c_x],
[0, f_y, c_y],
[0, 0, 1]])
return self.camera_matrix
def init_camera_parameters(self, frame, viz=True):
blob_board = self.get_blob_board()
blob_detector = self.get_blob_detector()
blob_frame = None
self.init_camera_matrix(frame.shape)
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
keypoints = blob_detector.detect(gray)
if viz:
blob_frame = cv2.drawKeypoints(
frame, keypoints,
np.array([]), (255, 0, 0),
cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
# find board with custom algorithm
self.filter_keypoints(keypoints)
# visualize board
if viz:
blob_frame = cv2.drawKeypoints(
blob_frame, keypoints,
np.array([]), (0, 0, 255),
cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
cv2.imwrite('res/blobs.png', blob_frame)
# extract image points from keypoints
image_points = np.array([
[keypoints[idx].pt[0], keypoints[idx].pt[1]]
for idx in range(self.column_count * self.row_count)])
# sort image points after x and y
sorted_indexes = np.lexsort((image_points[:, 0], image_points[:, 1]))
image_points = image_points[sorted_indexes]
# find matching indices for blob and board corners
# TODO: check out flags = cv2.SOLVEPNP_IPPE_SQUARE | cv2.SOLVEPNP_IPPE
# TODO: optimize corner detection
board_corners = np.array(
[blob_board[i] for i in [0, 20, 23, 3]])
image_point_corners = np.array(
[image_points[i] for i in [6, 0, 17, 23]])
retval, rvec, tvec = cv2.solvePnP(
board_corners, image_point_corners,
self.camera_matrix, None,
cv2.SOLVEPNP_IPPE)
if not retval:
raise Exception('could not solve correspondence')
if viz:
axis = np.float32(
[[0, 0, 0], # origin
[40 * 5, 0, 0], # right axis
[40 * 5, 40 * 3, 0],
[0, 40 * 3, 0], # left axis
[0, 0, -40 * 3],
[40 * 5, 0, -40 * 3],
[40 * 5, 40 * 3, -40 * 3],
[0, 40 * 3, -40 * 3]]).reshape(-1, 3)
projected_points, _ = cv2.projectPoints(
axis, rvec, tvec, self.camera_matrix, None)
blob_frame = draw(blob_frame, projected_points)
draw_point(blob_frame,
image_point_corners[0], (255, 0, 255))
draw_point(blob_frame,
image_point_corners[1], (255, 255, 255))
draw_point(blob_frame,
image_point_corners[2], (0, 255, 255))
draw_point(blob_frame,
image_point_corners[3], (0, 0, 0))
draw_point(blob_frame,
projected_points[0][0], (255, 0, 255))
draw_point(blob_frame,
projected_points[1][0], (255, 255, 255))
draw_point(blob_frame,
projected_points[2][0], (0, 255, 255))
draw_point(blob_frame,
projected_points[3][0], (0, 0, 0))
draw_point(blob_frame,
np.mean(image_point_corners, axis=0), (0, 0, 255))
idx = 0
for point in image_points:
font = cv2.FONT_HERSHEY_SIMPLEX
# print(point)
cv2.putText(blob_frame, f'{idx}',
tuple(point.astype(int)),
font, 0.3, (255, 255, 255), 1, cv2.LINE_AA)
idx = idx + 1
#cv2.imwrite('res/blobs.png', blob_frame)
self.rvec = rvec
self.tvec = tvec
self.rot_cam_chessboard = cv2.Rodrigues(rvec)[0]
self.rot_chessboard_cam = np.transpose(self.rot_cam_chessboard)
self.t_cam_chessboard = tvec
self.pos_cam_chessboard = np.matmul(
-self.rot_chessboard_cam, self.t_cam_chessboard)
return blob_frame
###################
# BOARD DETECTION #
###################
def filter_keypoints(self, keypoints):
r""" Discards keypoints until only board keypoints are left
Iteratively removes keypoints by calculating the center
position of all remaining keypoints and removing the
keypoint with the highest distance to the center until
rows*columns keypoints are left.
"""
x_coords = [p.pt[0] for p in keypoints]
y_coords = [p.pt[1] for p in keypoints]
while len(keypoints) > self.row_count * self.column_count:
_len = len(keypoints)
centroid_x = sum(x_coords) / _len
centroid_y = sum(y_coords) / _len
index = np.argmax([(x_coords[i] - centroid_x) ** 2 +
(y_coords[i] - centroid_y) ** 2
for i in range(_len)])
keypoints.pop(index)
x_coords.pop(index)
y_coords.pop(index)
########################
# FIND OBJECT POSITION #
########################
def get_intersection(self, ray_origin, ray_dir, plane, plane_d):
r""" Returns the 3D intersection point
"""
plane_dot_ray = plane[0] * ray_dir[0] + \
plane[1] * ray_dir[1] + plane[2] * ray_dir[2] + plane_d
if abs(plane_dot_ray) > 0:
plane_dot_ray_origin = ray_origin[0] * plane[0] + \
ray_origin[1] * plane[1] + ray_origin[2] * plane[2] + plane_d
return ray_origin - ray_dir * (plane_dot_ray_origin / plane_dot_ray)
def to_coordinate_plane(self, image_point):
r""" maps a image point to a coordinate
"""
# calculate the 3d direction of the ray in camera coordinate frame
image_point_norm = cv2.undistortPoints(
image_point, self.camera_matrix, None)[0][0]
ray_dir_cam = np.array([image_point_norm[0], image_point_norm[1], 1])
# compute the 3d direction
# Map the ray direction vector from camera coordinates
# to chessboard coordinates
ray_dir_chessboard = np.matmul(self.rot_chessboard_cam, ray_dir_cam)
# Find the desired 3d point by computing the intersection between the
# 3d ray and the chessboard plane with Z=0:
# Expressed in the coordinate frame of the chessboard, the ray
# originates from the
# 3d position of the camera center, i.e. 'pos_cam_chessboard',
# and its 3d
# direction vector is 'ray_dir_chessboard'
# Any point on this ray can be expressed parametrically using
# its depth 'd':
# P(d) = pos_cam_chessboard + d * ray_dir_chessboard
# To find the intersection between the ray and the plane of the
# chessboard, we compute the depth 'd' for which the Z coordinate
# of P(d) is equal to zero
d_intersection = -self.pos_cam_chessboard[2] / ray_dir_chessboard[2]
intersection_point = self.pos_cam_chessboard.T[0] + d_intersection[0] * ray_dir_chessboard
return intersection_point
def estimate_pose(self, frame, viz=True):
if viz:
tl = 2
tf = 1
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = self.model(frame_rgb, 320 + 32 * 4) # includes NMS
person_coordinates = []
labels = []
points = results.xyxy[0].cpu().numpy()
for x in points:
label = self.names[int(x[5])]
if label == 'person':
a = np.array([x[0], x[1]])
b = np.array([x[2], x[1]])
c = np.array([x[2], x[3]])
d = np.array([x[0], x[3]])
labels.append(label)
person_coordinates.append([a, b, c, d])
if viz:
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
color = self.colors[int(x[5])]
draw_point(frame, x[2:], color)
cv2.rectangle(frame, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
cv2.rectangle(frame, c1, c2, (0, 0, 0), -
1, cv2.LINE_AA) # filled
cv2.putText(frame, label, (c1[0], c1[1] - 2), 0, tl / 3,
[225, 255, 255], thickness=tf,
lineType=cv2.LINE_AA)
person_coordinates = np.array(person_coordinates)
positions = []
heights = []
widths = []
for person_coordinate in person_coordinates:
point0 = self.to_coordinate_plane(person_coordinate[3])
point1 = self.to_coordinate_plane(person_coordinate[2])
pos = (point0 + point1) / 2
if viz:
points, _ = cv2.projectPoints(point0,
self.rvec, self.tvec,
self.camera_matrix, None)
draw_point(frame, points[0][0], (255, 255, 255))
points, jac = cv2.projectPoints(point1,
self.rvec, self.tvec,
self.camera_matrix, None)
draw_point(frame, points[0][0], (255, 255, 255))
bounding_span = point1 - point0
up_vector = np.array([0, 0, -1])
# calculate normal vector of plane
width = norm(bounding_span)
bounding_span = bounding_span / width
plane_normal = np.cross(bounding_span, up_vector)
# to unit vector
plane_normal = plane_normal / norm(plane_normal)
plane_d = -np.dot(plane_normal, point1)
plane_point = (person_coordinate[0] + person_coordinate[1]) / 2
plane_norm_dir = cv2.undistortPoints(
plane_point, self.camera_matrix, None)[0][0]
ray_dir_cam = np.array([plane_norm_dir[0], plane_norm_dir[1], 1])
ray_dir_cam = ray_dir_cam / norm(ray_dir_cam)
ray_dir_chessboard = np.matmul(
self.rot_chessboard_cam, ray_dir_cam)
ray_origin = self.pos_cam_chessboard.T[0]
point1 = self.get_intersection(ray_origin, ray_dir_chessboard, plane_normal, plane_d)
height = point0[2] - point1[2]
if viz:
points, _ = cv2.projectPoints(point1, self.rvec, self.tvec, self.camera_matrix, None)
draw_point(frame, points[0][0], (0, 255, 255))
print(f'height={height}, width={width}, pos={pos}')
positions.append(pos)
heights.append(height)
widths.append(width)
# positions = np.array(positions)
# heights = np.array(heights)
return frame, labels, positions, heights, widths
def main():
object_detector = ObjectDetector()
frame = cv2.imread('data/calib-and-test/frame_1920x1080.jpg')
blob_frame = object_detector.init_camera_parameters(frame, viz=True)
cv2.imwrite('res/blobs.jpg', blob_frame)
width = 640
height = 360
# initialize object_detector with new height/width
object_detector.init_camera_matrix((height, width))
cap = cv2.VideoCapture('data/calib-and-test/vid_640x360.mp4')
cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
fps = cap.get(cv2.CAP_PROP_FPS)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('res/data.mp4', fourcc, fps, (width, height))
while True:
ret, frame = cap.read()
if not ret:
break
then = time.time()
frame, labels, pos, height, width = object_detector.estimate_pose(frame, viz=True)
print(labels, pos, height)
print('estimation took %.2f ms' % ((time.time() - then) * 1000))
out.write(frame)
cv2.imshow('frame', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
# Release everything if job is finished
cap.release()
out.release()
if __name__ == '__main__':
main()