forked from jmccormac/pySceneNetRGBD
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalculate_optical_flow.py
255 lines (220 loc) · 10.7 KB
/
calculate_optical_flow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
from PIL import Image
import math
import matplotlib
import numpy as np
import os
import pathlib
import random
import scenenet_pb2 as sn
import sys
import scipy.misc
def normalize(v):
return v/np.linalg.norm(v)
def load_depth_map_in_m(file_name):
image = Image.open(file_name)
pixel = np.array(image)
return (pixel * 0.001)
def pixel_to_ray(pixel,vfov=45,hfov=60,pixel_width=320,pixel_height=240):
x, y = pixel
x_vect = math.tan(math.radians(hfov/2.0)) * ((2.0 * ((x+0.5)/pixel_width)) - 1.0)
y_vect = math.tan(math.radians(vfov/2.0)) * ((2.0 * ((y+0.5)/pixel_height)) - 1.0)
return (x_vect,y_vect,1.0)
def normalised_pixel_to_ray_array(width=320,height=240):
pixel_to_ray_array = np.zeros((height,width,3))
for y in range(height):
for x in range(width):
pixel_to_ray_array[y,x] = normalize(np.array(pixel_to_ray((x,y),pixel_height=height,pixel_width=width)))
return pixel_to_ray_array
def points_in_camera_coords(depth_map,pixel_to_ray_array):
assert depth_map.shape[0] == pixel_to_ray_array.shape[0]
assert depth_map.shape[1] == pixel_to_ray_array.shape[1]
assert len(depth_map.shape) == 2
assert pixel_to_ray_array.shape[2] == 3
camera_relative_xyz = np.ones((depth_map.shape[0],depth_map.shape[1],4))
for i in range(3):
camera_relative_xyz[:,:,i] = depth_map * pixel_to_ray_array[:,:,i]
return camera_relative_xyz
def flatten_points(points):
return points.reshape(-1, 4)
def reshape_points(height,width,points):
other_dim = points.shape[1]
return points.reshape(height,width,other_dim)
def transform_points(transform,points):
assert points.shape[2] == 4
height = points.shape[0]
width = points.shape[1]
points = flatten_points(points)
return reshape_points(height,width,(transform.dot(points.T)).T)
def world_to_camera_with_pose(view_pose):
lookat_pose = position_to_np_array(view_pose.lookat)
camera_pose = position_to_np_array(view_pose.camera)
up = np.array([0,1,0])
R = np.diag(np.ones(4))
R[2,:3] = normalize(lookat_pose - camera_pose)
R[0,:3] = normalize(np.cross(R[2,:3],up))
R[1,:3] = -normalize(np.cross(R[0,:3],R[2,:3]))
T = np.diag(np.ones(4))
T[:3,3] = -camera_pose
return R.dot(T)
def camera_to_world_with_pose(view_pose):
return np.linalg.inv(world_to_camera_with_pose(view_pose))
def camera_point_to_uv_pixel_location(point,vfov=45,hfov=60,pixel_width=320,pixel_height=240):
point = point / point[2]
u = ((pixel_width/2.0) * ((point[0]/math.tan(math.radians(hfov/2.0))) + 1))
v = ((pixel_height/2.0) * ((point[1]/math.tan(math.radians(vfov/2.0))) + 1))
return (u,v)
def position_to_np_array(position):
return np.array([position.x,position.y,position.z])
def interpolate_poses(start_pose,end_pose,alpha):
assert alpha >= 0.0
assert alpha <= 1.0
camera_pose = alpha * position_to_np_array(end_pose.camera)
camera_pose += (1.0 - alpha) * position_to_np_array(start_pose.camera)
lookat_pose = alpha * position_to_np_array(end_pose.lookat)
lookat_pose += (1.0 - alpha) * position_to_np_array(start_pose.lookat)
timestamp = alpha * end_pose.timestamp + (1.0 - alpha) * start_pose.timestamp
pose = sn.Pose()
pose.camera.x = camera_pose[0]
pose.camera.y = camera_pose[1]
pose.camera.z = camera_pose[2]
pose.lookat.x = lookat_pose[0]
pose.lookat.y = lookat_pose[1]
pose.lookat.z = lookat_pose[2]
pose.timestamp = timestamp
return pose
def world_point_to_uv_pixel_location_with_interpolated_camera(point,shutter_open,shutter_close,alpha):
view_pose = interpolate_poses(shutter_open,shutter_close,alpha)
wTc = world_to_camera_with_pose(view_pose)
point_in_camera_coords = wTc.dot(np.array(point))
uv = camera_point_to_uv_pixel_location(point_in_camera_coords)
return uv
# Expects:
# an nx4 array of points of the form [[x,y,z,1],[x,y,z,1]...] in world coordinates
# a length three array [x,y,z] for camera start and end (of a shutter) and lookat start/end in world coordinates
# Returns:
# a nx2 array of the horizontal and vertical pixel location time derivatives (i.e. pixels per second in the horizontal and vertical)
# NOTE: the pixel coordinates are defined as (0,0) in the top left corner, to (320,240) in the bottom left
def optical_flow(points,shutter_open,shutter_close,alpha=0.5,shutter_time=(1.0/60),
hfov=60,pixel_width=320,vfov=45,pixel_height=240):
# Alpha is the linear interpolation coefficient, 0.5 takes the derivative in the midpoint
# which is where the ground truth renders are taken. The photo render integrates via sampling
# over the whole shutter open-close trajectory
view_pose = interpolate_poses(shutter_open,shutter_close,alpha)
wTc = world_to_camera_with_pose(view_pose)
camera_pose = position_to_np_array(view_pose.camera)
lookat_pose = position_to_np_array(view_pose.lookat)
# Get camera pixel scale constants
uk = (pixel_width/2.0) * ((1.0/math.tan(math.radians(hfov/2.0))))
vk = (pixel_height/2.0) * ((1.0/math.tan(math.radians(vfov/2.0))))
# Get basis vectors
ub1 = lookat_pose - camera_pose
b1 = normalize(ub1)
ub2 = np.cross(b1,np.array([0,1,0]))
b2 = normalize(ub2)
ub3 = np.cross(b2,b1)
b3 = -normalize(ub3)
# Get camera pose alpha derivative
camera_end = position_to_np_array(shutter_close.camera)
camera_start = position_to_np_array(shutter_open.camera)
lookat_end = position_to_np_array(shutter_close.lookat)
lookat_start= position_to_np_array(shutter_open.lookat)
dc_dalpha = camera_end - camera_start
# Get basis vector derivatives
# dub1 means d unnormalised b1
db1_dub1 = (np.eye(3) - np.outer(b1,b1))/np.linalg.norm(ub1)
dub1_dalpha = lookat_end - lookat_start - camera_end + camera_start
db1_dalpha = db1_dub1.dot(dub1_dalpha)
db2_dub2 = (np.eye(3) - np.outer(b2,b2))/np.linalg.norm(ub2)
dub2_dalpha = np.array([-db1_dalpha[2],0,db1_dalpha[0]])
db2_dalpha = db2_dub2.dot(dub2_dalpha)
db3_dub3 = (np.eye(3) - np.outer(b3,b3))/np.linalg.norm(ub3)
dub3_dalpha = np.array([
-(db2_dalpha[2]*b1[1]+db1_dalpha[1]*b2[2]),
-(db2_dalpha[0]*b1[2] + db1_dalpha[2]*b2[0])+(db2_dalpha[2]*b1[0]+db1_dalpha[0]*b2[2]),
(db1_dalpha[1]*b2[0]+db2_dalpha[0]*b1[1])
])
db3_dalpha = -db3_dub3.dot(dub3_dalpha)
# derivative of the rotated translation offset
dt3_dalpha = np.array([
-db2_dalpha.dot(camera_pose)-dc_dalpha.dot(b2),
-db3_dalpha.dot(camera_pose)-dc_dalpha.dot(b3),
-db1_dalpha.dot(camera_pose)-dc_dalpha.dot(b1),
])
# camera transform derivative
dT_dalpha = np.empty((4,4))
dT_dalpha[0,:3] = db2_dalpha
dT_dalpha[1,:3] = db3_dalpha
dT_dalpha[2,:3] = db1_dalpha
dT_dalpha[:3,3] = dt3_dalpha
# Calculate 3D point derivative alpha derivative
dpoint_dalpha = dT_dalpha.dot(points.T)
point_in_camera_coords = wTc.dot(np.array(points.T))
# Calculate pixel location alpha derivative
du_dalpha = uk * (dpoint_dalpha[0] * point_in_camera_coords[2] - dpoint_dalpha[2] * point_in_camera_coords[0])
dv_dalpha = vk * (dpoint_dalpha[1] * point_in_camera_coords[2] - dpoint_dalpha[2] * point_in_camera_coords[1])
du_dalpha = du_dalpha/(point_in_camera_coords[2]*point_in_camera_coords[2])
dv_dalpha = dv_dalpha/(point_in_camera_coords[2]*point_in_camera_coords[2])
# Calculate pixel location time derivative
du_dt = du_dalpha / shutter_time
dv_dt = dv_dalpha / shutter_time
return np.vstack((du_dt,dv_dt)).T
def flow_to_hsv_image(flow, magnitude_scale=1.0/100.0):
hsv = np.empty((240,320,3))
for row in range(240):
for col in range(320):
v = flow[row,col,:]
magnitude = np.linalg.norm(v)
if magnitude < 1e-8:
hsv[row,col,0] = 0.0
hsv[row,col,1] = 0.0
hsv[row,col,2] = 0.0
else:
direction = v / magnitude
theta = math.atan2(direction[1], direction[0])
if theta <= 0:
theta += 2*math.pi
assert(theta >= 0.0 and theta <= 2*math.pi)
hsv[row,col,0] = theta / (2*math.pi)
hsv[row,col,1] = 1.0
hsv[row,col,2] = min(magnitude * magnitude_scale, 1.0)
return hsv
data_root_path = 'data/val'
protobuf_path = 'data/scenenet_rgbd_val.pb'
def depth_path_from_view(render_path,view):
photo_path = os.path.join(render_path,'depth')
depth_path = os.path.join(photo_path,'{0}.png'.format(view.frame_num))
return os.path.join(data_root_path,depth_path)
if __name__ == '__main__':
trajectories = sn.Trajectories()
try:
with open(protobuf_path,'rb') as f:
trajectories.ParseFromString(f.read())
except IOError:
print('Scenenet protobuf data not found at location:{0}'.format(data_root_path))
print('Please ensure you have copied the pb file to the data directory')
traj = random.choice(trajectories.trajectories)
# This stores for each image pixel, the cameras 3D ray vector
cached_pixel_to_ray_array = normalised_pixel_to_ray_array()
for idx,view in enumerate(traj.views):
depth_path = depth_path_from_view(traj.render_path,view)
optical_flow_path = 'optical_flow_{0}.png'.format(idx)
print('Converting depth image:{0} and camera pose to optical flow image:{1}'.format(depth_path,optical_flow_path))
depth_map = load_depth_map_in_m(str(depth_path))
# When no depth information is available (because a ray went to
# infinity outside of a window) depth is set to zero. For the purpose
# of optical flow, here we set it simply to be very far away (1km)
depth_map[depth_map == 0.0] = 1000.0
# This is a 320x240x3 array, with each 'pixel' containing the 3D point in camera coords
points_in_camera = points_in_camera_coords(depth_map,cached_pixel_to_ray_array)
# Transform point from camera coordinates into world coordinates
ground_truth_pose = interpolate_poses(view.shutter_open,view.shutter_close,0.5)
camera_to_world_matrix = camera_to_world_with_pose(ground_truth_pose)
points_in_world = transform_points(camera_to_world_matrix,points_in_camera)
# Calculate optical flow
points_in_world = flatten_points(points_in_world)
optical_flow_derivatives = optical_flow(points_in_world,view.shutter_open,view.shutter_close)
optical_flow_derivatives = reshape_points(240,320,optical_flow_derivatives)
# Write out hsv optical flow image. We use the matplotlib hsv colour wheel
hsv = flow_to_hsv_image(optical_flow_derivatives)
rgb = matplotlib.colors.hsv_to_rgb(hsv)
scipy.misc.imsave(optical_flow_path,rgb)