-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathold_inference.py
144 lines (114 loc) · 4.93 KB
/
old_inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from momaapi import MOMAAPI
import torch
import sys
from mmpt.models import MMPTModel, MMPTClassifier
from torchvision.io import read_video, write_video
from torch.nn import Sequential
import torchvision.transforms as Transforms
import skvideo.io
import numpy as np
import yaml
def predict(moma, moma_acts, act_type):
# Init model
model, tokenizer, aligner = MMPTModel.from_pretrained("projects/retri/videoclip/how2.yaml")
model.eval().to('cuda')
#classifier = MMPTClassifier(model, tokenizer, aligner)
#classifier.set_class_names(moma_acts)
act = moma_acts[0]
act2 = moma_acts[1]
# Create text input
caps1, cmasks1 = aligner._build_text_seq(
tokenizer(act, add_special_tokens=False)["input_ids"])
caps1, cmasks1 = caps1[None, :].cuda(), cmasks1[None, :].cuda() # bsz=1
# Create text input2
caps2, cmasks2 = aligner._build_text_seq(
tokenizer(act2, add_special_tokens=False)["input_ids"])
caps2, cmasks2 = caps2[None, :].cuda(), cmasks2[None, :].cuda() # bsz=1
# Create video input
num_examples = 1 # Just grab one for now
act_ids = moma.get_ids_act(cnames_act = [act])
paths = moma.get_paths(ids_act=act_ids[:num_examples])
path = paths[0]
videodata = skvideo.io.vread(path)
L, H, W, C = videodata.shape
if L / 30 != 0:
extra_frames = L % 30
videodata = videodata[extra_frames:]
L = len(videodata)
if L > 240:
# Grab middle 240 frames
videodata = videodata[L//2-120:L//2+120] # Cap videodata to first 240 frames
videodata = np.reshape(videodata, (1, -1, 30, H, W, C))
video_frames = torch.from_numpy(videodata / 255.0).cuda().float()
path2 = paths[1]
videodata2 = skvideo.io.vread(path2)
L, H, W, C = videodata.shape
if L / 30 != 0:
extra_frames = L % 30
videodata2 = videodata2[extra_frames:]
L = len(videodata2)
if L > 240:
# Grab middle 240 frames
videodata2 = videodata2[L//2-120:L//2+120] # Cap videodata to first 240 frames
videodata2 = np.reshape(videodata2, (1, -1, 30, H, W, C))
video_frames2 = torch.from_numpy(videodata2 / 255.0).cuda().float()
with torch.no_grad():
output1 = model(video_frames, caps1, cmasks1, return_score=False)
output2 = model(video_frames2, caps1, cmasks1, return_score=False)
for correct_idx, activity in enumerate(moma_acts):
print("Category:", activity)
act_ids = moma.get_ids_act(cnames_act = [activity])
print("Number in category", len(act_ids))
num_examples = 10
num_correct = 0
paths = moma.get_paths(ids_act=act_ids[:num_examples])
print("PATH LENGTH:", len(paths))
model.eval().to('cuda')
for path in paths:
videodata = skvideo.io.vread(path)
L, H, W, C = videodata.shape
if L / 30 != 0:
extra_frames = L % 30
videodata = videodata[extra_frames:]
L = len(videodata)
if L > 240:
# Grab middle 240 frames
videodata = videodata[L//2-120:L//2+120] # Cap videodata to first 240 frames
videodata = np.reshape(videodata, (1, -1, 30, H, W, C))
# B, T, FPS, H, W, C (VideoCLIP is trained on 30 fps of s3d)
text_to_try = moma_acts
video_frames = torch.from_numpy(videodata / 255.0).cuda().float()
scores = []
for text in text_to_try:
caps, cmasks = aligner._build_text_seq(
tokenizer(text, add_special_tokens=False)["input_ids"]
)
caps, cmasks = caps[None, :].cuda(), cmasks[None, :].cuda() # bsz=1
with torch.no_grad():
# Goes here first
output = model(video_frames, caps, cmasks, return_score=True)
#print("Text:", "'" + text + "'", "score:", output["score"].item()) # dot-product
scores.append(output["score"].item())
pred = np.argmax(scores)
if pred == correct_idx:
num_correct += 1
print("Predicted class", moma_acts[pred])
print("Accuracy for class", activity, num_correct / num_examples)
def main():
if len(sys.argv) == 1:
print("ERROR: inference.py has required argument [config file]")
return
with open(sys.argv[1]) as file:
data = yaml.full_load(file)
print(data)
try:
act_type = data['activity_type']
moma_acts = data['class_names']
except:
print("YAML config file requires field [activity_type (str)] and [class_names (list:str)]")
dir_moma = '../../data/moma'
moma = MOMAAPI(dir_moma)
predict(moma, moma_acts, act_type)
# Takes argument [YAML config file] (see configs/activity.yaml
if __name__ == '__main__':
main()