-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmodel.py
85 lines (62 loc) · 3.54 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Copyright 2020 UMONS-Numediart-USHERBROOKE-Necotis.
#
# MAFNet of University of Mons and University of Sherbrooke – Mathilde Brousmiche is free software : you can redistribute it
# and/or modify it under the terms of the Lesser GNU General Public License as published by the Free Software Foundation,
# either version 3 of the License, or any later version. This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the Lesser GNU General Public License for more details.
# You should have received a copy of the Lesser GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
# Each use of this software must be attributed to University of MONS – Numédiart Institute and to University of SHERBROOKE - Necotis Lab (Mathilde Brousmiche).
import tensorflow as tf
import layers
class MAFnet:
# Create model
def __init__(self, image, sound, target, x_image_shape, x_sound_shape, n_hidden, n_classes):
self.x_image_shape = x_image_shape
self.x_sound_shape = x_sound_shape
self.n_hidden = n_hidden
self.n_classes = n_classes
self.xImage = image
self.xSound = sound
self.target = target
self.isTraining = tf.placeholder(tf.bool, shape=[], name="isTraining")
# network
self.logits, self.pred = self.network()
# loss
self.loss = self.loss_crossentropy()
# accuracy
self.acc = self.accuracy()
def network(self):
# FiLM layer
x_image = tf.reshape(self.xImage, (-1, self.x_image_shape))
x_sound = tf.reshape(self.xSound, (-1, 12, 8, self.x_sound_shape))
gamma_from_image, beta_from_image = layers.FILM_generator(x_image, self.n_hidden, name="FILM_for_sound")
x_sound = layers.FILMed_block(x_sound, self.n_hidden, gamma_from_image, beta_from_image,
self.isTraining, name='FILMed_block_sound')
x_sound = tf.reduce_mean(x_sound, [1, 2])
x_image = tf.reshape(x_image, (-1, 10, 1920))
x_sound = tf.reshape(x_sound, (-1, 10, 512))
# Dense image
x_image = tf.keras.layers.TimeDistributed(
tf.keras.layers.Dense(self.n_hidden, activation='linear', name='image_fc_1'))(x_image)
x_image = layers.BN_layer(x_image, self.isTraining, name="image_BN_1")
x_image = tf.nn.relu(x_image)
# Dense sound
x_sound = tf.keras.layers.TimeDistributed(
tf.keras.layers.Dense(self.n_hidden, activation='linear', name='sound_fc_1'))(x_sound)
x_sound = layers.BN_layer(x_sound, self.isTraining, name="sound_BN_1")
x_sound = tf.nn.relu(x_sound)
# Modality & temporal Attention Module
x, self.alpha = layers.modality_temporal_attention(x_image, x_sound, self.n_hidden, 2, self.n_hidden)
# Dense multimodal
x = tf.keras.layers.Dense(self.n_classes, activation='linear', use_bias='True', name='multimodal_fc_1')(x)
x = layers.BN_layer(x, self.isTraining, name="multimodal_BN_1")
pred = tf.nn.softmax(x)
return x, pred
def loss_crossentropy(self):
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.target))
return loss
def accuracy(self):
correct_prediction = tf.equal(tf.argmax(self.pred, -1), tf.argmax(self.target, -1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
return accuracy