forked from openvinotoolkit/openvino_notebooks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
607 lines (496 loc) · 21 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
import torch
import torch.nn as nn
from diffusers import AutoencoderKL, StableDiffusionPipeline, UNet2DConditionModel
from transformers import CLIPTextModel
import torch.nn.functional as F
from typing import Optional, Tuple, Union
from transformers.modeling_outputs import BaseModelOutputWithPooling
from transformers.models.clip.modeling_clip import (
CLIPPreTrainedModel,
CLIPModel,
)
import types
import torchvision.transforms as T
import gc
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
bsz, src_len = mask.size()
tgt_len = tgt_len if tgt_len is not None else src_len
expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
inverted_mask = 1.0 - expanded_mask
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
def _make_causal_mask(
input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
):
"""
Make causal mask used for bi-directional self-attention.
"""
bsz, tgt_len = input_ids_shape
mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
mask_cond = torch.arange(mask.size(-1), device=device)
mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
mask = mask.to(dtype)
if past_key_values_length > 0:
mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
class MLP(nn.Module):
def __init__(self, in_dim, out_dim, hidden_dim, use_residual=True):
super().__init__()
if use_residual:
assert in_dim == out_dim
self.layernorm = nn.LayerNorm(in_dim)
self.fc1 = nn.Linear(in_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, out_dim)
self.use_residual = use_residual
self.act_fn = nn.GELU()
def forward(self, x):
residual = x
x = self.layernorm(x)
x = self.fc1(x)
x = self.act_fn(x)
x = self.fc2(x)
if self.use_residual:
x = x + residual
return x
class FastComposerCLIPImageEncoder(CLIPPreTrainedModel):
@staticmethod
def from_pretrained(
global_model_name_or_path,
):
model = CLIPModel.from_pretrained(global_model_name_or_path)
vision_model = model.vision_model
visual_projection = model.visual_projection
vision_processor = T.Normalize(
(0.48145466, 0.4578275, 0.40821073),
(0.26862954, 0.26130258, 0.27577711),
)
return FastComposerCLIPImageEncoder(
vision_model,
visual_projection,
vision_processor,
)
def __init__(
self,
vision_model,
visual_projection,
vision_processor,
):
super().__init__(vision_model.config)
self.vision_model = vision_model
self.visual_projection = visual_projection
self.vision_processor = vision_processor
self.image_size = vision_model.config.image_size
def forward(self, object_pixel_values):
b, num_objects, c, h, w = object_pixel_values.shape
object_pixel_values = object_pixel_values.view(b * num_objects, c, h, w)
if h != self.image_size or w != self.image_size:
h, w = self.image_size, self.image_size
object_pixel_values = F.interpolate(
object_pixel_values, (h, w), mode="bilinear", antialias=True
)
object_pixel_values = self.vision_processor(object_pixel_values)
object_embeds = self.vision_model(object_pixel_values)[1]
object_embeds = self.visual_projection(object_embeds)
object_embeds = object_embeds.view(b, num_objects, 1, -1)
return object_embeds
def scatter_object_embeddings(
inputs_embeds,
image_token_mask,
object_embeds,
num_objects,
image_embedding_transform=None,
):
object_embeds = object_embeds.to(inputs_embeds.dtype)
batch_size, max_num_objects = object_embeds.shape[:2]
seq_length = inputs_embeds.shape[1]
flat_object_embeds = object_embeds.view(
-1, object_embeds.shape[-2], object_embeds.shape[-1]
)
valid_object_mask = (
torch.arange(max_num_objects, device=flat_object_embeds.device)[None, :]
< num_objects[:, None]
)
valid_object_embeds = flat_object_embeds[valid_object_mask.flatten()]
if image_embedding_transform is not None:
valid_object_embeds = image_embedding_transform(valid_object_embeds)
inputs_embeds = inputs_embeds.view(-1, inputs_embeds.shape[-1])
image_token_mask = image_token_mask.view(-1)
valid_object_embeds = valid_object_embeds.view(-1, valid_object_embeds.shape[-1])
inputs_embeds.masked_scatter_(image_token_mask[:, None], valid_object_embeds)
inputs_embeds = inputs_embeds.view(batch_size, seq_length, -1)
return inputs_embeds
def fuse_object_embeddings(
inputs_embeds,
image_token_mask,
object_embeds,
num_objects,
fuse_fn=torch.add,
):
object_embeds = object_embeds.to(inputs_embeds.dtype)
batch_size, max_num_objects = object_embeds.shape[:2]
seq_length = inputs_embeds.shape[1]
flat_object_embeds = object_embeds.view(
-1, object_embeds.shape[-2], object_embeds.shape[-1]
)
valid_object_mask = (
torch.arange(max_num_objects, device=flat_object_embeds.device)[None, :]
< num_objects[:, None]
)
valid_object_embeds = flat_object_embeds[valid_object_mask.flatten()]
inputs_embeds = inputs_embeds.view(-1, inputs_embeds.shape[-1])
image_token_mask = image_token_mask.view(-1)
valid_object_embeds = valid_object_embeds.view(-1, valid_object_embeds.shape[-1])
# slice out the image token embeddings
image_token_embeds = inputs_embeds[image_token_mask]
valid_object_embeds = fuse_fn(image_token_embeds, valid_object_embeds)
inputs_embeds.masked_scatter_(image_token_mask[:, None], valid_object_embeds)
inputs_embeds = inputs_embeds.view(batch_size, seq_length, -1)
return inputs_embeds
class FastComposerPostfuseModule(nn.Module):
def __init__(self, embed_dim):
super().__init__()
self.mlp1 = MLP(embed_dim * 2, embed_dim, embed_dim, use_residual=False)
self.mlp2 = MLP(embed_dim, embed_dim, embed_dim, use_residual=True)
self.layer_norm = nn.LayerNorm(embed_dim)
def fuse_fn(self, text_embeds, object_embeds):
text_object_embeds = torch.cat([text_embeds, object_embeds], dim=-1)
text_object_embeds = self.mlp1(text_object_embeds) + text_embeds
text_object_embeds = self.mlp2(text_object_embeds)
text_object_embeds = self.layer_norm(text_object_embeds)
return text_object_embeds
def forward(
self,
text_embeds,
object_embeds,
image_token_mask,
num_objects,
) -> torch.Tensor:
text_object_embeds = fuse_object_embeddings(
text_embeds, image_token_mask, object_embeds, num_objects, self.fuse_fn
)
return text_object_embeds
class FastComposerTextEncoder(CLIPPreTrainedModel):
@staticmethod
def from_pretrained(model_name_or_path, **kwargs):
model = CLIPTextModel.from_pretrained(model_name_or_path, **kwargs)
text_model = model.text_model
return FastComposerTextEncoder(text_model)
def __init__(self, text_model):
super().__init__(text_model.config)
self.config = text_model.config
self.final_layer_norm = text_model.final_layer_norm
self.embeddings = text_model.embeddings
self.encoder = text_model.encoder
def forward(
self,
input_ids,
image_token_mask=None,
object_embeds=None,
num_objects=None,
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
output_attentions = (
output_attentions
if output_attentions is not None
else self.config.output_attentions
)
output_hidden_states = (
output_hidden_states
if output_hidden_states is not None
else self.config.output_hidden_states
)
return_dict = (
return_dict if return_dict is not None else self.config.use_return_dict
)
input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_shape[-1])
hidden_states = self.embeddings(input_ids)
causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
# expand attention_mask
if attention_mask is not None:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
encoder_outputs = self.encoder(
inputs_embeds=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_state = encoder_outputs[0]
last_hidden_state = self.final_layer_norm(last_hidden_state)
# text_embeds.shape = [batch_size, sequence_length, transformer.width]
# take features from the eot embedding (eot_token is the highest number in each sequence)
# casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
pooled_output = last_hidden_state[
torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(
dim=-1
),
]
if not return_dict:
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling(
last_hidden_state=last_hidden_state,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
def unet_store_cross_attention_scores(unet, attention_scores, layers=5):
from diffusers.models.attention_processor import (
Attention,
AttnProcessor,
AttnProcessor2_0,
)
UNET_LAYER_NAMES = [
"down_blocks.0",
"down_blocks.1",
"down_blocks.2",
"mid_block",
"up_blocks.1",
"up_blocks.2",
"up_blocks.3",
]
start_layer = (len(UNET_LAYER_NAMES) - layers) // 2
end_layer = start_layer + layers
applicable_layers = UNET_LAYER_NAMES[start_layer:end_layer]
def make_new_get_attention_scores_fn(name):
def new_get_attention_scores(module, query, key, attention_mask=None):
attention_probs = module.old_get_attention_scores(
query, key, attention_mask
)
attention_scores[name] = attention_probs
return attention_probs
return new_get_attention_scores
for name, module in unet.named_modules():
if isinstance(module, Attention) and "attn2" in name:
if not any(layer in name for layer in applicable_layers):
continue
if isinstance(module.processor, AttnProcessor2_0):
module.set_processor(AttnProcessor())
module.old_get_attention_scores = module.get_attention_scores
module.get_attention_scores = types.MethodType(
make_new_get_attention_scores_fn(name), module
)
return unet
class BalancedL1Loss(nn.Module):
def __init__(self, threshold=1.0, normalize=False):
super().__init__()
self.threshold = threshold
self.normalize = normalize
def forward(self, object_token_attn_prob, object_segmaps):
if self.normalize:
object_token_attn_prob = object_token_attn_prob / (
object_token_attn_prob.max(dim=2, keepdim=True)[0] + 1e-5
)
background_segmaps = 1 - object_segmaps
background_segmaps_sum = background_segmaps.sum(dim=2) + 1e-5
object_segmaps_sum = object_segmaps.sum(dim=2) + 1e-5
background_loss = (object_token_attn_prob * background_segmaps).sum(
dim=2
) / background_segmaps_sum
object_loss = (object_token_attn_prob * object_segmaps).sum(
dim=2
) / object_segmaps_sum
return background_loss - object_loss
def get_object_localization_loss_for_one_layer(
cross_attention_scores,
object_segmaps,
object_token_idx,
object_token_idx_mask,
loss_fn,
):
bxh, num_noise_latents, num_text_tokens = cross_attention_scores.shape
b, max_num_objects, _, _ = object_segmaps.shape
size = int(num_noise_latents ** 0.5)
# Resize the object segmentation maps to the size of the cross attention scores
object_segmaps = F.interpolate(
object_segmaps, size=(size, size), mode="bilinear", antialias=True
) # (b, max_num_objects, size, size)
object_segmaps = object_segmaps.view(
b, max_num_objects, -1
) # (b, max_num_objects, num_noise_latents)
num_heads = bxh // b
cross_attention_scores = cross_attention_scores.view(
b, num_heads, num_noise_latents, num_text_tokens
)
# Gather object_token_attn_prob
object_token_attn_prob = torch.gather(
cross_attention_scores,
dim=3,
index=object_token_idx.view(b, 1, 1, max_num_objects).expand(
b, num_heads, num_noise_latents, max_num_objects
),
) # (b, num_heads, num_noise_latents, max_num_objects)
object_segmaps = (
object_segmaps.permute(0, 2, 1)
.unsqueeze(1)
.expand(b, num_heads, num_noise_latents, max_num_objects)
)
loss = loss_fn(object_token_attn_prob, object_segmaps)
loss = loss * object_token_idx_mask.view(b, 1, max_num_objects)
object_token_cnt = object_token_idx_mask.sum(dim=1).view(b, 1) + 1e-5
loss = (loss.sum(dim=2) / object_token_cnt).mean()
return loss
def get_object_localization_loss(
cross_attention_scores,
object_segmaps,
image_token_idx,
image_token_idx_mask,
loss_fn,
):
num_layers = len(cross_attention_scores)
loss = 0
for k, v in cross_attention_scores.items():
layer_loss = get_object_localization_loss_for_one_layer(
v, object_segmaps, image_token_idx, image_token_idx_mask, loss_fn
)
loss += layer_loss
return loss / num_layers
class FastComposerModel(nn.Module):
def __init__(self, text_encoder, image_encoder, vae, unet, args):
super().__init__()
self.text_encoder = text_encoder
self.image_encoder = image_encoder
self.vae = vae
self.unet = unet
self.use_ema = False
self.ema_param = None
self.pretrained_model_name_or_path = args.pretrained_model_name_or_path
self.revision = args.revision
self.non_ema_revision = args.non_ema_revision
self.object_localization = args.object_localization
self.object_localization_weight = args.object_localization_weight
self.localization_layers = args.localization_layers
self.mask_loss = args.mask_loss
self.mask_loss_prob = args.mask_loss_prob
embed_dim = text_encoder.config.hidden_size
self.postfuse_module = FastComposerPostfuseModule(embed_dim)
if self.object_localization:
self.cross_attention_scores = {}
self.unet = unet_store_cross_attention_scores(
self.unet, self.cross_attention_scores, self.localization_layers
)
self.object_localization_loss_fn = BalancedL1Loss(
args.object_localization_threshold,
args.object_localization_normalize,
)
def _clear_cross_attention_scores(self):
if hasattr(self, "cross_attention_scores"):
keys = list(self.cross_attention_scores.keys())
for k in keys:
del self.cross_attention_scores[k]
gc.collect()
@staticmethod
def from_pretrained(args):
text_encoder = FastComposerTextEncoder.from_pretrained(
args.pretrained_model_name_or_path,
subfolder="text_encoder",
revision=args.revision,
)
vae = AutoencoderKL.from_pretrained(
args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision
)
unet = UNet2DConditionModel.from_pretrained(
args.pretrained_model_name_or_path,
subfolder="unet",
revision=args.non_ema_revision,
)
image_encoder = FastComposerCLIPImageEncoder.from_pretrained(
args.image_encoder_name_or_path,
)
return FastComposerModel(text_encoder, image_encoder, vae, unet, args)
def to_pipeline(self):
pipe = StableDiffusionPipeline.from_pretrained(
self.pretrained_model_name_or_path,
revision=self.revision,
non_ema_revision=self.non_ema_revision,
text_encoder=self.text_encoder,
vae=self.vae,
unet=self.unet,
)
pipe.safety_checker = None
pipe.image_encoder = self.image_encoder
pipe.postfuse_module = self.postfuse_module
return pipe
def forward(self, batch, noise_scheduler):
pixel_values = batch["pixel_values"]
input_ids = batch["input_ids"]
image_token_mask = batch["image_token_mask"]
object_pixel_values = batch["object_pixel_values"]
num_objects = batch["num_objects"]
vae_dtype = self.vae.parameters().__next__().dtype
vae_input = pixel_values.to(vae_dtype)
latents = self.vae.encode(vae_input).latent_dist.sample()
latents = latents * self.vae.config.scaling_factor
# Sample noise that we'll add to the latents
noise = torch.randn_like(latents)
bsz = latents.shape[0]
# Sample a random timestep for each image
timesteps = torch.randint(
0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device
)
timesteps = timesteps.long()
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
# (bsz, max_num_objects, num_image_tokens, dim)
object_embeds = self.image_encoder(object_pixel_values)
encoder_hidden_states = self.text_encoder(
input_ids, image_token_mask, object_embeds, num_objects
)[
0
] # (bsz, seq_len, dim)
encoder_hidden_states = self.postfuse_module(
encoder_hidden_states,
object_embeds,
image_token_mask,
num_objects,
)
# Get the target for loss depending on the prediction type
if noise_scheduler.config.prediction_type == "epsilon":
target = noise
elif noise_scheduler.config.prediction_type == "v_prediction":
target = noise_scheduler.get_velocity(latents, noise, timesteps)
else:
raise ValueError(
f"Unknown prediction type {noise_scheduler.config.prediction_type}"
)
pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample
if self.mask_loss and torch.rand(1) < self.mask_loss_prob:
object_segmaps = batch["object_segmaps"]
mask = (object_segmaps.sum(dim=1) > 0).float()
mask = F.interpolate(
mask.unsqueeze(1),
size=(pred.shape[-2], pred.shape[-1]),
mode="bilinear",
align_corners=False,
)
pred = pred * mask
target = target * mask
denoise_loss = F.mse_loss(pred.float(), target.float(), reduction="mean")
return_dict = {"denoise_loss": denoise_loss}
if self.object_localization:
object_segmaps = batch["object_segmaps"]
image_token_idx = batch["image_token_idx"]
image_token_idx_mask = batch["image_token_idx_mask"]
localization_loss = get_object_localization_loss(
self.cross_attention_scores,
object_segmaps,
image_token_idx,
image_token_idx_mask,
self.object_localization_loss_fn,
)
return_dict["localization_loss"] = localization_loss
loss = self.object_localization_weight * localization_loss + denoise_loss
self._clear_cross_attention_scores()
else:
loss = denoise_loss
return_dict["loss"] = loss
return return_dict