From 1b0461ce21d8b76bb56140d31eda3e0a792b9c37 Mon Sep 17 00:00:00 2001 From: kaixuanliu Date: Wed, 5 Feb 2025 23:46:43 +0800 Subject: [PATCH] Add video-llava model support (#1522) Signed-off-by: kaixuanliu Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- Makefile | 5 + README.md | 2 + docs/source/index.mdx | 1 + examples/video-comprehension/README.md | 33 ++ examples/video-comprehension/requirements.txt | 2 + examples/video-comprehension/run_example.py | 235 ++++++++++ .../habana/transformers/generation/utils.py | 1 + optimum/habana/transformers/modeling_utils.py | 6 + .../habana/transformers/models/__init__.py | 1 + .../models/llama/modeling_llama.py | 2 + .../models/video_llava/__init__.py | 1 + .../video_llava/modeling_video_llava.py | 411 ++++++++++++++++++ tests/test_video_llava.py | 77 ++++ 13 files changed, 777 insertions(+) create mode 100644 examples/video-comprehension/README.md create mode 100644 examples/video-comprehension/requirements.txt create mode 100644 examples/video-comprehension/run_example.py create mode 100644 optimum/habana/transformers/models/video_llava/__init__.py create mode 100644 optimum/habana/transformers/models/video_llava/modeling_video_llava.py create mode 100644 tests/test_video_llava.py diff --git a/Makefile b/Makefile index 80fb7b8c62..34fd13bd07 100644 --- a/Makefile +++ b/Makefile @@ -121,6 +121,11 @@ slow_tests_openclip_vqa_example: test_installs python -m pip install -r examples/visual-question-answering/openclip_requirements.txt python -m pytest tests/test_openclip_vqa.py +# Run video comprehension tests +slow_tests_video_llava_example: test_installs + python -m pip install -r examples/video-comprehension/requirements.txt + python -m pytest tests/test_video_llava.py + slow_tests_fsdp: test_installs python -m pytest tests/test_fsdp_examples.py -v -s --token $(TOKEN) diff --git a/README.md b/README.md index b25d781120..e363edb26e 100644 --- a/README.md +++ b/README.md @@ -282,6 +282,8 @@ The following model architectures, tasks and device distributions have been vali | DeepSeek-V2 | :heavy_check_mark: | :heavy_check_mark: |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | | ChatGLM |
  • DeepSpeed
  • |
  • Single card
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | | Qwen2-VL | |
  • Single card
  • |
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | +| VideoLLaVA | |
  • Single card
  • |
  • [Video comprehension](https://github.com/huggingface/optimum-habana/tree/main/examples/video-comprehension)
  • | + diff --git a/docs/source/index.mdx b/docs/source/index.mdx index f71f69d3a6..2b8cdf06ef 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -105,6 +105,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be | TableTransformer | |
  • Single card
  • |
  • [table object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/table-detection)
  • | | DETR | |
  • Single card
  • |
  • [object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/object-detection)
  • | | Mllama |
  • LoRA
  • |✅ |
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | +| Video-LLaVA | |
  • Single card
  • |
  • [video comprehension](https://github.com/huggingface/optimum-habana/tree/main/examples/video-comprehension)
  • | | MiniCPM3 | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | | Baichuan2 |
  • DeepSpeed
  • |
  • Single card
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | | DeepSeek-V2 | ✅ | ✅ |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | diff --git a/examples/video-comprehension/README.md b/examples/video-comprehension/README.md new file mode 100644 index 0000000000..da54f26740 --- /dev/null +++ b/examples/video-comprehension/README.md @@ -0,0 +1,33 @@ + + +# Examples + +This directory contains example scripts that demonstrate how to perform video comprehension on Gaudi with graph mode. + +## Single-HPU inference + +### Video-LLaVA Model + +```bash +python3 run_example.py \ + --model_name_or_path "LanguageBind/Video-LLaVA-7B-hf" \ + --warmup 3 \ + --n_iterations 5 \ + --batch_size 1 \ + --use_hpu_graphs \ + --bf16 \ + --output_dir ./ +``` +Models that have been validated: + - [LanguageBind/Video-LLaVA-7B-hf ](https://huggingface.co/LanguageBind/Video-LLaVA-7B-hf) diff --git a/examples/video-comprehension/requirements.txt b/examples/video-comprehension/requirements.txt new file mode 100644 index 0000000000..7ed65352d9 --- /dev/null +++ b/examples/video-comprehension/requirements.txt @@ -0,0 +1,2 @@ +av == 12.1.0 +sentencepiece == 0.2.0 diff --git a/examples/video-comprehension/run_example.py b/examples/video-comprehension/run_example.py new file mode 100644 index 0000000000..5868bea3e8 --- /dev/null +++ b/examples/video-comprehension/run_example.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +import argparse +import json +import logging +import os +import time +from pathlib import Path + +import av +import numpy as np +import torch +from huggingface_hub import hf_hub_download +from transformers import VideoLlavaProcessor + +from optimum.habana.transformers.modeling_utils import ( + GaudiVideoLlavaForConditionalGeneration, + adapt_transformers_to_gaudi, +) + + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +def read_video_pyav(container, indices): + frames = [] + container.seek(0) + start_index = indices[0] + end_index = indices[-1] + for i, frame in enumerate(container.decode(video=0)): + if i > end_index: + break + if i >= start_index and i in indices: + frames.append(frame) + return np.stack([x.to_ndarray(format="rgb24") for x in frames]) + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + help="Path to pre-trained model", + ) + parser.add_argument( + "--video_path", + default=None, + type=str, + nargs="*", + help='Path to video as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --video_path "URL1" "URL2")', + ) + parser.add_argument( + "--prompt", + default=None, + type=str, + help='Optional argument to give a prompt of your choice as input. is a single string (eg: --prompt "Hello world")', + ) + parser.add_argument( + "--use_hpu_graphs", + action="store_true", + help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.", + ) + parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.") + parser.add_argument( + "--bf16", + action="store_true", + help="Whether to perform generation in bf16 precision.", + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + help="Output directory to store results in.", + ) + parser.add_argument( + "--token", + default=None, + type=str, + help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`).", + ) + parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.") + parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.") + parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.") + parser.add_argument( + "--ignore_eos", + action="store_true", + help="Whether to disable stopping with eos token when calling `generate`.", + ) + parser.add_argument( + "--use_flash_attention", + action="store_true", + help="Whether to enable Habana Flash Attention, provided that the model supports it.", + ) + parser.add_argument( + "--flash_attention_recompute", + action="store_true", + help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.", + ) + + args = parser.parse_args() + + os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE") + + if args.video_path is None: + args.video_path = [ + hf_hub_download( + repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset" + ) + ] + + if args.prompt is None: + args.prompt = ["USER: