Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added a Gradio UI for multi-modal inferencing using Llama 3.2 Vision/ #718

Merged
merged 17 commits into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
0c985d8
added a file to start with Inferencing on llama3.2 vision using gradi…
himanshushukla12 Oct 8, 2024
4053712
Added basic LlamaInference class structure, model loading functionali…
himanshushukla12 Oct 8, 2024
22be586
Implemented memory management to release GPU resources after inference
himanshushukla12 Oct 8, 2024
b2f9655
Modified requirements.txt by adding the gradio dependency
himanshushukla12 Oct 8, 2024
19938dd
Added instructions in README.md for using the Gradio UI
himanshushukla12 Oct 8, 2024
b94a340
Merge branch 'meta-llama:main' into main
himanshushukla12 Oct 8, 2024
c609a44
Change Gradio -> gradio
himanshushukla12 Oct 8, 2024
750b499
Added passing of Hugging-face token from the arguments
himanshushukla12 Oct 8, 2024
3170c27
Changed readme for usage of multimodal inferencing of gradio UI by pa…
himanshushukla12 Oct 8, 2024
c0405b6
Changes the UI from textbox to chatbox with max_tokens, rop_k, temper…
himanshushukla12 Oct 8, 2024
6f7c028
added the passing of hugging-face token from the argument
himanshushukla12 Oct 8, 2024
a261aea
Update README.md
himanshushukla12 Oct 8, 2024
597e44e
Update README.md
himanshushukla12 Oct 8, 2024
77f929f
Merge branch 'meta-llama:main' into main
himanshushukla12 Oct 27, 2024
37033ef
Renamed the file and made that G small of gradio
himanshushukla12 Oct 27, 2024
6453745
removed old files after renaming and added markup safe in readme for …
himanshushukla12 Oct 27, 2024
e0e8825
Fixed readme with renaming file name
himanshushukla12 Oct 27, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions recipes/quickstart/inference/local_inference/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,18 @@ The way to run this would be:
```
python multi_modal_infer.py --image_path PATH_TO_IMAGE --prompt_text "Describe this image" --temperature 0.5 --top_p 0.8 --model_name "meta-llama/Llama-3.2-11B-Vision-Instruct"
```
---
## Multi-modal Inferencing Using gradio UI for inferencing
For multi-modal inferencing using gradio UI we have added [multi_modal_infer_gradio_UI.py](multi_modal_infer_gradio_UI.py) which used gradio and transformers library.

### Steps to Run

The way to run this would be:
- Ensure having proper access to llama 3.2 vision models, then run the command given below

```
python multi_modal_infer_gradio_UI.py --hf_token <your hf_token here>
```

## Text-only Inference
For local inference we have provided an [inference script](inference.py). Depending on the type of finetuning performed during training the [inference script](inference.py) takes different arguments.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import gradio as gr
import torch
import os
from PIL import Image
from accelerate import Accelerator
from transformers import MllamaForConditionalGeneration, AutoProcessor
import argparse # Import argparse

# Parse the command line arguments
parser = argparse.ArgumentParser(description="Run Gradio app with Hugging Face model")
parser.add_argument("--hf_token", type=str, required=True, help="Hugging Face authentication token")
args = parser.parse_args()

# Hugging Face token
hf_token = args.hf_token

# Initialize Accelerator
accelerate = Accelerator()
device = accelerate.device

# Set memory management for PyTorch
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128' # or adjust size as needed

# Model ID
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

# Load model with the Hugging Face token
model = MllamaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map=device,
use_auth_token=hf_token # Pass the Hugging Face token here
)

# Load the processor
processor = AutoProcessor.from_pretrained(model_id, use_auth_token=hf_token)

# Visual theme
visual_theme = gr.themes.Default() # Default, Soft or Monochrome

# Constants
MAX_OUTPUT_TOKENS = 2048
MAX_IMAGE_SIZE = (1120, 1120)

# Function to process the image and generate a description
def describe_image(image, user_prompt, temperature, top_k, top_p, max_tokens, history):
# Initialize cleaned_output variable
cleaned_output = ""

if image is not None:
# Resize image if necessary
image = image.resize(MAX_IMAGE_SIZE)
prompt = f"<|image|><|begin_of_text|>{user_prompt} Answer:"
# Preprocess the image and prompt
inputs = processor(image, prompt, return_tensors="pt").to(device)
else:
# Text-only input if no image is provided
prompt = f"<|begin_of_text|>{user_prompt} Answer:"
# Preprocess the prompt only (no image)
inputs = processor(prompt, return_tensors="pt").to(device)

# Generate output with model
output = model.generate(
**inputs,
max_new_tokens=min(max_tokens, MAX_OUTPUT_TOKENS),
temperature=temperature,
top_k=top_k,
top_p=top_p
)

# Decode the raw output
raw_output = processor.decode(output[0])

# Clean up the output to remove system tokens
cleaned_output = raw_output.replace("<|image|><|begin_of_text|>", "").strip().replace(" Answer:", "")

# Ensure the prompt is not repeated in the output
if cleaned_output.startswith(user_prompt):
cleaned_output = cleaned_output[len(user_prompt):].strip()

# Append the new conversation to the history
history.append((user_prompt, cleaned_output))

return history


# Function to clear the chat history
def clear_chat():
return []

# Gradio Interface
def gradio_interface():
with gr.Blocks(visual_theme) as demo:
gr.HTML(
"""
<h1 style='text-align: center'>
meta-llama/Llama-3.2-11B-Vision-Instruct
</h1>
""")
with gr.Row():
# Left column with image and parameter inputs
with gr.Column(scale=1):
image_input = gr.Image(
label="Image",
type="pil",
image_mode="RGB",
height=512, # Set the height
width=512 # Set the width
)

# Parameter sliders
temperature = gr.Slider(
label="Temperature", minimum=0.1, maximum=1.0, value=0.6, step=0.1, interactive=True)
top_k = gr.Slider(
label="Top-k", minimum=1, maximum=100, value=50, step=1, interactive=True)
top_p = gr.Slider(
label="Top-p", minimum=0.1, maximum=1.0, value=0.9, step=0.1, interactive=True)
max_tokens = gr.Slider(
label="Max Tokens", minimum=50, maximum=MAX_OUTPUT_TOKENS, value=100, step=50, interactive=True)

# Right column with the chat interface
with gr.Column(scale=2):
chat_history = gr.Chatbot(label="Chat", height=512)

# User input box for prompt
user_prompt = gr.Textbox(
show_label=False,
container=False,
placeholder="Enter your prompt",
lines=2
)

# Generate and Clear buttons
with gr.Row():
generate_button = gr.Button("Generate")
clear_button = gr.Button("Clear")

# Define the action for the generate button
generate_button.click(
fn=describe_image,
inputs=[image_input, user_prompt, temperature, top_k, top_p, max_tokens, chat_history],
outputs=[chat_history]
)

# Define the action for the clear button
clear_button.click(
fn=clear_chat,
inputs=[],
outputs=[chat_history]
)

return demo

# Launch the interface
demo = gradio_interface()
# demo.launch(server_name="0.0.0.0", server_port=12003)
demo.launch()
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,5 @@ faiss-gpu; python_version < '3.11'
unstructured[pdf]
sentence_transformers
codeshield
gradio
markupsafe==2.0.1
Loading