diff --git a/candle-transformers/src/models/blip.rs b/candle-transformers/src/models/blip.rs index 0330386574..a391daacbf 100644 --- a/candle-transformers/src/models/blip.rs +++ b/candle-transformers/src/models/blip.rs @@ -1,8 +1,11 @@ //! Based on the BLIP paper from Salesforce Research. //! -//! See "BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation" -//! - [Arxiv](https://arxiv.org/abs/2201.12086) -//! - [Github](https://github.com/salesforce/BLIP) +//! The blip-image-captioning model can generate captions for an input image. +//! +//! - ⚑ [Interactive Wasm Example](https://huggingface.co/spaces/radames/Candle-BLIP-Image-Captioning) +//! - πŸ’» [GH Link](https://github.com/salesforce/BLIP) +//! - πŸ€— [HF Link](https://huggingface.co/Salesforce/blip-image-captioning-base) +//! - πŸ“ [Paper](https://arxiv.org/abs/2201.12086) //! use super::blip_text; diff --git a/candle-transformers/src/models/blip_text.rs b/candle-transformers/src/models/blip_text.rs index aceaf4ac1b..ad28193b16 100644 --- a/candle-transformers/src/models/blip_text.rs +++ b/candle-transformers/src/models/blip_text.rs @@ -1,9 +1,12 @@ //! Implementation of BLIP text encoder/decoder. //! -//! See "BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation" -//! https://arxiv.org/abs/2201.12086 +//! - πŸ“ [Paper](https://arxiv.org/abs/2201.12086). BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation" +//! +//! - ⚑ [Interactive Wasm Example](https://huggingface.co/spaces/radames/Candle-BLIP-Image-Captioning) +//! - πŸ’» [GH Link](https://github.com/salesforce/BLIP) +//! - πŸ€— [HF Link](https://huggingface.co/Salesforce/blip-image-captioning-base) +//! - πŸ“ [Paper](https://arxiv.org/abs/2201.12086) //! - use super::with_tracing::{linear, Embedding, Linear}; use candle::{Module, Result, Tensor, D}; use candle_nn::{layer_norm, LayerNorm, VarBuilder}; diff --git a/candle-transformers/src/models/chatglm.rs b/candle-transformers/src/models/chatglm.rs index 8d5d9ec601..a115c7fef2 100644 --- a/candle-transformers/src/models/chatglm.rs +++ b/candle-transformers/src/models/chatglm.rs @@ -1,10 +1,8 @@ //! Implementation of the ChatGLM2/3 models from THUDM. //! -//! See: -//! - ChatGLM3: ["ChatGLM3: Advancing Multilingual Conversational Language Models with High-Quality Data"](https://github.com/THUDM/ChatGLM3) -//! - ChatGLM2: ["ChatGLM2: An Open Bilingual Chat LLM"](https://github.com/THUDM/ChatGLM2-6B) +//! - πŸ’» [Github](https://github.com/THUDM/ChatGLM3) ChatGLM3: Advancing Multilingual Conversational Language Models with High-Quality Data +//! - πŸ’» [Github](https://github.com/THUDM/ChatGLM2-6B) ChatGLM2-6B. //! - use crate::models::with_tracing::{linear_b as linear, Linear}; use candle::{DType, Device, IndexOp, Module, Result, Tensor, D}; use candle_nn::VarBuilder; diff --git a/candle-transformers/src/models/chinese_clip/mod.rs b/candle-transformers/src/models/chinese_clip/mod.rs index 86616baa1c..1edc903179 100644 --- a/candle-transformers/src/models/chinese_clip/mod.rs +++ b/candle-transformers/src/models/chinese_clip/mod.rs @@ -3,10 +3,9 @@ //! Chinese contrastive Language-Image Pre-Training (CLIP) is an architecture trained on //! pairs of images with related texts. //! -//! - [GH Link](https://github.com/OFA-Sys/Chinese-CLIP) -//! - Transformers Python [reference implementation](https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/chinese_clip/modeling_chinese_clip.py) +//! - πŸ’» [GH Link](https://github.com/OFA-Sys/Chinese-CLIP) +//! - πŸ’» Transformers Python [reference implementation](https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/chinese_clip/modeling_chinese_clip.py) //! - use candle::{Module, Result, Tensor, D}; use candle_nn as nn; diff --git a/candle-transformers/src/models/chinese_clip/text_model.rs b/candle-transformers/src/models/chinese_clip/text_model.rs index 19499709a7..1cbf7c914e 100644 --- a/candle-transformers/src/models/chinese_clip/text_model.rs +++ b/candle-transformers/src/models/chinese_clip/text_model.rs @@ -3,8 +3,8 @@ //! Chinese contrastive Language-Image Pre-Training (CLIP) is an architecture trained on //! pairs of images with related texts. //! -//! https://github.com/OFA-Sys/Chinese-CLIP -//! https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/chinese_clip/modeling_chinese_clip.py +//! - πŸ’» [Chinese-CLIP](https://github.com/OFA-Sys/Chinese-CLIP) +//! - πŸ’» [HF](https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/chinese_clip/modeling_chinese_clip.py) use candle::{DType, Device, IndexOp, Module, Result, Tensor}; use candle_nn as nn; @@ -67,7 +67,7 @@ impl Default for ChineseClipTextConfig { } impl ChineseClipTextConfig { - /// referer: https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16/blob/main/config.json + /// [referer](https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16/blob/main/config.json) pub fn clip_vit_base_patch16() -> Self { Self { vocab_size: 21128, diff --git a/candle-transformers/src/models/chinese_clip/vision_model.rs b/candle-transformers/src/models/chinese_clip/vision_model.rs index 2d345e0f4a..a20535c40e 100644 --- a/candle-transformers/src/models/chinese_clip/vision_model.rs +++ b/candle-transformers/src/models/chinese_clip/vision_model.rs @@ -3,8 +3,8 @@ //! Chinese contrastive Language-Image Pre-Training (CLIP) is an architecture trained on //! pairs of images with related texts. //! -//! https://github.com/OFA-Sys/Chinese-CLIP -//! https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/chinese_clip/modeling_chinese_clip.py +//! - πŸ’» [Chinese-CLIP](https://github.com/OFA-Sys/Chinese-CLIP) +//! - πŸ’» [GH](https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/chinese_clip/modeling_chinese_clip.py_ use candle::{DType, IndexOp, Module, Result, Shape, Tensor, D}; use candle_nn as nn; @@ -49,7 +49,7 @@ impl Default for ChineseClipVisionConfig { } impl ChineseClipVisionConfig { - /// referer: https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16/blob/main/config.json + /// [referer](https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16/blob/main/config.json) pub fn clip_vit_base_patch16() -> Self { Self { hidden_size: 768, diff --git a/candle-transformers/src/models/clip/mod.rs b/candle-transformers/src/models/clip/mod.rs index e83f27e388..2b00267317 100644 --- a/candle-transformers/src/models/clip/mod.rs +++ b/candle-transformers/src/models/clip/mod.rs @@ -3,8 +3,10 @@ //! Contrastive Language-Image Pre-Training (CLIP) is an architecture trained on //! pairs of images with related texts. //! -//! - [GH Link](https://github.com/openai/CLIP) -//! - Transformers Python [reference implementation](https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip) +//! - πŸ’» [GH Link](https://github.com/openai/CLIP) +//! - πŸ’» Transformers Python [reference implementation](https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip) +//! - πŸ€— [HF Model](https://huggingface.co/openai/clip-vit-large-patch14-336) +//! use self::{ text_model::{Activation, ClipTextTransformer}, diff --git a/candle-transformers/src/models/clip/text_model.rs b/candle-transformers/src/models/clip/text_model.rs index 4662f65fda..eb103bd29a 100644 --- a/candle-transformers/src/models/clip/text_model.rs +++ b/candle-transformers/src/models/clip/text_model.rs @@ -3,8 +3,8 @@ //! Contrastive Language-Image Pre-Training (CLIP) is an architecture trained on //! pairs of images with related texts. //! -//! https://github.com/openai/CLIP -//! https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip +//! - [GH](https://github.com/openai/CLIP) +//! - [Code](https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip) use candle::{DType, Device, IndexOp, Result, Tensor, D}; use candle_nn as nn; diff --git a/candle-transformers/src/models/codegeex4_9b.rs b/candle-transformers/src/models/codegeex4_9b.rs index baf4745922..c37a97d57e 100644 --- a/candle-transformers/src/models/codegeex4_9b.rs +++ b/candle-transformers/src/models/codegeex4_9b.rs @@ -1,8 +1,9 @@ //! CodeGeeX4 - A multi-language code generation model //! -//! See "CodeGeeX: A Pre-Trained Model For Code Generation with Multilingual Evaluations on HumanEval-X", Qian et al. 2023 -//! - [Arxiv](https://arxiv.org/abs/2303.17568) -//! - [Github](https://github.com/THUDM/CodeGeeX) +//! A Pre-Trained Model For Code Generation with Multilingual Evaluations on HumanEval-X" +//! +//! - πŸ“ [Arxiv](https://arxiv.org/abs/2303.17568) +//! - πŸ’» [Github](https://github.com/THUDM/CodeGeeX) //! use crate::models::with_tracing::{linear_b as linear, Linear}; diff --git a/candle-transformers/src/models/convmixer.rs b/candle-transformers/src/models/convmixer.rs index e095f793a4..7f1b75ebc4 100644 --- a/candle-transformers/src/models/convmixer.rs +++ b/candle-transformers/src/models/convmixer.rs @@ -1,10 +1,10 @@ //! ConvMixer implementation. //! //! See "Patches Are All You Need?" by Trockman et al. 2022 -//! - [Arxiv](https://arxiv.org/abs/2201.09792) -//! - [Github](https://github.com/locuslab/convmixer) //! - +//! - πŸ“ [Arxiv](https://arxiv.org/abs/2201.09792) +//! - πŸ’» [Github](https://github.com/locuslab/convmixer) +//! use candle::Result; use candle_nn::{batch_norm, Conv2dConfig, Module, VarBuilder}; diff --git a/candle-transformers/src/models/convnext.rs b/candle-transformers/src/models/convnext.rs index d791895f1d..727e11381c 100644 --- a/candle-transformers/src/models/convnext.rs +++ b/candle-transformers/src/models/convnext.rs @@ -1,13 +1,16 @@ //! ConvNeXt implementation. //! -//! See ["A ConvNet for the 2020s" Liu et al. 2022](https://arxiv.org/abs/2201.03545) -//! and -//! ["ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders" Woo et al. 2023](https://arxiv.org/abs/2301.00808) +//! This candle implementation uses a pre-trained ConvNeXt network for inference. The +//! classification head has been trained on the ImageNet dataset and returns the +//! probabilities for the top-5 classes. //! //! Original code: -//! - [ConvNeXt](https://github.com/facebookresearch/ConvNeXt/) -//! - [ConvNeXt-V2](https://github.com/facebookresearch/ConvNeXt-V2/) -//! - [timm](https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/convnext.py) +//! - πŸ’» [ConvNeXt](https://github.com/facebookresearch/ConvNeXt/) +//! - πŸ’» [ConvNeXt-V2](https://github.com/facebookresearch/ConvNeXt-V2/) +//! - πŸ’» [timm](https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/convnext.py) +//! - πŸ“ [Paper](https://arxiv.org/abs/2201.03545) A ConvNet for the 2020s +//! - πŸ“ [Paper](https://arxiv.org/abs/2301.00808) ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders +//! use candle::shape::ShapeWithOneHole; use candle::{Result, D}; diff --git a/candle-transformers/src/models/flux/mod.rs b/candle-transformers/src/models/flux/mod.rs index 064c5130f5..1d2fa4ef33 100644 --- a/candle-transformers/src/models/flux/mod.rs +++ b/candle-transformers/src/models/flux/mod.rs @@ -2,9 +2,9 @@ //! //! Flux is a 12B rectified flow transformer capable of generating images from text descriptions. //! -//! - [Hugging Face Model](https://huggingface.co/black-forest-labs/FLUX.1-schnell) -//! - [GitHub Repository](https://github.com/black-forest-labs/flux) -//! - [Blog Post](https://blackforestlabs.ai/announcing-black-forest-labs/) +//! - πŸ€— [Hugging Face Model](https://huggingface.co/black-forest-labs/FLUX.1-schnell) +//! - πŸ’» [GitHub Repository](https://github.com/black-forest-labs/flux) +//! - πŸ“ [Blog Post](https://blackforestlabs.ai/announcing-black-forest-labs/) //! //! # Usage //! diff --git a/candle-transformers/src/models/hiera.rs b/candle-transformers/src/models/hiera.rs index 39f8d639b6..98ad825737 100644 --- a/candle-transformers/src/models/hiera.rs +++ b/candle-transformers/src/models/hiera.rs @@ -1,9 +1,8 @@ -//! [Hiera] inference implementation based on timm. +//! Hiera inference implementation based on timm. //! -//! See "[Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles]" -//! [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles]: https://arxiv.org/abs/2306.00989 //! -//! [Hiera]: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/hiera.py +//! - πŸ’» [Hiera](https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/hiera.py) +//! - πŸ“ [Paper](https://arxiv.org/abs/2306.00989). Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles use candle::{Result, D}; use candle_nn::{conv2d, layer_norm, linear, ops::softmax, Conv2dConfig, Func, VarBuilder}; diff --git a/candle-transformers/src/models/llama2_c.rs b/candle-transformers/src/models/llama2_c.rs index d825d8e4dd..930c8b8aa6 100644 --- a/candle-transformers/src/models/llama2_c.rs +++ b/candle-transformers/src/models/llama2_c.rs @@ -2,7 +2,9 @@ //! //! See ["LLaMA 2: Open Foundation and Fine-Tuned Chat Models"](https://arxiv.org/abs/2307.09288) //! -//! Based on the [llama2.c](https://github.com/karpathy/llama2.c) implementation +//! - ⚑ [Interactive Wasm Example](https://huggingface.co/spaces/lmz/candle-llama2) +//! - πŸ’» llama2.c [GH Link](https://github.com/karpathy/llama2.c) +//! use candle::{DType, Device, IndexOp, Result, Tensor, D}; use candle_nn::linear_no_bias as linear; diff --git a/candle-transformers/src/models/llava/mod.rs b/candle-transformers/src/models/llava/mod.rs index 44a00bf9a1..c252dbed56 100644 --- a/candle-transformers/src/models/llava/mod.rs +++ b/candle-transformers/src/models/llava/mod.rs @@ -1,13 +1,12 @@ //! The LLaVA (Large Language and Vision Assistant) model. //! //! This provides the main model implementation combining a vision tower (CLIP) with -//! language model (Llama) for multimodal capabilities. +//! language model (Llama) for multimodal capabilities. The architecture implements the training-free projection technique. //! -//! The architecture implements the training-free projection technique from the paper: -//! [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485). -//! -//! - [GH Link](https://github.com/haotian-liu/LLaVA/tree/main) +//! - πŸ’»[GH Link](https://github.com/haotian-liu/LLaVA/tree/main) +//! - πŸ“ [Paper](https://arxiv.org/abs/2304.08485)/ Visual Instruction Tuning //! + pub mod config; pub mod utils; diff --git a/candle-transformers/src/models/mimi/mod.rs b/candle-transformers/src/models/mimi/mod.rs index f19f9ae5fa..8945abfb03 100644 --- a/candle-transformers/src/models/mimi/mod.rs +++ b/candle-transformers/src/models/mimi/mod.rs @@ -1,9 +1,27 @@ //! mimi model //! -//! Mimi is a state-of-the-art audio neural codec. +//! [Mimi](https://huggingface.co/kyutai/mimi) is a state of the art audio +//! compression model using an encoder/decoder architecture with residual vector +//! quantization. The candle implementation supports streaming meaning that it's +//! possible to encode or decode a stream of audio tokens on the flight to provide +//! low latency interaction with an audio model. //! -//! - [HuggingFace Model Card](https://huggingface.co/kyutai/mimi) -//! - [GitHub](https://github.com/kyutai-labs/moshi) +//! - πŸ€— [HuggingFace Model Card](https://huggingface.co/kyutai/mimi) +//! - πŸ’» [GitHub](https://github.com/kyutai-labs/moshi) +//! +//! +//! # Example +//! ```bash +//! # Generating some audio tokens from an audio files. +//! wget https://github.com/metavoiceio/metavoice-src/raw/main/assets/bria.mp3 +//! cargo run --example mimi \ +//! --features mimi --release -- \ +//! audio-to-code bria.mp3 bria.safetensors +//! +//! # And decoding the audio tokens back into a sound file. +//! cargo run --example mimi +//! --features mimi --release -- \ +//! code-to-audio bria.safetensors bria.wav //! // Copyright (c) Kyutai, all rights reserved. diff --git a/candle-transformers/src/models/mmdit/mod.rs b/candle-transformers/src/models/mmdit/mod.rs index ce4872e0b2..88e73e1e3d 100644 --- a/candle-transformers/src/models/mmdit/mod.rs +++ b/candle-transformers/src/models/mmdit/mod.rs @@ -3,9 +3,15 @@ //! Mix of Multi-scale Dilated and Traditional Convolutions (MMDiT) is an architecture //! introduced for Stable Diffusion 3, with the MMDiT-X variant used in Stable Diffusion 3.5. //! -//! - [Research Paper](https://arxiv.org/abs/2403.03206) -//! - ComfyUI [reference implementation](https://github.com/comfyanonymous/ComfyUI/blob/78e133d0415784924cd2674e2ee48f3eeca8a2aa/comfy/ldm/modules/diffusionmodules/mmdit.py) -//! - Stability-AI [MMDiT-X implementation](https://github.com/Stability-AI/sd3.5/blob/4e484e05308d83fb77ae6f680028e6c313f9da54/mmditx.py) +//! - πŸ“ [Research Paper](https://arxiv.org/abs/2403.03206) +//! - πŸ’» ComfyUI [reference implementation](https://github.com/comfyanonymous/ComfyUI/blob/78e133d0415784924cd2674e2ee48f3eeca8a2aa/comfy/ldm/modules/diffusionmodules/mmdit.py) +//! - πŸ’» Stability-AI [MMDiT-X implementation](https://github.com/Stability-AI/sd3.5/blob/4e484e05308d83fb77ae6f680028e6c313f9da54/mmditx.py) + +//! - ⚑ [Interactive Wasm Example](https://huggingface.co/spaces/radames/Candle-BLIP-Image-Captioning) +//! - πŸ’» [GH Link](https://github.com/salesforce/BLIP) +//! - πŸ€— [HF Link](https://huggingface.co/Salesforce/blip-image-captioning-base) +//! - πŸ“ [Paper](https://arxiv.org/abs/2201.12086) +//! pub mod blocks; pub mod embedding; diff --git a/candle-transformers/src/models/mod.rs b/candle-transformers/src/models/mod.rs index 23edf349ad..571a88614d 100644 --- a/candle-transformers/src/models/mod.rs +++ b/candle-transformers/src/models/mod.rs @@ -1,3 +1,19 @@ +//! Candle implementations for various deep learning models +//! +//! This crate provides implementations of popular machine learning models and architectures for different modalities. +//! +//! - Large language models: [`llama`], [`phi3`], [`mamba`], [`mixtral`], [`bert`], ... +//! - Text to text models: [`t5`], ... +//! - Image to text models: [`blip`], ... +//! - Text to image models: [`stable_diffusion`] and [`wuerstchen`], ... +//! - Audio models: [`whisper`], [`encodec`], [`metavoice`], [`parler_tts`], ... +//! - Computer vision models: [`dinov2`], [`convmixer`], [`efficientnet`], ... +//! +//! Some of the models also have quantized variants, e.g. [`quantized_blip`], [`quantized_llama`] and [`quantized_qwen2`]. +//! +//! The implementations aim to be readable while maintaining good performance. For more information +//! on each model see the model's module docs in the links below. + pub mod based; pub mod beit; pub mod bert; diff --git a/candle-transformers/src/models/openclip/mod.rs b/candle-transformers/src/models/openclip/mod.rs index dacb627f9e..b3864b815e 100644 --- a/candle-transformers/src/models/openclip/mod.rs +++ b/candle-transformers/src/models/openclip/mod.rs @@ -3,7 +3,11 @@ //! Open Contrastive Language-Image Pre-Training (OpenCLIP) is an architecture trained on //! pairs of images with related texts. //! -//! - [GH Link](https://github.com/mlfoundations/open_clip) +//! - πŸ’» [GH Link](https://github.com/mlfoundations/open_clip) +//! - πŸ“ [Paper](https://arxiv.org/abs/2212.07143) //! +//! ## Overview +//! +//! ![](https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/CLIP.png) pub mod text_model; diff --git a/candle-transformers/src/models/persimmon.rs b/candle-transformers/src/models/persimmon.rs index 0996decf55..d1e3db316f 100644 --- a/candle-transformers/src/models/persimmon.rs +++ b/candle-transformers/src/models/persimmon.rs @@ -1,17 +1,15 @@ //! Persimmon Model //! -//! A transformer language model for efficient inference and general-purpose tasks. See Persimmon model details at: -//! - [Hugging Face](https://huggingface.co/adept/persimmon-8b-base) -//! -//! The model uses a standard transformer architecture with: +//! A transformer language model for efficient inference and general-purpose tasks. The model uses a standard transformer architecture with: //! - Layer normalization for Q/K attention //! - RoPE embeddings with partial rotary factor //! - ReLU activation //! - Separate number of attention heads and KV heads //! //! References: -//! - [Hugging Face Implementation](https://github.com/huggingface/transformers/blob/main/src/transformers/models/persimmon/modeling_persimmon.py) -//! - [Persimmon Config](https://github.com/huggingface/transformers/blob/main/src/transformers/models/persimmon/configuration_persimmon.py) +//! - πŸ’» [Hugging Face Implementation](https://github.com/huggingface/transformers/blob/main/src/transformers/models/persimmon/modeling_persimmon.py) +//! - πŸ’» [Persimmon Config](https://github.com/huggingface/transformers/blob/main/src/transformers/models/persimmon/configuration_persimmon.py) +//! - πŸ€— [Hugging Face](https://huggingface.co/adept/persimmon-8b-base) //! use candle::DType; diff --git a/candle-transformers/src/models/phi.rs b/candle-transformers/src/models/phi.rs index 36a08bb3c6..c94ef6686b 100644 --- a/candle-transformers/src/models/phi.rs +++ b/candle-transformers/src/models/phi.rs @@ -1,18 +1,15 @@ //! Microsoft Phi model implementation //! -//! See Phi model details at: -//! - [Phi-2 Model](https://huggingface.co/microsoft/phi-2) -//! //! The Phi series are decoder-only transformers designed for code and language tasks. +//! //! Key characteristics: //! - Decoder-only transformer architecture //! - RoPE embeddings //! - Layer normalization //! - QK normalization //! -//! References: -//! - [Hugging Face Implementation](https://huggingface.co/microsoft/phi-2) -//! - [Alternative Implementation](https://huggingface.co/microsoft/phi-2/tree/main) +//! - ⚑ [Interactive Wasm Example](https://huggingface.co/spaces/radames/Candle-phi1-phi2-wasm-demo) +//! - πŸ€— [HF Link](https://huggingface.co/microsoft/phi-2) //! use crate::models::with_tracing::{layer_norm, linear, Embedding, LayerNorm, Linear}; diff --git a/candle-transformers/src/models/pixtral/mod.rs b/candle-transformers/src/models/pixtral/mod.rs index e722ffcfd2..18bcc5f793 100644 --- a/candle-transformers/src/models/pixtral/mod.rs +++ b/candle-transformers/src/models/pixtral/mod.rs @@ -3,10 +3,10 @@ //! Pixtral is an architecture trained for multimodal learning //! using images paired with text descriptions. //! -//! - Transformers Python [reference implementation](https://github.com/huggingface/transformers/tree/main/src/transformers/models/pixtral) -//! - [Blog Post](https://mistral.ai/news/pixtral-12b/) - -//! - [HF Model Card](https://huggingface.co/mistralai/Pixtral-12B-2409) - -//! - [HF Community Model Card](https://huggingface.co/mistral-community/pixtral-12b). +//! - πŸ’» Transformers Python [reference implementation](https://github.com/huggingface/transformers/tree/main/src/transformers/models/pixtral) +//! - πŸ“ [Blog Post](https://mistral.ai/news/pixtral-12b/) +//! - πŸ€— [HF Model Card](https://huggingface.co/mistralai/Pixtral-12B-2409) +//! - πŸ€— [HF Community Model Card](https://huggingface.co/mistral-community/pixtral-12b) //! //! # Example //! diff --git a/candle-transformers/src/models/quantized_llama.rs b/candle-transformers/src/models/quantized_llama.rs index 7efd385d61..e171b54fd8 100644 --- a/candle-transformers/src/models/quantized_llama.rs +++ b/candle-transformers/src/models/quantized_llama.rs @@ -10,9 +10,10 @@ //! - Optimized memory usage through quantization //! - Configurable model sizes and parameter counts //! -//! References: -//! - [LLaMA Paper](https://arxiv.org/abs/2302.13971) -//! - [LLaMA Model](https://github.com/facebookresearch/llama) +//! - πŸ’» [GH Link](https://github.com/facebookresearch/llama) +//! - πŸ“ [Paper](https://arxiv.org/abs/2302.13971) +//! +//! ![](https://raw.githubusercontent.com/huggingface/candle/main/candle-examples/examples/quantized/assets/aoc.gif) //! use std::collections::HashMap; diff --git a/candle-transformers/src/models/quantized_t5.rs b/candle-transformers/src/models/quantized_t5.rs index 9f770d69d9..4fc9c537f8 100644 --- a/candle-transformers/src/models/quantized_t5.rs +++ b/candle-transformers/src/models/quantized_t5.rs @@ -11,9 +11,9 @@ //! - Support for 8-bit quantization //! //! References: -//! - [T5 Paper](https://arxiv.org/abs/1910.10683) -//! - [Model Card](https://huggingface.co/t5-base) -//! - Original model from [T5](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py) +//! - πŸ“ [T5 Paper](https://arxiv.org/abs/1910.10683) +//! - πŸ€— [Model Card](https://huggingface.co/t5-base) +//! - πŸ€— Original model from [T5](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py) use crate::models::t5::{deserialize_feed_forward_proj_activation, ActivationWithOptionalGating}; use crate::models::with_tracing::QMatMul; diff --git a/candle-transformers/src/models/qwen2.rs b/candle-transformers/src/models/qwen2.rs index 8dbca36b3e..8a29646efe 100644 --- a/candle-transformers/src/models/qwen2.rs +++ b/candle-transformers/src/models/qwen2.rs @@ -11,8 +11,7 @@ //! - Support for 8-bit quantization //! //! References: -//! - [Qwen2 Model](https://huggingface.co/Qwen/Qwen2-7B) -//! - [Model Card](https://huggingface.co/Qwen/Qwen2-7B) +//! - πŸ€— [Qwen2 Model](https://huggingface.co/Qwen/Qwen2-7B) //! use crate::models::with_tracing::{linear, linear_no_bias, Linear, RmsNorm}; diff --git a/candle-transformers/src/models/repvgg.rs b/candle-transformers/src/models/repvgg.rs index a6ffce0d6d..6e45c2d68c 100644 --- a/candle-transformers/src/models/repvgg.rs +++ b/candle-transformers/src/models/repvgg.rs @@ -1,8 +1,5 @@ //! RepVGG inference implementation //! -//! See "RepVGG: Making VGG-style ConvNets Great Again" Ding et al. 2021 -//! https://arxiv.org/abs/2101.03697 -//! //! Key characteristics: //! - Efficient inference architecture through structural reparameterization //! - Single 3x3 conv layer after fusing 3x3 branch, 1x1 branch and identity branch @@ -10,7 +7,7 @@ //! - High accuracy with VGG-like plain architecture and training //! //! References: -//! - [RepVGG Paper](https://arxiv.org/abs/2101.03697) +//! - [RepVGG Paper](https://arxiv.org/abs/2101.03697). RepVGG: Making VGG-style ConvNets Great Again //! - [Official Implementation](https://github.com/DingXiaoH/RepVGG) //! diff --git a/candle-transformers/src/models/siglip.rs b/candle-transformers/src/models/siglip.rs index 2046401428..932970ed3b 100644 --- a/candle-transformers/src/models/siglip.rs +++ b/candle-transformers/src/models/siglip.rs @@ -3,7 +3,7 @@ //! Siglip architecture combining vision and language for zero-shot tasks. //! //! References: -//! - [Model Card](https://huggingface.co/google/siglip-base-patch16-224) +//! - πŸ€— [Model Card](https://huggingface.co/google/siglip-base-patch16-224) //! use crate::models::clip::div_l2_norm; diff --git a/candle-transformers/src/models/stable_diffusion/clip.rs b/candle-transformers/src/models/stable_diffusion/clip.rs index 2f631248bc..4c3f9d512d 100644 --- a/candle-transformers/src/models/stable_diffusion/clip.rs +++ b/candle-transformers/src/models/stable_diffusion/clip.rs @@ -3,7 +3,7 @@ //! Contrastive Language-Image Pre-Training (CLIP) is an architecture trained on //! pairs of images with related texts. //! -//! https://github.com/openai/CLIP +//! - [CLIP](https://github.com/openai/CLIP) use candle::{DType, Device, Result, Tensor, D}; use candle_nn as nn; use candle_nn::Module; diff --git a/candle-transformers/src/models/stable_diffusion/ddpm.rs b/candle-transformers/src/models/stable_diffusion/ddpm.rs index d393f39aac..42a0dc7e17 100644 --- a/candle-transformers/src/models/stable_diffusion/ddpm.rs +++ b/candle-transformers/src/models/stable_diffusion/ddpm.rs @@ -104,7 +104,7 @@ impl DDPMScheduler { }; let current_beta_t = 1. - alpha_prod_t / alpha_prod_t_prev; - // For t > 0, compute predicted variance Ξ²t (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf) + // For t > 0, compute predicted variance Ξ²t (see formula (6) and (7) from [the pdf](https://arxiv.org/pdf/2006.11239.pdf)) // and sample from it to get previous sample // x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample let variance = (1. - alpha_prod_t_prev) / (1. - alpha_prod_t) * current_beta_t; diff --git a/candle-transformers/src/models/stable_diffusion/euler_ancestral_discrete.rs b/candle-transformers/src/models/stable_diffusion/euler_ancestral_discrete.rs index 9576c2de40..edd5eb508b 100644 --- a/candle-transformers/src/models/stable_diffusion/euler_ancestral_discrete.rs +++ b/candle-transformers/src/models/stable_diffusion/euler_ancestral_discrete.rs @@ -1,12 +1,7 @@ //! Ancestral sampling with Euler method steps. //! -//! Reference implementation in Rust: -//! -//! https://github.com/pykeio/diffusers/blob/250b9ad1898af41e76a74c0d8d4292652823338a/src/schedulers/euler_ancestral_discrete.rs -//! -//! Based on the original [`k-diffusion` implementation by Katherine Crowson][kd]. +//! Based on the original [`k-diffusion` implementation by Katherine Crowson]( https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72). /// -/// [kd]: https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72 use super::{ schedulers::{ betas_for_alpha_bar, BetaSchedule, PredictionType, Scheduler, SchedulerConfig, @@ -29,7 +24,7 @@ pub struct EulerAncestralDiscreteSchedulerConfig { pub steps_offset: usize, /// prediction type of the scheduler function, one of `epsilon` (predicting /// the noise of the diffusion process), `sample` (directly predicting the noisy sample`) - /// or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf) + /// or `v_prediction` (see [section 2.4](https://imagen.research.google/video/paper.pdf)) pub prediction_type: PredictionType, /// number of diffusion steps used to train the model pub train_timesteps: usize, diff --git a/candle-transformers/src/models/stable_diffusion/mod.rs b/candle-transformers/src/models/stable_diffusion/mod.rs index 458a7de2d4..6d89f9cd43 100644 --- a/candle-transformers/src/models/stable_diffusion/mod.rs +++ b/candle-transformers/src/models/stable_diffusion/mod.rs @@ -3,9 +3,9 @@ //! Stable Diffusion is a latent text-to-image diffusion model capable of //! generating photo-realistic images given any text input. //! -//! - [Original Repository](https://github.com/CompVis/stable-diffusion) -//! - [Hugging Face](https://huggingface.co/runwayml/stable-diffusion-v1-5) -//! - The default scheduler for the v1.5, v2.1 and XL 1.0 version is the Denoising Diffusion Implicit Model scheduler (DDIM). The original paper and some code can be found in the [associated repo](https://github.com/ermongroup/ddim). The default scheduler for the XL Turbo version is the Euler Ancestral scheduler. +//! - πŸ’» [Original Repository](https://github.com/CompVis/stable-diffusion) +//! - πŸ€— [Hugging Face](https://huggingface.co/runwayml/stable-diffusion-v1-5) +//! - The default scheduler for the v1.5, v2.1 and XL 1.0 version is the Denoising Diffusion Implicit Model scheduler (DDIM). The original paper and some code can be found in the [associated repo](https://github.com/ermongroup/ddim). The default scheduler for the XL Turbo version is the Euler Ancestral scheduler. //! //! //! # Example diff --git a/candle-transformers/src/models/stable_diffusion/resnet.rs b/candle-transformers/src/models/stable_diffusion/resnet.rs index 5df04a8b44..5cca7edd30 100644 --- a/candle-transformers/src/models/stable_diffusion/resnet.rs +++ b/candle-transformers/src/models/stable_diffusion/resnet.rs @@ -3,7 +3,8 @@ //! Some Residual Network blocks used in UNet models. //! //! Denoising Diffusion Implicit Models, K. He and al, 2015. -//! https://arxiv.org/abs/1512.03385 +//! - [Paper](https://arxiv.org/abs/1512.03385) +//! use crate::models::with_tracing::{conv2d, Conv2d}; use candle::{Result, Tensor, D}; use candle_nn as nn; diff --git a/candle-transformers/src/models/stable_diffusion/schedulers.rs b/candle-transformers/src/models/stable_diffusion/schedulers.rs index 94f8ab86f7..1d39037f8f 100644 --- a/candle-transformers/src/models/stable_diffusion/schedulers.rs +++ b/candle-transformers/src/models/stable_diffusion/schedulers.rs @@ -43,7 +43,7 @@ pub enum PredictionType { /// Time step spacing for the diffusion process. /// -/// "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 +/// "linspace", "leading", "trailing" corresponds to annotation of Table 2. of the [paper](https://arxiv.org/abs/2305.08891) #[derive(Debug, Clone, Copy)] pub enum TimestepSpacing { Leading, diff --git a/candle-transformers/src/models/stable_lm.rs b/candle-transformers/src/models/stable_lm.rs index c5dbd3958d..536f7727e4 100644 --- a/candle-transformers/src/models/stable_lm.rs +++ b/candle-transformers/src/models/stable_lm.rs @@ -10,7 +10,7 @@ //! - Support for different model sizes (3B, 7B) //! //! References: -//! - [Model Card](https://huggingface.co/stabilityai/stablelm-3b-4e1t) +//! - πŸ€— [Model Card](https://huggingface.co/stabilityai/stablelm-3b-4e1t) //! use crate::models::with_tracing::{linear, linear_no_bias, Linear}; diff --git a/candle-transformers/src/models/starcoder2.rs b/candle-transformers/src/models/starcoder2.rs index 0df5990b89..266221e5c8 100644 --- a/candle-transformers/src/models/starcoder2.rs +++ b/candle-transformers/src/models/starcoder2.rs @@ -11,8 +11,8 @@ //! - Support for 8-bit quantization //! //! References: -//! - [StarCoder Paper](https://arxiv.org/abs/2305.06161) -//! - [Model Card](https://huggingface.co/bigcode/starcoder) +//! - πŸ“ [StarCoder Paper](https://arxiv.org/abs/2305.06161) +//! - πŸ€— [Model Card](https://huggingface.co/bigcode/starcoder) //! use candle::{DType, Device, Module, Result, Tensor, D}; diff --git a/candle-transformers/src/models/t5.rs b/candle-transformers/src/models/t5.rs index d3fd2ba686..5d23549f21 100644 --- a/candle-transformers/src/models/t5.rs +++ b/candle-transformers/src/models/t5.rs @@ -11,9 +11,10 @@ //! - Support for sequence-to-sequence tasks //! //! References: -//! - [T5 Paper](https://arxiv.org/abs/1910.10683) -//! - [HuggingFace T5](https://huggingface.co/docs/transformers/model_doc/t5) -//! - [GH Model](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py) +//! - ⚑ [Interactive Wasm Example](https://huggingface.co/spaces/radames/Candle-T5-Generation-Wasm) +//! - πŸ’»[GH Model](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py) +//! - πŸ€— [HF Link](https://huggingface.co/docs/transformers/model_doc/t5) +//! - πŸ“ [T5 Paper](https://arxiv.org/abs/1910.10683) //! //! # Encoder-decoder example: //! diff --git a/candle-transformers/src/models/whisper/mod.rs b/candle-transformers/src/models/whisper/mod.rs index 6123884ae4..d7082ea6d8 100644 --- a/candle-transformers/src/models/whisper/mod.rs +++ b/candle-transformers/src/models/whisper/mod.rs @@ -1,10 +1,14 @@ //! Whisper Model Implementation //! //! Whisper is an automatic speech recognition (ASR) system trained on large amounts -//! of multilingual and multitask supervised data collected from the web. +//! of multilingual and multitask supervised data collected from the web. It can be used to +//! convert audio files (in the `.wav` format) to text. Supported features include +//! language detection as well as multilingual speech recognition. +//! +//! - ⚑ [Interactive Wasm Example](https://huggingface.co/spaces/lmz/candle-whisper) +//! - πŸ’» [GH Link](https://github.com/openai/whisper) +//! - πŸ’» Transformers Python [reference implementation](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py) //! -//! - [GH Link](https://github.com/openai/whisper) -//! - Transformers Python [reference implementation](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py) //! pub mod audio; pub mod model; diff --git a/candle-transformers/src/models/wuerstchen/mod.rs b/candle-transformers/src/models/wuerstchen/mod.rs index 9bb37a3bcc..ae42c4a884 100644 --- a/candle-transformers/src/models/wuerstchen/mod.rs +++ b/candle-transformers/src/models/wuerstchen/mod.rs @@ -3,10 +3,17 @@ //! WΓΌrstchen is an efficient diffusion model architecture for generating images using //! a two-stage approach with a small decoder and prior network. //! -//! - [Paper Link](https://openreview.net/pdf?id=gU58AyJlYz) -//! - [GH Link](https://github.com/dome272/Wuerstchen) -//! - [Reference Implementation](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py) +//! - πŸ’» [GH Link](https://github.com/dome272/Wuerstchen) +//! - πŸ€— [HF Link](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py) +//! - πŸ“ [Paper](https://openreview.net/pdf?id=gU58AyJlYz) //! +//! ## Example +//! +//!
+//! +//!

"Anthropomorphic cat dressed as a fire fighter"

+//!
+ pub mod attention_processor; pub mod common; pub mod ddpm; diff --git a/candle-transformers/src/models/yi.rs b/candle-transformers/src/models/yi.rs index 047ea77046..8a2fb111be 100644 --- a/candle-transformers/src/models/yi.rs +++ b/candle-transformers/src/models/yi.rs @@ -1,7 +1,12 @@ //! Yi model implementation. //! -//! Yi is a decoder-only large language model trained by 01.AI. -//! It follows a standard transformer architecture similar to Llama. +//! This candle implementation uses a pre-trained Yi decoder-only large language model for inference. +//! The model was trained by 01.AI and follows a standard transformer architecture similar to LLaMA. +//! +//! Original code: +//! - πŸ’» [Yi Model](https://huggingface.co/01-ai/Yi-6B) +//! - πŸ’» [Yi Modeling Code](https://huggingface.co/01-ai/Yi-6B/blob/main/modeling_yi.py) +//! - πŸ“ [Technical Report](https://arxiv.org/abs/2403.04652) Yi: Open Foundation Models by 01.AI //! //! Key characteristics: //! - Multi-head attention with rotary positional embeddings @@ -9,9 +14,6 @@ //! - SwiGLU activation in feed-forward layers //! - Grouped-query attention for efficient inference //! -//! References: -//! - [Yi Model](https://huggingface.co/01-ai/Yi-6B) -//! - [Hugging Face](https://huggingface.co/01-ai/Yi-6B/blob/main/modeling_yi.py) use crate::models::with_tracing::{linear_no_bias, Linear, RmsNorm}; use candle::{DType, Device, Module, Result, Tensor, D};