From 88419cfc683bb9b680d51fb958830f5fa8e51f0c Mon Sep 17 00:00:00 2001 From: Christos Hadjiaslanis Date: Wed, 15 May 2024 21:24:34 +0100 Subject: [PATCH] Added documention for ML on Linera --- src/SUMMARY.md | 4 ++ src/experimental.md | 5 ++ src/experimental/ml.md | 154 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 163 insertions(+) create mode 100644 src/experimental.md create mode 100644 src/experimental/ml.md diff --git a/src/SUMMARY.md b/src/SUMMARY.md index 87126ec8..aaa1a6d1 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -37,6 +37,10 @@ - [Creating New Blocks](advanced_topics/block_creation.md) - [Applications that Handle Assets](advanced_topics/assets.md) +- [Experimental](experimental.md) + + - [Machine Learning](experimental/ml.md) + - [Appendix](appendix.md) - [Glossary](appendix/glossary.md) - [Videos](appendix/videos.md) diff --git a/src/experimental.md b/src/experimental.md new file mode 100644 index 00000000..f12065f8 --- /dev/null +++ b/src/experimental.md @@ -0,0 +1,5 @@ +# Experimental Topics + +In this section, we present experimental topics related to the Linera protocol. + +These are still in the works and subject to frequent breaking changes. diff --git a/src/experimental/ml.md b/src/experimental/ml.md new file mode 100644 index 00000000..a59ed3ef --- /dev/null +++ b/src/experimental/ml.md @@ -0,0 +1,154 @@ +# Machine Learning on Linera + +The Linera application contract / service split allows for securely and +efficiently running machine learning models on the edge. + +The application's contract retrieves the correct model with all the correctness +guarantees enforced by the consensus algorithm, while the client performs +inference off-chain, in the un-metered service. Since the service is running on +the user's own hardware, it can be implicitly trusted. + +## Guidelines + +The existing examples use the [`candle`](https://github.com/huggingface/candle) +framework by [Hugging Face](https://huggingface.co/) as the underlying ML +framework. + +`candle` is a minimalist ML framework for Rust with a focus on performance and +usability. It also compiles to Wasm and has great support for Wasm both in and +outside the browser. Check candle's +[examples](https://github.com/huggingface/candle/tree/main/candle-wasm-examples) +for inspiration on the types of models which are supported. + +### Getting Started + +To add ML capabilities to your existing Linera project, you'll need to add the +`candle-core`, `getrandom`, `rand` and `tokenizers` dependencies to your Linera +project: + +```toml +candle-core = "0.4.1" +getrandom = { version = "0.2.12", default-features = false, features = ["custom"] } +rand = "0.8.5" +``` + +Optionally, to run Large Language Models, you'll also need the +`candle-transformers` and `transformers` crate: + +```toml +candle-transformers = "0.4.1" +tokenizers = { git = "https://github.com/christos-h/tokenizers", default-features = false, features = ["unstable_wasm"] } +``` + +### Providing Randomness + +ML frameworks use random numbers to perform inference. Linera services run in a +Wasm VM which do not have access to the OS Rng. For this reason, we need to +manually seed RNG used by `candle`. We do this by writing a custom `getrandom`. + +Create a file under `src/random.rs` and add the following: + +```rust +use std::sync::{Mutex, OnceLock}; + +use rand::{rngs::StdRng, Rng, SeedableRng}; + +static RNG: OnceLock> = OnceLock::new(); + +fn custom_getrandom(buf: &mut [u8]) -> Result<(), getrandom::Error> { + let seed = [0u8; 32]; + RNG.get_or_init(|| Mutex::new(StdRng::from_seed(seed))) + .lock() + .expect("failed to get RNG lock") + .fill(buf); + Ok(()) +} + +getrandom::register_custom_getrandom!(custom_getrandom); +``` + +This will enable `candle` and any other crates which rely on `getrandom` access +to a deterministic RNG. If deterministic behaviour is not desired, the System +API can be used to seed the RNG from a timestamp. + +### Loading the model into the Service + +Models cannot currently be saved on-chain; for more information see the +`Limitations` below. + +To perform model inference, the model must be loaded into the service. To do +this we'll use the `fetch_url` API when a query is made against the service: + +```rust +impl Service for MyService { + async fn handle_query(&self, request: Request) -> Response { + // do some stuff here + let raw_weights = self.runtime.fetch_url("https://my-model-provider.com/model.bin"); + // do more stuff here + } +} +``` + +This can be served from a local webserver or pulled directly from a model +provider such as Hugging Face. + +At this we have the raw bytes which correspond to the models and tokenizer. +`candle` supports multiple formats for storing model weights, both quantized and +not (`gguf`, `ggml`, `safetensors`, etc.). + +Depending on the model format that you're using, `candle` exposes convenience +functions to convert the bytes into a typed `struct` which can then be used to +perform inference. Below is an example for a non-quantized Llama 2 model: + +```rust + fn load_llama_model(cursor: &mut Cursor>) -> Result<(Llama, Cache), candle_core::Error> { + let config = llama2_c::Config::from_reader(cursor)?; + let weights = + llama2_c_weights::TransformerWeights::from_reader(cursor, &config, &Device::Cpu)?; + let vb = weights.var_builder(&config, &Device::Cpu)?; + let cache = llama2_c::Cache::new(true, &config, vb.pp("rot"))?; + let llama = Llama::load(vb, config.clone())?; + Ok((llama, cache)) + } +``` + +### Inference + +Performing inference using `candle` is not a 'one-size-fits-all' process. +Different models require different logic to perform inference so the specifics +of how to perform inference are beyond the scope of this document. + +Luckily, there are multiple examples which can be used as guidelines on how to +perform inference in Wasm: + +- [Llm Stories](https://github.com/linera-io/linera-protocol/tree/main/examples/llm) +- [Generative NFTs](https://github.com/linera-io/linera-protocol/tree/main/examples/gen-nft) +- [Candle Wasm Examples](https://github.com/huggingface/candle/tree/main/candle-wasm-examples) + +## Limitations + +### Hardware Acceleration + +Although SIMD instructions _are_ supported by the service runtime, general +purpose GPU hardware acceleration is +[currently not supported](https://github.com/linera-io/linera-protocol/issues/1931). +Therefore, performance in local model inference degraded for larger models. + +### On-Chain Models + +Due to block-size constraints, models need to be stored off-chain until the +introduction of the +[Blob API](https://github.com/linera-io/linera-protocol/issues/1981). The Blob +API will enable large binary blobs to be stored on-chain, the correctness and +availability of which is guaranteed by the validators. + +### Maximum Model Size + +The maximum size of a model which can be loaded into an application's service is +currently constrained by: + +1. The addressable memory of the service's Wasm runtime being 4 Gb. +2. Not being able to load models directly to the GPU. + +It is recommended that smaller models (50 Mb - 100 Mb) are used at current state +of development.