[DOCS] Add weightless caching docs - porting 28253 (#28990)

Porting: #28253
openvinotoolkit · Feb 14, 2025 · 332e2ba · 332e2ba
1 parent 98ad409
commit 332e2ba
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 13 deletions.
diff --git a/docs/articles_en/assets/snippets/ov_caching.cpp b/docs/articles_en/assets/snippets/ov_caching.cpp
@@ -61,12 +61,36 @@ bool cachingSupported = std::find(caps.begin(), caps.end(), ov::device::capabili
 }
 
 void part4() {
+    std::string modelPath = "/tmp/myModel.xml";
+    std::string device = "GPU";
+    ov::Core core;                                           // Step 1: create ov::Core object
+    bool hasGPU = false;                                     // Step 1a: Check if GPU is available
+    auto devices = core.get_available_devices();
+    for (auto&& supported : devices) {
+        hasGPU |= supported.find(device) != std::string::npos;
+    }
+    if(!hasGPU) {
+        return;
+    }
+    core.set_property(ov::cache_dir("/path/to/cache/dir"));  // Step 1b: Enable caching
+//! [ov:caching:part4]
+// Note: model path needs to point to the *.xml file, not *.bin when using the IR model format.
+auto compiled = core.compile_model(modelPath,
+                                   device,
+                                   ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
+//! [ov:caching:part4]
+    if (!compiled) {
+        throw std::runtime_error("error");
+    }
+}
+
+void part5() {
     std::string modelPath = "/tmp/myModel.xml";
     std::string device = "CPU";
     ov::Core core;                                           // Step 1: create ov::Core object
     core.set_property(ov::cache_dir("/path/to/cache/dir"));  // Step 1b: Enable caching
     auto model = core.read_model(modelPath);                 // Step 2: Read Model
-//! [ov:caching:part4]
+//! [ov:caching:part5]
 ov::AnyMap config;
 ov::EncryptionCallbacks encryption_callbacks;
 static const char codec_key[] = {0x30, 0x60, 0x70, 0x02, 0x04, 0x08, 0x3F, 0x6F, 0x72, 0x74, 0x78, 0x7F};
@@ -84,13 +108,13 @@ encryption_callbacks.encrypt = codec_xor;
 encryption_callbacks.decrypt = codec_xor;
 config.insert(ov::cache_encryption_callbacks(encryption_callbacks));  // Step 4: Set device configuration
 auto compiled = core.compile_model(model, device, config);            // Step 5: LoadNetwork
-//! [ov:caching:part4]
+//! [ov:caching:part5]
     if (!compiled) {
         throw std::runtime_error("error");
     }
 }
 
-void part5() {
+void part6() {
     std::string modelPath = "/tmp/myModel.xml";
     std::string device = "GPU";
     ov::Core core;                                           // Step 1: create ov::Core object
@@ -103,7 +127,7 @@ void part5() {
         return;
     }
     core.set_property(ov::cache_dir("/path/to/cache/dir"));  // Step 1b: Enable caching
-//! [ov:caching:part5]
+//! [ov:caching:part6]
 static const char codec_key[] = {0x30, 0x60, 0x70, 0x02, 0x04, 0x08, 0x3F, 0x6F, 0x72, 0x74, 0x78, 0x7F};
 auto codec_xor = [&](const std::string& source_str) {
     auto key_size = sizeof(codec_key);
@@ -119,7 +143,7 @@ auto compiled = core.compile_model(modelPath,
                                    device,
                                    ov::cache_encryption_callbacks(ov::EncryptionCallbacks{codec_xor, codec_xor}),
                                    ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));  // Step 5: Compile model
-//! [ov:caching:part5]
+//! [ov:caching:part6]
     if (!compiled) {
         throw std::runtime_error("error");
     }
@@ -133,6 +157,7 @@ int main() {
         part3();
         part4();
         part5();
+        part6();
     } catch (...) {
     }
     return 0;

diff --git a/docs/articles_en/assets/snippets/ov_caching.py b/docs/articles_en/assets/snippets/ov_caching.py
@@ -44,6 +44,16 @@
 # ! [ov:caching:part3]
 
 # ! [ov:caching:part4]
+core = ov.Core()
+if "GPU" in core.available_devices:
+    core.set_property({props.cache_dir: path_to_cache_dir})
+    config_cache = {}
+    config_cache["CACHE_MODE"] = "OPTIMIZE_SIZE"
+    # Note: model path needs to point to the *.xml file, not *.bin when using the IR model format.
+    compiled_model = core.compile_model(model=model_path, device_name='GPU', config=config_cache)
+# ! [ov:caching:part4]
+
+# ! [ov:caching:part5]
 import base64
 
 def encrypt_base64(src):
@@ -58,9 +68,9 @@ def decrypt_base64(src):
 config_cache["CACHE_ENCRYPTION_CALLBACKS"] = [encrypt_base64, decrypt_base64]
 model = core.read_model(model=model_path)
 compiled_model = core.compile_model(model=model, device_name=device_name, config=config_cache)
-# ! [ov:caching:part4]
-
 # ! [ov:caching:part5]
+
+# ! [ov:caching:part6]
 import base64
 
 def encrypt_base64(src):
@@ -76,4 +86,4 @@ def decrypt_base64(src):
     config_cache["CACHE_ENCRYPTION_CALLBACKS"] = [encrypt_base64, decrypt_base64]
     config_cache["CACHE_MODE"] = "OPTIMIZE_SIZE"
     compiled_model = core.compile_model(model=model_path, device_name='GPU', config=config_cache)
-# ! [ov:caching:part5]
+# ! [ov:caching:part6]
diff --git a/...ning-inference/optimize-inference/optimizing-latency/model-caching-overview.rst b/...ning-inference/optimize-inference/optimizing-latency/model-caching-overview.rst
@@ -140,6 +140,35 @@ model caching, use the following code in your application:
          :language: cpp
          :fragment: [ov:caching:part3]
 
+Set ``CacheMode`` property to ``OPTIMIZE_SIZE`` to enable weightless caching
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+Weightless caching is a feature that allows you to create a cache file which doesn't contain the weights of the model. Instead, the weights are loaded from the original model file. This helps to reduce the size of the cache file.
+
+.. tab-set::
+
+   .. tab-item:: Python
+      :sync: py
+
+      .. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.py
+         :language: py
+         :fragment: [ov:caching:part4]
+
+   .. tab-item:: C++
+      :sync: cpp
+
+      .. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.cpp
+         :language: cpp
+         :fragment: [ov:caching:part4]
+
+.. important::
+
+   Currently, this property is supported only by the GPU Plugin and IR model format.
+
+.. important::
+
+   Some weights which undergo transformations during model compilation may not be eligible for weightless caching. In such cases, the cache file will contain these weights while still using the weightless caching mechanism for the rest. The feature supports some of the common transformations and replicates them after loading the model from the cache.
+
 Enable cache encryption
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
@@ -154,16 +183,16 @@ loading it from the cache. Currently, this property can be set only in ``compile
 
       .. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.py
          :language: py
-         :fragment: [ov:caching:part4]
+         :fragment: [ov:caching:part5]
 
    .. tab-item:: C++
       :sync: cpp
 
       .. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.cpp
          :language: cpp
-         :fragment: [ov:caching:part4]
+         :fragment: [ov:caching:part5]
 
-Full encryption only works when the ``CacheMode`` property is set to ``OPTIMIZE_SIZE``.
+If model caching is enabled in the GPU Plugin, the model topology can be encrypted while it is saved to the cache and decrypted when it is loaded from the cache. Full encryption only works when the ``CacheMode`` property is set to ``OPTIMIZE_SIZE``.
 
 .. tab-set::
 
@@ -172,14 +201,14 @@ Full encryption only works when the ``CacheMode`` property is set to ``OPTIMIZE_
 
       .. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.py
          :language: py
-         :fragment: [ov:caching:part5]
+         :fragment: [ov:caching:part6]
 
    .. tab-item:: C++
       :sync: cpp
 
       .. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.cpp
          :language: cpp
-         :fragment: [ov:caching:part5]
+         :fragment: [ov:caching:part6]
 
 .. important::