From d109b365824c36b418ee58e987fb51063caba98e Mon Sep 17 00:00:00 2001 From: Nicolas Belouin Date: Wed, 27 Dec 2023 16:07:28 +0100 Subject: [PATCH 1/8] Refactor the agent This commit broadly refactors the agent to: - use kube Controller construct - take advantage of Server Side Apply - prepare for resource split and CDI+DRA - don't put everything under a util directory - use closer to kube upstream kube client - update proto definitions for device plugins - use kubelet pod resources monitoring interface rather than CRI to do slot reconciliation - Use CRD definition in Rust code to generate yaml file Signed-off-by: Nicolas Belouin --- Cargo.lock | 321 +- agent/Cargo.toml | 12 +- agent/build.rs | 12 +- agent/proto/pluginapi.proto | 352 +- agent/proto/pluginregistration.proto | 50 + agent/proto/podresources.proto | 67 + agent/src/device_manager/cdi.rs | 169 + agent/src/device_manager/in_memory.rs | 175 + agent/src/device_manager/mod.rs | 12 + .../discovery_handler_registry.rs | 983 ++++++ .../discovery_property_solver.rs | 727 ++++ .../embedded_handler.rs | 174 + agent/src/discovery_handler_manager/mod.rs | 105 + .../registration_socket.rs | 345 ++ agent/src/main.rs | 111 +- .../device_plugin_instance_controller.rs | 1517 ++++++++ .../plugin_manager/device_plugin_runner.rs | 248 ++ .../device_plugin_slot_reclaimer.rs | 98 + agent/src/plugin_manager/mod.rs | 7 + .../src/plugin_manager/pluginregistration.rs | 260 ++ agent/src/plugin_manager/v1.rs | 432 +++ agent/src/{util => plugin_manager}/v1beta1.rs | 150 +- agent/src/util/config_action.rs | 684 ---- agent/src/util/constants.rs | 76 - agent/src/util/crictl_containers.rs | 160 - agent/src/util/device_plugin_builder.rs | 437 --- agent/src/util/device_plugin_service.rs | 3129 ----------------- .../discovery_configuration_controller.rs | 638 ++++ agent/src/util/discovery_operator.rs | 2960 ---------------- agent/src/util/embedded_discovery_handlers.rs | 138 - agent/src/util/mod.rs | 16 +- agent/src/util/registration.rs | 460 --- agent/src/util/slot_reconciliation.rs | 769 ---- agent/src/util/stopper.rs | 99 + agent/src/util/streaming_extension.rs | 31 - build/containers/Dockerfile.rust | 2 +- controller/Cargo.toml | 6 +- controller/src/util/instance_action.rs | 6 +- controller/src/util/node_watcher.rs | 8 +- controller/src/util/pod_watcher.rs | 8 +- deployment/helm/crds/akri-instance-crd.yaml | 104 +- deployment/helm/templates/agent.yaml | 5 + deployment/helm/templates/rbac.yaml | 2 +- deployment/helm/values.yaml | 2 + shared/Cargo.toml | 7 +- shared/src/akri/configuration.rs | 16 +- shared/src/akri/instance.rs | 75 +- shared/src/gen_crds.rs | 9 + shared/src/k8s/crud.rs | 147 + shared/src/k8s/job.rs | 2 + shared/src/k8s/mod.rs | 5 + shared/src/os/env_var.rs | 2 +- test/e2e/test_core.py | 3 +- test/json/local-instance-list.json | 2 + test/json/local-instance.json | 2 + test/json/shared-instance-list-slots.json | 2 + test/json/shared-instance-list.json | 2 + test/json/shared-instance-update.json | 2 + test/json/shared-instance.json | 2 + test/yaml/akri-instance-onvif-camera.yaml | 2 + test/yaml/akri-instance-usb-camera.yaml | 2 + 61 files changed, 7187 insertions(+), 9162 deletions(-) create mode 100644 agent/proto/pluginregistration.proto create mode 100644 agent/proto/podresources.proto create mode 100644 agent/src/device_manager/cdi.rs create mode 100644 agent/src/device_manager/in_memory.rs create mode 100644 agent/src/device_manager/mod.rs create mode 100644 agent/src/discovery_handler_manager/discovery_handler_registry.rs create mode 100644 agent/src/discovery_handler_manager/discovery_property_solver.rs create mode 100644 agent/src/discovery_handler_manager/embedded_handler.rs create mode 100644 agent/src/discovery_handler_manager/mod.rs create mode 100644 agent/src/discovery_handler_manager/registration_socket.rs create mode 100644 agent/src/plugin_manager/device_plugin_instance_controller.rs create mode 100644 agent/src/plugin_manager/device_plugin_runner.rs create mode 100644 agent/src/plugin_manager/device_plugin_slot_reclaimer.rs create mode 100644 agent/src/plugin_manager/mod.rs create mode 100644 agent/src/plugin_manager/pluginregistration.rs create mode 100644 agent/src/plugin_manager/v1.rs rename agent/src/{util => plugin_manager}/v1beta1.rs (84%) delete mode 100644 agent/src/util/config_action.rs delete mode 100644 agent/src/util/constants.rs delete mode 100644 agent/src/util/crictl_containers.rs delete mode 100644 agent/src/util/device_plugin_builder.rs delete mode 100644 agent/src/util/device_plugin_service.rs create mode 100644 agent/src/util/discovery_configuration_controller.rs delete mode 100644 agent/src/util/discovery_operator.rs delete mode 100644 agent/src/util/embedded_discovery_handlers.rs delete mode 100644 agent/src/util/registration.rs delete mode 100644 agent/src/util/slot_reconciliation.rs create mode 100644 agent/src/util/stopper.rs delete mode 100644 agent/src/util/streaming_extension.rs create mode 100644 shared/src/gen_crds.rs create mode 100644 shared/src/k8s/crud.rs diff --git a/Cargo.lock b/Cargo.lock index 335c5ad79..20d85662b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -237,8 +237,9 @@ dependencies = [ "env_logger", "futures", "hyper", - "k8s-openapi", - "kube", + "itertools 0.12.1", + "k8s-openapi 0.20.0", + "kube 0.87.2", "kube-runtime", "lazy_static", "log", @@ -252,6 +253,7 @@ dependencies = [ "serde_json", "serde_yaml 0.8.26", "tempfile", + "thiserror", "tokio", "tokio-stream", "tonic", @@ -377,8 +379,8 @@ dependencies = [ "async-trait", "either", "env_logger", - "k8s-openapi", - "kube", + "k8s-openapi 0.20.0", + "kube 0.87.2", "log", "mockall", "prometheus", @@ -431,6 +433,12 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "allocator-api2" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" + [[package]] name = "android-tzdata" version = "0.1.1" @@ -957,8 +965,8 @@ dependencies = [ "chrono", "env_logger", "futures", - "k8s-openapi", - "kube", + "k8s-openapi 0.20.0", + "kube 0.87.2", "kube-runtime", "lazy_static", "log", @@ -1051,8 +1059,18 @@ version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" dependencies = [ - "darling_core", - "darling_macro", + "darling_core 0.14.4", + "darling_macro 0.14.4", +] + +[[package]] +name = "darling" +version = "0.20.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc5d6b04b3fd0ba9926f945895de7d806260a2d7431ba82e7edaecb043c4c6b8" +dependencies = [ + "darling_core 0.20.5", + "darling_macro 0.20.5", ] [[package]] @@ -1069,17 +1087,42 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "darling_core" +version = "0.20.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e48a959bcd5c761246f5d090ebc2fbf7b9cd527a492b07a67510c108f1e7e3" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.48", +] + [[package]] name = "darling_macro" version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" dependencies = [ - "darling_core", + "darling_core 0.14.4", "quote", "syn 1.0.109", ] +[[package]] +name = "darling_macro" +version = "0.20.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1545d67a2149e1d93b7e5c7752dce5a7426eb5d1357ddcfd89336b94444f77" +dependencies = [ + "darling_core 0.20.5", + "quote", + "syn 2.0.48", +] + [[package]] name = "data-encoding" version = "2.5.0" @@ -1515,6 +1558,10 @@ name = "hashbrown" version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +dependencies = [ + "ahash", + "allocator-api2", +] [[package]] name = "headers" @@ -1664,6 +1711,22 @@ dependencies = [ "tower-layer", ] +[[package]] +name = "hyper-rustls" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" +dependencies = [ + "futures-util", + "http", + "hyper", + "log", + "rustls", + "rustls-native-certs", + "tokio", + "tokio-rustls", +] + [[package]] name = "hyper-timeout" version = "0.4.1" @@ -1798,6 +1861,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.10" @@ -1824,9 +1896,9 @@ dependencies = [ [[package]] name = "json-patch" -version = "0.3.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e712e62827c382a77b87f590532febb1f8b2fdbc3eefa1ee37fe7281687075ef" +checksum = "55ff1e1486799e3f64129f8ccad108b38290df9cd7015cd31bed17239f0789d6" dependencies = [ "serde", "serde_json", @@ -1834,6 +1906,19 @@ dependencies = [ "treediff", ] +[[package]] +name = "jsonpath-rust" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06cc127b7c3d270be504572364f9569761a180b981919dd0d87693a7f5fb7829" +dependencies = [ + "pest", + "pest_derive", + "regex", + "serde_json", + "thiserror", +] + [[package]] name = "jsonpath_lib" version = "0.3.0" @@ -1860,16 +1945,43 @@ dependencies = [ "serde_json", ] +[[package]] +name = "k8s-openapi" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edc3606fd16aca7989db2f84bb25684d0270c6d6fa1dbcd0025af7b4130523a6" +dependencies = [ + "base64 0.21.6", + "bytes", + "chrono", + "schemars", + "serde", + "serde-value", + "serde_json", +] + [[package]] name = "kube" version = "0.80.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "414d80c69906a91e8ecf4ae16d0fb504e19aa6b099135d35d85298b4e4be3ed3" dependencies = [ - "k8s-openapi", - "kube-client", - "kube-core", - "kube-derive", + "k8s-openapi 0.17.0", + "kube-client 0.80.0", + "kube-core 0.80.0", + "kube-derive 0.80.0", +] + +[[package]] +name = "kube" +version = "0.87.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3499c8d60c763246c7a213f51caac1e9033f46026904cb89bc8951ae8601f26e" +dependencies = [ + "k8s-openapi 0.20.0", + "kube-client 0.87.2", + "kube-core 0.87.2", + "kube-derive 0.87.2", ] [[package]] @@ -1890,10 +2002,10 @@ dependencies = [ "hyper-openssl", "hyper-timeout", "jsonpath_lib", - "k8s-openapi", - "kube-core", + "k8s-openapi 0.17.0", + "kube-core 0.80.0", "openssl", - "pem", + "pem 1.1.1", "pin-project", "secrecy", "serde", @@ -1903,7 +2015,43 @@ dependencies = [ "tokio", "tokio-util", "tower", - "tower-http", + "tower-http 0.3.5", + "tracing", +] + +[[package]] +name = "kube-client" +version = "0.87.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "033450dfa0762130565890dadf2f8835faedf749376ca13345bcd8ecd6b5f29f" +dependencies = [ + "base64 0.21.6", + "bytes", + "chrono", + "either", + "futures", + "home", + "http", + "http-body", + "hyper", + "hyper-rustls", + "hyper-timeout", + "jsonpath-rust", + "k8s-openapi 0.20.0", + "kube-core 0.87.2", + "pem 3.0.3", + "pin-project", + "rustls", + "rustls-pemfile", + "secrecy", + "serde", + "serde_json", + "serde_yaml 0.9.30", + "thiserror", + "tokio", + "tokio-util", + "tower", + "tower-http 0.4.4", "tracing", ] @@ -1912,12 +2060,29 @@ name = "kube-core" version = "0.80.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98331c6f1354893f7c50da069e43a3fd1c84e55bbedc7765d9db22ec3291d07d" +dependencies = [ + "chrono", + "form_urlencoded", + "http", + "k8s-openapi 0.17.0", + "once_cell", + "schemars", + "serde", + "serde_json", + "thiserror", +] + +[[package]] +name = "kube-core" +version = "0.87.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5bba93d054786eba7994d03ce522f368ef7d48c88a1826faa28478d85fb63ae" dependencies = [ "chrono", "form_urlencoded", "http", "json-patch", - "k8s-openapi", + "k8s-openapi 0.20.0", "once_cell", "schemars", "serde", @@ -1931,27 +2096,41 @@ version = "0.80.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4be6ff26b9a34ce831d341e8b33bc78986a33c1be88f5bf9ca84e92e98b1dfb" dependencies = [ - "darling", + "darling 0.14.4", "proc-macro2", "quote", "serde_json", "syn 1.0.109", ] +[[package]] +name = "kube-derive" +version = "0.87.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e98dd5e5767c7b894c1f0e41fd628b145f808e981feb8b08ed66455d47f1a4" +dependencies = [ + "darling 0.20.5", + "proc-macro2", + "quote", + "serde_json", + "syn 2.0.48", +] + [[package]] name = "kube-runtime" -version = "0.80.0" +version = "0.87.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b698eb8998b46683b0dc3c2ce72c80bc308fc8159f25afa719668c290a037a57" +checksum = "2d8893eb18fbf6bb6c80ef6ee7dd11ec32b1dc3c034c988ac1b3a84d46a230ae" dependencies = [ "ahash", "async-trait", "backoff", "derivative", "futures", + "hashbrown 0.14.3", "json-patch", - "k8s-openapi", - "kube-client", + "k8s-openapi 0.20.0", + "kube-client 0.87.2", "parking_lot 0.12.1", "pin-project", "serde", @@ -2333,6 +2512,12 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + [[package]] name = "openssl-sys" version = "0.9.98" @@ -2423,6 +2608,16 @@ dependencies = [ "base64 0.13.1", ] +[[package]] +name = "pem" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310" +dependencies = [ + "base64 0.21.6", + "serde", +] + [[package]] name = "percent-encoding" version = "1.0.1" @@ -2594,7 +2789,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dfc28575c2e3f19cb3c73b93af36460ae898d426eba6fc15b9bd2a5220758a0" dependencies = [ "anstyle", - "itertools", + "itertools 0.11.0", "predicates-core", ] @@ -2682,7 +2877,7 @@ checksum = "c55e02e35260070b6f716a2423c2ff1c3bb1642ddca6f99e1f26d06268a0e2d2" dependencies = [ "bytes", "heck 0.4.1", - "itertools", + "itertools 0.11.0", "log", "multimap", "once_cell", @@ -2703,7 +2898,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efb6c9a1dd1def8e2124d17e83a20af56f1570d6c2d2bd9e266ccb768df3840e" dependencies = [ "anyhow", - "itertools", + "itertools 0.11.0", "proc-macro2", "quote", "syn 2.0.48", @@ -2940,6 +3135,18 @@ dependencies = [ "sct", ] +[[package]] +name = "rustls-native-certs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" +dependencies = [ + "openssl-probe", + "rustls-pemfile", + "schannel", + "security-framework", +] + [[package]] name = "rustls-pemfile" version = "1.0.4" @@ -2971,6 +3178,15 @@ version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" +[[package]] +name = "schannel" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "schemars" version = "0.8.16" @@ -3027,6 +3243,29 @@ dependencies = [ "zeroize", ] +[[package]] +name = "security-framework" +version = "2.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "semver" version = "1.0.21" @@ -3455,6 +3694,7 @@ dependencies = [ "futures-core", "pin-project-lite", "tokio", + "tokio-util", ] [[package]] @@ -3567,6 +3807,27 @@ dependencies = [ "tracing", ] +[[package]] +name = "tower-http" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c5bb1d698276a2443e5ecfabc1008bf15a36c12e6a7176e7bf089ea9131140" +dependencies = [ + "base64 0.21.6", + "bitflags 2.4.1", + "bytes", + "futures-core", + "futures-util", + "http", + "http-body", + "http-range-header", + "mime", + "pin-project-lite", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "tower-layer" version = "0.3.2" @@ -3955,8 +4216,8 @@ dependencies = [ "actix-web", "akri-shared", "clap", - "k8s-openapi", - "kube", + "k8s-openapi 0.17.0", + "kube 0.80.0", "openapi", "openssl", "serde_json", diff --git a/agent/Cargo.toml b/agent/Cargo.toml index 68765f289..1666e20c7 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -22,9 +22,10 @@ blake2 = "0.9.0" env_logger = "0.10.0" futures = { version = "0.3.1", package = "futures" } hyper = "0.14.2" -k8s-openapi = { version = "0.17.0", default-features = false, features = ["schemars", "v1_23"] } -kube = { version = "0.80.0", features = ["derive"] } -kube-runtime = "0.80.0" +itertools = "0.12.0" +k8s-openapi = { version = "0.20.0", default-features = false, features = ["schemars", "v1_23"] } +kube = { version = "0.87.1", features = ["derive"] } +kube-runtime = { version = "0.87.1", features = ["unstable-runtime-reconcile-on"] } lazy_static = "1.4" log = "0.4" mockall_double = "0.3.1" @@ -34,8 +35,9 @@ serde = "1.0.104" serde_derive = "1.0.104" serde_json = "1.0.45" serde_yaml = { version = "0.8.11", optional = true } -tokio = { version = "1.0", features = ["rt-multi-thread", "time", "fs", "macros", "net"] } -tokio-stream = { version = "0.1", features = ["net"] } +thiserror = "1.0.50" +tokio = { version = "1.0", features = ["rt-multi-thread", "time", "fs", "macros", "net", "signal"] } +tokio-stream = { version = "0.1", features = ["net", "sync"] } tonic = "0.10" tower = "0.4.8" diff --git a/agent/build.rs b/agent/build.rs index f76ecc5be..34a7e1c1a 100644 --- a/agent/build.rs +++ b/agent/build.rs @@ -2,7 +2,15 @@ fn main() { tonic_build::configure() .build_client(true) - .out_dir("./src/util") - .compile(&["./proto/pluginapi.proto"], &["./proto"]) + .out_dir("./src/plugin_manager") + .compile( + &["./proto/pluginapi.proto", "./proto/podresources.proto"], + &["./proto"], + ) + .expect("failed to compile protos"); + tonic_build::configure() + .build_client(false) + .out_dir("./src/plugin_manager") + .compile(&["./proto/pluginregistration.proto"], &["./proto"]) .expect("failed to compile protos"); } diff --git a/agent/proto/pluginapi.proto b/agent/proto/pluginapi.proto index 883f9a991..ba49b88a1 100644 --- a/agent/proto/pluginapi.proto +++ b/agent/proto/pluginapi.proto @@ -1,152 +1,200 @@ - syntax = 'proto3'; - - package v1beta1; - - - // Registration is the service advertised by the Kubelet - // Only when Kubelet answers with a success code to a Register Request - // may Device Plugins start their service - // Registration may fail when device plugin version is not supported by - // Kubelet or the registered resourceName is already taken by another - // active device plugin. Device plugin is expected to terminate upon registration failure - service Registration { - rpc Register(RegisterRequest) returns (Empty) {} - } - - message DevicePluginOptions { - // Indicates if PreStartContainer call is required before each container start - bool pre_start_required = 1; - } - - message RegisterRequest { - // Version of the API the Device Plugin was built against - string version = 1; - // Name of the unix socket the device plugin is listening on - // PATH = path.Join(DevicePluginPath, endpoint) - string endpoint = 2; - // Schedulable resource name. As of now it's expected to be a DNS Label - string resource_name = 3; - // Options to be communicated with Device Manager - DevicePluginOptions options = 4; - } - - message Empty { - } - - // DevicePlugin is the service advertised by Device Plugins - service DevicePlugin { - // GetDevicePluginOptions returns options to be communicated with Device - // Manager - rpc GetDevicePluginOptions(Empty) returns (DevicePluginOptions) {} - - // ListAndWatch returns a stream of List of Devices - // Whenever a Device state change or a Device disapears, ListAndWatch - // returns the new list - rpc ListAndWatch(Empty) returns (stream ListAndWatchResponse) {} - - // Allocate is called during container creation so that the Device - // Plugin can run device specific operations and instruct Kubelet - // of the steps to make the Device available in the container - rpc Allocate(AllocateRequest) returns (AllocateResponse) {} - - // PreStartContainer is called, if indicated by Device Plugin during registeration phase, - // before each container start. Device plugin can run device specific operations - // such as reseting the device before making devices available to the container - rpc PreStartContainer(PreStartContainerRequest) returns (PreStartContainerResponse) {} - } - - // ListAndWatch returns a stream of List of Devices - // Whenever a Device state change or a Device disapears, ListAndWatch - // returns the new list - message ListAndWatchResponse { - repeated Device devices = 1; - } - - /* E.g: - * struct Device { - * ID: "GPU-fef8089b-4820-abfc-e83e-94318197576e", - * State: "Healthy", - *} */ - message Device { - // A unique ID assigned by the device plugin used - // to identify devices during the communication - // Max length of this field is 63 characters - string ID = 1; - // Health of the device, can be healthy or unhealthy, see constants.go - string health = 2; - } - - // - PreStartContainer is expected to be called before each container start if indicated by plugin during registration phase. - // - PreStartContainer allows kubelet to pass reinitialized devices to containers. - // - PreStartContainer allows Device Plugin to run device specific operations on - // the Devices requested - message PreStartContainerRequest { - repeated string devicesIDs = 1; - } - - // PreStartContainerResponse will be send by plugin in response to PreStartContainerRequest - message PreStartContainerResponse { - } - - // - Allocate is expected to be called during pod creation since allocation - // failures for any container would result in pod startup failure. - // - Allocate allows kubelet to exposes additional artifacts in a pod's - // environment as directed by the plugin. - // - Allocate allows Device Plugin to run device specific operations on - // the Devices requested - message AllocateRequest { - repeated ContainerAllocateRequest container_requests = 1; - } - - message ContainerAllocateRequest { - repeated string devicesIDs = 1; - } - - // AllocateResponse includes the artifacts that needs to be injected into - // a container for accessing 'deviceIDs' that were mentioned as part of - // 'AllocateRequest'. - // Failure Handling: - // if Kubelet sends an allocation request for dev1 and dev2. - // Allocation on dev1 succeeds but allocation on dev2 fails. - // The Device plugin should send a ListAndWatch update and fail the - // Allocation request - message AllocateResponse { - repeated ContainerAllocateResponse container_responses = 1; - } - - message ContainerAllocateResponse { - // List of environment variable to be set in the container to access one of more devices. - map envs = 1; - // Mounts for the container. - repeated Mount mounts = 2; - // Devices for the container. - repeated DeviceSpec devices = 3; - // Container annotations to pass to the container runtime - map annotations = 4; - } - - // Mount specifies a host volume to mount into a container. - // where device library or tools are installed on host and container - message Mount { - // Path of the mount within the container. - string container_path = 1; - // Path of the mount on the host. - string host_path = 2; - // If set, the mount is read-only. - bool read_only = 3; - } - - // DeviceSpec specifies a host device to mount into a container. - message DeviceSpec { - // Path of the device within the container. - string container_path = 1; - // Path of the device on the host. - string host_path = 2; - // Cgroups permissions of the device, candidates are one or more of - // * r - allows container to read from the specified device. - // * w - allows container to write to the specified device. - // * m - allows container to create device files that do not yet exist. - string permissions = 3; - } - - \ No newline at end of file +syntax = "proto3"; + +package v1beta1; + + +// Registration is the service advertised by the Kubelet +// Only when Kubelet answers with a success code to a Register Request +// may Device Plugins start their service +// Registration may fail when device plugin version is not supported by +// Kubelet or the registered resourceName is already taken by another +// active device plugin. Device plugin is expected to terminate upon registration failure +service Registration { + rpc Register(RegisterRequest) returns (Empty) {} +} + +message DevicePluginOptions { + // Indicates if PreStartContainer call is required before each container start + bool pre_start_required = 1; + // Indicates if GetPreferredAllocation is implemented and available for calling + bool get_preferred_allocation_available = 2; +} + +message RegisterRequest { + // Version of the API the Device Plugin was built against + string version = 1; + // Name of the unix socket the device plugin is listening on + // PATH = path.Join(DevicePluginPath, endpoint) + string endpoint = 2; + // Schedulable resource name. As of now it's expected to be a DNS Label + string resource_name = 3; + // Options to be communicated with Device Manager + DevicePluginOptions options = 4; +} + +message Empty { +} + +// DevicePlugin is the service advertised by Device Plugins +service DevicePlugin { + // GetDevicePluginOptions returns options to be communicated with Device + // Manager + rpc GetDevicePluginOptions(Empty) returns (DevicePluginOptions) {} + + // ListAndWatch returns a stream of List of Devices + // Whenever a Device state change or a Device disappears, ListAndWatch + // returns the new list + rpc ListAndWatch(Empty) returns (stream ListAndWatchResponse) {} + + // GetPreferredAllocation returns a preferred set of devices to allocate + // from a list of available ones. The resulting preferred allocation is not + // guaranteed to be the allocation ultimately performed by the + // devicemanager. It is only designed to help the devicemanager make a more + // informed allocation decision when possible. + rpc GetPreferredAllocation(PreferredAllocationRequest) returns (PreferredAllocationResponse) {} + + // Allocate is called during container creation so that the Device + // Plugin can run device specific operations and instruct Kubelet + // of the steps to make the Device available in the container + rpc Allocate(AllocateRequest) returns (AllocateResponse) {} + + // PreStartContainer is called, if indicated by Device Plugin during registeration phase, + // before each container start. Device plugin can run device specific operations + // such as resetting the device before making devices available to the container + rpc PreStartContainer(PreStartContainerRequest) returns (PreStartContainerResponse) {} +} + +// ListAndWatch returns a stream of List of Devices +// Whenever a Device state change or a Device disappears, ListAndWatch +// returns the new list +message ListAndWatchResponse { + repeated Device devices = 1; +} + +message TopologyInfo { + repeated NUMANode nodes = 1; +} + +message NUMANode { + int64 ID = 1; +} + +/* E.g: +* struct Device { +* ID: "GPU-fef8089b-4820-abfc-e83e-94318197576e", +* Health: "Healthy", +* Topology: +* Node: +* ID: 1 +*} */ +message Device { + // A unique ID assigned by the device plugin used + // to identify devices during the communication + // Max length of this field is 63 characters + string ID = 1; + // Health of the device, can be healthy or unhealthy, see constants.go + string health = 2; + // Topology for device + TopologyInfo topology = 3; +} + +// - PreStartContainer is expected to be called before each container start if indicated by plugin during registration phase. +// - PreStartContainer allows kubelet to pass reinitialized devices to containers. +// - PreStartContainer allows Device Plugin to run device specific operations on +// the Devices requested +message PreStartContainerRequest { + repeated string devicesIDs = 1; +} + +// PreStartContainerResponse will be send by plugin in response to PreStartContainerRequest +message PreStartContainerResponse { +} + +// PreferredAllocationRequest is passed via a call to GetPreferredAllocation() +// at pod admission time. The device plugin should take the list of +// `available_deviceIDs` and calculate a preferred allocation of size +// 'allocation_size' from them, making sure to include the set of devices +// listed in 'must_include_deviceIDs'. +message PreferredAllocationRequest { + repeated ContainerPreferredAllocationRequest container_requests = 1; +} + +message ContainerPreferredAllocationRequest { + // List of available deviceIDs from which to choose a preferred allocation + repeated string available_deviceIDs = 1; + // List of deviceIDs that must be included in the preferred allocation + repeated string must_include_deviceIDs = 2; + // Number of devices to include in the preferred allocation + int32 allocation_size = 3; +} + +// PreferredAllocationResponse returns a preferred allocation, +// resulting from a PreferredAllocationRequest. +message PreferredAllocationResponse { + repeated ContainerPreferredAllocationResponse container_responses = 1; +} + +message ContainerPreferredAllocationResponse { + repeated string deviceIDs = 1; +} + +// - Allocate is expected to be called during pod creation since allocation +// failures for any container would result in pod startup failure. +// - Allocate allows kubelet to exposes additional artifacts in a pod's +// environment as directed by the plugin. +// - Allocate allows Device Plugin to run device specific operations on +// the Devices requested +message AllocateRequest { + repeated ContainerAllocateRequest container_requests = 1; +} + +message ContainerAllocateRequest { + repeated string devicesIDs = 1; +} + +// AllocateResponse includes the artifacts that needs to be injected into +// a container for accessing 'deviceIDs' that were mentioned as part of +// 'AllocateRequest'. +// Failure Handling: +// if Kubelet sends an allocation request for dev1 and dev2. +// Allocation on dev1 succeeds but allocation on dev2 fails. +// The Device plugin should send a ListAndWatch update and fail the +// Allocation request +message AllocateResponse { + repeated ContainerAllocateResponse container_responses = 1; +} + +message ContainerAllocateResponse { + // List of environment variable to be set in the container to access one of more devices. + map envs = 1; + // Mounts for the container. + repeated Mount mounts = 2; + // Devices for the container. + repeated DeviceSpec devices = 3; + // Container annotations to pass to the container runtime + map annotations = 4; +} + +// Mount specifies a host volume to mount into a container. +// where device library or tools are installed on host and container +message Mount { + // Path of the mount within the container. + string container_path = 1; + // Path of the mount on the host. + string host_path = 2; + // If set, the mount is read-only. + bool read_only = 3; +} + +// DeviceSpec specifies a host device to mount into a container. +message DeviceSpec { + // Path of the device within the container. + string container_path = 1; + // Path of the device on the host. + string host_path = 2; + // Cgroups permissions of the device, candidates are one or more of + // * r - allows container to read from the specified device. + // * w - allows container to write to the specified device. + // * m - allows container to create device files that do not yet exist. + string permissions = 3; +} \ No newline at end of file diff --git a/agent/proto/pluginregistration.proto b/agent/proto/pluginregistration.proto new file mode 100644 index 000000000..6e3be0d78 --- /dev/null +++ b/agent/proto/pluginregistration.proto @@ -0,0 +1,50 @@ +// To regenerate api.pb.go run `hack/update-codegen.sh protobindings` +syntax = "proto3"; + +package pluginregistration; // This should have been v1. + +// PluginInfo is the message sent from a plugin to the Kubelet pluginwatcher for plugin registration +message PluginInfo { + // Type of the Plugin. CSIPlugin or DevicePlugin + string type = 1; + // Plugin name that uniquely identifies the plugin for the given plugin type. + // For DevicePlugin, this is the resource name that the plugin manages and + // should follow the extended resource name convention. + // For CSI, this is the CSI driver registrar name. + string name = 2; + // Optional endpoint location. If found set by Kubelet component, + // Kubelet component will use this endpoint for specific requests. + // This allows the plugin to register using one endpoint and possibly use + // a different socket for control operations. CSI uses this model to delegate + // its registration external from the plugin. + string endpoint = 3; + // Plugin service API versions the plugin supports. + // For DevicePlugin, this maps to the deviceplugin API versions the + // plugin supports at the given socket. + // The Kubelet component communicating with the plugin should be able + // to choose any preferred version from this list, or returns an error + // if none of the listed versions is supported. + repeated string supported_versions = 4; +} + +// RegistrationStatus is the message sent from Kubelet pluginwatcher to the plugin for notification on registration status +message RegistrationStatus { + // True if plugin gets registered successfully at Kubelet + bool plugin_registered = 1; + // Error message in case plugin fails to register, empty string otherwise + string error = 2; +} + +// RegistrationStatusResponse is sent by plugin to kubelet in response to RegistrationStatus RPC +message RegistrationStatusResponse { +} + +// InfoRequest is the empty request message from Kubelet +message InfoRequest { +} + +// Registration is the service advertised by the Plugins. +service Registration { + rpc GetInfo(InfoRequest) returns (PluginInfo) {} + rpc NotifyRegistrationStatus(RegistrationStatus) returns (RegistrationStatusResponse) {} +} diff --git a/agent/proto/podresources.proto b/agent/proto/podresources.proto new file mode 100644 index 000000000..0119b1421 --- /dev/null +++ b/agent/proto/podresources.proto @@ -0,0 +1,67 @@ +syntax = "proto3"; + +package v1; + + +// PodResourcesLister is a service provided by the kubelet that provides information about the +// node resources consumed by pods and containers on the node +service PodResourcesLister { + rpc List(ListPodResourcesRequest) returns (ListPodResourcesResponse) {} + rpc GetAllocatableResources(AllocatableResourcesRequest) returns (AllocatableResourcesResponse) {} +} + +message AllocatableResourcesRequest {} + +// AllocatableResourcesResponses contains informations about all the devices known by the kubelet +message AllocatableResourcesResponse { + repeated ContainerDevices devices = 1; + repeated int64 cpu_ids = 2; + repeated ContainerMemory memory = 3; +} + +// ListPodResourcesRequest is the request made to the PodResourcesLister service +message ListPodResourcesRequest {} + +// ListPodResourcesResponse is the response returned by List function +message ListPodResourcesResponse { + repeated PodResources pod_resources = 1; +} + +// PodResources contains information about the node resources assigned to a pod +message PodResources { + string name = 1; + string namespace = 2; + repeated ContainerResources containers = 3; +} + +// ContainerResources contains information about the resources assigned to a container +message ContainerResources { + string name = 1; + repeated ContainerDevices devices = 2; + repeated int64 cpu_ids = 3; + repeated ContainerMemory memory = 4; +} + +// ContainerMemory contains information about memory and hugepages assigned to a container +message ContainerMemory { + string memory_type = 1; + uint64 size = 2; + TopologyInfo topology = 3; +} + +// ContainerDevices contains information about the devices assigned to a container +message ContainerDevices { + string resource_name = 1; + repeated string device_ids = 2; + TopologyInfo topology = 3; +} + +// Topology describes hardware topology of the resource +message TopologyInfo { + repeated NUMANode nodes = 1; +} + +// NUMA representation of NUMA node +message NUMANode { + int64 ID = 1; +} \ No newline at end of file diff --git a/agent/src/device_manager/cdi.rs b/agent/src/device_manager/cdi.rs new file mode 100644 index 000000000..5380c9e45 --- /dev/null +++ b/agent/src/device_manager/cdi.rs @@ -0,0 +1,169 @@ +///This module represents the schema used by CDI in version 0.6.0: +/// https://github.com/cncf-tags/container-device-interface/blob/main/SPEC.md +/// +/// It provides helpers to convert from v0 discovery handler protocol +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct Device { + pub name: String, + #[serde(default, skip_serializing_if = "HashMap::is_empty")] + pub annotations: HashMap, + #[serde(default)] + pub container_edits: ContainerEdit, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ContainerEdit { + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub env: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub device_nodes: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub mounts: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub hooks: Vec, +} + +#[derive(Clone, Debug, Deserialize, Serialize, Default, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct DeviceNode { + pub path: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub host_path: Option, + #[serde(rename = "type", default, skip_serializing_if = "Option::is_none")] + pub device_type: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub major: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub minor: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub file_mode: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub permissions: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub uid: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub gid: Option, +} + +impl From for DeviceNode { + fn from(value: akri_discovery_utils::discovery::v0::DeviceSpec) -> Self { + Self { + path: value.container_path, + host_path: Some(value.host_path), + permissions: Some(value.permissions), + ..Default::default() + } + } +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct Mount { + pub host_path: String, + pub container_path: String, + #[serde(rename = "type", default, skip_serializing_if = "Option::is_none")] + pub mount_type: Option, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub options: Vec, +} + +impl From for Mount { + fn from(value: akri_discovery_utils::discovery::v0::Mount) -> Self { + let options = match value.read_only { + false => vec![], + true => vec!["ro".to_string()], + }; + Self { + host_path: value.host_path, + container_path: value.container_path, + mount_type: None, + options, + } + } +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct Hook { + pub hook_name: String, + pub path: String, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub args: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub env: Vec, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub timeout: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize, Default, PartialEq)] +#[serde(rename_all = "camelCase")] +#[serde(tag = "cdiVersion", rename = "0.6.0")] +pub struct Kind { + pub kind: String, + #[serde(default, skip_serializing_if = "HashMap::is_empty")] + pub annotations: HashMap, + pub devices: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub container_edits: Vec, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_cdi_mount_from_discovery() { + let discovery_mount = akri_discovery_utils::discovery::v0::Mount { + container_path: "/path/in/container".to_string(), + host_path: "/path/in/host".to_string(), + read_only: true, + }; + let expected_mount = Mount { + host_path: "/path/in/host".to_string(), + container_path: "/path/in/container".to_string(), + mount_type: None, + options: vec!["ro".to_string()], + }; + assert_eq!(Mount::from(discovery_mount), expected_mount); + + let discovery_mount = akri_discovery_utils::discovery::v0::Mount { + container_path: "/path/in/container".to_string(), + host_path: "/path/in/host".to_string(), + read_only: false, + }; + let expected_mount = Mount { + host_path: "/path/in/host".to_string(), + container_path: "/path/in/container".to_string(), + mount_type: None, + options: vec![], + }; + assert_eq!(Mount::from(discovery_mount), expected_mount); + } + + #[test] + fn test_device_node_from_device_spec() { + let device_spec = akri_discovery_utils::discovery::v0::DeviceSpec { + container_path: "/path/in/container".to_string(), + host_path: "/path/in/host".to_string(), + permissions: "rw".to_string(), + }; + let expected_device_node = DeviceNode { + path: "/path/in/container".to_string(), + host_path: Some("/path/in/host".to_string()), + device_type: None, + major: None, + minor: None, + file_mode: None, + permissions: Some("rw".to_string()), + uid: None, + gid: None, + }; + assert_eq!(DeviceNode::from(device_spec), expected_device_node) + } +} diff --git a/agent/src/device_manager/in_memory.rs b/agent/src/device_manager/in_memory.rs new file mode 100644 index 000000000..61a458413 --- /dev/null +++ b/agent/src/device_manager/in_memory.rs @@ -0,0 +1,175 @@ +use std::collections::HashMap; + +use super::{cdi, DeviceManager}; +use tokio::sync::watch; + +pub struct InMemoryManager { + state: watch::Receiver>, +} + +impl InMemoryManager { + pub fn new(state: watch::Receiver>) -> Self { + InMemoryManager { state } + } +} + +impl DeviceManager for InMemoryManager { + /// This method resolves a device from its FQDN (i.e in the form akri.sh/configuration=id) + /// It returns None if the device is not registered to the device manager + /// If the device is registered, it resolves its properties by merging the device specific properties + /// with the configuration (kind) level properties + fn get(&self, fqdn: &str) -> Option { + let (kind, id) = fqdn.split_once('=').unwrap(); + let state = self.state.borrow(); + let cdi_kind = state.get(kind)?; + let mut device = cdi_kind.devices.iter().find(|dev| dev.name == id)?.clone(); + device.name = format!("{}-{}", kind, id); + device.annotations.extend( + cdi_kind + .annotations + .iter() + .map(|(k, v)| (k.clone(), v.clone())), + ); + for edit in cdi_kind.container_edits.iter().cloned() { + device.container_edits.env.extend(edit.env); + device + .container_edits + .device_nodes + .extend(edit.device_nodes); + device.container_edits.hooks.extend(edit.hooks); + device.container_edits.mounts.extend(edit.mounts); + } + Some(device) + } + + fn has_device(&self, fqdn: String) -> bool { + let (kind, id) = fqdn.split_once('=').unwrap(); + if let Some(k) = self.state.borrow().get(kind) { + return k.devices.iter().any(|dev| dev.name == id); + } + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_manager() { + let (sender, rec) = watch::channel(Default::default()); + let manager = InMemoryManager::new(rec); + + assert!(!manager.has_device("akri.sh/any=device".to_string())); + assert_eq!(manager.get("akri.sh/any=device"), None); + + let _ = sender.send(HashMap::from([( + "akri.sh/device".to_string(), + cdi::Kind { + kind: "akri.sh/device".to_string(), + annotations: HashMap::from([("config_level".to_owned(), "foo".to_owned())]), + devices: vec![cdi::Device { + name: "my-device".to_string(), + annotations: HashMap::from([("device_level".to_owned(), "bar".to_owned())]), + container_edits: cdi::ContainerEdit { + env: vec!["back=home".to_string()], + device_nodes: vec![cdi::DeviceNode { + path: "/device/level/path".to_string(), + ..Default::default() + }], + mounts: vec![cdi::Mount { + host_path: "/device/level/host/path".to_string(), + container_path: "/device/level/container/path".to_string(), + mount_type: None, + options: vec![], + }], + hooks: vec![cdi::Hook { + hook_name: "device_level".to_string(), + path: "some/path".to_string(), + args: vec![], + env: vec![], + timeout: None, + }], + }, + }], + container_edits: vec![cdi::ContainerEdit { + env: vec!["hello=world".to_string()], + device_nodes: vec![cdi::DeviceNode { + path: "/conf/level/path".to_string(), + ..Default::default() + }], + mounts: vec![cdi::Mount { + host_path: "/conf/level/host/path".to_string(), + container_path: "/conf/level/container/path".to_string(), + mount_type: None, + options: vec![], + }], + hooks: vec![cdi::Hook { + hook_name: "config_level".to_string(), + path: "some/path".to_string(), + args: vec![], + env: vec![], + timeout: None, + }], + }], + }, + )])); + + let expected_device = cdi::Device { + name: "akri.sh/device-my-device".to_string(), + annotations: HashMap::from([ + ("device_level".to_owned(), "bar".to_owned()), + ("config_level".to_owned(), "foo".to_owned()), + ]), + container_edits: cdi::ContainerEdit { + env: vec!["back=home".to_string(), "hello=world".to_string()], + device_nodes: vec![ + cdi::DeviceNode { + path: "/device/level/path".to_string(), + ..Default::default() + }, + cdi::DeviceNode { + path: "/conf/level/path".to_string(), + ..Default::default() + }, + ], + mounts: vec![ + cdi::Mount { + host_path: "/device/level/host/path".to_string(), + container_path: "/device/level/container/path".to_string(), + mount_type: None, + options: vec![], + }, + cdi::Mount { + host_path: "/conf/level/host/path".to_string(), + container_path: "/conf/level/container/path".to_string(), + mount_type: None, + options: vec![], + }, + ], + hooks: vec![ + cdi::Hook { + hook_name: "device_level".to_string(), + path: "some/path".to_string(), + args: vec![], + env: vec![], + timeout: None, + }, + cdi::Hook { + hook_name: "config_level".to_string(), + path: "some/path".to_string(), + args: vec![], + env: vec![], + timeout: None, + }, + ], + }, + }; + + assert!(manager.has_device("akri.sh/device=my-device".to_string())); + assert_eq!( + manager.get("akri.sh/device=my-device"), + Some(expected_device) + ); + } +} diff --git a/agent/src/device_manager/mod.rs b/agent/src/device_manager/mod.rs new file mode 100644 index 000000000..cb39b1e60 --- /dev/null +++ b/agent/src/device_manager/mod.rs @@ -0,0 +1,12 @@ +pub mod cdi; +mod in_memory; + +pub use in_memory::InMemoryManager; + +#[cfg(test)] +use mockall::automock; +#[cfg_attr(test, automock)] +pub trait DeviceManager: Send + Sync { + fn get(&self, fqdn: &str) -> Option; + fn has_device(&self, fqdn: String) -> bool; +} diff --git a/agent/src/discovery_handler_manager/discovery_handler_registry.rs b/agent/src/discovery_handler_manager/discovery_handler_registry.rs new file mode 100644 index 000000000..f12a24c06 --- /dev/null +++ b/agent/src/discovery_handler_manager/discovery_handler_registry.rs @@ -0,0 +1,983 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use akri_discovery_utils::discovery::v0::{ByteData, Device, DiscoverRequest}; +use akri_shared::akri::configuration::{Configuration, DiscoveryProperty}; +use akri_shared::akri::instance::Instance; + +use akri_shared::akri::instance::InstanceSpec; +use async_trait::async_trait; +use blake2::digest::{Update, VariableOutput}; +use blake2::VarBlake2b; +use futures::future::select_all; +use futures::future::try_join_all; +use futures::FutureExt; +use itertools::Itertools; +use kube::core::ObjectMeta; +use kube_runtime::reflector::ObjectRef; +use tokio::select; +use tokio::sync::mpsc; +use tokio::sync::watch; +use tokio::sync::RwLock; +use tokio::sync::{broadcast, Mutex, Notify}; + +use super::discovery_property_solver::PropertySolver; +use super::{DiscoveryError, DiscoveryManagerKubeInterface}; +use crate::device_manager::cdi::ContainerEdit; + +#[cfg(test)] +use mockall::automock; + +#[derive(Clone, Debug, PartialEq)] +pub enum DiscoveredDevice { + LocalDevice(Device, String), + SharedDevice(Device), +} + +impl DiscoveredDevice { + /// Generates a digest of an Instance's id. There should be a unique digest and Instance for each discovered device. + /// This means that the id of non-local devices that could be visible to multiple nodes should always resolve + /// to the same instance name (which is suffixed with this digest). + /// However, local devices' Instances should have unique hashes even if they have the same id. + /// To ensure this, the node's name is added to the id before it is hashed. + fn device_hash(&self) -> String { + let (id_to_digest, shared, node_name) = match self { + DiscoveredDevice::LocalDevice(d, n) => (d.id.to_owned(), false, n.as_str()), + DiscoveredDevice::SharedDevice(d) => (d.id.to_owned(), true, ""), + }; + let mut id_to_digest = id_to_digest.to_string(); + // For local devices, include node hostname in id_to_digest so instances have unique names + if !shared { + id_to_digest = format!("{}{}", &id_to_digest, node_name,); + } + let mut digest = String::new(); + let mut hasher = VarBlake2b::new(3).unwrap(); + hasher.update(id_to_digest); + hasher.finalize_variable(|var| { + digest = var + .iter() + .map(|num| format!("{:02x}", num)) + .collect::>() + .join("") + }); + digest + } + + fn inner(self) -> Device { + match self { + DiscoveredDevice::LocalDevice(d, _) => d, + DiscoveredDevice::SharedDevice(d) => d, + } + } +} + +impl From for crate::device_manager::cdi::Device { + fn from(value: DiscoveredDevice) -> Self { + let hash = value.device_hash(); + let dev = value.inner(); + Self { + name: hash, + annotations: Default::default(), + container_edits: crate::device_manager::cdi::ContainerEdit { + env: dev + .properties + .into_iter() + .map(|(k, v)| format!("{}={}", k, v)) + .collect(), + device_nodes: dev.device_specs.into_iter().map_into().collect(), + mounts: dev.mounts.into_iter().map_into().collect(), + hooks: Default::default(), + }, + } + } +} + +/// This trait represents a discovery handler, no matter if it is an embedded or remote one +#[async_trait] +#[cfg_attr(test, automock)] +pub trait DiscoveryHandlerEndpoint: Send + Sync { + async fn query( + &self, + sender: watch::Sender>>, + query_body: DiscoverRequest, + ) -> Result<(), DiscoveryError>; + + fn get_name(&self) -> String; + fn get_uid(&self) -> String; + + async fn closed(&self); + fn is_closed(&self) -> bool; +} + +/// This trait is here to help with testing for code that interract with the discovery handler registry +#[cfg_attr(test, automock)] +pub trait DiscoveryHandlerRequest: Sync + Send { + fn get_instances(&self) -> Result, DiscoveryError>; +} + +/// This trait is here to help with testing for code that interract with the discovery handler registry +#[cfg_attr(test, automock)] +#[async_trait] +pub trait DiscoveryHandlerRegistry: Sync + Send { + async fn new_request( + &self, + key: &str, + dh_name: &str, + dh_details: &str, + dh_properties: &[DiscoveryProperty], + extra_device_properties: HashMap, + namespace: &str, + ) -> Result<(), DiscoveryError>; + + /// Get a reference to a specific request, allowing one to get the related Instances + async fn get_request(&self, key: &str) -> Option>; + + /// Terminate a specific request, will trigger removal of linked devices + async fn terminate_request(&self, key: &str); + + /// Register a new endpoint to make it available to all current and future queries + async fn register_endpoint(&self, endpoint: Arc); +} + +/// Real world implementation of the Discovery Handler Request +struct DHRequestImpl { + endpoints: RwLock>>>>, + notifier: watch::Sender>>, + key: String, + handler_name: String, + details: String, + properties: Vec, + extra_device_properties: HashMap, + kube_client: Arc, + termination_notifier: Arc, +} + +impl DiscoveryHandlerRequest for DHRequestImpl { + fn get_instances(&self) -> Result, DiscoveryError> { + Ok(self + .notifier + .borrow() + .iter() + .map(|i| self.device_to_instance(i)) + .collect()) + } +} + +impl DHRequestImpl { + fn device_to_instance(&self, dev: &DiscoveredDevice) -> Instance { + let (rdev, shared) = match dev { + DiscoveredDevice::LocalDevice(d, _) => (d, false), + DiscoveredDevice::SharedDevice(d) => (d, true), + }; + let mut properties = rdev.properties.clone(); + properties.extend(self.extra_device_properties.clone()); + Instance { + spec: InstanceSpec { + cdi_name: self.get_device_cdi_fqdn(dev), + configuration_name: self.key.clone(), + broker_properties: properties, + shared, + nodes: Default::default(), + device_usage: Default::default(), + capacity: Default::default(), + }, + metadata: ObjectMeta { + name: Some(format!("{}-{}", self.key, dev.device_hash())), + ..Default::default() + }, + } + } + + fn get_device_cdi_fqdn(&self, dev: &DiscoveredDevice) -> String { + format!("akri.sh/{}={}", self.key, dev.device_hash()) + } + + async fn watch_devices(&self, mut rec: broadcast::Receiver>) { + loop { + let mut local_endpoints = self.endpoints.write().await.clone(); + let futures = local_endpoints.iter_mut().map(|e| e.changed().boxed()); + select! { + (a, index, _) = select_all(futures) => { + if a.is_err() { + let mut write_endpoint = self.endpoints.write().await; + write_endpoint.remove(index); + if write_endpoint.is_empty() { + return; + } + } + }, + Ok(endpoint) = rec.recv() => { + if endpoint.get_name() != self.handler_name { + // We woke up for another kind of DH, let's get back to sleep + continue + } + if let Ok(q) = self.query(endpoint).await { + self.endpoints.write().await.push(q); + } + }, + _ = self.notifier.closed() => { + return; + }, + } + let devices: Vec> = self + .endpoints + .write() + .await + .iter_mut() + .flat_map(|r| r.borrow_and_update().clone().into_iter()) + .collect(); + self.notifier.send_replace( + devices + .into_iter() + .unique_by(|d| self.get_device_cdi_fqdn(d)) + .collect(), + ); + } + } + + async fn query( + &self, + endpoint: Arc, + ) -> Result>>, DiscoveryError> { + let (q_sender, q_receiver) = watch::channel(vec![]); + let query_body = DiscoverRequest { + discovery_details: self.details.clone(), + discovery_properties: self.solve_discovery_properties().await?, + }; + endpoint.query(q_sender, query_body).await?; + Ok(q_receiver) + } + + async fn solve_discovery_properties( + &self, + ) -> Result, DiscoveryError> { + let solved_properties_futures = self + .properties + .iter() + .map(|p| p.solve(self.kube_client.clone())); + Ok(try_join_all(solved_properties_futures) + .await? + .into_iter() + .flatten() + .collect()) + } +} + +pub(super) type LockedMap = Arc>>; + +pub(super) struct DHRegistryImpl { + requests: LockedMap>, + handlers: LockedMap>>, + endpoint_notifier: broadcast::Sender>, + configuration_notifier: mpsc::Sender>, + cdi_notifier: Arc>>>, + kube_client: Arc, +} + +impl DHRegistryImpl { + pub(super) fn new( + kube_client: Arc, + cdi_notifier: watch::Sender>, + configuration_notifier: mpsc::Sender>, + ) -> Self { + let (endpoint_notifier, _) = broadcast::channel(10); + + Self { + requests: Default::default(), + handlers: Default::default(), + endpoint_notifier, + configuration_notifier, + cdi_notifier: Arc::new(Mutex::new(cdi_notifier)), + kube_client, + } + } +} + +#[async_trait] +impl DiscoveryHandlerRegistry for DHRegistryImpl { + async fn new_request( + &self, + key: &str, + dh_name: &str, + dh_details: &str, + dh_properties: &[DiscoveryProperty], + extra_device_properties: HashMap, + namespace: &str, + ) -> Result<(), DiscoveryError> { + match self.handlers.read().await.get(dh_name) { + Some(handlers) => { + let (notifier, _) = watch::channel(vec![]); + let terminated = Arc::new(Notify::new()); + let mut dh_req = DHRequestImpl { + endpoints: Default::default(), + notifier, + key: key.to_string(), + handler_name: dh_name.to_string(), + details: dh_details.to_string(), + properties: dh_properties.to_vec(), + extra_device_properties: extra_device_properties.clone(), + kube_client: self.kube_client.clone(), + termination_notifier: terminated.clone(), + }; + let dh_futures = handlers + .iter() + .map(|(_, handler)| dh_req.query(handler.clone())); + let dh_streams: Vec>>> = + try_join_all(dh_futures).await?; + dh_req.endpoints = RwLock::new(dh_streams); + { + let mut req_w = self.requests.write().await; + req_w.insert(key.to_string(), Arc::new(dh_req)); + } + let dh_req_ref = self.requests.read().await.get(key).unwrap().to_owned(); + let mut local_req_notifier = self + .requests + .read() + .await + .get(key) + .unwrap() + .notifier + .subscribe(); + let local_config_sender = self.configuration_notifier.to_owned(); + let local_cdi_sender = self.cdi_notifier.to_owned(); + let local_key = key.to_owned(); + let namespace = namespace.to_owned(); + tokio::spawn(async move { + let cdi_kind = format!("akri.sh/{}", local_key); + loop { + match local_req_notifier.changed().await { + Ok(_) => { + local_cdi_sender.lock().await.send_modify(|kind| { + kind.insert( + cdi_kind.clone(), + crate::device_manager::cdi::Kind { + kind: cdi_kind.clone(), + annotations: Default::default(), + devices: local_req_notifier + .borrow_and_update() + .iter() + .map(|d| d.as_ref().clone().into()) + .collect(), + container_edits: vec![ContainerEdit { + env: extra_device_properties + .iter() + .map(|(k, v)| format!("{}={}", k, v)) + .collect(), + ..Default::default() + }], + }, + ); + }); + trace!("Ask for reconciliation of {}::{}", namespace, local_key); + let res = local_config_sender + .send( + ObjectRef::::new(&local_key) + .within(&namespace), + ) + .await; + if res.is_err() { + local_cdi_sender.lock().await.send_modify(|kind| { + kind.remove(&cdi_kind); + }); + return; + } + } + Err(_) => { + trace!("Ask for reconciliation of {}::{}", namespace, local_key); + let _ = local_config_sender + .send( + ObjectRef::::new(&local_key) + .within(&namespace), + ) + .await; + local_cdi_sender.lock().await.send_modify(|kind| { + kind.remove(&cdi_kind); + }); + return; + } + } + } + }); + + let local_key = key.to_owned(); + let notifier_receiver = self.endpoint_notifier.subscribe(); + let local_req = self.requests.clone(); + tokio::spawn(async move { + let mut signal = + tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()) + .unwrap(); + select! { + _ = dh_req_ref + .watch_devices(notifier_receiver) => {}, + _ = terminated.notified() => {}, + _ = signal.recv() => {}, + } + local_req.write().await.remove(&local_key); + }); + Ok(()) + } + None => Err(DiscoveryError::NoHandler(dh_name.to_string())), + } + } + + async fn get_request(&self, key: &str) -> Option> { + let req_read = self.requests.read().await; + match req_read.get(key) { + Some(r) => Some(r.to_owned()), + None => None, + } + } + + async fn terminate_request(&self, key: &str) { + if let Some(r) = self.requests.write().await.remove(key) { + r.termination_notifier.notify_waiters() + } + } + + async fn register_endpoint(&self, endpoint: Arc) { + let name = endpoint.get_name(); + let uid = endpoint.get_uid(); + let _ = self.endpoint_notifier.send(endpoint.clone()); + { + let mut w_handlers = self.handlers.write().await; + match w_handlers.get_mut(&name) { + Some(v) => { + v.insert(uid.clone(), endpoint.clone()); + } + None => { + w_handlers.insert( + name.clone(), + HashMap::from([(uid.clone(), endpoint.clone())]), + ); + } + } + } + // Spawn a task to remove it from the list when it gets closed. It is the responsibility of the + // endpoint to close itself when it cannot accept new requests, it is ok for the endpoint to do so + // reactively after a failure on a new request. + let local_handlers = self.handlers.clone(); + tokio::spawn(async move { + endpoint.closed().await; + let mut w_handlers = local_handlers.write().await; + if let Some(v) = w_handlers.get_mut(&name) { + // Remove all closed endpoints, we can't remove just the one with our uid, as it + // may have registered again in the meantime. + v.retain(|_, e| !e.is_closed()); + if v.is_empty() { + w_handlers.remove(&name); + } + } + }); + } +} + +#[cfg(test)] +mod tests { + use std::{sync::Arc, time::Duration}; + + use crate::{ + device_manager::cdi::{self, Kind}, + discovery_handler_manager::mock::MockDiscoveryManagerKubeInterface, + }; + use akri_discovery_utils::discovery::v0 as discovery_utils; + + use super::*; + + #[test] + fn test_discovered_device() { + let local_device = DiscoveredDevice::LocalDevice( + Device { + id: "my_local_device".to_owned(), + properties: Default::default(), + mounts: Default::default(), + device_specs: Default::default(), + }, + "my_node".to_owned(), + ); + let other_local_device = DiscoveredDevice::LocalDevice( + Device { + id: "my_local_device".to_owned(), + properties: Default::default(), + mounts: Default::default(), + device_specs: Default::default(), + }, + "my_other_node".to_owned(), + ); + let shared_device = DiscoveredDevice::SharedDevice(Device { + id: "my_shared_device".to_owned(), + properties: HashMap::from([("ENV_KEY".to_owned(), "env_value".to_owned())]), + mounts: vec![discovery_utils::Mount { + container_path: "container".to_owned(), + host_path: "host".to_owned(), + read_only: false, + }], + device_specs: vec![discovery_utils::DeviceSpec { + container_path: "container".to_owned(), + host_path: "host".to_owned(), + permissions: "perms".to_owned(), + }], + }); + + assert_eq!( + Into::::into(local_device), + cdi::Device { + name: "e77db4".to_owned(), + annotations: Default::default(), + container_edits: ContainerEdit { + env: vec![], + device_nodes: vec![], + mounts: vec![], + hooks: Default::default() + }, + } + ); + assert_eq!( + Into::::into(other_local_device), + cdi::Device { + name: "099763".to_owned(), + annotations: Default::default(), + container_edits: ContainerEdit { + env: vec![], + device_nodes: vec![], + mounts: vec![], + hooks: Default::default() + }, + } + ); + assert_eq!( + Into::::into(shared_device), + cdi::Device { + name: "4294ea".to_owned(), + annotations: Default::default(), + container_edits: ContainerEdit { + env: vec!["ENV_KEY=env_value".to_owned()], + device_nodes: vec![cdi::DeviceNode { + path: "container".to_owned(), + host_path: Some("host".to_owned()), + permissions: Some("perms".to_owned()), + ..Default::default() + }], + mounts: vec![cdi::Mount { + host_path: "host".to_owned(), + container_path: "container".to_owned(), + mount_type: None, + options: Default::default() + }], + hooks: Default::default() + }, + } + ); + } + + #[test] + fn test_dh_request_impl_get_instances() { + let (notifier, _) = watch::channel(vec![Arc::new(DiscoveredDevice::LocalDevice( + Device { + id: "my_local_device".to_owned(), + properties: HashMap::from([( + "MY_DEVICE_KEY".to_owned(), + "device_value".to_owned(), + )]), + mounts: Default::default(), + device_specs: Default::default(), + }, + "my_node".to_owned(), + ))]); + let req = DHRequestImpl { + endpoints: Default::default(), + notifier, + key: "my_config".to_owned(), + handler_name: "mock_handler".to_string(), + details: Default::default(), + properties: Default::default(), + extra_device_properties: HashMap::from([( + "MY_EXTRA_KEY".to_owned(), + "value".to_owned(), + )]), + kube_client: Arc::new(MockDiscoveryManagerKubeInterface::new()), + termination_notifier: Arc::new(Notify::new()), + }; + + assert_eq!( + req.get_instances().unwrap(), + vec![Instance { + metadata: ObjectMeta { + name: Some("my_config-e77db4".to_owned()), + ..Default::default() + }, + spec: InstanceSpec { + configuration_name: "my_config".to_owned(), + cdi_name: "akri.sh/my_config=e77db4".to_owned(), + capacity: 0, + broker_properties: HashMap::from([ + ("MY_EXTRA_KEY".to_owned(), "value".to_owned()), + ("MY_DEVICE_KEY".to_owned(), "device_value".to_owned()) + ]), + shared: false, + nodes: Default::default(), + device_usage: Default::default(), + } + }] + ); + } + + #[tokio::test] + async fn test_dh_request_impl_watch_devices() { + let (notifier, mut n_rec) = watch::channel(vec![]); + let (dh_send, dh_rec) = watch::channel(Default::default()); + let req = Arc::new(DHRequestImpl { + endpoints: RwLock::new(vec![dh_rec]), + notifier, + key: "my_config".to_owned(), + handler_name: "mock_handler".to_string(), + details: "discovery details".to_string(), + properties: vec![DiscoveryProperty { + name: "property_1".to_string(), + value: Some("value_1".to_string()), + value_from: None, + }], + extra_device_properties: HashMap::from([( + "MY_EXTRA_KEY".to_owned(), + "value".to_owned(), + )]), + kube_client: Arc::new(MockDiscoveryManagerKubeInterface::new()), + termination_notifier: Arc::new(Notify::new()), + }); + let req_ref = req.clone(); + + let (new_dh_sen, rec) = broadcast::channel(1); + + let task = tokio::spawn(async move { req_ref.watch_devices(rec).await }); + assert!(n_rec.borrow_and_update().is_empty()); + + let new_device = Arc::new(DiscoveredDevice::SharedDevice(Device { + id: "my_shared_device".to_owned(), + properties: HashMap::from([("ENV_KEY".to_owned(), "env_value".to_owned())]), + mounts: vec![], + device_specs: vec![], + })); + dh_send.send(vec![new_device.clone()]).unwrap(); + + tokio::time::sleep(Duration::from_millis(500)).await; + assert_eq!(n_rec.borrow_and_update().clone(), vec![new_device]); + + let mut new_dh = MockDiscoveryHandlerEndpoint::new(); + let new_dh_senders = Arc::new(std::sync::Mutex::new(vec![])); + let senders_vec = new_dh_senders.clone(); + new_dh + .expect_get_name() + .returning(|| "mock_handler".to_string()); + new_dh + .expect_query() + .with( + mockall::predicate::always(), + mockall::predicate::eq(DiscoverRequest { + discovery_details: "discovery details".to_string(), + discovery_properties: HashMap::from([( + "property_1".to_owned(), + ByteData { + vec: Some(b"value_1".to_vec()), + }, + )]), + }), + ) + .returning(move |s, _| { + senders_vec.lock().unwrap().push(s); + async { Ok(()) }.boxed() + }); + assert!(new_dh_sen.send(Arc::new(new_dh)).is_ok()); + tokio::time::sleep(Duration::from_millis(500)).await; + assert_eq!(req.endpoints.read().await.len(), 2); + new_dh_senders.lock().unwrap().pop(); + tokio::time::sleep(Duration::from_millis(500)).await; + assert_eq!(req.endpoints.read().await.len(), 1); + drop(n_rec); + assert!(task.await.is_ok()) + } + + #[tokio::test] + async fn test_dh_reg_register_endpoint() { + let (cdi_notifier, _) = watch::channel(Default::default()); + let (configuration_notifier, _) = mpsc::channel(2); + let dh_reg = DHRegistryImpl::new( + Arc::new(MockDiscoveryManagerKubeInterface::new()), + cdi_notifier, + configuration_notifier, + ); + let mut endpoint = MockDiscoveryHandlerEndpoint::new(); + let (close_1, closed) = tokio::sync::oneshot::channel::<()>(); + endpoint.expect_get_name().return_const("mock_handler"); + endpoint.expect_get_uid().return_const("mock_handler_local"); + endpoint.expect_closed().return_once(|| { + Box::pin(async { + let _ = closed.await; + }) + }); + endpoint.expect_is_closed().return_const(true); + dh_reg.register_endpoint(Arc::new(endpoint)).await; + assert!(dh_reg + .handlers + .read() + .await + .get("mock_handler") + .unwrap() + .get("mock_handler_local") + .is_some()); + + let mut endpoint = MockDiscoveryHandlerEndpoint::new(); + let (close_2, closed) = tokio::sync::oneshot::channel::<()>(); + endpoint.expect_get_name().return_const("mock_handler"); + endpoint + .expect_get_uid() + .return_const("mock_handler_local_2"); + endpoint.expect_closed().return_once(|| { + Box::pin(async { + let _ = closed.await; + }) + }); + endpoint.expect_is_closed().once().return_const(false); + endpoint.expect_is_closed().once().return_const(true); + dh_reg.register_endpoint(Arc::new(endpoint)).await; + assert!(dh_reg + .handlers + .read() + .await + .get("mock_handler") + .unwrap() + .get("mock_handler_local_2") + .is_some()); + + close_1.send(()).unwrap(); + tokio::time::sleep(Duration::from_millis(500)).await; + assert!(dh_reg + .handlers + .read() + .await + .get("mock_handler") + .unwrap() + .get("mock_handler_local") + .is_none()); + assert!(dh_reg + .handlers + .read() + .await + .get("mock_handler") + .unwrap() + .get("mock_handler_local_2") + .is_some()); + + close_2.send(()).unwrap(); + tokio::time::sleep(Duration::from_millis(500)).await; + assert!(!dh_reg.handlers.read().await.contains_key("mock_handler")) + } + + #[tokio::test] + async fn test_dh_reg_get_terminate_request() { + let (cdi_notifier, _) = watch::channel(Default::default()); + let (configuration_notifier, _) = mpsc::channel(2); + let kube_client = Arc::new(MockDiscoveryManagerKubeInterface::new()); + let dh_reg = DHRegistryImpl::new(kube_client.clone(), cdi_notifier, configuration_notifier); + let (req_not, _) = watch::channel(Default::default()); + let request = Arc::new(DHRequestImpl { + endpoints: Default::default(), + notifier: req_not, + key: "my-config".to_owned(), + handler_name: Default::default(), + details: Default::default(), + properties: Default::default(), + extra_device_properties: Default::default(), + kube_client, + termination_notifier: Arc::new(Notify::new()), + }); + dh_reg + .requests + .write() + .await + .insert("my-config".to_string(), request.clone()); + + assert!(dh_reg.get_request("my-config").await.is_some()); + assert!(dh_reg.get_request("my-other-config").await.is_none()); + + assert!(tokio::time::timeout( + Duration::from_millis(500), + request.termination_notifier.notified() + ) + .await + .is_err()); + let notif = request.termination_notifier.notified(); + + dh_reg.terminate_request("my-config").await; + assert!(tokio::time::timeout(Duration::from_millis(500), notif) + .await + .is_ok()); + } + + #[tokio::test] + async fn test_dh_reg_new_request() { + let (cdi_notifier, mut cdi_rec) = watch::channel(Default::default()); + let (configuration_notifier, mut config_rec) = mpsc::channel(2); + let kube_client = Arc::new(MockDiscoveryManagerKubeInterface::new()); + let dh_reg = DHRegistryImpl::new(kube_client.clone(), cdi_notifier, configuration_notifier); + + assert!(dh_reg + .new_request( + "my-config", + "mock_handler", + "discovery details", + &[], + HashMap::from([]), + "namespace" + ) + .await + .is_err_and(|e| { + matches!(e, + DiscoveryError::NoHandler(s) if s == *"mock_handler" + ) + })); + + let dev_senders = Arc::new(std::sync::Mutex::new(vec![])); + + let mut endpoint = MockDiscoveryHandlerEndpoint::new(); + let (close_1, closed) = tokio::sync::oneshot::channel::<()>(); + let local_senders = dev_senders.clone(); + endpoint.expect_get_name().return_const("mock_handler"); + endpoint.expect_get_uid().return_const("mock_handler_local"); + endpoint.expect_closed().return_once(|| { + Box::pin(async { + let _ = closed.await; + }) + }); + endpoint.expect_is_closed().return_const(true); + endpoint.expect_query().returning(move |s, _| { + local_senders.lock().unwrap().push(s); + async { Ok(()) }.boxed() + }); + dh_reg.register_endpoint(Arc::new(endpoint)).await; + let mut endpoint = MockDiscoveryHandlerEndpoint::new(); + let (close_2, closed) = tokio::sync::oneshot::channel::<()>(); + let local_senders = dev_senders.clone(); + endpoint.expect_get_name().return_const("mock_handler"); + endpoint + .expect_get_uid() + .return_const("mock_handler_local_2"); + endpoint.expect_closed().return_once(|| { + Box::pin(async { + let _ = closed.await; + }) + }); + endpoint.expect_is_closed().return_const(true); + endpoint.expect_query().returning(move |s, _| { + local_senders.lock().unwrap().push(s); + async { Ok(()) }.boxed() + }); + dh_reg.register_endpoint(Arc::new(endpoint)).await; + + assert!(dh_reg + .new_request( + "my-config", + "mock_handler", + "discovery details", + &[], + HashMap::from([]), + "namespace" + ) + .await + .is_ok()); + + assert!(cdi_rec.borrow_and_update().is_empty()); + assert_eq!(config_rec.try_recv(), Err(mpsc::error::TryRecvError::Empty)); + + dev_senders + .lock() + .unwrap() + .first() + .unwrap() + .send(vec![Arc::new(DiscoveredDevice::SharedDevice(Device { + id: "dev_1".to_owned(), + properties: Default::default(), + mounts: Default::default(), + device_specs: Default::default(), + }))]) + .unwrap(); + tokio::time::sleep(Duration::from_millis(500)).await; + assert_eq!( + config_rec.try_recv(), + Ok(ObjectRef::new("my-config").within("namespace")) + ); + + dev_senders + .lock() + .unwrap() + .get(1) + .unwrap() + .send(vec![Arc::new(DiscoveredDevice::SharedDevice(Device { + id: "dev_2".to_owned(), + properties: Default::default(), + mounts: Default::default(), + device_specs: Default::default(), + }))]) + .unwrap(); + tokio::time::sleep(Duration::from_millis(500)).await; + assert_eq!( + config_rec.try_recv(), + Ok(ObjectRef::new("my-config").within("namespace")) + ); + + assert_eq!( + cdi_rec.borrow_and_update().clone(), + HashMap::from([( + "akri.sh/my-config".to_owned(), + Kind { + kind: "akri.sh/my-config".to_owned(), + annotations: Default::default(), + container_edits: vec![ContainerEdit::default()], + devices: vec![ + crate::device_manager::cdi::Device { + name: "cb2ad7".to_owned(), + annotations: Default::default(), + container_edits: Default::default(), + }, + crate::device_manager::cdi::Device { + name: "7bbc11".to_owned(), + annotations: Default::default(), + container_edits: Default::default(), + }, + ] + } + )]) + ); + + dev_senders.lock().unwrap().pop(); + close_2.send(()).unwrap(); + tokio::time::sleep(Duration::from_millis(500)).await; + assert_eq!( + config_rec.try_recv(), + Ok(ObjectRef::new("my-config").within("namespace")) + ); + assert_eq!( + cdi_rec.borrow_and_update().clone(), + HashMap::from([( + "akri.sh/my-config".to_owned(), + Kind { + kind: "akri.sh/my-config".to_owned(), + annotations: Default::default(), + container_edits: vec![Default::default()], + devices: vec![crate::device_manager::cdi::Device { + name: "cb2ad7".to_owned(), + annotations: Default::default(), + container_edits: Default::default(), + },] + } + )]) + ); + + dev_senders.lock().unwrap().pop(); + close_1.send(()).unwrap(); + tokio::time::sleep(Duration::from_millis(500)).await; + assert_eq!( + config_rec.try_recv(), + Ok(ObjectRef::new("my-config").within("namespace")) + ); + assert!(cdi_rec.borrow_and_update().clone().is_empty()); + } +} diff --git a/agent/src/discovery_handler_manager/discovery_property_solver.rs b/agent/src/discovery_handler_manager/discovery_property_solver.rs new file mode 100644 index 000000000..c382042be --- /dev/null +++ b/agent/src/discovery_handler_manager/discovery_property_solver.rs @@ -0,0 +1,727 @@ +use akri_discovery_utils::discovery::v0::ByteData; +use akri_shared::akri::configuration::{ + DiscoveryProperty, DiscoveryPropertyKeySelector, DiscoveryPropertySource, +}; +use async_trait::async_trait; +use k8s_openapi::api::core::v1::{ConfigMap, Secret}; +use std::sync::Arc; + +use super::{DiscoveryError, DiscoveryManagerKubeInterface}; + +#[async_trait] +pub(super) trait PropertySolver { + async fn solve( + &self, + client: Arc, + ) -> Result, DiscoveryError>; +} + +#[async_trait] +impl PropertySolver for DiscoveryProperty { + async fn solve( + &self, + client: Arc, + ) -> Result, DiscoveryError> { + let value = if let Some(value) = self.value.as_ref() { + Some(ByteData { + vec: Some(value.as_bytes().to_vec()), + }) + } else if let Some(value_from) = self.value_from.as_ref() { + match value_from { + DiscoveryPropertySource::ConfigMapKeyRef(val) => { + solve_value_from_config_map(val, client.as_ref()).await? + } + DiscoveryPropertySource::SecretKeyRef(val) => { + solve_value_from_secret(val, client.as_ref()).await? + } + } + } else { + Some(ByteData { vec: None }) + }; + Ok(value.map(|v| (self.name.clone(), v))) + } +} + +async fn solve_value_from_config_map( + config_map_key_selector: &DiscoveryPropertyKeySelector, + client: &dyn DiscoveryManagerKubeInterface, +) -> Result, DiscoveryError> { + let optional = config_map_key_selector.optional.unwrap_or_default(); + let config_map_name = &config_map_key_selector.name; + let config_map_namespace = &config_map_key_selector.namespace; + let config_map_key = &config_map_key_selector.key; + + let config_map = client + .namespaced(config_map_namespace) + .get(config_map_name) + .await?; + + if config_map.is_none() { + if optional { + return Ok(None); + } else { + return Err(DiscoveryError::UnsolvableProperty("ConfigMap")); + } + } + let config_map: ConfigMap = config_map.unwrap(); + if let Some(data) = config_map.data { + if let Some(v) = data.get(config_map_key) { + return Ok(Some(ByteData { + vec: Some(v.as_bytes().to_vec()), + })); + } + } + if let Some(binary_data) = config_map.binary_data { + if let Some(v) = binary_data.get(config_map_key) { + return Ok(Some(ByteData { + vec: Some(v.0.clone()), + })); + } + } + + // config_map key/value not found + if optional { + Ok(None) + } else { + Err(DiscoveryError::UnsolvableProperty("ConfigMap")) + } +} + +async fn solve_value_from_secret( + secret_key_selector: &DiscoveryPropertyKeySelector, + client: &dyn DiscoveryManagerKubeInterface, +) -> Result, DiscoveryError> { + let optional = secret_key_selector.optional.unwrap_or_default(); + let secret_name = &secret_key_selector.name; + let secret_namespace = &secret_key_selector.namespace; + let secret_key = &secret_key_selector.key; + + let secret = client.namespaced(secret_namespace).get(secret_name).await?; + if secret.is_none() { + if optional { + return Ok(None); + } else { + return Err(DiscoveryError::UnsolvableProperty("Secret")); + } + } + let secret: Secret = secret.unwrap(); + // All key-value pairs in the stringData field are internally merged into the data field + // we don't need to check string_data. + if let Some(data) = secret.data { + if let Some(v) = data.get(secret_key) { + return Ok(Some(ByteData { + vec: Some(v.0.clone()), + })); + } + } + + // secret key/value not found + if optional { + Ok(None) + } else { + Err(DiscoveryError::UnsolvableProperty("Secret")) + } +} + +#[cfg(test)] +mod tests { + use std::collections::BTreeMap; + + use akri_shared::k8s::crud::MockApi; + use k8s_openapi::ByteString; + + use crate::discovery_handler_manager::mock::MockDiscoveryManagerKubeInterface; + + use super::*; + + #[tokio::test] + async fn test_get_discovery_properties_value_from_secret_no_secret_found() { + let _ = env_logger::builder().is_test(true).try_init(); + let namespace_name = "namespace_name"; + let secret_name = "secret_1"; + let key_in_secret = "key_in_secret"; + + let selector = DiscoveryPropertyKeySelector { + key: key_in_secret.to_string(), + name: secret_name.to_string(), + namespace: namespace_name.to_string(), + optional: Some(false), + }; + + let mut mock_secret_api = MockApi::new(); + mock_secret_api + .expect_get() + .times(1) + .withf(move |name| name == secret_name) + .returning(move |_| Ok(None)); + let mut mock_kube_client = MockDiscoveryManagerKubeInterface::new(); + mock_kube_client + .secret + .expect_namespaced() + .return_once(|_| Box::new(mock_secret_api)); + + // solve_value_from_secret should return error if secret not found + let result = solve_value_from_secret(&selector, &mock_kube_client).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_get_discovery_properties_value_from_secret_no_secret_found_optional() { + let _ = env_logger::builder().is_test(true).try_init(); + let namespace_name = "namespace_name"; + let secret_name = "secret_1"; + let key_in_secret = "key_in_secret"; + + let selector = DiscoveryPropertyKeySelector { + key: key_in_secret.to_string(), + name: secret_name.to_string(), + namespace: namespace_name.to_string(), + optional: Some(true), + }; + + let mut mock_secret_api = MockApi::new(); + mock_secret_api + .expect_get() + .times(1) + .withf(move |name| name == secret_name) + .returning(move |_| Ok(None)); + let mut mock_kube_client = MockDiscoveryManagerKubeInterface::new(); + mock_kube_client + .secret + .expect_namespaced() + .return_once(|_| Box::new(mock_secret_api)); + + // solve_value_from_secret for an optional key should return None if secret not found + let result = solve_value_from_secret(&selector, &mock_kube_client).await; + assert!(result.unwrap().is_none()); + } + + #[tokio::test] + async fn test_get_discovery_properties_value_from_secret_no_key() { + let _ = env_logger::builder().is_test(true).try_init(); + let namespace_name = "namespace_name"; + let secret_name = "secret_1"; + let key_in_secret = "key_in_secret"; + + let selector = DiscoveryPropertyKeySelector { + key: key_in_secret.to_string(), + name: secret_name.to_string(), + namespace: namespace_name.to_string(), + optional: Some(false), + }; + + let mut mock_secret_api = MockApi::new(); + mock_secret_api + .expect_get() + .times(1) + .withf(move |name| name == secret_name) + .returning(move |_| Ok(Default::default())); + let mut mock_kube_client = MockDiscoveryManagerKubeInterface::new(); + mock_kube_client + .secret + .expect_namespaced() + .return_once(|_| Box::new(mock_secret_api)); + + // solve_value_from_secret should return error if key in secret not found + assert!(solve_value_from_secret(&selector, &mock_kube_client) + .await + .is_err()); + } + + #[tokio::test] + async fn test_get_discovery_properties_value_from_secret_no_key_optional() { + let _ = env_logger::builder().is_test(true).try_init(); + let namespace_name = "namespace_name"; + let secret_name = "secret_1"; + let key_in_secret = "key_in_config_map"; + + let selector = DiscoveryPropertyKeySelector { + key: key_in_secret.to_string(), + name: secret_name.to_string(), + namespace: namespace_name.to_string(), + optional: Some(true), + }; + + let mut mock_secret_api = MockApi::new(); + mock_secret_api + .expect_get() + .times(1) + .withf(move |name| name == secret_name) + .returning(move |_| Ok(Default::default())); + let mut mock_kube_client = MockDiscoveryManagerKubeInterface::new(); + mock_kube_client + .secret + .expect_namespaced() + .return_once(|_| Box::new(mock_secret_api)); + + // solve_value_from_secret for an optional key should return None if key in secret not found + let result = solve_value_from_secret(&selector, &mock_kube_client).await; + assert!(result.unwrap().is_none()); + } + + #[tokio::test] + async fn test_get_discovery_properties_value_from_secret_no_value() { + let _ = env_logger::builder().is_test(true).try_init(); + let namespace_name = "namespace_name"; + let secret_name = "secret_1"; + let key_in_secret = "key_in_secret"; + + let selector = DiscoveryPropertyKeySelector { + key: key_in_secret.to_string(), + name: secret_name.to_string(), + namespace: namespace_name.to_string(), + optional: Some(false), + }; + + let mut mock_secret_api = MockApi::new(); + mock_secret_api + .expect_get() + .times(1) + .withf(move |name| name == secret_name) + .returning(move |_| { + let secret = Secret { + data: Some(BTreeMap::new()), + ..Default::default() + }; + Ok(Some(secret)) + }); + let mut mock_kube_client = MockDiscoveryManagerKubeInterface::new(); + mock_kube_client + .secret + .expect_namespaced() + .return_once(|_| Box::new(mock_secret_api)); + + // solve_value_from_secret should return error if no value in secret + let result = solve_value_from_secret(&selector, &mock_kube_client).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_get_discovery_properties_value_from_secret_no_value_optional() { + let _ = env_logger::builder().is_test(true).try_init(); + let namespace_name = "namespace_name"; + let secret_name = "secret_1"; + let key_in_secret = "key_in_config_map"; + + let selector = DiscoveryPropertyKeySelector { + key: key_in_secret.to_string(), + name: secret_name.to_string(), + namespace: namespace_name.to_string(), + optional: Some(true), + }; + + let mut mock_secret_api = MockApi::new(); + mock_secret_api + .expect_get() + .times(1) + .withf(move |name| name == secret_name) + .returning(move |_| { + let secret = Secret { + data: Some(BTreeMap::new()), + ..Default::default() + }; + Ok(Some(secret)) + }); + let mut mock_kube_client = MockDiscoveryManagerKubeInterface::new(); + mock_kube_client + .secret + .expect_namespaced() + .return_once(|_| Box::new(mock_secret_api)); + + // solve_value_from_secret for an optional key should return None if key in secret not found + let result = solve_value_from_secret(&selector, &mock_kube_client).await; + assert!(result.unwrap().is_none()); + } + + #[tokio::test] + async fn test_get_discovery_properties_value_from_secret_data_value() { + let _ = env_logger::builder().is_test(true).try_init(); + let namespace_name = "namespace_name"; + let secret_name = "secret_1"; + let key_in_secret = "key_in_secret"; + let value_in_secret = "value_in_secret"; + + let selector = DiscoveryPropertyKeySelector { + key: key_in_secret.to_string(), + name: secret_name.to_string(), + namespace: namespace_name.to_string(), + optional: Some(false), + }; + + let mut mock_secret_api = MockApi::new(); + mock_secret_api + .expect_get() + .times(1) + .withf(move |name| name == secret_name) + .returning(move |_| { + let data = BTreeMap::from([( + key_in_secret.to_string(), + ByteString(value_in_secret.into()), + )]); + let secret = Secret { + data: Some(data), + ..Default::default() + }; + Ok(Some(secret)) + }); + let mut mock_kube_client = MockDiscoveryManagerKubeInterface::new(); + mock_kube_client + .secret + .expect_namespaced() + .return_once(|_| Box::new(mock_secret_api)); + + let expected_result = ByteData { + vec: Some(value_in_secret.into()), + }; + + // solve_value_from_secret should return correct value if data value in secret + let result = solve_value_from_secret(&selector, &mock_kube_client).await; + assert_eq!(result.unwrap().unwrap(), expected_result); + } + + #[tokio::test] + async fn test_get_discovery_properties_value_from_config_map_no_config_map_found() { + let _ = env_logger::builder().is_test(true).try_init(); + let namespace_name = "namespace_name"; + let config_map_name = "config_map_1"; + let key_in_config_map = "key_in_config_map"; + + let selector = DiscoveryPropertyKeySelector { + key: key_in_config_map.to_string(), + name: config_map_name.to_string(), + namespace: namespace_name.to_string(), + optional: Some(false), + }; + + let mut mock_cm_api = MockApi::new(); + mock_cm_api + .expect_get() + .times(1) + .withf(move |name| name == config_map_name) + .returning(move |_| Ok(None)); + let mut mock_kube_client = MockDiscoveryManagerKubeInterface::new(); + mock_kube_client + .config + .expect_namespaced() + .return_once(|_| Box::new(mock_cm_api)); + + // solve_value_from_config_map should return error if configMap not found + let result = solve_value_from_config_map(&selector, &mock_kube_client).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_get_discovery_properties_value_from_config_map_no_config_map_found_optional() { + let _ = env_logger::builder().is_test(true).try_init(); + let namespace_name = "namespace_name"; + let config_map_name = "config_map_1"; + let key_in_config_map = "key_in_config_map"; + + let selector = DiscoveryPropertyKeySelector { + key: key_in_config_map.to_string(), + name: config_map_name.to_string(), + namespace: namespace_name.to_string(), + optional: Some(true), + }; + + let mut mock_cm_api = MockApi::new(); + mock_cm_api + .expect_get() + .times(1) + .withf(move |name| name == config_map_name) + .returning(move |_| Ok(None)); + let mut mock_kube_client = MockDiscoveryManagerKubeInterface::new(); + mock_kube_client + .config + .expect_namespaced() + .return_once(|_| Box::new(mock_cm_api)); + + // solve_value_from_config_map for an optional key should return None if configMap not found + let result = solve_value_from_config_map(&selector, &mock_kube_client).await; + assert!(result.unwrap().is_none()); + } + + #[tokio::test] + async fn test_get_discovery_properties_value_from_config_map_no_key() { + let _ = env_logger::builder().is_test(true).try_init(); + let namespace_name = "namespace_name"; + let config_map_name = "config_map_1"; + let key_in_config_map = "key_in_config_map"; + + let selector = DiscoveryPropertyKeySelector { + key: key_in_config_map.to_string(), + name: config_map_name.to_string(), + namespace: namespace_name.to_string(), + optional: Some(false), + }; + + let mut mock_cm_api = MockApi::new(); + mock_cm_api + .expect_get() + .times(1) + .withf(move |name| name == config_map_name) + .returning(move |_| Ok(Default::default())); + let mut mock_kube_client = MockDiscoveryManagerKubeInterface::new(); + mock_kube_client + .config + .expect_namespaced() + .return_once(|_| Box::new(mock_cm_api)); + + // solve_value_from_config_map should return error if key in configMap not found + let result = solve_value_from_config_map(&selector, &mock_kube_client).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_get_discovery_properties_value_from_config_map_no_key_optional() { + let _ = env_logger::builder().is_test(true).try_init(); + let namespace_name = "namespace_name"; + let config_map_name = "config_map_1"; + let key_in_config_map = "key_in_config_map"; + + let selector = DiscoveryPropertyKeySelector { + key: key_in_config_map.to_string(), + name: config_map_name.to_string(), + namespace: namespace_name.to_string(), + optional: Some(true), + }; + + let mut mock_cm_api = MockApi::new(); + mock_cm_api + .expect_get() + .times(1) + .withf(move |name| name == config_map_name) + .returning(move |_| Ok(Default::default())); + let mut mock_kube_client = MockDiscoveryManagerKubeInterface::new(); + mock_kube_client + .config + .expect_namespaced() + .return_once(|_| Box::new(mock_cm_api)); + + // solve_value_from_config_map for an optional key should return None if key in configMap not found + let result = solve_value_from_config_map(&selector, &mock_kube_client).await; + assert!(result.unwrap().is_none()); + } + + #[tokio::test] + async fn test_get_discovery_properties_value_from_config_map_no_value() { + let _ = env_logger::builder().is_test(true).try_init(); + let namespace_name = "namespace_name"; + let config_map_name = "config_map_1"; + let key_in_config_map = "key_in_config_map"; + + let selector = DiscoveryPropertyKeySelector { + key: key_in_config_map.to_string(), + name: config_map_name.to_string(), + namespace: namespace_name.to_string(), + optional: Some(false), + }; + + let mut mock_cm_api = MockApi::new(); + mock_cm_api + .expect_get() + .times(1) + .withf(move |name| name == config_map_name) + .returning(move |_| { + let config_map = ConfigMap { + data: Some(BTreeMap::new()), + binary_data: Some(BTreeMap::new()), + ..Default::default() + }; + Ok(Some(config_map)) + }); + let mut mock_kube_client = MockDiscoveryManagerKubeInterface::new(); + mock_kube_client + .config + .expect_namespaced() + .return_once(|_| Box::new(mock_cm_api)); + + // solve_value_from_config_map should return error if no value in configMap + let result = solve_value_from_config_map(&selector, &mock_kube_client).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_get_discovery_properties_value_from_config_map_no_value_optional() { + let _ = env_logger::builder().is_test(true).try_init(); + let namespace_name = "namespace_name"; + let config_map_name = "config_map_1"; + let key_in_config_map = "key_in_config_map"; + + let selector = DiscoveryPropertyKeySelector { + key: key_in_config_map.to_string(), + name: config_map_name.to_string(), + namespace: namespace_name.to_string(), + optional: Some(true), + }; + + let mut mock_cm_api = MockApi::new(); + mock_cm_api + .expect_get() + .times(1) + .withf(move |name| name == config_map_name) + .returning(move |_| { + let config_map = ConfigMap { + data: Some(BTreeMap::new()), + binary_data: Some(BTreeMap::new()), + ..Default::default() + }; + Ok(Some(config_map)) + }); + let mut mock_kube_client = MockDiscoveryManagerKubeInterface::new(); + mock_kube_client + .config + .expect_namespaced() + .return_once(|_| Box::new(mock_cm_api)); + + // solve_value_from_config_map for an optional key should return None if key in configMap not found + let result = solve_value_from_config_map(&selector, &mock_kube_client).await; + assert!(result.unwrap().is_none()); + } + + #[tokio::test] + async fn test_get_discovery_properties_value_from_config_map_data_value() { + let _ = env_logger::builder().is_test(true).try_init(); + let namespace_name = "namespace_name"; + let config_map_name = "config_map_1"; + let key_in_config_map = "key_in_config_map"; + let value_in_config_map = "value_in_config_map"; + + let selector = DiscoveryPropertyKeySelector { + key: key_in_config_map.to_string(), + name: config_map_name.to_string(), + namespace: namespace_name.to_string(), + optional: Some(false), + }; + + let mut mock_cm_api = MockApi::new(); + mock_cm_api + .expect_get() + .times(1) + .withf(move |name| name == config_map_name) + .returning(move |_| { + let data = BTreeMap::from([( + key_in_config_map.to_string(), + value_in_config_map.to_string(), + )]); + let config_map = ConfigMap { + data: Some(data), + binary_data: Some(BTreeMap::new()), + ..Default::default() + }; + Ok(Some(config_map)) + }); + let mut mock_kube_client = MockDiscoveryManagerKubeInterface::new(); + mock_kube_client + .config + .expect_namespaced() + .return_once(|_| Box::new(mock_cm_api)); + + let expected_result = ByteData { + vec: Some(value_in_config_map.into()), + }; + + // solve_value_from_config_map should return correct value if data value in configMap + let result = solve_value_from_config_map(&selector, &mock_kube_client).await; + assert_eq!(result.unwrap().unwrap(), expected_result); + } + + #[tokio::test] + async fn test_get_discovery_properties_value_from_config_map_binary_data_value() { + let _ = env_logger::builder().is_test(true).try_init(); + let namespace_name = "namespace_name"; + let config_map_name = "config_map_1"; + let key_in_config_map = "key_in_config_map"; + let value_in_config_map = "value_in_config_map"; + + let selector = DiscoveryPropertyKeySelector { + key: key_in_config_map.to_string(), + name: config_map_name.to_string(), + namespace: namespace_name.to_string(), + optional: Some(false), + }; + + let mut mock_cm_api = MockApi::new(); + mock_cm_api + .expect_get() + .times(1) + .withf(move |name| name == config_map_name) + .returning(move |_| { + let binary_data = BTreeMap::from([( + key_in_config_map.to_string(), + ByteString(value_in_config_map.into()), + )]); + let config_map = ConfigMap { + data: Some(BTreeMap::new()), + binary_data: Some(binary_data), + ..Default::default() + }; + Ok(Some(config_map)) + }); + let mut mock_kube_client = MockDiscoveryManagerKubeInterface::new(); + mock_kube_client + .config + .expect_namespaced() + .return_once(|_| Box::new(mock_cm_api)); + + let expected_result = ByteData { + vec: Some(value_in_config_map.into()), + }; + + // solve_value_from_config_map should return correct value if binary data value in configMap + let result = solve_value_from_config_map(&selector, &mock_kube_client).await; + assert_eq!(result.unwrap().unwrap(), expected_result); + } + + #[tokio::test] + async fn test_get_discovery_properties_value_from_config_map_data_and_binary_data_value() { + let _ = env_logger::builder().is_test(true).try_init(); + let namespace_name = "namespace_name"; + let config_map_name = "config_map_1"; + let key_in_config_map = "key_in_config_map"; + let value_in_config_map = "value_in_config_map"; + let binary_value_in_config_map = "binary_value_in_config_map"; + + let selector = DiscoveryPropertyKeySelector { + key: key_in_config_map.to_string(), + name: config_map_name.to_string(), + namespace: namespace_name.to_string(), + optional: Some(false), + }; + + let mut mock_cm_api = MockApi::new(); + mock_cm_api + .expect_get() + .times(1) + .withf(move |name| name == config_map_name) + .returning(move |_| { + let data = BTreeMap::from([( + key_in_config_map.to_string(), + value_in_config_map.to_string(), + )]); + let binary_data = BTreeMap::from([( + key_in_config_map.to_string(), + ByteString(binary_value_in_config_map.into()), + )]); + let config_map = ConfigMap { + data: Some(data), + binary_data: Some(binary_data), + ..Default::default() + }; + Ok(Some(config_map)) + }); + let mut mock_kube_client = MockDiscoveryManagerKubeInterface::new(); + mock_kube_client + .config + .expect_namespaced() + .return_once(|_| Box::new(mock_cm_api)); + + let expected_result = ByteData { + vec: Some(value_in_config_map.into()), + }; + + // solve_value_from_config_map should return value from data if both data and binary data value exist + let result = solve_value_from_config_map(&selector, &mock_kube_client).await; + assert_eq!(result.unwrap().unwrap(), expected_result); + } +} diff --git a/agent/src/discovery_handler_manager/embedded_handler.rs b/agent/src/discovery_handler_manager/embedded_handler.rs new file mode 100644 index 000000000..c82344320 --- /dev/null +++ b/agent/src/discovery_handler_manager/embedded_handler.rs @@ -0,0 +1,174 @@ +use std::sync::Arc; + +use akri_discovery_utils::discovery::{ + v0::{discovery_handler_server::DiscoveryHandler, DiscoverRequest, DiscoverResponse}, + DiscoverStream, +}; +use akri_shared::os::env_var::{ActualEnvVarQuery, EnvVarQuery}; +use async_trait::async_trait; +use tokio::{select, sync::watch}; +use tokio_stream::{wrappers::ReceiverStream, StreamExt}; +use tonic::IntoRequest; + +/// Label of environment variable that, when set, enables the embedded debug echo discovery handler +#[cfg(any(test, feature = "agent-full"))] +pub const ENABLE_DEBUG_ECHO_LABEL: &str = "ENABLE_DEBUG_ECHO"; + +use super::{ + discovery_handler_registry::{ + DiscoveredDevice, DiscoveryHandlerEndpoint, DiscoveryHandlerRegistry, + }, + DiscoveryError, +}; + +struct EmbeddedHandlerEndpoint { + name: String, + shared: bool, + handler: Box>, + node_name: String, +} + +impl EmbeddedHandlerEndpoint { + async fn handle_stream( + uid: String, + node_name: String, + shared: bool, + sender: watch::Sender>>, + mut stream: ReceiverStream>, + ) { + let mut signal = + tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()).unwrap(); + loop { + let msg = select! { + _ = sender.closed() => return, + _ = signal.recv() => return, + msg = stream.try_next() => match msg { + Ok(Some(msg)) => msg, + Ok(None) => { + error!("Discovery Handler {} closed the stream unexpectedly", uid); + return + }, + Err(e) => { + error!("Received error on gRPC stream for {}: {}", uid, e); + return + }, + }, + }; + let devices = msg + .devices + .into_iter() + .map(|d| { + Arc::new(match shared { + true => DiscoveredDevice::SharedDevice(d), + false => DiscoveredDevice::LocalDevice(d, node_name.clone()), + }) + }) + .collect(); + sender.send_replace(devices); + } + } +} + +#[async_trait] +impl DiscoveryHandlerEndpoint for EmbeddedHandlerEndpoint { + async fn query( + &self, + sender: watch::Sender>>, + query_body: DiscoverRequest, + ) -> Result<(), DiscoveryError> { + let stream = match self.handler.discover(query_body.into_request()).await { + Ok(r) => r.into_inner(), + Err(e) => { + match e.code() { + tonic::Code::InvalidArgument => { + warn!("NetworkEndpoint::query - invalid arguments provided to DiscoveryHandler"); + return Err(DiscoveryError::InvalidDiscoveryDetails); + } + _ => { + error!("NetworkEndpoint::query - could not connect to DiscoveryHandler at endpoint {} with error {}", self.get_uid(), e); + // We do not consider the DH as unavailable here, as this can be a temporary error + return Err(DiscoveryError::UnavailableDiscoveryHandler(self.get_uid())); + } + } + } + }; + tokio::spawn(Self::handle_stream( + self.get_uid(), + self.node_name.to_owned(), + self.shared.to_owned(), + sender, + stream, + )); + Ok(()) + } + + fn get_name(&self) -> String { + self.name.to_owned() + } + fn get_uid(&self) -> String { + format!("embedded-{}", self.name) + } + + async fn closed(&self) { + std::future::pending().await + } + fn is_closed(&self) -> bool { + false + } +} + +pub(super) async fn register_handlers(reg: &dyn DiscoveryHandlerRegistry, node_name: String) { + let env_var_query = ActualEnvVarQuery {}; + inner_register_discovery_handlers(reg, &env_var_query, node_name).await; +} + +async fn inner_register_discovery_handlers( + reg: &dyn DiscoveryHandlerRegistry, + env: &dyn EnvVarQuery, + node_name: String, +) { + if env.get_env_var(ENABLE_DEBUG_ECHO_LABEL).is_ok() { + let shared: bool = env + .get_env_var(akri_debug_echo::DEBUG_ECHO_INSTANCES_SHARED_LABEL) + .unwrap() + .parse() + .unwrap(); + reg.register_endpoint(Arc::new(EmbeddedHandlerEndpoint { + name: akri_debug_echo::DISCOVERY_HANDLER_NAME.to_string(), + shared, + handler: Box::new(akri_debug_echo::discovery_handler::DiscoveryHandlerImpl::new(None)), + node_name: node_name.clone(), + })) + .await; + } + #[cfg(feature = "onvif-feat")] + reg.register_endpoint(Arc::new(EmbeddedHandlerEndpoint { + name: akri_onvif::DISCOVERY_HANDLER_NAME.to_string(), + shared: akri_onvif::SHARED, + handler: Box::new(akri_onvif::discovery_handler::DiscoveryHandlerImpl::new( + None, + )), + node_name: node_name.clone(), + })) + .await; + #[cfg(feature = "udev-feat")] + reg.register_endpoint(Arc::new(EmbeddedHandlerEndpoint { + name: akri_udev::DISCOVERY_HANDLER_NAME.to_string(), + shared: akri_udev::SHARED, + handler: Box::new(akri_udev::discovery_handler::DiscoveryHandlerImpl::new( + None, + )), + node_name: node_name.clone(), + })) + .await; + #[cfg(feature = "opcua-feat")] + reg.register_endpoint(Arc::new(EmbeddedHandlerEndpoint { + name: akri_opcua::DISCOVERY_HANDLER_NAME.to_string(), + shared: akri_opcua::SHARED, + handler: Box::new(akri_opcua::discovery_handler::DiscoveryHandlerImpl::new( + None, + )), + node_name: node_name.clone(), + })) + .await; +} diff --git a/agent/src/discovery_handler_manager/mod.rs b/agent/src/discovery_handler_manager/mod.rs new file mode 100644 index 000000000..183881539 --- /dev/null +++ b/agent/src/discovery_handler_manager/mod.rs @@ -0,0 +1,105 @@ +pub mod discovery_handler_registry; +mod discovery_property_solver; +#[cfg(any(test, feature = "agent-full"))] +mod embedded_handler; +mod registration_socket; + +use std::{collections::HashMap, sync::Arc}; + +use akri_shared::{akri::configuration::Configuration, k8s::crud::IntoApi}; +use k8s_openapi::api::core::v1::{ConfigMap, Secret}; + +use kube_runtime::reflector::ObjectRef; +use thiserror::Error; +use tokio::sync::{mpsc, watch}; + +use self::discovery_handler_registry::DHRegistryImpl; + +pub use registration_socket::run_registration_server; + +#[derive(Error, Debug)] +pub enum DiscoveryError { + #[error("Invalid discovery details provided to discovery handler")] + InvalidDiscoveryDetails, + + #[error("Discovery Handler {0} is unavailable")] + UnavailableDiscoveryHandler(String), + + #[error("discoveryProperties' referenced {0} not found")] + UnsolvableProperty(&'static str), + + #[error(transparent)] + KubeError(#[from] kube::Error), + + #[error("No registered handler for {0}")] + NoHandler(String), + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +pub fn new_registry( + kube_client: Arc, +) -> ( + watch::Receiver>, + impl discovery_handler_registry::DiscoveryHandlerRegistry, + mpsc::Receiver>, +) { + let (sender, receiver) = watch::channel(Default::default()); + let (configuration_notifier, notifier) = mpsc::channel(10); + let registry = DHRegistryImpl::new(kube_client, sender, configuration_notifier); + (receiver, registry, notifier) +} + +pub trait DiscoveryManagerKubeInterface: IntoApi + IntoApi {} + +impl + IntoApi> DiscoveryManagerKubeInterface for T {} + +#[cfg(test)] +mod mock { + + use akri_shared::k8s::crud::{Api, IntoApi, MockIntoApi}; + use k8s_openapi::api::core::v1::{ConfigMap, Secret}; + #[derive(Default)] + pub struct MockDiscoveryManagerKubeInterface { + pub secret: MockIntoApi, + pub config: MockIntoApi, + } + + impl MockDiscoveryManagerKubeInterface { + pub fn new() -> Self { + Self { + secret: MockIntoApi::new(), + config: MockIntoApi::new(), + } + } + } + + impl IntoApi for MockDiscoveryManagerKubeInterface { + fn all(&self) -> Box> { + self.secret.all() + } + + fn namespaced(&self, namespace: &str) -> Box> { + self.secret.namespaced(namespace) + } + + fn default_namespaced(&self) -> Box> { + self.secret.default_namespaced() + } + } + + impl IntoApi for MockDiscoveryManagerKubeInterface { + fn all(&self) -> Box> { + self.config.all() + } + + fn namespaced(&self, namespace: &str) -> Box> { + self.config.namespaced(namespace) + } + + fn default_namespaced(&self) -> Box> { + self.config.default_namespaced() + } + } +} diff --git a/agent/src/discovery_handler_manager/registration_socket.rs b/agent/src/discovery_handler_manager/registration_socket.rs new file mode 100644 index 000000000..ffdefae34 --- /dev/null +++ b/agent/src/discovery_handler_manager/registration_socket.rs @@ -0,0 +1,345 @@ +use std::{convert::TryFrom, pin::Pin, sync::Arc}; + +use akri_discovery_utils::discovery::v0::{ + discovery_handler_client::DiscoveryHandlerClient, + register_discovery_handler_request::EndpointType, registration_server::Registration, + DiscoverRequest, DiscoverResponse, Empty, RegisterDiscoveryHandlerRequest, +}; +use akri_shared::uds::unix_stream; +use async_trait::async_trait; +use futures::{FutureExt, Stream, StreamExt, TryFutureExt}; +use tokio::{select, sync::watch}; +use tokio_stream::StreamExt as _; +use tonic::{transport::Channel, Request, Response, Status}; + +use crate::util::stopper::Stopper; + +use super::{ + discovery_handler_registry::{ + DiscoveredDevice, DiscoveryHandlerEndpoint, DiscoveryHandlerRegistry, + }, + DiscoveryError, +}; + +struct NetworkEndpoint { + name: String, + endpoint: String, + endpoint_type: EndpointType, + stopped: Stopper, + shared: bool, + node_name: String, +} + +impl NetworkEndpoint { + fn new(req: RegisterDiscoveryHandlerRequest, node_name: String) -> Self { + NetworkEndpoint { + name: req.name, + endpoint: req.endpoint, + stopped: Stopper::new(), + shared: req.shared, + endpoint_type: EndpointType::try_from(req.endpoint_type).unwrap(), + node_name, + } + } + + async fn get_client(&self) -> Result, tonic::transport::Error> { + match self.endpoint_type { + EndpointType::Uds => { + let socket = self.endpoint.clone(); + Ok(DiscoveryHandlerClient::new( + tonic::transport::Endpoint::try_from("http://[::1]:50051") + .unwrap() + .connect_with_connector(tower::service_fn(move |_: hyper::Uri| { + tokio::net::UnixStream::connect(socket.clone()) + })) + .await?, + )) + } + EndpointType::Network => DiscoveryHandlerClient::connect(self.endpoint.clone()).await, + } + } + + async fn handle_stream( + stopper: Stopper, + uid: String, + node_name: String, + shared: bool, + sender: watch::Sender>>, + mut stream: Pin> + Send>>, + ) { + let mut signal = + tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()).unwrap(); + loop { + let msg = select! { + // This means all queries for this endpoint must end. + _ = stopper.stopped() => return, + // This means all receiver dropped (i.e no one cares about this query anymore) + _ = sender.closed() => return, + _ = signal.recv() => return, + msg = stream.try_next() => match msg { + Ok(Some(msg)) => msg, + Ok(None) => { + error!("Discovery Handler {} closed the stream unexpectedly", uid); + return + }, + Err(e) => { + error!("Received error on gRPC stream for {}: {}", uid, e); + return + }, + }, + }; + trace!("Received new message from discovery handler: {:?}", msg); + let devices = msg + .devices + .into_iter() + .map(|d| { + Arc::new(match shared { + true => DiscoveredDevice::SharedDevice(d), + false => DiscoveredDevice::LocalDevice(d, node_name.clone()), + }) + }) + .collect(); + sender.send_replace(devices); + } + } +} + +#[async_trait] +impl DiscoveryHandlerEndpoint for NetworkEndpoint { + async fn query( + &self, + sender: watch::Sender>>, + query_body: DiscoverRequest, + ) -> Result<(), DiscoveryError> { + if self.stopped.is_stopped() { + return Err(DiscoveryError::UnavailableDiscoveryHandler(self.get_uid())); + } + let stream = match self.get_client().await { + Ok(mut discovery_handler_client) => { + trace!( + "NetworkEndpoint::query - connecting to external {} discovery handler over network", + self.name + ); + match discovery_handler_client.discover(query_body).await { + Ok(device_update_receiver) => device_update_receiver.into_inner(), + Err(e) => { + match e.code() { + tonic::Code::InvalidArgument => { + warn!("NetworkEndpoint::query - invalid arguments provided to DiscoveryHandler"); + return Err(DiscoveryError::InvalidDiscoveryDetails); + } + _ => { + error!("NetworkEndpoint::query - could not connect to DiscoveryHandler at endpoint {} with error {}", self.get_uid(), e); + // We do not consider the DH as unavailable here, as this can be a temporary error + return Err(DiscoveryError::UnavailableDiscoveryHandler( + self.get_uid(), + )); + } + } + } + } + } + Err(e) => { + error!("NetworkEndpoint::query - failed to connect to {} discovery handler over network with error {}", self.name, e); + // We failed to connect to Discovery Handler, consider it offline now + self.stopped.stop(); + return Err(DiscoveryError::UnavailableDiscoveryHandler(self.get_uid())); + } + }; + tokio::spawn(Self::handle_stream( + self.stopped.to_owned(), + self.get_uid(), + self.node_name.to_owned(), + self.shared.to_owned(), + sender, + stream.boxed(), + )); + Ok(()) + } + + fn get_name(&self) -> String { + self.name.to_owned() + } + fn get_uid(&self) -> String { + format!("{}@{}", self.name, self.endpoint) + } + + async fn closed(&self) { + self.stopped.stopped().await + } + fn is_closed(&self) -> bool { + self.stopped.is_stopped() + } +} + +struct RegistrationEndpoint { + inner: Arc, + node_name: String, +} +#[async_trait] +impl Registration for RegistrationEndpoint { + async fn register_discovery_handler( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + self.inner + .register_endpoint(Arc::new(NetworkEndpoint::new(req, self.node_name.clone()))) + .await; + Ok(Response::new(Empty {})) + } +} + +pub async fn run_registration_server( + dh_registry: Arc, + socket_path: &str, + node_name: String, +) -> Result<(), Box> { + info!("internal_run_registration_server - entered"); + trace!( + "internal_run_registration_server - registration server listening on socket {}", + socket_path + ); + + #[cfg(any(test, feature = "agent-full"))] + super::embedded_handler::register_handlers(dh_registry.as_ref(), node_name.clone()).await; + // Delete socket in case previously created/used + std::fs::remove_file(socket_path).unwrap_or(()); + let incoming = { + let uds = + tokio::net::UnixListener::bind(socket_path).expect("Failed to bind to socket path"); + + async_stream::stream! { + loop { + let item = uds.accept().map_ok(|(st, _)| unix_stream::UnixStream(st)).await; + yield item; + } + } + }; + let mut signal = + tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()).unwrap(); + tonic::transport::Server::builder() + .add_service( + akri_discovery_utils::discovery::v0::registration_server::RegistrationServer::new( + RegistrationEndpoint { + inner: dh_registry, + node_name, + }, + ), + ) + .serve_with_incoming_shutdown(incoming, signal.recv().map(|_| ())) + .await?; + trace!( + "internal_run_registration_server - gracefully shutdown ... deleting socket {}", + socket_path + ); + std::fs::remove_file(socket_path).unwrap_or(()); + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use akri_discovery_utils::discovery::v0::Device; + use tokio::sync::mpsc; + + use super::*; + + #[tokio::test] + async fn test_handle_stream_local() { + let stopper = Stopper::new(); + let uid = "foo".to_owned(); + let node_name = "node-a".to_owned(); + let shared = false; + let (sender, mut receiver) = watch::channel(Default::default()); + let (st_sender, st_rec) = mpsc::channel(1); + let stream = tokio_stream::wrappers::ReceiverStream::new(st_rec); + + let task = tokio::spawn(NetworkEndpoint::handle_stream( + stopper, + uid, + node_name.clone(), + shared, + sender, + stream.boxed(), + )); + assert!(st_sender + .send(Ok(DiscoverResponse { + devices: vec![Device { + id: "bar".to_string(), + ..Default::default() + }] + })) + .await + .is_ok()); + assert!( + tokio::time::timeout(Duration::from_millis(500), receiver.changed()) + .await + .is_ok() + ); + let val = receiver.borrow_and_update().clone(); + assert_eq!( + val, + vec![Arc::new(DiscoveredDevice::LocalDevice( + Device { + id: "bar".to_string(), + ..Default::default() + }, + node_name.to_owned() + ))] + ); + + drop(receiver); + assert!(tokio::time::timeout(Duration::from_millis(500), task) + .await + .is_ok()); + } + + #[tokio::test] + async fn test_handle_stream_shared() { + let stopper = Stopper::new(); + let uid = "foo".to_owned(); + let node_name = "node-a".to_owned(); + let shared = true; + let (sender, mut receiver) = watch::channel(Default::default()); + let (st_sender, st_rec) = mpsc::channel(1); + let stream = tokio_stream::wrappers::ReceiverStream::new(st_rec); + + let task = tokio::spawn(NetworkEndpoint::handle_stream( + stopper.clone(), + uid, + node_name.clone(), + shared, + sender, + stream.boxed(), + )); + assert!(st_sender + .send(Ok(DiscoverResponse { + devices: vec![Device { + id: "bar".to_string(), + ..Default::default() + }] + })) + .await + .is_ok()); + assert!( + tokio::time::timeout(Duration::from_millis(500), receiver.changed()) + .await + .is_ok() + ); + let val = receiver.borrow_and_update().clone(); + assert_eq!( + val, + vec![Arc::new(DiscoveredDevice::SharedDevice(Device { + id: "bar".to_string(), + ..Default::default() + }))] + ); + + stopper.stop(); + assert!(tokio::time::timeout(Duration::from_millis(500), task) + .await + .is_ok()); + } +} diff --git a/agent/src/main.rs b/agent/src/main.rs index 36522ba52..efe1f33f5 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -3,6 +3,9 @@ extern crate hyper; extern crate log; #[macro_use] extern crate serde_derive; +mod device_manager; +mod discovery_handler_manager; +mod plugin_manager; mod util; use akri_shared::akri::{metrics::run_metrics_server, API_NAMESPACE}; @@ -11,18 +14,6 @@ use std::{ collections::HashMap, env, sync::{Arc, Mutex}, - time::Duration, -}; -use tokio::sync::broadcast; -#[cfg(feature = "agent-full")] -use util::registration::register_embedded_discovery_handlers; -use util::{ - config_action, - constants::{ - NEW_DISCOVERY_HANDLER_CHANNEL_CAPACITY, SLOT_RECONCILIATION_SLOT_GRACE_PERIOD_SECS, - }, - registration::{run_registration_server, DiscoveryHandlerName}, - slot_reconciliation::periodic_slot_reconciliation, }; /// This is the entry point for the Akri Agent. @@ -45,44 +36,68 @@ async fn main() -> Result<(), Box let mut tasks = Vec::new(); let node_name = env::var("AGENT_NODE_NAME")?; - // Start server for Prometheus metrics - tasks.push(tokio::spawn(async move { - run_metrics_server().await.unwrap(); - })); - - let discovery_handler_map = Arc::new(Mutex::new(HashMap::new())); - let discovery_handler_map_clone = discovery_handler_map.clone(); - let (new_discovery_handler_sender, _): ( - broadcast::Sender, - broadcast::Receiver, - ) = broadcast::channel(NEW_DISCOVERY_HANDLER_CHANNEL_CAPACITY); - let new_discovery_handler_sender_clone = new_discovery_handler_sender.clone(); - #[cfg(feature = "agent-full")] - register_embedded_discovery_handlers(discovery_handler_map_clone.clone())?; - - // Start registration service for registering `DiscoveryHandlers` - tasks.push(tokio::spawn(async move { - run_registration_server(discovery_handler_map_clone, new_discovery_handler_sender) - .await - .unwrap(); - })); + { + let kube_client = Arc::new(akri_shared::k8s::KubeImpl::new().await?); + + // Start server for Prometheus metrics + tasks.push(tokio::spawn(async move { + run_metrics_server().await.unwrap(); + })); - tasks.push(tokio::spawn(async move { - let slot_grace_period = Duration::from_secs(SLOT_RECONCILIATION_SLOT_GRACE_PERIOD_SECS); - periodic_slot_reconciliation(slot_grace_period) + let (device_notifier, discovery_handler_registry, conf_notifier) = + discovery_handler_manager::new_registry(kube_client.clone()); + + let dh_registry = Arc::new(discovery_handler_registry); + let local_dh_reg = dh_registry.clone(); + let local_node_name = node_name.clone(); + + tasks.push(tokio::spawn(async { + discovery_handler_manager::run_registration_server( + local_dh_reg, + &akri_discovery_utils::get_registration_socket(), + local_node_name, + ) .await - .unwrap(); - })); - - tasks.push(tokio::spawn(async move { - config_action::do_config_watch( - discovery_handler_map, - new_discovery_handler_sender_clone, - node_name, - ) - .await - .unwrap() - })); + .unwrap() + })); + + let im_device_manager = Arc::new(device_manager::InMemoryManager::new(device_notifier)); + + let device_plugin_manager = Arc::new( + plugin_manager::device_plugin_instance_controller::DevicePluginManager::new( + node_name.clone(), + kube_client.clone(), + im_device_manager.clone(), + ), + ); + + let (instances_cache, task) = plugin_manager::device_plugin_instance_controller::start_dpm( + device_plugin_manager.clone(), + ); + tasks.push(task); + + tasks.push(tokio::spawn( + plugin_manager::device_plugin_slot_reclaimer::start_reclaimer(device_plugin_manager), + )); + + let config_controller_context = Arc::new( + util::discovery_configuration_controller::ControllerContext { + instances_cache, + dh_registry, + client: kube_client.clone(), + agent_instance_name: node_name.clone(), + error_backoffs: Mutex::new(HashMap::new()), + }, + ); + + tasks.push(tokio::spawn(async { + util::discovery_configuration_controller::start_controller( + config_controller_context, + conf_notifier, + ) + .await; + })); + } futures::future::try_join_all(tasks).await?; info!("{} Agent end", API_NAMESPACE); diff --git a/agent/src/plugin_manager/device_plugin_instance_controller.rs b/agent/src/plugin_manager/device_plugin_instance_controller.rs new file mode 100644 index 000000000..4ceaa24db --- /dev/null +++ b/agent/src/plugin_manager/device_plugin_instance_controller.rs @@ -0,0 +1,1517 @@ +use std::collections::HashSet; +use std::fmt::Display; +use std::str::FromStr; +use std::{collections::HashMap, sync::Arc, time::Duration}; + +use akri_shared::{akri::instance::Instance, k8s::crud::IntoApi}; +use async_trait::async_trait; +use futures::StreamExt; +use itertools::Itertools; +use kube::api::{Patch, PatchParams}; +use kube::core::{NotUsed, Object, ObjectMeta, TypeMeta}; +use kube::ResourceExt; +use kube_runtime::controller::Action; +use kube_runtime::reflector::Store; +use kube_runtime::Controller; +use thiserror::Error; +use tokio::sync::{watch, Mutex, RwLock}; +use tokio::task::JoinHandle; +use tonic::Request; + +use crate::device_manager::{cdi, DeviceManager}; +use crate::plugin_manager::v1beta1::ContainerAllocateResponse; +use crate::util::stopper::Stopper; + +use super::device_plugin_runner::{ + serve_and_register_plugin, DeviceUsageStream, InternalDevicePlugin, +}; +use super::v1beta1::{AllocateRequest, AllocateResponse, ListAndWatchResponse}; + +#[derive(Error, Debug)] +pub enum DevicePluginError { + #[error("Slot already in use")] + SlotInUse, + + #[error("No slots left for device")] + NoSlot, + + #[error("Device usage parse error")] + UsageParseError, + + #[error("Unknown device: {0}")] + UnknownDevice(String), + + #[error(transparent)] + RunnerError(#[from] super::device_plugin_runner::RunnerError), + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +#[derive(Debug, Clone, PartialEq)] +enum DeviceUsage { + Unused, + Node(String), + Configuration { vdev: String, node: String }, +} + +impl DeviceUsage { + fn is_owned_by(&self, node: &str) -> bool { + match self { + Self::Node(n) if n == node => true, + Self::Configuration { node: n, .. } if n == node => true, + _ => false, + } + } +} + +impl Display for DeviceUsage { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DeviceUsage::Unused => write!(f, ""), + DeviceUsage::Node(node) => write!(f, "{}", node), + DeviceUsage::Configuration { vdev, node } => write!(f, "C:{}:{}", vdev, node), + } + } +} + +impl FromStr for DeviceUsage { + type Err = DevicePluginError; + fn from_str(val: &str) -> Result { + if val.is_empty() { + Ok(Self::Unused) + } else { + match val.split(':').collect_vec()[..] { + ["C", vdev, node] => Ok(Self::Configuration { + vdev: vdev.to_owned(), + node: node.to_owned(), + }), + [node] => Ok(Self::Node(node.to_owned())), + _ => Err(DevicePluginError::UsageParseError), + } + } + } +} + +fn parse_slot_id(st: &str) -> Result { + usize::from_str( + st.rsplit_once('-') + .ok_or(DevicePluginError::UsageParseError)? + .1, + ) + .or(Err(DevicePluginError::UsageParseError)) +} + +fn construct_slots_map( + slots: &HashMap, +) -> Result, DevicePluginError> { + slots + .iter() + .map(|(k, v)| Ok((parse_slot_id(k)?, DeviceUsage::from_str(v)?))) + .try_collect() +} + +fn construct_slots_vec( + slots: &HashMap, + capacity: usize, +) -> Result, DevicePluginError> { + let mut out_vec = vec![DeviceUsage::Unused; capacity]; + for (k, v) in slots.iter() { + let index = parse_slot_id(k)?; + if index >= capacity { + return Err(DevicePluginError::UsageParseError); + } + out_vec[index] = DeviceUsage::from_str(v)?; + } + Ok(out_vec) +} + +#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde(rename_all = "camelCase")] +struct PartialInstanceSlotUsage { + device_usage: HashMap, +} + +struct InstanceDevicePlugin { + device: cdi::Device, + slots_status: Mutex>>, + node_name: String, + instance_name: String, + instance_namespace: String, + kube_client: Arc>, + stopper: Stopper, +} + +impl InstanceDevicePlugin { + fn new( + node_name: String, + plugin_name: String, + namespace: String, + device: cdi::Device, + slots: &HashMap, + capacity: usize, + client: Arc>, + ) -> Result { + let (slots_status, _) = watch::channel(construct_slots_vec(slots, capacity)?); + Ok(Self { + device, + slots_status: Mutex::new(slots_status), + node_name, + instance_name: plugin_name, + kube_client: client, + stopper: Stopper::new(), + instance_namespace: namespace, + }) + } + + async fn update_slots(&self, slots: &HashMap) -> Result<(), DevicePluginError> { + let my_slots = self.slots_status.lock().await; + let new_slots = construct_slots_map(slots)?; + my_slots.send_if_modified(|current| { + let mut modified = false; + for (k, v) in new_slots.iter() { + if current[*k] != *v { + current[*k] = v.to_owned(); + modified = true; + } + } + modified + }); + Ok(()) + } + + async fn claim_slot( + &self, + id: Option, + wanted_state: DeviceUsage, + ) -> Result { + if wanted_state == DeviceUsage::Unused { + return Err(anyhow::anyhow!("Should never happen").into()); + } + let slots_status = self.slots_status.lock().await; + let id = match id { + Some(id) => match &slots_status.borrow()[id] { + DeviceUsage::Unused => id, + // The kubelet asks for the same slot, it knows best + d if *d == wanted_state => id, + _ => { + trace!("Trying to claim already used slot"); + return Err(DevicePluginError::SlotInUse); + } + }, + None => slots_status + .borrow() + .iter() + .position(|v| *v == DeviceUsage::Unused) + .ok_or(DevicePluginError::NoSlot)?, + }; + slots_status.send_modify(|slots| { + slots[id] = wanted_state; + }); + let device_usage = slots_status + .borrow() + .iter() + .enumerate() + .filter_map(|(i, v)| match v { + v if v.is_owned_by(&self.node_name) => { + Some((format!("{}-{}", self.instance_name, i), v.to_string())) + } + _ => None, + }) + .collect(); + let api = self.kube_client.namespaced(&self.instance_namespace); + let patch = Patch::Apply( + serde_json::to_value(Object { + types: Some(TypeMeta { + api_version: "akri.sh/v0".to_owned(), + kind: "Instance".to_owned(), + }), + status: None::, + spec: PartialInstanceSlotUsage { device_usage }, + metadata: ObjectMeta { + name: Some(self.instance_name.to_owned()), + ..Default::default() + }, + }) + .unwrap(), + ); + api.raw_patch( + &self.instance_name, + &patch, + &PatchParams::apply(&format!("dp-{}", &self.node_name)), + ) + .await + .map_err(|e| match e { + kube::Error::Api(ae) => match ae.code { + 409 => { + trace!("Conflict on apply {:?}", ae); + DevicePluginError::SlotInUse + } + _ => DevicePluginError::Other(ae.into()), + }, + e => DevicePluginError::Other(e.into()), + })?; + Ok(id) + } + + async fn free_slot(&self, id: usize) -> Result<(), DevicePluginError> { + let slots_status = self.slots_status.lock().await; + slots_status.send_if_modified(|slots| { + if id >= slots.len() { + // We try to free a slot that doesn't exists, probably already freed + false + } else { + slots[id] = DeviceUsage::Unused; + true + } + }); + let device_usage = slots_status + .borrow() + .iter() + .enumerate() + .filter_map(|(i, v)| match v { + v if v.is_owned_by(&self.node_name) => { + Some((format!("{}-{}", self.instance_name, i), v.to_string())) + } + _ => None, + }) + .collect(); + let api = self.kube_client.namespaced(&self.instance_namespace); + let patch = Patch::Apply( + serde_json::to_value(Object { + types: Some(TypeMeta { + api_version: "akri.sh/v0".to_owned(), + kind: "Instance".to_owned(), + }), + status: None::, + spec: PartialInstanceSlotUsage { device_usage }, + metadata: ObjectMeta { + name: Some(self.instance_name.to_owned()), + ..Default::default() + }, + }) + .unwrap(), + ); + api.raw_patch( + &self.instance_name, + &patch, + &PatchParams::apply(&format!("dp-{}", &self.node_name)), + ) + .await + .map_err(|e| match e { + kube::Error::Api(ae) => match ae.code { + 409 => DevicePluginError::SlotInUse, + _ => DevicePluginError::Other(ae.into()), + }, + e => DevicePluginError::Other(e.into()), + })?; + Ok(()) + } +} + +fn instance_device_usage_to_device( + device_name: &str, + node_name: &str, + devices: Vec, +) -> Result { + let devices = devices + .into_iter() + .enumerate() + .map(|(id, dev)| super::v1beta1::Device { + id: format!("{}-{}", device_name, id), + health: match dev { + DeviceUsage::Unused => "Healthy", + DeviceUsage::Configuration { .. } => "Unhealthy", + DeviceUsage::Node(n) => match n == node_name { + true => "Healthy", + false => "Unhealthy", + }, + } + .to_string(), + topology: None, + }) + .collect(); + trace!("Sending devices to kubelet: {:?}", devices); + Ok(ListAndWatchResponse { devices }) +} + +#[async_trait] +impl InternalDevicePlugin for InstanceDevicePlugin { + type DeviceStore = Vec; + + fn get_name(&self) -> String { + self.instance_name.clone() + } + + fn stop(&self) { + trace!("stopping device plugin"); + self.stopper.stop() + } + + async fn stopped(&self) { + self.stopper.stopped().await; + trace!("plugin {} stopped", self.instance_name); + } + + async fn list_and_watch( + &self, + ) -> Result>, tonic::Status> { + info!( + "list_and_watch - kubelet called list_and_watch for instance {}", + self.instance_name + ); + let device_name = self.instance_name.clone(); + let node_name = self.node_name.clone(); + let receiver = self.slots_status.lock().await.subscribe(); + let receiver_stream = tokio_stream::wrappers::WatchStream::new(receiver); + + Ok(tonic::Response::new(DeviceUsageStream { + f: instance_device_usage_to_device, + st: self.stopper.make_abortable(receiver_stream), + str_1: device_name, + str_2: node_name, + })) + } + + /// Kubelet calls allocate during pod creation. + /// This means kubelet is trying to reserve a usage slot (virtual Device) of the Instance for this node. + /// Returns error if cannot reserve that slot. + async fn allocate( + &self, + requests: Request, + ) -> Result, tonic::Status> { + info!( + "allocate - kubelet called allocate for Instance {}", + self.instance_name + ); + let mut container_responses: Vec = Vec::new(); + let reqs = requests.into_inner().container_requests; + for allocate_request in reqs { + let devices = allocate_request.devices_i_ds; + for device in devices { + let (_, id) = device + .rsplit_once('-') + .ok_or(tonic::Status::unknown("Invalid device id"))?; + let id = id + .parse::() + .or(Err(tonic::Status::unknown("Invalid device id")))?; + self.claim_slot(Some(id), DeviceUsage::Node(self.node_name.to_owned())) + .await + .map_err(|e| { + error!("Unable to claim slot: {:?}", e); + tonic::Status::unknown("Unable to claim slot") + })?; + } + container_responses.push(cdi_device_to_car(&self.instance_name, &self.device)); + } + Ok(tonic::Response::new(AllocateResponse { + container_responses, + })) + } +} + +fn cdi_device_to_car(instance_name: &str, device: &cdi::Device) -> ContainerAllocateResponse { + ContainerAllocateResponse { + envs: device + .container_edits + .env + .iter() + .map(|e| match e.split_once('=') { + Some((k, v)) => ( + format!("{}_{}", k, instance_name.to_uppercase()), + v.to_string(), + ), + None => ( + format!("{}_{}", e, instance_name.to_uppercase()), + "".to_string(), + ), + }) + .collect(), + mounts: device + .container_edits + .mounts + .iter() + .map(|m| super::v1beta1::Mount { + container_path: m.container_path.clone(), + host_path: m.host_path.clone(), + read_only: m.options.contains(&"ro".to_string()), + }) + .collect(), + devices: device + .container_edits + .device_nodes + .iter() + .map(|d| super::v1beta1::DeviceSpec { + container_path: d.path.clone(), + host_path: d.host_path.clone().unwrap_or(d.path.clone()), + permissions: d.permissions.clone().unwrap_or_default(), + }) + .collect(), + annotations: device.annotations.clone(), + } +} + +#[derive(Clone, Debug, PartialEq)] +enum ConfigurationSlot { + DeviceFree(String), + DeviceUsed { device: String, slot_id: usize }, +} + +struct ConfigurationDevicePlugin { + instances: RwLock>>, + slots: Arc>>>, + config_name: String, + node_name: String, + stopper: Stopper, +} + +impl ConfigurationDevicePlugin { + fn new(config_name: String, node_name: String) -> Self { + let (slots, _) = watch::channel(Default::default()); + Self { + instances: Default::default(), + slots: Arc::new(RwLock::new(slots)), + config_name, + node_name, + stopper: Stopper::new(), + } + } + async fn add_plugin(&self, name: String, plugin: Arc) { + self.instances + .write() + .await + .insert(name.to_owned(), plugin.clone()); + let node_name = self.node_name.clone(); + let slots_ref = self.slots.clone(); + let config_name = self.config_name.clone(); + let instance_name = plugin.instance_name.clone(); + let mut receiver = plugin.slots_status.lock().await.subscribe(); + tokio::spawn(async move { + let mut signal = + tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()).unwrap(); + loop { + { + let (has_free, used_config_slots) = { + let values = receiver.borrow_and_update(); + let has_free = values.contains(&DeviceUsage::Unused); + let used_config_slots: HashMap = values + .iter() + .enumerate() + .filter_map(|(slot, du)| match du { + DeviceUsage::Configuration { vdev, node } if *node == node_name => { + Some(( + vdev.clone(), + ConfigurationSlot::DeviceUsed { + device: instance_name.clone(), + slot_id: slot, + }, + )) + } + _ => None, + }) + .collect(); + (has_free, used_config_slots) + }; + slots_ref.write().await.send_if_modified(|slots| { + let mut modified = false; + let mut free_slot_available = has_free; + // Check for slots to remove + for (slot, usage) in slots.clone().iter() { + let to_remove = match usage { + ConfigurationSlot::DeviceFree(d) if *d == instance_name => { + if !free_slot_available { + true + } else { + free_slot_available = false; + false + } + } + ConfigurationSlot::DeviceUsed { device, .. } + if *device == instance_name => + { + used_config_slots.get(slot) != Some(usage) + } + _ => false, + }; + if to_remove { + modified = true; + slots.remove(slot); + } + } + let cur_length = slots.len(); + slots.extend(used_config_slots); + if free_slot_available { + let mut used_slots_ids = slots + .keys() + .map(|k| { + let (_, id) = k.rsplit_once('-').unwrap(); + id.parse::().unwrap() + }) + .sorted() + .rev() + .collect_vec(); + let mut possible_slot = 0usize; + while used_slots_ids.pop() == Some(possible_slot) { + possible_slot += 1; + } + slots.insert( + format!("{}-{}", config_name, possible_slot), + ConfigurationSlot::DeviceFree(instance_name.clone()), + ); + } + modified || cur_length != slots.len() + }); + } + tokio::select! { + a = receiver.changed() => { + if a.is_err() { + break; + } + }, + _ = signal.recv() => {break} + } + } + slots_ref.write().await.send_modify(|slots| { + // Only keep slots that are unrelated to the current plugin + slots.retain(|_, v| match v { + ConfigurationSlot::DeviceFree(p) if *p == instance_name => false, + ConfigurationSlot::DeviceUsed { device, .. } if *device == instance_name => { + false + } + _ => true, + }) + }); + }); + } + async fn remove_plugin(&self, name: &str) -> bool { + let mut instances = self.instances.write().await; + instances.remove(name); + instances.is_empty() + } + + async fn free_slot(&self, id: usize) -> Result<(), DevicePluginError> { + let slot_id = format!("{}-{}", self.config_name, id); + let slot = self.slots.read().await.borrow().get(&slot_id).cloned(); + if let Some(ConfigurationSlot::DeviceUsed { device, slot_id }) = slot { + if let Some(dp) = self.instances.read().await.get(&device) { + dp.free_slot(slot_id).await?; + } else { + error!("Tried to free used slot for gone instance device plugin"); + } + } + Ok(()) + } +} + +#[async_trait] +impl InternalDevicePlugin for ConfigurationDevicePlugin { + type DeviceStore = HashMap; + + fn get_name(&self) -> String { + self.config_name.clone() + } + + async fn stopped(&self) { + self.stopper.stopped().await; + trace!("plugin {} stopped", self.config_name); + } + + fn stop(&self) { + self.stopper.stop() + } + + async fn list_and_watch( + &self, + ) -> Result>, tonic::Status> { + info!( + "list_and_watch - kubelet called list_and_watch for Configuration {}", + self.config_name + ); + let device_name = self.config_name.clone(); + let node_name = self.node_name.clone(); + let receiver = self.slots.read().await.subscribe(); + let receiver_stream = tokio_stream::wrappers::WatchStream::new(receiver); + + Ok(tonic::Response::new(DeviceUsageStream { + f: config_device_usage_to_device, + st: self.stopper.make_abortable(receiver_stream), + str_1: device_name, + str_2: node_name, + })) + } + + /// Kubelet calls allocate during pod creation. + /// This means kubelet is trying to reserve a usage slot (virtual Device) of the Instance for this node. + /// Returns error if cannot reserve that slot. + async fn allocate( + &self, + requests: Request, + ) -> Result, tonic::Status> { + info!( + "allocate - kubelet called allocate for Configuration {}", + self.config_name + ); + let mut container_responses: Vec = Vec::new(); + let reqs = requests.into_inner().container_requests; + for allocate_request in reqs { + let devices = allocate_request.devices_i_ds; + for device in devices { + let dev = self + .slots + .read() + .await + .borrow() + .get(&device) + .ok_or(tonic::Status::unknown("Unable to claim slot"))? + .clone(); + if let ConfigurationSlot::DeviceFree(dev) = dev { + let dp = self + .instances + .read() + .await + .get(&dev) + .ok_or(tonic::Status::unknown("Invalid slot"))? + .clone(); + container_responses.push(cdi_device_to_car(&dp.instance_name, &dp.device)); + dp.claim_slot( + None, + DeviceUsage::Configuration { + vdev: device.clone(), + node: self.node_name.clone(), + }, + ) + .await + .or(Err(tonic::Status::unknown("Unavailable slot")))?; + } else { + return Err(tonic::Status::unknown("Unable to claim slot")); + } + } + } + Ok(tonic::Response::new(AllocateResponse { + container_responses, + })) + } +} + +fn config_device_usage_to_device( + _device_name: &str, + _node_name: &str, + devices: HashMap, +) -> Result { + Ok(ListAndWatchResponse { + devices: devices + .into_keys() + .map(|id| super::v1beta1::Device { + id, + health: "Healthy".to_string(), + topology: None, + }) + .collect(), + }) +} + +/// This module implements a controller for Instance resources that will ensure device plugins are correctly created with the correct health status + +pub struct DevicePluginManager { + instance_plugins: Mutex>>, + configuration_plugins: Mutex>>, + node_name: String, + kube_client: Arc>, + device_manager: Arc, +} + +impl DevicePluginManager { + pub fn new( + node_name: String, + kube_client: Arc>, + device_manager: Arc, + ) -> Self { + Self { + instance_plugins: Mutex::new(HashMap::default()), + configuration_plugins: Mutex::new(HashMap::default()), + node_name, + kube_client, + device_manager, + } + } + + pub async fn free_slot(&self, device_id: String) -> Result<(), DevicePluginError> { + let (plugin_name, slot_id) = device_id + .rsplit_once('-') + .ok_or(DevicePluginError::UsageParseError)?; + let slot_id = slot_id + .parse::() + .map_err(|_| DevicePluginError::UsageParseError)?; + { + let plugin = self.instance_plugins.lock().await.get(plugin_name).cloned(); + if let Some(plugin) = plugin { + return plugin.free_slot(slot_id).await; + } + } + { + let plugin = self + .configuration_plugins + .lock() + .await + .get(plugin_name) + .cloned(); + if let Some(plugin) = plugin { + return plugin.free_slot(slot_id).await; + } + } + Err(DevicePluginError::NoSlot) + } + + pub async fn get_used_slots(&self) -> HashSet { + let mut slots: HashSet = Default::default(); + for (instance, plugin) in self.instance_plugins.lock().await.iter() { + slots.extend( + plugin + .slots_status + .lock() + .await + .borrow() + .iter() + .enumerate() + .filter_map(|(i, u)| match u { + DeviceUsage::Node(n) if *n == self.node_name => { + Some(format!("akri.sh/{}-{}", instance, i)) + } + DeviceUsage::Configuration { vdev, node } if *node == self.node_name => { + Some(vdev.to_string()) + } + _ => None, + }), + ); + } + slots + } +} + +pub fn start_dpm(dpm: Arc) -> (Store, JoinHandle<()>) { + let api = dpm.kube_client.all().as_inner(); + let controller = Controller::new(api, Default::default()); + let store = controller.store(); + let task = tokio::spawn(async { + controller + .graceful_shutdown_on(async { + let mut signal = + tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()) + .unwrap(); + signal.recv().await; + }) + .run(reconcile, error_policy, dpm) + .for_each(|_| futures::future::ready(())) + .await + }); + (store, task) +} + +pub async fn reconcile( + instance: Arc, + ctx: Arc, +) -> Result { + trace!("Plugin Manager: Reconciling {}", instance.name_any()); + let api = ctx.kube_client.namespaced(&instance.namespace().unwrap()); + if !instance.spec.nodes.contains(&ctx.node_name) + || instance.metadata.deletion_timestamp.is_some() + { + let mut cps = ctx.configuration_plugins.lock().await; + if let Some(cp) = cps.get(&instance.spec.configuration_name) { + if cp.remove_plugin(&instance.name_any()).await { + cp.stop(); + cps.remove(&instance.spec.configuration_name); + } + } + if let Some(plugin) = ctx + .instance_plugins + .lock() + .await + .remove(&instance.name_any()) + { + plugin.stop(); + } + api.remove_finalizer(&instance, &ctx.node_name) + .await + .map_err(|e| DevicePluginError::Other(e.into()))?; + } else { + let device = ctx.device_manager.get(&instance.spec.cdi_name).ok_or( + DevicePluginError::UnknownDevice(instance.spec.cdi_name.to_owned()), + )?; + api.add_finalizer(&instance, &ctx.node_name) + .await + .map_err(|e| DevicePluginError::Other(e.into()))?; + + let instance_plugin = { + let mut instance_plugins = ctx.instance_plugins.lock().await; + match instance_plugins.get(&instance.name_any()) { + None => { + let plugin = Arc::new(InstanceDevicePlugin::new( + ctx.node_name.to_owned(), + instance.name_any(), + instance.namespace().unwrap_or("default".to_string()), + device, + &instance.spec.device_usage, + instance.spec.capacity, + ctx.kube_client.clone(), + )?); + serve_and_register_plugin(plugin.clone()).await?; + instance_plugins.insert(instance.name_any(), plugin.clone()); + plugin + } + Some(plugin) => { + plugin.update_slots(&instance.spec.device_usage).await?; + plugin.clone() + } + } + }; + let configuration_plugin = { + let mut configuration_plugins = ctx.configuration_plugins.lock().await; + match configuration_plugins.get(&instance.spec.configuration_name) { + None => { + let plugin = Arc::new(ConfigurationDevicePlugin::new( + instance.spec.configuration_name.to_owned(), + ctx.node_name.to_owned(), + )); + serve_and_register_plugin(plugin.clone()).await?; + configuration_plugins + .insert(instance.spec.configuration_name.to_owned(), plugin.clone()); + plugin + } + Some(plugin) => plugin.clone(), + } + }; + configuration_plugin + .add_plugin(instance.name_any(), instance_plugin) + .await; + } + Ok(Action::requeue(Duration::from_secs(300))) +} + +pub fn error_policy( + dc: Arc, + error: &DevicePluginError, + _ctx: Arc, +) -> Action { + error!( + "Error during reconciliation of Instance {}: {:?}", + dc.name_any(), + error + ); + Action::requeue(Duration::from_secs(60)) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use akri_shared::{ + akri::instance::InstanceSpec, + k8s::crud::{MockApi, MockIntoApi}, + }; + use tokio_stream::StreamExt; + + use crate::plugin_manager::v1beta1::ContainerAllocateRequest; + + use self::cdi::{ContainerEdit, Device}; + + use super::*; + + #[test] + fn test_device_usage() -> Result<(), DevicePluginError> { + assert_eq!( + DeviceUsage::from_str("node-a")?, + DeviceUsage::Node("node-a".to_string()) + ); + assert_eq!( + DeviceUsage::from_str("C:vdev1:node-a")?, + DeviceUsage::Configuration { + vdev: "vdev1".to_string(), + node: "node-a".to_string() + }, + ); + assert_eq!(DeviceUsage::from_str("")?, DeviceUsage::Unused,); + assert!(DeviceUsage::from_str("C:node-a").is_err()); + + Ok(()) + } + + #[test] + fn test_parse_slot_id() -> Result<(), DevicePluginError> { + assert_eq!(parse_slot_id("slot-1")?, 1); + assert_eq!(parse_slot_id("my-other-slot-2")?, 2); + assert!(parse_slot_id("not-a-slot-id").is_err()); + Ok(()) + } + + #[test] + fn test_construct_slots_map() -> Result<(), DevicePluginError> { + let slots = HashMap::from([ + ("slot-1".to_owned(), "node-a".to_owned()), + ("slot-3".to_owned(), "C:vdev1:node-a".to_owned()), + ]); + assert_eq!( + construct_slots_map(&slots)?, + HashMap::from([ + (1, DeviceUsage::Node("node-a".to_owned())), + ( + 3, + DeviceUsage::Configuration { + vdev: "vdev1".to_owned(), + node: "node-a".to_owned() + } + ) + ]) + ); + Ok(()) + } + + #[test] + fn test_construct_slots_vec() -> Result<(), DevicePluginError> { + let slots = HashMap::from([ + ("slot-1".to_owned(), "node-a".to_owned()), + ("slot-3".to_owned(), "C:vdev1:node-a".to_owned()), + ]); + assert_eq!( + construct_slots_vec(&slots, 4)?, + vec![ + DeviceUsage::Unused, + DeviceUsage::Node("node-a".to_string()), + DeviceUsage::Unused, + DeviceUsage::Configuration { + vdev: "vdev1".to_owned(), + node: "node-a".to_owned() + } + ] + ); + assert!(construct_slots_vec(&slots, 1).is_err()); + Ok(()) + } + + #[tokio::test] + async fn test_instance_plugin_update_slots() { + let plugin = InstanceDevicePlugin::new( + "node-a".to_owned(), + "my-device".to_owned(), + "namespace-a".to_owned(), + Device { + name: "my-device".to_owned(), + annotations: Default::default(), + container_edits: ContainerEdit { + ..Default::default() + }, + }, + &HashMap::new(), + 3, + Arc::new(MockIntoApi::new()), + ) + .unwrap(); + + assert!(plugin + .update_slots(&HashMap::from([("slot-1".to_owned(), "node-a".to_owned())])) + .await + .is_ok(),); + + assert_eq!( + plugin.slots_status.lock().await.borrow()[1], + DeviceUsage::Node("node-a".to_owned()) + ); + } + + #[tokio::test] + async fn test_free_slot() { + let dm = crate::device_manager::MockDeviceManager::new(); + let mut kube_client = MockIntoApi::new(); + kube_client.expect_namespaced().returning(|_| { + let mut api = MockApi::new(); + api.expect_raw_patch() + .with( + mockall::predicate::eq("instance-a"), + mockall::predicate::function(|a: &Patch| match a { + Patch::Apply(v) => { + let su: Object = + serde_json::from_value(v.clone()).unwrap(); + error!("{:?}", su.spec.device_usage); + su.spec.device_usage.is_empty() + } + _ => false, + }), + mockall::predicate::always(), + ) + .returning(|_, _, _| { + Ok(Instance { + metadata: Default::default(), + spec: InstanceSpec { + configuration_name: "config-a".to_owned(), + cdi_name: Default::default(), + capacity: 1, + broker_properties: Default::default(), + shared: false, + nodes: Default::default(), + device_usage: Default::default(), + }, + }) + }); + Box::new(api) + }); + let kube_client = Arc::new(kube_client); + let dpm = DevicePluginManager::new("node-a".to_owned(), kube_client.clone(), Arc::new(dm)); + + let stopper = Stopper::new(); + + let (s, _) = watch::channel(vec![ + DeviceUsage::Configuration { + vdev: "config-a-1".to_owned(), + node: "node-a".to_owned(), + }, + DeviceUsage::Node("node-b".to_owned()), + ]); + + let instance_plugin = Arc::new(InstanceDevicePlugin { + device: Device { + name: "my-device".to_owned(), + annotations: Default::default(), + container_edits: ContainerEdit { + ..Default::default() + }, + }, + slots_status: Mutex::new(s), + node_name: "node-a".to_owned(), + instance_name: "instance-a".to_owned(), + instance_namespace: "namespace-a".to_owned(), + kube_client, + stopper: stopper.clone(), + }); + dpm.instance_plugins + .lock() + .await + .insert("instance-a".to_owned(), instance_plugin.clone()); + + let (s, _) = watch::channel(HashMap::from([( + "config-a-1".to_owned(), + ConfigurationSlot::DeviceUsed { + device: "instance-a".to_owned(), + slot_id: 0, + }, + )])); + + dpm.configuration_plugins.lock().await.insert( + "config-a".to_owned(), + Arc::new(ConfigurationDevicePlugin { + instances: RwLock::new(HashMap::from([("instance-a".to_owned(), instance_plugin)])), + slots: Arc::new(RwLock::new(s)), + config_name: "config-a".to_owned(), + node_name: "node-a".to_string(), + stopper, + }), + ); + + assert!(dpm.free_slot("config-b-2".to_owned()).await.is_err()); + assert!(dpm.free_slot("config-a-1".to_owned()).await.is_ok()); + } + + #[tokio::test] + async fn test_get_used_slots() { + let dm = crate::device_manager::MockDeviceManager::new(); + let kube_client = Arc::new(MockIntoApi::new()); + let stopper = Stopper::new(); + let dpm = DevicePluginManager::new("node-a".to_owned(), kube_client.clone(), Arc::new(dm)); + + assert!(dpm.get_used_slots().await.is_empty()); + + let (s, _) = watch::channel(vec![ + DeviceUsage::Configuration { + vdev: "akri.sh/config-a-1".to_owned(), + node: "node-a".to_owned(), + }, + DeviceUsage::Node("node-a".to_owned()), + DeviceUsage::Node("node-b".to_owned()), + DeviceUsage::Unused, + ]); + let instance_plugin = Arc::new(InstanceDevicePlugin { + device: Device { + name: "my-device".to_owned(), + annotations: Default::default(), + container_edits: ContainerEdit { + ..Default::default() + }, + }, + slots_status: Mutex::new(s), + node_name: "node-a".to_owned(), + instance_name: "instance-a".to_owned(), + instance_namespace: "namespace-a".to_owned(), + kube_client, + stopper: stopper.clone(), + }); + dpm.instance_plugins + .lock() + .await + .insert("instance-a".to_owned(), instance_plugin); + assert_eq!( + dpm.get_used_slots().await, + HashSet::from([ + "akri.sh/config-a-1".to_owned(), + "akri.sh/instance-a-1".to_owned() + ]) + ); + } + + #[tokio::test] + async fn test_config_plugin_add_remove_plugin() { + let kube_client = Arc::new(MockIntoApi::new()); + let stopper = Stopper::new(); + let (s, mut r) = watch::channel(vec![ + DeviceUsage::Configuration { + vdev: "akri.sh/config-a-1".to_owned(), + node: "node-a".to_owned(), + }, + DeviceUsage::Node("node-a".to_owned()), + DeviceUsage::Node("node-b".to_owned()), + DeviceUsage::Unused, + ]); + let instance_plugin = Arc::new(InstanceDevicePlugin { + device: Device { + name: "my-device".to_owned(), + annotations: Default::default(), + container_edits: ContainerEdit { + ..Default::default() + }, + }, + slots_status: Mutex::new(s), + node_name: "node-a".to_owned(), + instance_name: "instance-a".to_owned(), + instance_namespace: "namespace-a".to_owned(), + kube_client, + stopper: stopper.clone(), + }); + + let config_plugin = + ConfigurationDevicePlugin::new("config-a".to_owned(), "node-a".to_owned()); + config_plugin + .add_plugin("instance-a".to_owned(), instance_plugin.clone()) + .await; + + assert_eq!(config_plugin.instances.read().await.len(), 1); + + tokio::time::sleep(Duration::from_millis(500)).await; + + assert_eq!( + config_plugin.slots.read().await.borrow().clone(), + HashMap::from([ + ( + "akri.sh/config-a-1".to_owned(), + ConfigurationSlot::DeviceUsed { + device: "instance-a".to_owned(), + slot_id: 0 + } + ), + ( + "config-a-0".to_owned(), + ConfigurationSlot::DeviceFree("instance-a".to_owned()) + ) + ]) + ); + + instance_plugin + .slots_status + .lock() + .await + .send_modify(|slots| slots[3] = DeviceUsage::Node("node-a".to_string())); + drop(instance_plugin); + + tokio::time::sleep(Duration::from_millis(500)).await; + r.borrow_and_update(); + + assert_eq!( + config_plugin.slots.read().await.borrow().clone(), + HashMap::from([( + "akri.sh/config-a-1".to_owned(), + ConfigurationSlot::DeviceUsed { + device: "instance-a".to_owned(), + slot_id: 0 + } + ),]) + ); + config_plugin.remove_plugin("instance-a").await; + tokio::time::sleep(Duration::from_millis(500)).await; + + assert!(config_plugin.instances.read().await.is_empty()); + assert!( + tokio::time::timeout(Duration::from_millis(500), r.changed()) + .await + .unwrap() + .is_err() + ); + assert!(config_plugin.slots.read().await.borrow().is_empty()); + } + + #[tokio::test] + async fn test_config_plugin_allocate() { + let mut kube_client = MockIntoApi::new(); + kube_client.expect_namespaced().returning(|_| { + let mut api = MockApi::new(); + api.expect_raw_patch().returning(|_, _, _| { + Ok(Instance { + metadata: Default::default(), + spec: InstanceSpec { + configuration_name: "config-a".to_owned(), + cdi_name: Default::default(), + capacity: 1, + broker_properties: Default::default(), + shared: false, + nodes: Default::default(), + device_usage: Default::default(), + }, + }) + }); + Box::new(api) + }); + let kube_client = Arc::new(kube_client); + let stopper = Stopper::new(); + let (s, _) = watch::channel(vec![ + DeviceUsage::Configuration { + vdev: "akri.sh/config-a-1".to_owned(), + node: "node-a".to_owned(), + }, + DeviceUsage::Node("node-a".to_owned()), + DeviceUsage::Node("node-b".to_owned()), + DeviceUsage::Unused, + ]); + let instance_plugin = Arc::new(InstanceDevicePlugin { + device: Device { + name: "my-device".to_owned(), + annotations: Default::default(), + container_edits: ContainerEdit { + ..Default::default() + }, + }, + slots_status: Mutex::new(s), + node_name: "node-a".to_owned(), + instance_name: "instance-a".to_owned(), + instance_namespace: "namespace-a".to_owned(), + kube_client, + stopper: stopper.clone(), + }); + + let config_plugin = + ConfigurationDevicePlugin::new("config-a".to_owned(), "node-a".to_owned()); + config_plugin + .add_plugin("instance-a".to_owned(), instance_plugin) + .await; + + tokio::time::sleep(Duration::from_millis(500)).await; + + assert_eq!( + config_plugin + .allocate(Request::new(AllocateRequest { + container_requests: vec![ContainerAllocateRequest { + devices_i_ds: vec!["config-a-0".to_owned()], + }], + })) + .await + .unwrap() + .into_inner() + .container_responses + .len(), + 1 + ); + } + + #[tokio::test] + async fn test_instance_plugin_allocate() { + let mut kube_client = MockIntoApi::new(); + kube_client.expect_namespaced().returning(|_| { + let mut api = MockApi::new(); + api.expect_raw_patch() + .with( + mockall::predicate::eq("instance-a"), + mockall::predicate::function(|a: &Patch| match a { + Patch::Apply(v) => { + let su: Object = + serde_json::from_value(v.clone()).unwrap(); + error!("{:?}", su.spec.device_usage); + su.spec.device_usage + == HashMap::from([ + ( + "instance-a-0".to_string(), + "C:akri.sh/config-a-1:node-a".to_owned(), + ), + ("instance-a-1".to_owned(), "node-a".to_owned()), + ("instance-a-3".to_owned(), "node-a".to_owned()), + ]) + } + _ => false, + }), + mockall::predicate::always(), + ) + .returning(|_, _, _| { + Ok(Instance { + metadata: Default::default(), + spec: InstanceSpec { + configuration_name: "config-a".to_owned(), + cdi_name: Default::default(), + capacity: 1, + broker_properties: Default::default(), + shared: false, + nodes: Default::default(), + device_usage: Default::default(), + }, + }) + }); + Box::new(api) + }); + let kube_client = Arc::new(kube_client); + let stopper = Stopper::new(); + let (s, _) = watch::channel(vec![ + DeviceUsage::Configuration { + vdev: "akri.sh/config-a-1".to_owned(), + node: "node-a".to_owned(), + }, + DeviceUsage::Node("node-a".to_owned()), + DeviceUsage::Node("node-b".to_owned()), + DeviceUsage::Unused, + ]); + let instance_plugin = Arc::new(InstanceDevicePlugin { + device: Device { + name: "my-device".to_owned(), + annotations: Default::default(), + container_edits: ContainerEdit { + ..Default::default() + }, + }, + slots_status: Mutex::new(s), + node_name: "node-a".to_owned(), + instance_name: "instance-a".to_owned(), + instance_namespace: "namespace-a".to_owned(), + kube_client, + stopper: stopper.clone(), + }); + + assert!(instance_plugin + .allocate(Request::new(AllocateRequest { + container_requests: vec![ContainerAllocateRequest { + devices_i_ds: vec!["instance-a-0".to_owned()], + }] + })) + .await + .is_err()); + assert!(instance_plugin + .allocate(Request::new(AllocateRequest { + container_requests: vec![ContainerAllocateRequest { + devices_i_ds: vec!["instance-a-3".to_owned()], + }] + })) + .await + .is_ok()); + } + + #[tokio::test] + async fn test_list_and_watch() { + let kube_client = Arc::new(MockIntoApi::new()); + let instance_plugin = Arc::new( + InstanceDevicePlugin::new( + "node-a".to_owned(), + "instance-a".to_owned(), + "namespace-a".to_owned(), + Device { + name: "my-device".to_string(), + annotations: Default::default(), + container_edits: Default::default(), + }, + &HashMap::from([("instance-a-1".to_owned(), "node-b".to_owned())]), + 3, + kube_client, + ) + .unwrap(), + ); + let config_plugin = + ConfigurationDevicePlugin::new("config-a".to_owned(), "node-a".to_owned()); + config_plugin + .add_plugin("instance-a".to_owned(), instance_plugin.clone()) + .await; + + let mut instance_stream = instance_plugin.list_and_watch().await.unwrap().into_inner(); + let mut config_stream = config_plugin.list_and_watch().await.unwrap().into_inner(); + + assert_eq!( + instance_stream.next().await.unwrap().unwrap(), + ListAndWatchResponse { + devices: vec![ + crate::plugin_manager::v1beta1::Device { + id: "instance-a-0".to_owned(), + health: "Healthy".to_owned(), + topology: None, + }, + crate::plugin_manager::v1beta1::Device { + id: "instance-a-1".to_owned(), + health: "Unhealthy".to_owned(), + topology: None, + }, + crate::plugin_manager::v1beta1::Device { + id: "instance-a-2".to_owned(), + health: "Healthy".to_owned(), + topology: None, + }, + ] + } + ); + // First message is sent before adding plugin + assert_eq!( + config_stream.next().await.unwrap().unwrap(), + ListAndWatchResponse { devices: vec![] } + ); + assert_eq!( + config_stream.next().await.unwrap().unwrap(), + ListAndWatchResponse { + devices: vec![crate::plugin_manager::v1beta1::Device { + id: "config-a-0".to_owned(), + health: "Healthy".to_owned(), + topology: None, + }] + } + ); + + instance_plugin + .update_slots(&HashMap::from([ + ("instance-a-0".to_owned(), "C:config-a-0:node-a".to_owned()), + ("instance-a-1".to_owned(), "node-b".to_owned()), + ("instance-a-2".to_owned(), "node-a".to_owned()), + ])) + .await + .unwrap(); + + assert_eq!( + instance_stream.next().await.unwrap().unwrap(), + ListAndWatchResponse { + devices: vec![ + crate::plugin_manager::v1beta1::Device { + id: "instance-a-0".to_owned(), + health: "Unhealthy".to_owned(), + topology: None, + }, + crate::plugin_manager::v1beta1::Device { + id: "instance-a-1".to_owned(), + health: "Unhealthy".to_owned(), + topology: None, + }, + crate::plugin_manager::v1beta1::Device { + id: "instance-a-2".to_owned(), + health: "Healthy".to_owned(), + topology: None, + }, + ] + } + ); + + assert_eq!( + config_stream.next().await.unwrap().unwrap(), + ListAndWatchResponse { + devices: vec![crate::plugin_manager::v1beta1::Device { + id: "config-a-0".to_owned(), + health: "Healthy".to_owned(), + topology: None, + }] + } + ); + } +} diff --git a/agent/src/plugin_manager/device_plugin_runner.rs b/agent/src/plugin_manager/device_plugin_runner.rs new file mode 100644 index 000000000..6c090a3f3 --- /dev/null +++ b/agent/src/plugin_manager/device_plugin_runner.rs @@ -0,0 +1,248 @@ +use std::{convert::TryFrom, path::Path, sync::Arc, time::SystemTime}; + +use akri_shared::uds::unix_stream; +use async_trait::async_trait; +use futures::{StreamExt, TryFutureExt}; +use thiserror::Error; +use tokio::net::{UnixListener, UnixStream}; +use tokio_stream::wrappers::WatchStream; +use tonic::{ + transport::{Endpoint, Server, Uri}, + Request, +}; +use tower::service_fn; + +/// Current version of the API supported by kubelet. +pub const K8S_DEVICE_PLUGIN_VERSION: &str = "v1beta1"; + +/// DevicePluginPath is the folder the kubelet expects to find Device-Plugin sockets. +pub const DEVICE_PLUGIN_PATH: &str = "/var/lib/kubelet/device-plugins"; + +/// Path of the Kubelet registry socket +pub const KUBELET_SOCKET: &str = "/var/lib/kubelet/device-plugins/kubelet.sock"; + +use super::v1beta1::{ + device_plugin_server::{DevicePlugin, DevicePluginServer}, + registration_client, AllocateRequest, AllocateResponse, DevicePluginOptions, Empty, + ListAndWatchResponse, RegisterRequest, +}; + +#[async_trait] +pub(super) trait InternalDevicePlugin: Sync + Send { + type DeviceStore: Clone + Send + Sync + 'static; + async fn list_and_watch( + &self, + ) -> Result>, tonic::Status>; + async fn allocate( + &self, + requests: Request, + ) -> Result, tonic::Status>; + + fn get_name(&self) -> String; + + async fn stopped(&self); + fn stop(&self); +} + +pub(super) struct DeviceUsageStream { + pub f: fn(&str, &str, T) -> Result, + pub st: futures::stream::Abortable>, + pub str_1: String, + pub str_2: String, +} + +impl futures::Stream for DeviceUsageStream { + type Item = Result; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + match self.st.poll_next_unpin(cx) { + std::task::Poll::Ready(Some(i)) => { + std::task::Poll::Ready(Some((self.f)(&self.str_1, &self.str_2, i))) + } + std::task::Poll::Ready(None) => { + trace!("Stream Stopped"); + std::task::Poll::Ready(None) + } + std::task::Poll::Pending => std::task::Poll::Pending, + } + } +} + +struct DevicePluginImpl { + inner: Arc>, +} + +#[async_trait] +impl DevicePlugin for DevicePluginImpl { + async fn get_device_plugin_options( + &self, + _request: tonic::Request, + ) -> Result, tonic::Status> { + Ok(tonic::Response::new(DevicePluginOptions { + pre_start_required: false, + get_preferred_allocation_available: false, + })) + } + + type ListAndWatchStream = DeviceUsageStream; + + async fn list_and_watch( + &self, + _request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner.list_and_watch().await + } + + async fn allocate( + &self, + requests: Request, + ) -> Result, tonic::Status> { + trace!("kubelet called allocate {:?}", requests); + self.inner.allocate(requests).await + } + + async fn pre_start_container( + &self, + _request: Request, + ) -> Result, tonic::Status> { + error!("pre_start_container - kubelet called pre_start_container !",); + Ok(tonic::Response::new( + super::v1beta1::PreStartContainerResponse {}, + )) + } + + async fn get_preferred_allocation( + &self, + _request: tonic::Request, + ) -> Result, tonic::Status> { + error!("get_preferred_allocation - kubelet called get_prefered_allocation",); + Err(tonic::Status::unimplemented( + "Get preferred allocation is not implemented for this plugin", + )) + } +} + +#[derive(Error, Debug)] +pub enum RunnerError { + #[error("Unable to get current time")] + TimeError, + + #[error("Unable to register plugin to kubelet")] + RegistrationError, +} + +pub(super) async fn serve_and_register_plugin( + plugin: Arc>, +) -> Result<(), RunnerError> { + let device_plugin_name = plugin.get_name(); + let plugin_impl = DevicePluginImpl { + inner: plugin.clone(), + }; + + let unique_time = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .map_err(|_| RunnerError::TimeError)?; + let device_endpoint: String = format!("{}-{}.sock", device_plugin_name, unique_time.as_secs()); + let socket_path: String = Path::new(DEVICE_PLUGIN_PATH) + .join(device_endpoint.clone()) + .to_str() + .unwrap() + .to_string(); + + info!( + "serve - creating a device plugin server that will listen at: {}", + socket_path + ); + tokio::fs::create_dir_all(Path::new(&socket_path[..]).parent().unwrap()) + .await + .expect("Failed to create dir at socket path"); + let service = DevicePluginServer::new(plugin_impl); + let task_socket_path = socket_path.clone(); + let task_plugin = plugin.clone(); + tokio::task::spawn(async move { + let socket_to_delete = task_socket_path.clone(); + let incoming = { + let uds = UnixListener::bind(task_socket_path).expect("Failed to bind to socket path"); + + async_stream::stream! { + loop { + let item = uds.accept().map_ok(|(st, _)| unix_stream::UnixStream(st)).await; + yield item; + } + } + }; + Server::builder() + .add_service(service) + .serve_with_incoming_shutdown(incoming, task_plugin.stopped()) + .await + .unwrap(); + trace!( + "serve - gracefully shutdown ... deleting socket {}", + socket_to_delete + ); + // Socket may already be deleted in the case of the kubelet restart + std::fs::remove_file(socket_to_delete).unwrap_or(()); + }); + + if let Err(e) = register_plugin(device_plugin_name, device_endpoint, socket_path).await { + plugin.stop(); + return Err(e); + } + Ok(()) +} + +async fn register_plugin( + device_plugin_name: String, + device_endpoint: String, + socket_path: String, +) -> Result<(), RunnerError> { + let capability_id: String = format!("akri.sh/{}", device_plugin_name); + + akri_shared::uds::unix_stream::try_connect(&socket_path) + .await + .map_err(|_| RunnerError::RegistrationError)?; + + info!( + "register - entered for Instance {} and socket_name: {}", + capability_id, device_endpoint + ); + let op = DevicePluginOptions { + pre_start_required: false, + get_preferred_allocation_available: false, + }; + + // We will ignore this dummy uri because UDS does not use it. + // Some servers will check the uri content so the uri needs to + // be in valid format even it's not used, the scheme part is used + // to specific what scheme to use, such as http or https + let kubelet_socket_closure = KUBELET_SOCKET.to_string(); + let channel = Endpoint::try_from("http://[::1]:50051") + .unwrap() + .connect_with_connector(service_fn(move |_: Uri| { + UnixStream::connect(kubelet_socket_closure.clone()) + })) + .await + .map_err(|_| RunnerError::RegistrationError)?; + let mut registration_client = registration_client::RegistrationClient::new(channel); + + let register_request = tonic::Request::new(RegisterRequest { + version: K8S_DEVICE_PLUGIN_VERSION.into(), + endpoint: device_endpoint.to_string(), + resource_name: capability_id.to_string(), + options: Some(op), + }); + trace!( + "register - before call to register with the kubelet at socket {}", + KUBELET_SOCKET + ); + + // If fail to register with the kubelet, terminate device plugin + registration_client + .register(register_request) + .await + .map_err(|_| RunnerError::RegistrationError)?; + Ok(()) +} diff --git a/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs b/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs new file mode 100644 index 000000000..e5f491817 --- /dev/null +++ b/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs @@ -0,0 +1,98 @@ +use std::{ + collections::{HashMap, HashSet}, + convert::TryFrom, + sync::Arc, + time::{Duration, Instant}, +}; + +use tokio::net::UnixStream; +use tonic::transport::{Endpoint, Uri}; +use tower::service_fn; + +use crate::plugin_manager::v1::ListPodResourcesRequest; + +use super::{ + device_plugin_instance_controller::DevicePluginManager, + v1::pod_resources_lister_client as podresources, +}; + +/// Path of the Kubelet registry socket +pub const KUBELET_SOCKET: &str = "/var/lib/kubelet/pod-resources/kubelet.sock"; + +async fn get_used_slots() -> Result, anyhow::Error> { + // We will ignore this dummy uri because UDS does not use it. + // Some servers will check the uri content so the uri needs to + // be in valid format even it's not used, the scheme part is used + // to specific what scheme to use, such as http or https + let kubelet_socket_closure = KUBELET_SOCKET.to_string(); + let channel = Endpoint::try_from("http://[::1]:50051") + .unwrap() + .connect_with_connector(service_fn(move |_: Uri| { + UnixStream::connect(kubelet_socket_closure.clone()) + })) + .await?; + let mut podresources_client = podresources::PodResourcesListerClient::new(channel); + + let list_request = tonic::Request::new(ListPodResourcesRequest {}); + trace!( + "register - before call to register with the kubelet at socket {}", + KUBELET_SOCKET + ); + + // Get the list of allocated device ids from kubelet + let resources = podresources_client + .list(list_request) + .await? + .into_inner() + .pod_resources + .into_iter() + .flat_map(|pr| { + pr.containers.into_iter().flat_map(|cr| { + cr.devices.into_iter().flat_map(|cd| { + if cd.resource_name.starts_with("akri.sh/") { + cd.device_ids + } else { + vec![] + } + }) + }) + }) + .collect(); + + Ok(resources) +} + +pub async fn start_reclaimer(dp_manager: Arc) { + let mut stalled_slots: HashMap = HashMap::new(); + let mut signal = + tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()).unwrap(); + loop { + trace!("reclaiming unused slots - start"); + if let Ok(used_slots) = get_used_slots().await { + trace!("used slots: {:?}", used_slots); + let theoretical_slots = dp_manager.get_used_slots().await; + trace!("theoretical slots: {:?}", theoretical_slots); + let mut new_stalled_slots: HashMap = HashMap::new(); + let now = Instant::now(); + for slot in theoretical_slots.difference(&used_slots) { + if let Some(at) = stalled_slots.get(slot) { + if now.saturating_duration_since(*at) >= Duration::from_secs(20) { + trace!("freeing slot: {}", slot); + if dp_manager.free_slot(slot.to_string()).await.is_err() { + new_stalled_slots.insert(slot.to_string(), at.to_owned()); + }; + } else { + new_stalled_slots.insert(slot.to_string(), at.to_owned()); + } + } else { + new_stalled_slots.insert(slot.to_string(), now); + } + } + stalled_slots = new_stalled_slots; + } + tokio::select! { + _ = tokio::time::sleep(Duration::from_secs(10)) => {}, + _ = signal.recv() => return, + }; + } +} diff --git a/agent/src/plugin_manager/mod.rs b/agent/src/plugin_manager/mod.rs new file mode 100644 index 000000000..5300520f1 --- /dev/null +++ b/agent/src/plugin_manager/mod.rs @@ -0,0 +1,7 @@ +pub mod pluginregistration; // Pros generated pluginregistration module +pub mod v1; // Prost generated podresources module +pub mod v1beta1; // Prost generated pluginapi module + +pub mod device_plugin_instance_controller; +mod device_plugin_runner; +pub mod device_plugin_slot_reclaimer; diff --git a/agent/src/plugin_manager/pluginregistration.rs b/agent/src/plugin_manager/pluginregistration.rs new file mode 100644 index 000000000..8fdd546e4 --- /dev/null +++ b/agent/src/plugin_manager/pluginregistration.rs @@ -0,0 +1,260 @@ +/// PluginInfo is the message sent from a plugin to the Kubelet pluginwatcher for plugin registration +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct PluginInfo { + /// Type of the Plugin. CSIPlugin or DevicePlugin + #[prost(string, tag = "1")] + pub r#type: ::prost::alloc::string::String, + /// Plugin name that uniquely identifies the plugin for the given plugin type. + /// For DevicePlugin, this is the resource name that the plugin manages and + /// should follow the extended resource name convention. + /// For CSI, this is the CSI driver registrar name. + #[prost(string, tag = "2")] + pub name: ::prost::alloc::string::String, + /// Optional endpoint location. If found set by Kubelet component, + /// Kubelet component will use this endpoint for specific requests. + /// This allows the plugin to register using one endpoint and possibly use + /// a different socket for control operations. CSI uses this model to delegate + /// its registration external from the plugin. + #[prost(string, tag = "3")] + pub endpoint: ::prost::alloc::string::String, + /// Plugin service API versions the plugin supports. + /// For DevicePlugin, this maps to the deviceplugin API versions the + /// plugin supports at the given socket. + /// The Kubelet component communicating with the plugin should be able + /// to choose any preferred version from this list, or returns an error + /// if none of the listed versions is supported. + #[prost(string, repeated, tag = "4")] + pub supported_versions: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, +} +/// RegistrationStatus is the message sent from Kubelet pluginwatcher to the plugin for notification on registration status +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct RegistrationStatus { + /// True if plugin gets registered successfully at Kubelet + #[prost(bool, tag = "1")] + pub plugin_registered: bool, + /// Error message in case plugin fails to register, empty string otherwise + #[prost(string, tag = "2")] + pub error: ::prost::alloc::string::String, +} +/// RegistrationStatusResponse is sent by plugin to kubelet in response to RegistrationStatus RPC +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct RegistrationStatusResponse {} +/// InfoRequest is the empty request message from Kubelet +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct InfoRequest {} +/// Generated server implementations. +pub mod registration_server { + #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)] + use tonic::codegen::*; + /// Generated trait containing gRPC methods that should be implemented for use with RegistrationServer. + #[async_trait] + pub trait Registration: Send + Sync + 'static { + async fn get_info( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; + async fn notify_registration_status( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; + } + /// Registration is the service advertised by the Plugins. + #[derive(Debug)] + pub struct RegistrationServer { + inner: _Inner, + accept_compression_encodings: EnabledCompressionEncodings, + send_compression_encodings: EnabledCompressionEncodings, + max_decoding_message_size: Option, + max_encoding_message_size: Option, + } + struct _Inner(Arc); + impl RegistrationServer { + pub fn new(inner: T) -> Self { + Self::from_arc(Arc::new(inner)) + } + pub fn from_arc(inner: Arc) -> Self { + let inner = _Inner(inner); + Self { + inner, + accept_compression_encodings: Default::default(), + send_compression_encodings: Default::default(), + max_decoding_message_size: None, + max_encoding_message_size: None, + } + } + pub fn with_interceptor(inner: T, interceptor: F) -> InterceptedService + where + F: tonic::service::Interceptor, + { + InterceptedService::new(Self::new(inner), interceptor) + } + /// Enable decompressing requests with the given encoding. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.accept_compression_encodings.enable(encoding); + self + } + /// Compress responses with the given encoding, if the client supports it. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.send_compression_encodings.enable(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.max_decoding_message_size = Some(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.max_encoding_message_size = Some(limit); + self + } + } + impl tonic::codegen::Service> for RegistrationServer + where + T: Registration, + B: Body + Send + 'static, + B::Error: Into + Send + 'static, + { + type Response = http::Response; + type Error = std::convert::Infallible; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut Context<'_>, + ) -> Poll> { + Poll::Ready(Ok(())) + } + fn call(&mut self, req: http::Request) -> Self::Future { + let inner = self.inner.clone(); + match req.uri().path() { + "/pluginregistration.Registration/GetInfo" => { + #[allow(non_camel_case_types)] + struct GetInfoSvc(pub Arc); + impl tonic::server::UnaryService for GetInfoSvc { + type Response = super::PluginInfo; + type Future = BoxFuture, tonic::Status>; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = + async move { ::get_info(&inner, request).await }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let inner = inner.0; + let method = GetInfoSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/pluginregistration.Registration/NotifyRegistrationStatus" => { + #[allow(non_camel_case_types)] + struct NotifyRegistrationStatusSvc(pub Arc); + impl tonic::server::UnaryService + for NotifyRegistrationStatusSvc + { + type Response = super::RegistrationStatusResponse; + type Future = BoxFuture, tonic::Status>; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::notify_registration_status(&inner, request) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let inner = inner.0; + let method = NotifyRegistrationStatusSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + _ => Box::pin(async move { + Ok(http::Response::builder() + .status(200) + .header("grpc-status", "12") + .header("content-type", "application/grpc") + .body(empty_body()) + .unwrap()) + }), + } + } + } + impl Clone for RegistrationServer { + fn clone(&self) -> Self { + let inner = self.inner.clone(); + Self { + inner, + accept_compression_encodings: self.accept_compression_encodings, + send_compression_encodings: self.send_compression_encodings, + max_decoding_message_size: self.max_decoding_message_size, + max_encoding_message_size: self.max_encoding_message_size, + } + } + } + impl Clone for _Inner { + fn clone(&self) -> Self { + Self(Arc::clone(&self.0)) + } + } + impl std::fmt::Debug for _Inner { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.0) + } + } + impl tonic::server::NamedService for RegistrationServer { + const NAME: &'static str = "pluginregistration.Registration"; + } +} diff --git a/agent/src/plugin_manager/v1.rs b/agent/src/plugin_manager/v1.rs new file mode 100644 index 000000000..9ebaa8c1d --- /dev/null +++ b/agent/src/plugin_manager/v1.rs @@ -0,0 +1,432 @@ +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct AllocatableResourcesRequest {} +/// AllocatableResourcesResponses contains informations about all the devices known by the kubelet +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct AllocatableResourcesResponse { + #[prost(message, repeated, tag = "1")] + pub devices: ::prost::alloc::vec::Vec, + #[prost(int64, repeated, tag = "2")] + pub cpu_ids: ::prost::alloc::vec::Vec, + #[prost(message, repeated, tag = "3")] + pub memory: ::prost::alloc::vec::Vec, +} +/// ListPodResourcesRequest is the request made to the PodResourcesLister service +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ListPodResourcesRequest {} +/// ListPodResourcesResponse is the response returned by List function +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ListPodResourcesResponse { + #[prost(message, repeated, tag = "1")] + pub pod_resources: ::prost::alloc::vec::Vec, +} +/// PodResources contains information about the node resources assigned to a pod +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct PodResources { + #[prost(string, tag = "1")] + pub name: ::prost::alloc::string::String, + #[prost(string, tag = "2")] + pub namespace: ::prost::alloc::string::String, + #[prost(message, repeated, tag = "3")] + pub containers: ::prost::alloc::vec::Vec, +} +/// ContainerResources contains information about the resources assigned to a container +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ContainerResources { + #[prost(string, tag = "1")] + pub name: ::prost::alloc::string::String, + #[prost(message, repeated, tag = "2")] + pub devices: ::prost::alloc::vec::Vec, + #[prost(int64, repeated, tag = "3")] + pub cpu_ids: ::prost::alloc::vec::Vec, + #[prost(message, repeated, tag = "4")] + pub memory: ::prost::alloc::vec::Vec, +} +/// ContainerMemory contains information about memory and hugepages assigned to a container +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ContainerMemory { + #[prost(string, tag = "1")] + pub memory_type: ::prost::alloc::string::String, + #[prost(uint64, tag = "2")] + pub size: u64, + #[prost(message, optional, tag = "3")] + pub topology: ::core::option::Option, +} +/// ContainerDevices contains information about the devices assigned to a container +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ContainerDevices { + #[prost(string, tag = "1")] + pub resource_name: ::prost::alloc::string::String, + #[prost(string, repeated, tag = "2")] + pub device_ids: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, + #[prost(message, optional, tag = "3")] + pub topology: ::core::option::Option, +} +/// Topology describes hardware topology of the resource +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct TopologyInfo { + #[prost(message, repeated, tag = "1")] + pub nodes: ::prost::alloc::vec::Vec, +} +/// NUMA representation of NUMA node +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct NumaNode { + #[prost(int64, tag = "1")] + pub id: i64, +} +/// Generated client implementations. +pub mod pod_resources_lister_client { + #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)] + use tonic::codegen::http::Uri; + use tonic::codegen::*; + /// PodResourcesLister is a service provided by the kubelet that provides information about the + /// node resources consumed by pods and containers on the node + #[derive(Debug, Clone)] + pub struct PodResourcesListerClient { + inner: tonic::client::Grpc, + } + impl PodResourcesListerClient { + /// Attempt to create a new client by connecting to a given endpoint. + pub async fn connect(dst: D) -> Result + where + D: TryInto, + D::Error: Into, + { + let conn = tonic::transport::Endpoint::new(dst)?.connect().await?; + Ok(Self::new(conn)) + } + } + impl PodResourcesListerClient + where + T: tonic::client::GrpcService, + T::Error: Into, + T::ResponseBody: Body + Send + 'static, + ::Error: Into + Send, + { + pub fn new(inner: T) -> Self { + let inner = tonic::client::Grpc::new(inner); + Self { inner } + } + pub fn with_origin(inner: T, origin: Uri) -> Self { + let inner = tonic::client::Grpc::with_origin(inner, origin); + Self { inner } + } + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> PodResourcesListerClient> + where + F: tonic::service::Interceptor, + T::ResponseBody: Default, + T: tonic::codegen::Service< + http::Request, + Response = http::Response< + >::ResponseBody, + >, + >, + >>::Error: + Into + Send + Sync, + { + PodResourcesListerClient::new(InterceptedService::new(inner, interceptor)) + } + /// Compress requests with the given encoding. + /// + /// This requires the server to support it otherwise it might respond with an + /// error. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.send_compressed(encoding); + self + } + /// Enable decompressing responses. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.accept_compressed(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_decoding_message_size(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_encoding_message_size(limit); + self + } + pub async fn list( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result, tonic::Status> + { + self.inner.ready().await.map_err(|e| { + tonic::Status::new( + tonic::Code::Unknown, + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static("/v1.PodResourcesLister/List"); + let mut req = request.into_request(); + req.extensions_mut() + .insert(GrpcMethod::new("v1.PodResourcesLister", "List")); + self.inner.unary(req, path, codec).await + } + pub async fn get_allocatable_resources( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result, tonic::Status> + { + self.inner.ready().await.map_err(|e| { + tonic::Status::new( + tonic::Code::Unknown, + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/v1.PodResourcesLister/GetAllocatableResources", + ); + let mut req = request.into_request(); + req.extensions_mut().insert(GrpcMethod::new( + "v1.PodResourcesLister", + "GetAllocatableResources", + )); + self.inner.unary(req, path, codec).await + } + } +} +/// Generated server implementations. +pub mod pod_resources_lister_server { + #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)] + use tonic::codegen::*; + /// Generated trait containing gRPC methods that should be implemented for use with PodResourcesListerServer. + #[async_trait] + pub trait PodResourcesLister: Send + Sync + 'static { + async fn list( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; + async fn get_allocatable_resources( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; + } + /// PodResourcesLister is a service provided by the kubelet that provides information about the + /// node resources consumed by pods and containers on the node + #[derive(Debug)] + pub struct PodResourcesListerServer { + inner: _Inner, + accept_compression_encodings: EnabledCompressionEncodings, + send_compression_encodings: EnabledCompressionEncodings, + max_decoding_message_size: Option, + max_encoding_message_size: Option, + } + struct _Inner(Arc); + impl PodResourcesListerServer { + pub fn new(inner: T) -> Self { + Self::from_arc(Arc::new(inner)) + } + pub fn from_arc(inner: Arc) -> Self { + let inner = _Inner(inner); + Self { + inner, + accept_compression_encodings: Default::default(), + send_compression_encodings: Default::default(), + max_decoding_message_size: None, + max_encoding_message_size: None, + } + } + pub fn with_interceptor(inner: T, interceptor: F) -> InterceptedService + where + F: tonic::service::Interceptor, + { + InterceptedService::new(Self::new(inner), interceptor) + } + /// Enable decompressing requests with the given encoding. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.accept_compression_encodings.enable(encoding); + self + } + /// Compress responses with the given encoding, if the client supports it. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.send_compression_encodings.enable(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.max_decoding_message_size = Some(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.max_encoding_message_size = Some(limit); + self + } + } + impl tonic::codegen::Service> for PodResourcesListerServer + where + T: PodResourcesLister, + B: Body + Send + 'static, + B::Error: Into + Send + 'static, + { + type Response = http::Response; + type Error = std::convert::Infallible; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut Context<'_>, + ) -> Poll> { + Poll::Ready(Ok(())) + } + fn call(&mut self, req: http::Request) -> Self::Future { + let inner = self.inner.clone(); + match req.uri().path() { + "/v1.PodResourcesLister/List" => { + #[allow(non_camel_case_types)] + struct ListSvc(pub Arc); + impl + tonic::server::UnaryService for ListSvc + { + type Response = super::ListPodResourcesResponse; + type Future = BoxFuture, tonic::Status>; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::list(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let inner = inner.0; + let method = ListSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + "/v1.PodResourcesLister/GetAllocatableResources" => { + #[allow(non_camel_case_types)] + struct GetAllocatableResourcesSvc(pub Arc); + impl + tonic::server::UnaryService + for GetAllocatableResourcesSvc + { + type Response = super::AllocatableResourcesResponse; + type Future = BoxFuture, tonic::Status>; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::get_allocatable_resources( + &inner, request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let inner = inner.0; + let method = GetAllocatableResourcesSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + _ => Box::pin(async move { + Ok(http::Response::builder() + .status(200) + .header("grpc-status", "12") + .header("content-type", "application/grpc") + .body(empty_body()) + .unwrap()) + }), + } + } + } + impl Clone for PodResourcesListerServer { + fn clone(&self) -> Self { + let inner = self.inner.clone(); + Self { + inner, + accept_compression_encodings: self.accept_compression_encodings, + send_compression_encodings: self.send_compression_encodings, + max_decoding_message_size: self.max_decoding_message_size, + max_encoding_message_size: self.max_encoding_message_size, + } + } + } + impl Clone for _Inner { + fn clone(&self) -> Self { + Self(Arc::clone(&self.0)) + } + } + impl std::fmt::Debug for _Inner { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.0) + } + } + impl tonic::server::NamedService for PodResourcesListerServer { + const NAME: &'static str = "v1.PodResourcesLister"; + } +} diff --git a/agent/src/util/v1beta1.rs b/agent/src/plugin_manager/v1beta1.rs similarity index 84% rename from agent/src/util/v1beta1.rs rename to agent/src/plugin_manager/v1beta1.rs index c84581fee..59d60f9ea 100644 --- a/agent/src/util/v1beta1.rs +++ b/agent/src/plugin_manager/v1beta1.rs @@ -4,6 +4,9 @@ pub struct DevicePluginOptions { /// Indicates if PreStartContainer call is required before each container start #[prost(bool, tag = "1")] pub pre_start_required: bool, + /// Indicates if GetPreferredAllocation is implemented and available for calling + #[prost(bool, tag = "2")] + pub get_preferred_allocation_available: bool, } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] @@ -26,7 +29,7 @@ pub struct RegisterRequest { #[derive(Clone, PartialEq, ::prost::Message)] pub struct Empty {} /// ListAndWatch returns a stream of List of Devices -/// Whenever a Device state change or a Device disapears, ListAndWatch +/// Whenever a Device state change or a Device disappears, ListAndWatch /// returns the new list #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] @@ -34,10 +37,25 @@ pub struct ListAndWatchResponse { #[prost(message, repeated, tag = "1")] pub devices: ::prost::alloc::vec::Vec, } +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct TopologyInfo { + #[prost(message, repeated, tag = "1")] + pub nodes: ::prost::alloc::vec::Vec, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct NumaNode { + #[prost(int64, tag = "1")] + pub id: i64, +} /// E.g: /// struct Device { /// ID: "GPU-fef8089b-4820-abfc-e83e-94318197576e", -/// State: "Healthy", +/// Health: "Healthy", +/// Topology: +/// Node: +/// ID: 1 /// } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] @@ -50,6 +68,9 @@ pub struct Device { /// Health of the device, can be healthy or unhealthy, see constants.go #[prost(string, tag = "2")] pub health: ::prost::alloc::string::String, + /// Topology for device + #[prost(message, optional, tag = "3")] + pub topology: ::core::option::Option, } /// - PreStartContainer is expected to be called before each container start if indicated by plugin during registration phase. /// - PreStartContainer allows kubelet to pass reinitialized devices to containers. @@ -65,6 +86,44 @@ pub struct PreStartContainerRequest { #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct PreStartContainerResponse {} +/// PreferredAllocationRequest is passed via a call to GetPreferredAllocation() +/// at pod admission time. The device plugin should take the list of +/// `available_deviceIDs` and calculate a preferred allocation of size +/// 'allocation_size' from them, making sure to include the set of devices +/// listed in 'must_include_deviceIDs'. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct PreferredAllocationRequest { + #[prost(message, repeated, tag = "1")] + pub container_requests: ::prost::alloc::vec::Vec, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ContainerPreferredAllocationRequest { + /// List of available deviceIDs from which to choose a preferred allocation + #[prost(string, repeated, tag = "1")] + pub available_device_i_ds: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, + /// List of deviceIDs that must be included in the preferred allocation + #[prost(string, repeated, tag = "2")] + pub must_include_device_i_ds: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, + /// Number of devices to include in the preferred allocation + #[prost(int32, tag = "3")] + pub allocation_size: i32, +} +/// PreferredAllocationResponse returns a preferred allocation, +/// resulting from a PreferredAllocationRequest. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct PreferredAllocationResponse { + #[prost(message, repeated, tag = "1")] + pub container_responses: ::prost::alloc::vec::Vec, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ContainerPreferredAllocationResponse { + #[prost(string, repeated, tag = "1")] + pub device_i_ds: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, +} /// - Allocate is expected to be called during pod creation since allocation /// failures for any container would result in pod startup failure. /// - Allocate allows kubelet to exposes additional artifacts in a pod's @@ -366,7 +425,7 @@ pub mod device_plugin_client { self.inner.unary(req, path, codec).await } /// ListAndWatch returns a stream of List of Devices - /// Whenever a Device state change or a Device disapears, ListAndWatch + /// Whenever a Device state change or a Device disappears, ListAndWatch /// returns the new list pub async fn list_and_watch( &mut self, @@ -388,6 +447,33 @@ pub mod device_plugin_client { .insert(GrpcMethod::new("v1beta1.DevicePlugin", "ListAndWatch")); self.inner.server_streaming(req, path, codec).await } + /// GetPreferredAllocation returns a preferred set of devices to allocate + /// from a list of available ones. The resulting preferred allocation is not + /// guaranteed to be the allocation ultimately performed by the + /// devicemanager. It is only designed to help the devicemanager make a more + /// informed allocation decision when possible. + pub async fn get_preferred_allocation( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result, tonic::Status> + { + self.inner.ready().await.map_err(|e| { + tonic::Status::new( + tonic::Code::Unknown, + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/v1beta1.DevicePlugin/GetPreferredAllocation", + ); + let mut req = request.into_request(); + req.extensions_mut().insert(GrpcMethod::new( + "v1beta1.DevicePlugin", + "GetPreferredAllocation", + )); + self.inner.unary(req, path, codec).await + } /// Allocate is called during container creation so that the Device /// Plugin can run device specific operations and instruct Kubelet /// of the steps to make the Device available in the container @@ -410,7 +496,7 @@ pub mod device_plugin_client { } /// PreStartContainer is called, if indicated by Device Plugin during registeration phase, /// before each container start. Device plugin can run device specific operations - /// such as reseting the device before making devices available to the container + /// such as resetting the device before making devices available to the container pub async fn pre_start_container( &mut self, request: impl tonic::IntoRequest, @@ -621,12 +707,21 @@ pub mod device_plugin_server { > + Send + 'static; /// ListAndWatch returns a stream of List of Devices - /// Whenever a Device state change or a Device disapears, ListAndWatch + /// Whenever a Device state change or a Device disappears, ListAndWatch /// returns the new list async fn list_and_watch( &self, request: tonic::Request, ) -> std::result::Result, tonic::Status>; + /// GetPreferredAllocation returns a preferred set of devices to allocate + /// from a list of available ones. The resulting preferred allocation is not + /// guaranteed to be the allocation ultimately performed by the + /// devicemanager. It is only designed to help the devicemanager make a more + /// informed allocation decision when possible. + async fn get_preferred_allocation( + &self, + request: tonic::Request, + ) -> std::result::Result, tonic::Status>; /// Allocate is called during container creation so that the Device /// Plugin can run device specific operations and instruct Kubelet /// of the steps to make the Device available in the container @@ -636,7 +731,7 @@ pub mod device_plugin_server { ) -> std::result::Result, tonic::Status>; /// PreStartContainer is called, if indicated by Device Plugin during registeration phase, /// before each container start. Device plugin can run device specific operations - /// such as reseting the device before making devices available to the container + /// such as resetting the device before making devices available to the container async fn pre_start_container( &self, request: tonic::Request, @@ -796,6 +891,49 @@ pub mod device_plugin_server { }; Box::pin(fut) } + "/v1beta1.DevicePlugin/GetPreferredAllocation" => { + #[allow(non_camel_case_types)] + struct GetPreferredAllocationSvc(pub Arc); + impl + tonic::server::UnaryService + for GetPreferredAllocationSvc + { + type Response = super::PreferredAllocationResponse; + type Future = BoxFuture, tonic::Status>; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::get_preferred_allocation(&inner, request).await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let inner = inner.0; + let method = GetPreferredAllocationSvc(inner); + let codec = tonic::codec::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } "/v1beta1.DevicePlugin/Allocate" => { #[allow(non_camel_case_types)] struct AllocateSvc(pub Arc); diff --git a/agent/src/util/config_action.rs b/agent/src/util/config_action.rs deleted file mode 100644 index f97d542b0..000000000 --- a/agent/src/util/config_action.rs +++ /dev/null @@ -1,684 +0,0 @@ -use super::{ - constants::{ - DISCOVERY_OPERATOR_FINISHED_DISCOVERY_CHANNEL_CAPACITY, - DISCOVERY_OPERATOR_STOP_DISCOVERY_CHANNEL_CAPACITY, - }, - device_plugin_service, - device_plugin_service::DevicePluginContext, - discovery_operator::start_discovery::{start_discovery, DiscoveryOperator}, - registration::RegisteredDiscoveryHandlerMap, -}; -use akri_shared::{ - akri::configuration::Configuration, - k8s, - k8s::{try_delete_instance, KubeInterface}, -}; -use futures::{StreamExt, TryStreamExt}; -use kube::api::{Api, ListParams}; -use kube_runtime::watcher::{default_backoff, watcher, Event}; -use kube_runtime::WatchStreamExt; -use log::{error, info, trace}; -use std::{ - collections::{HashMap, HashSet}, - sync::Arc, -}; -use tokio::sync::{broadcast, mpsc, RwLock}; - -pub type ConfigId = (String, String); -type ConfigMap = Arc>>; - -/// Information for managing a Configuration, such as all applied Instances of that Configuration -/// and senders for ceasing to discover instances upon Configuration deletion. -#[derive(Debug)] -pub struct ConfigInfo { - /// Map of all of a Configuration's Instances - device_plugin_context: Arc>, - /// Sends notification to a `DiscoveryOperator` that it should stop all discovery for its Configuration. - /// This signals it to tell each of its subtasks to stop discovery. - /// A broadcast channel is used so both the sending and receiving ends can be cloned. - stop_discovery_sender: broadcast::Sender<()>, - /// Receives notification that all `DiscoveryOperators` threads have completed and a Configuration's Instances - /// can be safely deleted and the associated `DevicePluginServices` terminated. - finished_discovery_receiver: mpsc::Receiver<()>, - /// Tracks the last generation of the `Configuration` resource (i.e. `.metadata.generation`). - /// This is used to determine if the `Configuration` actually changed, or if only the metadata changed. - /// The `.metadata.generation` value is incremented for all changes, except for changes to `.metadata` or `.status`. - last_generation: Option, -} - -/// This handles pre-existing Configurations and invokes an internal method that watches for Configuration events. -pub async fn do_config_watch( - discovery_handler_map: RegisteredDiscoveryHandlerMap, - new_discovery_handler_sender: broadcast::Sender, - node_name: String, -) -> Result<(), Box> { - info!("do_config_watch - enter"); - let config_map: ConfigMap = Arc::new(RwLock::new(HashMap::new())); - let kube_interface = Arc::new(k8s::KubeImpl::new().await?); - let mut tasks = Vec::new(); - - // Handle pre-existing configs - let pre_existing_configs = kube_interface.get_configurations().await?; - for config in pre_existing_configs { - let config_map = config_map.clone(); - let discovery_handler_map = discovery_handler_map.clone(); - let new_discovery_handler_sender = new_discovery_handler_sender.clone(); - let new_kube_interface = kube_interface.clone(); - let new_node_name = node_name.clone(); - tasks.push(tokio::spawn(async move { - handle_config_add( - new_kube_interface, - &config, - config_map, - discovery_handler_map, - new_discovery_handler_sender, - new_node_name, - ) - .await - .unwrap(); - })); - } - - // Watch for new configs and changes - tasks.push(tokio::spawn(async move { - watch_for_config_changes( - kube_interface, - config_map, - discovery_handler_map, - new_discovery_handler_sender, - node_name, - ) - .await - .unwrap(); - })); - - futures::future::try_join_all(tasks).await?; - info!("do_config_watch - end"); - Ok(()) -} - -/// This watches for Configuration events -async fn watch_for_config_changes( - kube_interface: Arc, - config_map: ConfigMap, - discovery_handler_map: RegisteredDiscoveryHandlerMap, - new_discovery_handler_sender: broadcast::Sender, - node_name: String, -) -> Result<(), Box> { - trace!("watch_for_config_changes - start"); - let resource = Api::::all(kube_interface.get_kube_client()); - let watcher = watcher(resource, ListParams::default()).backoff(default_backoff()); - let mut informer = watcher.boxed(); - // Currently, this does not handle None except to break the loop. - loop { - let event = match informer.try_next().await { - Err(e) => { - error!("Error during watch: {}", e); - continue; - } - Ok(None) => break, - Ok(Some(event)) => event, - }; - let new_discovery_handler_sender = new_discovery_handler_sender.clone(); - handle_config( - kube_interface.clone(), - event, - config_map.clone(), - discovery_handler_map.clone(), - new_discovery_handler_sender, - node_name.clone(), - ) - .await? - } - Ok(()) -} - -/// This takes an event off the Configuration stream and delegates it to the -/// correct function based on the event type. -async fn handle_config( - kube_interface: Arc, - event: Event, - config_map: ConfigMap, - discovery_handler_map: RegisteredDiscoveryHandlerMap, - new_discovery_handler_sender: broadcast::Sender, - node_name: String, -) -> anyhow::Result<()> { - trace!("handle_config - something happened to a configuration"); - match event { - Event::Applied(config) => { - info!( - "handle_config - added or modified Configuration {:?}", - config.metadata.name.as_ref().unwrap(), - ); - handle_config_apply( - kube_interface, - config, - config_map, - discovery_handler_map, - new_discovery_handler_sender, - node_name, - ) - .await?; - } - Event::Deleted(config) => { - let config_id: ConfigId = ( - config.metadata.namespace.clone().unwrap(), - config.metadata.name.clone().unwrap(), - ); - info!("handle_config - deleted Configuration {:?}", config_id,); - handle_config_delete(kube_interface.as_ref(), config_id, config_map).await?; - } - Event::Restarted(configs) => { - let new_configs: HashSet = configs - .iter() - .map(|config| { - ( - config.metadata.namespace.clone().unwrap(), - config.metadata.name.clone().unwrap(), - ) - }) - .collect(); - let old_configs: HashSet = config_map.read().await.keys().cloned().collect(); - for config_id in old_configs.difference(&new_configs) { - handle_config_delete( - kube_interface.as_ref(), - config_id.clone(), - config_map.clone(), - ) - .await?; - } - for config in configs { - handle_config_apply( - kube_interface.clone(), - config, - config_map.clone(), - discovery_handler_map.clone(), - new_discovery_handler_sender.clone(), - node_name.clone(), - ) - .await?; - } - } - } - Ok(()) -} - -async fn handle_config_apply( - kube_interface: Arc, - config: Configuration, - config_map: ConfigMap, - discovery_handler_map: RegisteredDiscoveryHandlerMap, - new_discovery_handler_sender: broadcast::Sender, - node_name: String, -) -> anyhow::Result<()> { - // Applied events can either be newly added Configurations or modified Configurations. - // If modified delete all associated instances and device plugins and then recreate them to reflect updated config - // TODO: more gracefully handle modified Configurations by determining what changed rather than delete/re-add - let config_id: ConfigId = ( - config.metadata.namespace.clone().unwrap(), - config.metadata.name.clone().unwrap(), - ); - if config_map.read().await.contains_key(&config_id) { - let do_recreate = should_recreate_config(&config, config_map.clone()).await?; - if !do_recreate { - trace!( - "handle_config - config {:?} has not changed. ignoring config modified event.", - config.metadata.name, - ); - return Ok(()); - } - info!( - "handle_config - modified Configuration {:?}", - config.metadata.name, - ); - handle_config_delete(kube_interface.as_ref(), config_id, config_map.clone()).await?; - } - - tokio::spawn(async move { - handle_config_add( - kube_interface, - &config, - config_map, - discovery_handler_map, - new_discovery_handler_sender, - node_name, - ) - .await - .unwrap(); - }); - Ok(()) -} - -/// This handles added Configuration by creating a new ConfigInfo for it and adding it to the ConfigMap. -/// Then calls a function to continually observe the availability of instances associated with the Configuration. -async fn handle_config_add( - kube_interface: Arc, - config: &Configuration, - config_map: ConfigMap, - discovery_handler_map: RegisteredDiscoveryHandlerMap, - new_discovery_handler_sender: broadcast::Sender, - node_name: String, -) -> Result<(), Box> { - let config_id: ConfigId = ( - config.metadata.namespace.clone().unwrap(), - config.metadata.name.clone().unwrap(), - ); - // Create a new instance map for this config and add it to the config map - let device_plugin_context = Arc::new(RwLock::new(DevicePluginContext::default())); - let (stop_discovery_sender, _): (broadcast::Sender<()>, broadcast::Receiver<()>) = - broadcast::channel(DISCOVERY_OPERATOR_STOP_DISCOVERY_CHANNEL_CAPACITY); - let (mut finished_discovery_sender, finished_discovery_receiver) = - mpsc::channel(DISCOVERY_OPERATOR_FINISHED_DISCOVERY_CHANNEL_CAPACITY); - let config_info = ConfigInfo { - device_plugin_context: device_plugin_context.clone(), - stop_discovery_sender: stop_discovery_sender.clone(), - finished_discovery_receiver, - last_generation: config.metadata.generation, - }; - config_map.write().await.insert(config_id, config_info); - - let config = config.clone(); - // Keep discovering instances until the config is deleted, signaled by a message from handle_config_delete - tokio::spawn(async move { - let discovery_operator = - DiscoveryOperator::new(discovery_handler_map, config, device_plugin_context); - start_discovery( - discovery_operator, - new_discovery_handler_sender, - stop_discovery_sender, - &mut finished_discovery_sender, - kube_interface, - node_name, - ) - .await - .unwrap(); - }) - .await?; - Ok(()) -} - -/// This handles a deleted Configuration. First, it ceases to discover instances associated with the Configuration. -/// Then, for each of the Configuration's Instances, it signals the DevicePluginService to shutdown, -/// and deletes the Instance CRD. -async fn handle_config_delete( - kube_interface: &dyn KubeInterface, - config_id: ConfigId, - config_map: ConfigMap, -) -> anyhow::Result<()> { - trace!( - "handle_config_delete - for config {:?} telling do_periodic_discovery to end", - config_id - ); - // Send message to stop observing instances' availability and waits until response is received - if config_map - .read() - .await - .get(&config_id) - .unwrap() - .stop_discovery_sender - .clone() - .send(()) - .is_ok() - { - if config_map - .write() - .await - .get_mut(&config_id) - .unwrap() - .finished_discovery_receiver - .recv() - .await - .is_some() - { - trace!( - "handle_config_delete - for config {:?} received message that do_periodic_discovery ended", - config_id - ); - } else { - trace!( - "handle_config_delete - for config {:?} do_periodic_discovery sender has been dropped", - config_id - ); - } - } else { - trace!( - "handle_config_delete - for config {:?} do_periodic_discovery receiver has been dropped", - config_id - ); - } - - // Get map of instances for the Configuration and then remove Configuration from ConfigMap - let device_plugin_context; - { - let mut config_map_locked = config_map.write().await; - device_plugin_context = config_map_locked - .get(&config_id) - .unwrap() - .device_plugin_context - .clone(); - config_map_locked.remove(&config_id); - } - delete_all_instances_in_device_plugin_context( - kube_interface, - device_plugin_context.clone(), - config_id, - ) - .await?; - if let Some(sender) = &device_plugin_context - .read() - .await - .usage_update_message_sender - { - sender.send(device_plugin_service::ListAndWatchMessageKind::End)?; - } - Ok(()) -} - -/// Checks to see if the configuration needs to be recreated. -/// At present, this just checks to see if the `.metadata.generation` has changed. -/// The `.metadata.generation` value is incremented for all changes, except for changes to `.metadata` or `.status`. -async fn should_recreate_config( - config: &Configuration, - config_map: ConfigMap, -) -> Result { - let config_id: ConfigId = ( - config.metadata.namespace.clone().unwrap(), - config.metadata.name.clone().unwrap(), - ); - let last_generation = config_map - .read() - .await - .get(&config_id) - .ok_or_else(|| anyhow::anyhow!("Configuration {:?} not found in ConfigMap", &config_id))? - .last_generation; - - if config.metadata.generation <= last_generation { - return Ok(false); - } - - Ok(true) -} - -/// This shuts down all a Configuration's Instances and terminates the associated Device Plugins -pub async fn delete_all_instances_in_device_plugin_context( - kube_interface: &dyn k8s::KubeInterface, - device_plugin_context: Arc>, - (namespace, name): ConfigId, -) -> anyhow::Result<()> { - let mut device_plugin_context_locked = device_plugin_context.write().await; - let instances_to_delete_map = device_plugin_context_locked.clone().instances; - for (instance_name, instance_info) in instances_to_delete_map { - trace!( - "handle_config_delete - found Instance {} associated with deleted config {:?} ... sending message to end list_and_watch", - instance_name, - (namespace.clone(), name.clone()), - ); - instance_info - .list_and_watch_message_sender - .send(device_plugin_service::ListAndWatchMessageKind::End) - .unwrap(); - device_plugin_context_locked - .instances - .remove(&instance_name); - try_delete_instance(kube_interface, &instance_name, namespace.as_str()).await?; - } - Ok(()) -} - -#[cfg(test)] -mod config_action_tests { - use super::super::{ - device_plugin_service, device_plugin_service::InstanceConnectivityStatus, - discovery_operator::tests::build_device_plugin_context, - }; - use super::*; - use akri_shared::{akri::configuration::Configuration, k8s::MockKubeInterface}; - use std::{collections::HashMap, fs, sync::Arc, vec}; - use tokio::sync::{broadcast, RwLock}; - - // Test that watcher restarts correctly add/remove Configurations - #[tokio::test] - async fn test_handle_watcher_restart() { - let _ = env_logger::builder().is_test(true).try_init(); - - let path_to_config = "../test/yaml/config-a.yaml"; - let config_yaml = fs::read_to_string(path_to_config).expect("Unable to read file"); - let config: Configuration = serde_yaml::from_str(&config_yaml).unwrap(); - let config_id: ConfigId = ( - config.metadata.namespace.clone().unwrap(), - config.metadata.name.clone().unwrap(), - ); - let kube_interface = Arc::new(MockKubeInterface::new()); - - let config_map = Arc::new(RwLock::new(HashMap::new())); - let dh_map = Arc::new(std::sync::Mutex::new(HashMap::new())); - let (tx, mut _rx1) = broadcast::channel(1); - assert!(handle_config( - kube_interface.clone(), - Event::Restarted(vec![config]), - config_map.clone(), - dh_map.clone(), - tx.clone(), - "node-a".to_string(), - ) - .await - .is_ok()); - - // Wait for `handle_config_add` to effectively add the config to the map - // FIXME: This looks a lot like a race window - tokio::time::sleep(tokio::time::Duration::from_micros(100)).await; - assert!(config_map.read().await.contains_key(&config_id)); - - assert!(handle_config( - kube_interface, - Event::Restarted(Vec::new()), - config_map.clone(), - dh_map, - tx, - "node-a".to_string(), - ) - .await - .is_ok()); - - assert!(!config_map.read().await.contains_key(&config_id)); - } - - #[tokio::test] - async fn test_handle_config_delete() { - let _ = env_logger::builder().is_test(true).try_init(); - let path_to_config = "../test/yaml/config-a.yaml"; - let config_yaml = fs::read_to_string(path_to_config).expect("Unable to read file"); - let config: Configuration = serde_yaml::from_str(&config_yaml).unwrap(); - let config_id: ConfigId = ( - config.metadata.namespace.clone().unwrap(), - config.metadata.name.clone().unwrap(), - ); - let mut list_and_watch_message_receivers = Vec::new(); - let mut visible_discovery_results = Vec::new(); - let mut mock = MockKubeInterface::new(); - let device_plugin_context = build_device_plugin_context( - &config, - &mut visible_discovery_results, - &mut list_and_watch_message_receivers, - InstanceConnectivityStatus::Online, - ) - .await; - let (stop_discovery_sender, mut stop_discovery_receiver) = broadcast::channel(2); - let (finished_discovery_sender, finished_discovery_receiver) = mpsc::channel(2); - let mut map: HashMap = HashMap::new(); - map.insert( - config_id.clone(), - ConfigInfo { - stop_discovery_sender, - device_plugin_context: device_plugin_context.clone(), - finished_discovery_receiver, - last_generation: config.metadata.generation, - }, - ); - let config_map: ConfigMap = Arc::new(RwLock::new(map)); - - mock.expect_delete_instance() - .times(2) - .returning(move |_, _| Ok(())); - tokio::spawn(async move { - handle_config_delete(&mock, config_id.clone(), config_map.clone()) - .await - .unwrap(); - // Assert that config is removed from map after it has been deleted - assert!(!config_map.read().await.contains_key(&config_id)); - }); - - // Assert that handle_config_delete tells start_discovery to end - assert!(stop_discovery_receiver.recv().await.is_ok()); - // Mimic do_periodic_discovery's response - finished_discovery_sender.send(()).await.unwrap(); - - // Assert list_and_watch is signaled to end for every instance associated with a config - let mut tasks = Vec::new(); - for mut receiver in list_and_watch_message_receivers { - tasks.push(tokio::spawn(async move { - assert_eq!( - receiver.recv().await.unwrap(), - device_plugin_service::ListAndWatchMessageKind::End - ); - })); - } - futures::future::join_all(tasks).await; - - // Assert that all instances have been removed from the instance map - assert_eq!(device_plugin_context.read().await.instances.len(), 0); - } - - #[tokio::test] - async fn test_handle_config_delete_already_dropped() { - let _ = env_logger::builder().is_test(true).try_init(); - let path_to_config = "../test/yaml/config-a.yaml"; - let config_yaml = fs::read_to_string(path_to_config).expect("Unable to read file"); - let config: Configuration = serde_yaml::from_str(&config_yaml).unwrap(); - let config_id: ConfigId = ( - config.metadata.namespace.clone().unwrap(), - config.metadata.name.clone().unwrap(), - ); - let mut list_and_watch_message_receivers = Vec::new(); - let mut visible_discovery_results = Vec::new(); - let mut mock = MockKubeInterface::new(); - let device_plugin_context = build_device_plugin_context( - &config, - &mut visible_discovery_results, - &mut list_and_watch_message_receivers, - InstanceConnectivityStatus::Online, - ) - .await; - let (stop_discovery_sender, mut stop_discovery_receiver) = broadcast::channel(2); - let (_, finished_discovery_receiver) = mpsc::channel(2); - let mut map: HashMap = HashMap::new(); - map.insert( - config_id.clone(), - ConfigInfo { - stop_discovery_sender, - device_plugin_context: device_plugin_context.clone(), - finished_discovery_receiver, - last_generation: config.metadata.generation, - }, - ); - let config_map: ConfigMap = Arc::new(RwLock::new(map)); - - mock.expect_delete_instance() - .times(2) - .returning(move |_, _| Ok(())); - - let handle_delete = tokio::spawn(async move { - handle_config_delete(&mock, config_id.clone(), config_map.clone()) - .await - .unwrap(); - // Assert that config is removed from map after it has been deleted - assert!(!config_map.read().await.contains_key(&config_id)); - }); - - // Assert that handle_config_delete tells start_discovery to end - assert!(stop_discovery_receiver.recv().await.is_ok()); - handle_delete.await.unwrap(); - // Assert list_and_watch is signaled to end for every instance associated with a config - let mut tasks = Vec::new(); - for mut receiver in list_and_watch_message_receivers { - tasks.push(tokio::spawn(async move { - assert_eq!( - receiver.recv().await.unwrap(), - device_plugin_service::ListAndWatchMessageKind::End - ); - })); - } - futures::future::join_all(tasks).await; - - // Assert that all instances have been removed from the instance map - assert_eq!(device_plugin_context.read().await.instances.len(), 0); - } - - // Tests that when a Configuration is updated, - // if generation has changed, should return true - #[tokio::test] - async fn test_should_recreate_config_new_generation() { - let (mut config, config_map) = get_should_recreate_config_data().await; - - // using different generation as what is already in config_map - config.metadata.generation = Some(2); - let do_recreate = should_recreate_config(&config, config_map.clone()) - .await - .unwrap(); - - assert!(do_recreate) - } - - // Tests that when a Configuration is updated, - // if generation has NOT changed, should return false - #[tokio::test] - async fn test_should_recreate_config_same_generation() { - let (mut config, config_map) = get_should_recreate_config_data().await; - - // using same generation as what is already in config_map - config.metadata.generation = Some(1); - let do_recreate = should_recreate_config(&config, config_map.clone()) - .await - .unwrap(); - - assert!(!do_recreate) - } - - // Tests that when a Configuration is updated, - // if generation is older, should return false - #[tokio::test] - async fn test_should_recreate_config_older_generation() { - let (mut config, config_map) = get_should_recreate_config_data().await; - - // using older generation than what is already in config_map - config.metadata.generation = Some(0); - let do_recreate = should_recreate_config(&config, config_map.clone()) - .await - .unwrap(); - - assert!(!do_recreate) - } - - async fn get_should_recreate_config_data() -> (Configuration, ConfigMap) { - let path_to_config = "../test/yaml/config-a.yaml"; - let config_yaml = fs::read_to_string(path_to_config).expect("Unable to read file"); - let config: Configuration = serde_yaml::from_str(&config_yaml).unwrap(); - - let (stop_discovery_sender, _) = broadcast::channel(2); - let (_, finished_discovery_receiver) = mpsc::channel(2); - - let config_info = ConfigInfo { - device_plugin_context: Arc::new(RwLock::new(DevicePluginContext::default())), - stop_discovery_sender: stop_discovery_sender.clone(), - finished_discovery_receiver, - last_generation: Some(1), - }; - let config_id: ConfigId = ( - config.metadata.namespace.clone().unwrap(), - config.metadata.name.clone().unwrap(), - ); - let config_map: ConfigMap = Arc::new(RwLock::new(HashMap::new())); - config_map.write().await.insert(config_id, config_info); - (config, config_map) - } -} diff --git a/agent/src/util/constants.rs b/agent/src/util/constants.rs deleted file mode 100644 index e3c0cdfaa..000000000 --- a/agent/src/util/constants.rs +++ /dev/null @@ -1,76 +0,0 @@ -/// For unshared devices, Healthy means the device is discoverable. For shared devices, Healthy means the device is -/// either unused or used by this node. -pub const HEALTHY: &str = "Healthy"; - -/// For unshared devices, Unhealthy means the device is not discoverable. For shared devices, Unhealthy means that the -/// device shared and used already by another node. -pub const UNHEALTHY: &str = "Unhealthy"; - -/// Current version of the API supported by kubelet. -pub const K8S_DEVICE_PLUGIN_VERSION: &str = "v1beta1"; - -/// DevicePluginPath is the folder the kubelet expects to find Device-Plugin sockets. -pub const DEVICE_PLUGIN_PATH: &str = "/var/lib/kubelet/device-plugins"; - -/// Path of the Kubelet registry socket -pub const KUBELET_SOCKET: &str = "/var/lib/kubelet/device-plugins/kubelet.sock"; - -/// Maximum length of time `list_and_watch` will sleep before sending kubelet another list of virtual devices -pub const LIST_AND_WATCH_SLEEP_SECS: u64 = 60; - -/// Length of time a shared instance can be offline before it's `DevicePluginService` is shutdown. -pub const SHARED_INSTANCE_OFFLINE_GRACE_PERIOD_SECS: u64 = 300; - -/// Length of time to sleep between slot reconciliation checks -pub const SLOT_RECONCILIATION_CHECK_DELAY_SECS: u64 = 10; - -/// Length of time a slot can be unused before slot reconciliation reclaims it -pub const SLOT_RECONCILIATION_SLOT_GRACE_PERIOD_SECS: u64 = 300; - -/// Label of environment variable that, when set, enables the embedded debug echo discovery handler -#[cfg(any(test, feature = "agent-full"))] -pub const ENABLE_DEBUG_ECHO_LABEL: &str = "ENABLE_DEBUG_ECHO"; - -/// Capacity of channel over which `DevicePluginService::list_and_watch` sends updates to kubelet about "virtual" device -/// health of an instance. The kubelet Device Plugin manager should receive each message instantly; however, providing -/// some buffer in case. -pub const KUBELET_UPDATE_CHANNEL_CAPACITY: usize = 4; - -/// Capacity of channel over which the Agent Registration updates `DiscoveryOperators` when new `DiscoveryHandlers` -/// register. Tokio does not provide an unbounded broadcast channel in order to prevent the channel from growing -/// infinitely due to a "slow receiver". It is hard to determine an appropriate channel size, since the number of -/// `DiscoveryOperator` receivers (equivalent to number of applied Akri Configurations) and the frequency of sends -/// (equivalent to the number of registering `DiscoveryHandlers`) are unpredictable. Therefore, a large size is chosen -/// out of caution. -pub const NEW_DISCOVERY_HANDLER_CHANNEL_CAPACITY: usize = 15; - -/// Capacity of channel over which the `DevicePluginService::list_and_watch` receives messages to -/// `ListAndWatchMessageKind::Continue` (prematurely send updates to kubelet) or `ListAndWatchMessageKind::End` -/// (terminate itself). `list_and_watch` receives messages asynchronously from `DevicePluginService.allocate`, -/// `DiscoveryOperator.update_connectivity_status`, and `handle_config_delete`. Messages are sent as a response to a -/// variety of events, such as an Instance going offline/online, a Configuration being deleted, or a slot being -/// requested via allocate that is already taken, making it hard to determine the appropriate size of the channel. If a -/// new message is put in the channel after capacity is already met, the oldest message is dropped, dropping a -/// `ListAndWatchMessageKind::End` would likely be unrecoverable. Tokio does not provide an unbounded broadcast channel -/// in order to prevent the channel from growing infinitely due to a "slow receiver", so a large channel size is chosen -/// out of caution. -pub const LIST_AND_WATCH_MESSAGE_CHANNEL_CAPACITY: usize = 15; - -/// Capacity of channel over which a `DevicePluginService` receives a shutdown signal. This is either sent by -/// `DevicePluginBuilder::register` or `DevicePluginService::list_and_watch`. Capacity is set to meet worst case -/// scenario in which they both send messages at the same time. -pub const DEVICE_PLUGIN_SERVER_ENDER_CHANNEL_CAPACITY: usize = 2; - -/// Capacity of channel over which a `DiscoveryOperator` is notified to stop discovery for its Configuration. This -/// signals it to tell each of its subtasks to stop discovery. Message is only sent once, upon Configuration deletion. -pub const DISCOVERY_OPERATOR_STOP_DISCOVERY_CHANNEL_CAPACITY: usize = 1; - -/// Capacity of channel over which a DiscoveryOperator signals that it has stopped discovery and a Configuration's -/// Instances and associated `DevicePluginServices` can safely be deleted/terminated. There is only one sender -/// (`DiscoveryOperator`) who only sends a message once. -pub const DISCOVERY_OPERATOR_FINISHED_DISCOVERY_CHANNEL_CAPACITY: usize = 1; - -/// Capacity of channel over which `DiscoveryOperator` is notified to stop (trying to make) a connection with a -/// `DiscoveryHandler`. Sent once by the Agent Registration service when a `DiscoveryHandler` re-registers with a different -/// registration request (edge case). -pub const CLOSE_DISCOVERY_HANDLER_CONNECTION_CHANNEL_CAPACITY: usize = 1; diff --git a/agent/src/util/crictl_containers.rs b/agent/src/util/crictl_containers.rs deleted file mode 100644 index 241690f44..000000000 --- a/agent/src/util/crictl_containers.rs +++ /dev/null @@ -1,160 +0,0 @@ -use akri_shared::akri::{instance::device_usage::NodeUsage, AKRI_SLOT_ANNOTATION_NAME_PREFIX}; -use std::collections::HashMap; -use std::str::FromStr; - -/// Output from crictl query -#[derive(Serialize, Deserialize, Clone, Debug)] -#[serde(rename_all = "camelCase")] -struct CriCtlOutput { - containers: Vec, -} - -/// Container from crictl query -#[derive(Serialize, Deserialize, Clone, Debug)] -#[serde(rename_all = "camelCase")] -struct CriCtlContainer { - annotations: HashMap, -} - -/// This gets the usage slots for an instance by getting the annotations that were stored at id `AKRI_SLOT_ANNOTATION_NAME_PREFIX` during allocate. -pub fn get_container_slot_usage(crictl_output: &str) -> HashMap { - match serde_json::from_str::(crictl_output) { - Ok(crictl_output_parsed) => crictl_output_parsed - .containers - .iter() - .flat_map(|container| &container.annotations) - .filter_map(|(key, value)| { - if key.starts_with(AKRI_SLOT_ANNOTATION_NAME_PREFIX) { - let slot_id = key - .strip_prefix(AKRI_SLOT_ANNOTATION_NAME_PREFIX) - .unwrap_or_default(); - match NodeUsage::from_str(value) { - Ok(node_usage) => Some((slot_id.to_string(), node_usage)), - Err(_) => None, - } - } else { - None - } - }) - .collect(), - Err(e) => { - trace!( - "handle_crictl_output - failed to parse crictl output: {:?} => [{:?}]", - e, - &crictl_output - ); - HashMap::default() - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use akri_shared::akri::instance::device_usage::DeviceUsageKind; - - fn get_container_str(annotation: &str) -> String { - format!("{{ \ - \"id\": \"46afc04a13ac21d73ff93843efd39590d66927d9b5d743d239542cf2f6de703e\", \ - \"podSandboxId\": \"9094d7341170ecbc6fb0a6a72ba449c8ea98d3267c60e06d815d03102ca7a3e6\", \ - \"metadata\": {{ \ - \"name\": \"akri-agent\", \ - \"attempt\": 0 \ - }}, \ - \"image\": {{ \ - \"image\": \"akri.sh/agent@sha256:86bb6234353129bcae170cfc7db5ad5f282cfc3495555a39aa88042948491850\" \ - }}, \ - \"imageRef\": \"sha256:1305fb97b2db8e9aa715af6a6cd0711986da7935bcbb98f6363aaa5b86163072\", \ - \"state\": \"CONTAINER_RUNNING\", \ - \"createdAt\": \"1587749289000000000\", \ - \"labels\": {{ \ - \"io.kubernetes.container.name\": \"akri-agent\", \ - \"io.kubernetes.pod.name\": \"akri-agent-daemonset-lt2gc\", \ - \"io.kubernetes.pod.namespace\": \"default\", \ - \"io.kubernetes.pod.uid\": \"1ed0098d-8d6f-4001-8192-f690f9b8ae98\" \ - }}, \ - \"annotations\": {{ \ - {} \ - \"io.kubernetes.container.hash\": \"34d65174\", \ - \"io.kubernetes.container.restartCount\": \"0\", \ - \"io.kubernetes.container.terminationMessagePath\": \"/dev/termination-log\", \ - \"io.kubernetes.container.terminationMessagePolicy\": \"File\", \ - \"io.kubernetes.pod.terminationGracePeriod\": \"30\" \ - }} \ - }}", - annotation) - } - - #[test] - fn test_get_container_slot_usage() { - let _ = env_logger::builder().is_test(true).try_init(); - - // Empty output - assert_eq!( - HashMap::::new(), - get_container_slot_usage(r#""#) - ); - // Empty json output - assert_eq!( - HashMap::::new(), - get_container_slot_usage(r#"{}"#) - ); - // Expected output with no containers - assert_eq!( - HashMap::::new(), - get_container_slot_usage(r#"{\"containers\": []}"#) - ); - // Output with syntax error - assert_eq!( - HashMap::::new(), - get_container_slot_usage(r#"{ddd}"#) - ); // syntax error - // Expected output with no slot - assert_eq!( - HashMap::::new(), - get_container_slot_usage(&format!( - "{{ \"containers\": [ {} ] }}", - &get_container_str("") - )) - ); - // Expected output with slot (including unexpected property) - let mut expected = HashMap::new(); - expected.insert( - "foo".to_string(), - NodeUsage::create(&DeviceUsageKind::Instance, "node-a").unwrap(), - ); - assert_eq!( - expected, - get_container_slot_usage(&format!( - "{{ \"ddd\": \"\", \"containers\": [ {} ] }}", - &get_container_str("\"akri.agent.slot-foo\": \"node-a\",") - )) - ); - // Expected output with slot - assert_eq!( - expected, - get_container_slot_usage(&format!( - "{{ \"containers\": [ {} ] }}", - &get_container_str("\"akri.agent.slot-foo\": \"node-a\",") - )) - ); - // Expected output with multiple containers - let mut expected_2 = HashMap::new(); - expected_2.insert( - "foo1".to_string(), - NodeUsage::create(&DeviceUsageKind::Instance, "node-a").unwrap(), - ); - expected_2.insert( - "foo2".to_string(), - NodeUsage::create(&DeviceUsageKind::Instance, "node-b").unwrap(), - ); - assert_eq!( - expected_2, - get_container_slot_usage(&format!( - "{{ \"containers\": [ {}, {} ] }}", - &get_container_str("\"akri.agent.slot-foo1\": \"node-a\","), - &get_container_str("\"akri.agent.slot-foo2\": \"node-b\","), - )) - ); - } -} diff --git a/agent/src/util/device_plugin_builder.rs b/agent/src/util/device_plugin_builder.rs deleted file mode 100644 index d20f161c9..000000000 --- a/agent/src/util/device_plugin_builder.rs +++ /dev/null @@ -1,437 +0,0 @@ -use super::{ - constants::{ - DEVICE_PLUGIN_PATH, DEVICE_PLUGIN_SERVER_ENDER_CHANNEL_CAPACITY, K8S_DEVICE_PLUGIN_VERSION, - KUBELET_SOCKET, LIST_AND_WATCH_MESSAGE_CHANNEL_CAPACITY, - }, - device_plugin_service::{ - get_device_instance_name, ConfigurationDevicePlugin, DevicePluginBehavior, - DevicePluginContext, DevicePluginService, InstanceDevicePlugin, ListAndWatchMessageKind, - }, - v1beta1, - v1beta1::{ - device_plugin_server::{DevicePlugin, DevicePluginServer}, - registration_client, DevicePluginOptions, - }, -}; -use akri_discovery_utils::discovery::v0::Device; -use akri_shared::{ - akri::{configuration::Configuration, AKRI_PREFIX}, - uds::unix_stream, -}; -use async_trait::async_trait; -use futures::TryFutureExt; -use log::{info, trace}; -#[cfg(test)] -use mockall::{automock, predicate::*}; -use std::sync::Arc; -use std::{convert::TryFrom, path::Path, time::SystemTime}; -use tokio::{ - net::UnixListener, - net::UnixStream, - sync::{broadcast, mpsc, RwLock}, - task, -}; -use tonic::transport::{Endpoint, Server, Uri}; -use tower::service_fn; - -#[cfg_attr(test, automock)] -#[async_trait] -pub trait DevicePluginBuilderInterface: Send + Sync { - async fn build_device_plugin( - &self, - instance_id: String, - config: &Configuration, - shared: bool, - device_plugin_context: Arc>, - device: Device, - node_name: String, - ) -> Result<(), Box>; - - async fn build_configuration_device_plugin( - &self, - device_plugin_name: String, - config: &Configuration, - device_plugin_context: Arc>, - node_name: String, - ) -> Result< - broadcast::Sender, - Box, - >; -} - -/// For each Instance, builds a Device Plugin, registers it with the kubelet, and serves it over UDS. -pub struct DevicePluginBuilder {} - -#[async_trait] -impl DevicePluginBuilderInterface for DevicePluginBuilder { - /// This creates a new DevicePluginService for an instance and registers it with the kubelet - async fn build_device_plugin( - &self, - instance_id: String, - config: &Configuration, - shared: bool, - device_plugin_context: Arc>, - device: Device, - node_name: String, - ) -> Result<(), Box> { - let instance_name = - get_device_instance_name(&instance_id, config.metadata.name.as_ref().unwrap()); - info!("build_device_plugin - entered for device {}", instance_name); - let device_plugin_behavior = DevicePluginBehavior::Instance(InstanceDevicePlugin { - instance_id: instance_id.clone(), - shared, - device: device.clone(), - }); - let (list_and_watch_message_sender, _) = - broadcast::channel(LIST_AND_WATCH_MESSAGE_CHANNEL_CAPACITY); - self.build_device_plugin_service( - &instance_name, - config, - device_plugin_context, - device_plugin_behavior, - list_and_watch_message_sender, - node_name, - ) - .await - } - - /// This creates a new ConfigurationDevicePluginService for a Configuration and registers it with the kubelet - async fn build_configuration_device_plugin( - &self, - device_plugin_name: String, - config: &Configuration, - device_plugin_context: Arc>, - node_name: String, - ) -> Result< - broadcast::Sender, - Box, - > { - info!( - "build_configuration_device_plugin - entered for device {}", - device_plugin_name - ); - let device_plugin_behavior = - DevicePluginBehavior::Configuration(ConfigurationDevicePlugin::default()); - let (list_and_watch_message_sender, _) = - broadcast::channel(LIST_AND_WATCH_MESSAGE_CHANNEL_CAPACITY); - self.build_device_plugin_service( - &device_plugin_name, - config, - device_plugin_context, - device_plugin_behavior, - list_and_watch_message_sender.clone(), - node_name, - ) - .await?; - Ok(list_and_watch_message_sender) - } -} - -impl DevicePluginBuilder { - async fn build_device_plugin_service( - &self, - device_plugin_name: &str, - config: &Configuration, - device_plugin_context: Arc>, - device_plugin_behavior: DevicePluginBehavior, - list_and_watch_message_sender: broadcast::Sender, - node_name: String, - ) -> Result<(), Box> { - let capability_id: String = format!("{}/{}", AKRI_PREFIX, device_plugin_name); - let unique_time = SystemTime::now().duration_since(SystemTime::UNIX_EPOCH)?; - let device_endpoint: String = - format!("{}-{}.sock", device_plugin_name, unique_time.as_secs()); - let socket_path: String = Path::new(DEVICE_PLUGIN_PATH) - .join(device_endpoint.clone()) - .to_str() - .unwrap() - .to_string(); - let (server_ender_sender, server_ender_receiver) = - mpsc::channel(DEVICE_PLUGIN_SERVER_ENDER_CHANNEL_CAPACITY); - let device_plugin_service = DevicePluginService { - instance_name: device_plugin_name.to_string(), - config: config.spec.clone(), - config_name: config.metadata.name.clone().unwrap(), - config_uid: config.metadata.uid.as_ref().unwrap().clone(), - config_namespace: config.metadata.namespace.as_ref().unwrap().clone(), - node_name, - device_plugin_context, - list_and_watch_message_sender, - server_ender_sender: server_ender_sender.clone(), - device_plugin_behavior, - }; - - self.serve( - device_plugin_service, - socket_path.clone(), - server_ender_receiver, - ) - .await?; - - self.register( - &capability_id, - &device_endpoint, - device_plugin_name, - server_ender_sender, - KUBELET_SOCKET, - ) - .await?; - - Ok(()) - } - - // This starts a DevicePluginServer - async fn serve( - &self, - device_plugin_service: T, - socket_path: String, - server_ender_receiver: mpsc::Receiver<()>, - ) -> Result<(), Box> { - info!( - "serve - creating a device plugin server that will listen at: {}", - socket_path - ); - tokio::fs::create_dir_all(Path::new(&socket_path[..]).parent().unwrap()) - .await - .expect("Failed to create dir at socket path"); - let service = DevicePluginServer::new(device_plugin_service); - let task_socket_path = socket_path.clone(); - task::spawn(async move { - let socket_to_delete = task_socket_path.clone(); - let incoming = { - let uds = - UnixListener::bind(task_socket_path).expect("Failed to bind to socket path"); - - async_stream::stream! { - loop { - let item = uds.accept().map_ok(|(st, _)| unix_stream::UnixStream(st)).await; - yield item; - } - } - }; - Server::builder() - .add_service(service) - .serve_with_incoming_shutdown(incoming, shutdown_signal(server_ender_receiver)) - .await - .unwrap(); - trace!( - "serve - gracefully shutdown ... deleting socket {}", - socket_to_delete - ); - // Socket may already be deleted in the case of the kubelet restart - std::fs::remove_file(socket_to_delete).unwrap_or(()); - }); - - akri_shared::uds::unix_stream::try_connect(&socket_path).await?; - Ok(()) - } - - /// This registers DevicePlugin with the kubelet. - /// During registration, the device plugin must send - /// (1) name of unix socket, - /// (2) Device-Plugin API it was built against (v1beta1), - /// (3) resource name akri.sh/device_id. - /// If registration request to the kubelet fails, terminates DevicePluginService. - async fn register( - &self, - capability_id: &str, - socket_name: &str, - instance_name: &str, - server_ender_sender: mpsc::Sender<()>, - kubelet_socket: &str, - ) -> Result<(), Box> { - info!( - "register - entered for Instance {} and socket_name: {}", - capability_id, socket_name - ); - let op = DevicePluginOptions { - pre_start_required: false, - }; - - // We will ignore this dummy uri because UDS does not use it. - // Some servers will check the uri content so the uri needs to - // be in valid format even it's not used, the scheme part is used - // to specific what scheme to use, such as http or https - let kubelet_socket_closure = kubelet_socket.to_string(); - let channel = Endpoint::try_from("http://[::1]:50051")? - .connect_with_connector(service_fn(move |_: Uri| { - UnixStream::connect(kubelet_socket_closure.clone()) - })) - .await?; - let mut registration_client = registration_client::RegistrationClient::new(channel); - - let register_request = tonic::Request::new(v1beta1::RegisterRequest { - version: K8S_DEVICE_PLUGIN_VERSION.into(), - endpoint: socket_name.to_string(), - resource_name: capability_id.to_string(), - options: Some(op), - }); - trace!( - "register - before call to register with the kubelet at socket {}", - kubelet_socket - ); - - // If fail to register with the kubelet, terminate device plugin - if registration_client - .register(register_request) - .await - .is_err() - { - trace!( - "register - failed to register Instance {} with the kubelet ... terminating device plugin", - instance_name - ); - server_ender_sender.send(()).await?; - } - Ok(()) - } -} - -/// This acts as a signal future to gracefully shutdown DevicePluginServer upon its completion. -/// Ends when it receives message from `list_and_watch`. -async fn shutdown_signal(mut server_ender_receiver: mpsc::Receiver<()>) { - match server_ender_receiver.recv().await { - Some(_) => trace!( - "shutdown_signal - received signal ... device plugin service gracefully shutting down" - ), - None => trace!("shutdown_signal - connection to server_ender_sender closed ... error"), - } -} - -#[cfg(test)] -pub mod tests { - use super::super::v1beta1::{ - registration_server::{Registration, RegistrationServer}, - Empty, RegisterRequest, - }; - use super::*; - use tempfile::Builder; - - struct MockRegistration { - pub return_error: bool, - } - - // Mock implementation of kubelet's registration service for tests. - // Can be configured with its `return_error` field to return an error. - #[async_trait] - impl Registration for MockRegistration { - async fn register( - &self, - _request: tonic::Request, - ) -> Result, tonic::Status> { - if self.return_error { - Err(tonic::Status::invalid_argument( - "mock discovery handler error", - )) - } else { - Ok(tonic::Response::new(Empty {})) - } - } - } - - async fn serve_for_test>( - service: RegistrationServer, - socket: P, - ) { - let incoming = { - let uds = UnixListener::bind(socket).expect("Failed to bind to socket path"); - - async_stream::stream! { - loop { - let item = uds.accept().map_ok(|(st, _)| unix_stream::UnixStream(st)).await; - yield item; - } - } - }; - - Server::builder() - .add_service(service) - .serve_with_incoming(incoming) - .await - .unwrap(); - } - - #[tokio::test] - async fn test_register() { - let device_plugins_dirs = Builder::new().prefix("device-plugins").tempdir().unwrap(); - let kubelet_socket = device_plugins_dirs.path().join("kubelet.sock"); - let kubelet_socket_clone = kubelet_socket.clone(); - let kubelet_socket_str = kubelet_socket_clone.to_str().unwrap(); - - // Start kubelet registration server - let registration = MockRegistration { - return_error: false, - }; - let service = RegistrationServer::new(registration); - task::spawn(async move { - serve_for_test(service, kubelet_socket).await; - }); - - // Make sure registration server has started - akri_shared::uds::unix_stream::try_connect(kubelet_socket_str) - .await - .unwrap(); - - let device_plugin_builder = DevicePluginBuilder {}; - let (server_ender_sender, _) = mpsc::channel(1); - // Test successful registration - assert!(device_plugin_builder - .register( - "random_instance_id", - "socket.sock", - "random_instance", - server_ender_sender, - kubelet_socket_str - ) - .await - .is_ok()); - } - - #[tokio::test] - async fn test_register_error() { - let device_plugin_builder = DevicePluginBuilder {}; - let (server_ender_sender, mut server_ender_receiver) = mpsc::channel(1); - let device_plugins_dirs = Builder::new().prefix("device-plugins").tempdir().unwrap(); - let kubelet_socket = device_plugins_dirs.path().join("kubelet.sock"); - let kubelet_socket_clone = kubelet_socket.clone(); - let kubelet_socket_str = kubelet_socket_clone.to_str().unwrap(); - - // Try to register when no registration service exists - assert!(device_plugin_builder - .register( - "random_instance_id", - "socket.sock", - "random_instance", - server_ender_sender.clone(), - kubelet_socket_str - ) - .await - .is_err()); - - // Start kubelet registration server - let registration = MockRegistration { return_error: true }; - let service = RegistrationServer::new(registration); - task::spawn(async move { - serve_for_test(service, kubelet_socket).await; - }); - - // Make sure registration server has started - akri_shared::uds::unix_stream::try_connect(kubelet_socket_str) - .await - .unwrap(); - - // Test that when registration fails, no error is thrown but the DevicePluginService is signaled to shutdown - assert!(device_plugin_builder - .register( - "random_instance_id", - "socket.sock", - "random_instance", - server_ender_sender, - kubelet_socket_str - ) - .await - .is_ok()); - // Make sure DevicePluginService is signaled to shutdown - server_ender_receiver.recv().await.unwrap(); - } -} diff --git a/agent/src/util/device_plugin_service.rs b/agent/src/util/device_plugin_service.rs deleted file mode 100644 index 3a1a6ada1..000000000 --- a/agent/src/util/device_plugin_service.rs +++ /dev/null @@ -1,3129 +0,0 @@ -use super::constants::{ - HEALTHY, KUBELET_UPDATE_CHANNEL_CAPACITY, LIST_AND_WATCH_SLEEP_SECS, UNHEALTHY, -}; -use super::v1beta1; -use super::v1beta1::{ - device_plugin_server::DevicePlugin, AllocateRequest, AllocateResponse, DevicePluginOptions, - DeviceSpec, Empty, ListAndWatchResponse, Mount, PreStartContainerRequest, - PreStartContainerResponse, -}; -use akri_discovery_utils::discovery::v0::Device; -use akri_shared::{ - akri::{ - configuration::ConfigurationSpec, - instance::device_usage::{DeviceUsageKind, NodeUsage}, - instance::InstanceSpec, - retry::{random_delay, MAX_INSTANCE_UPDATE_TRIES}, - AKRI_SLOT_ANNOTATION_NAME_PREFIX, - }, - k8s, - k8s::KubeInterface, -}; -use log::{error, info, trace}; -#[cfg(test)] -use mock_instant::Instant; -#[cfg(not(test))] -use std::time::Instant; -use std::{ - collections::{HashMap, HashSet}, - str::FromStr, - sync::Arc, - time::Duration, -}; -use tokio::{ - sync::{broadcast, mpsc, RwLock}, - time::timeout, -}; -use tokio_stream::wrappers::ReceiverStream; -use tonic::{Code, Request, Response, Status}; - -/// Message sent in channel to `list_and_watch`. -/// Dictates what action `list_and_watch` should take upon being awoken. -#[derive(PartialEq, Clone, Debug)] -pub enum ListAndWatchMessageKind { - /// Prematurely continue looping - Continue, - /// Stop looping - End, -} - -/// Describes whether an instance was discovered or the time at which it was no longer discovered. -#[derive(PartialEq, Debug, Clone)] -pub enum InstanceConnectivityStatus { - /// Was discovered - Online, - /// Could not be discovered. Instant contains time at which it was no longer discovered. - Offline(Instant), -} - -/// Contains an Instance's state -#[derive(Clone, Debug)] -pub struct InstanceInfo { - /// Sender to tell `list_and_watch` to either prematurely continue looping or end - pub list_and_watch_message_sender: broadcast::Sender, - /// Instance's `InstanceConnectivityStatus` - pub connectivity_status: InstanceConnectivityStatus, - /// Instance's hash id - pub instance_id: String, - /// Device that the instance represents. - /// Contains information about environment variables and volumes that should be mounted - /// into requesting Pods. - pub device: Device, -} - -#[derive(Clone, Debug, Default)] -pub struct DevicePluginContext { - /// Sender to tell Configuration device plugin `list_and_watch` to either prematurely continue looping or end - pub usage_update_message_sender: Option>, - /// Map of all Instances from the same Configuration - pub instances: HashMap, -} - -#[derive(Clone)] -pub enum DevicePluginBehavior { - Configuration(ConfigurationDevicePlugin), - Instance(InstanceDevicePlugin), -} - -#[derive(PartialEq, Clone, Debug)] -pub enum DeviceUsageStatus { - /// Free - Free, - /// Reserved by Configuration Device Plugin on current node - ReservedByConfiguration(String), - /// Reserved by Instance Device Plugin on current node - ReservedByInstance, - /// Reserved by other nodes - ReservedByOtherNode, - /// Unknown, insufficient information to determine the status, - /// mostly due to the device usage slot is not found from the instance map - Unknown, -} - -/// Kubernetes Device-Plugin for an Instance. -/// -/// `DevicePluginService` implements Kubernetes Device-Plugin v1beta1 API specification -/// defined in a public proto file (imported here at agent/proto/pluginapi.proto). -/// The code generated from pluginapi.proto can be found in `agent/src/util/v1beta1.rs`. -/// Each `DevicePluginService` has an associated Instance and Configuration. -/// Serves a unix domain socket, sending and receiving messages to/from kubelet. -/// Kubelet is its client, calling each of its methods. -#[derive(Clone)] -pub struct DevicePluginService { - /// Instance CRD name - pub instance_name: String, - /// Instance's Configuration - pub config: ConfigurationSpec, - /// Name of Instance's Configuration CRD - pub config_name: String, - /// UID of Instance's Configuration CRD - pub config_uid: String, - /// Namespace of Instance's Configuration CRD - pub config_namespace: String, - /// Hostname of node this Device Plugin is running on - pub node_name: String, - /// Map of all Instances that have the same Configuration CRD as this one - pub device_plugin_context: Arc>, - /// Receiver for list_and_watch continue or end messages - /// Note: since the tonic grpc generated list_and_watch definition takes in &self, - /// using broadcast sender instead of mpsc receiver - /// Can clone broadcast sender and subscribe receiver to use in spawned thread in list_and_watch - pub list_and_watch_message_sender: broadcast::Sender, - /// Upon send, terminates function that acts as the shutdown signal for this service - pub server_ender_sender: mpsc::Sender<()>, - /// Enum object that defines the behavior of the device plugin - pub device_plugin_behavior: DevicePluginBehavior, -} - -#[tonic::async_trait] -impl DevicePlugin for DevicePluginService { - /// Returns options to be communicated with kubelet Device Manager - async fn get_device_plugin_options( - &self, - _request: Request, - ) -> Result, Status> { - trace!("get_device_plugin_options - kubelet called get_device_plugin_options"); - let resp = DevicePluginOptions { - pre_start_required: false, - }; - Ok(Response::new(resp)) - } - - type ListAndWatchStream = ReceiverStream>; - - /// Called by Kubelet right after the DevicePluginService registers with Kubelet. - /// Returns a stream of List of "virtual" Devices over a channel. - /// Since Kubernetes designed Device-Plugin so that multiple consumers can use a Device, - /// "virtual" Devices are reservation slots for using the Device or Instance in akri terms. - /// The number of "virtual" Devices (length of `ListAndWatchResponse`) is determined by Instance.capacity. - /// Whenever Instance state changes or an Instance disapears, `list_and_watch` returns the new list. - /// Runs until receives message to end due to Instance disappearing or Configuration being deleted. - async fn list_and_watch( - &self, - _request: Request, - ) -> Result, Status> { - info!( - "list_and_watch - kubelet called list_and_watch for instance {}", - self.instance_name - ); - let kube_interface = Arc::new(k8s::KubeImpl::new().await.unwrap()); - self.internal_list_and_watch(kube_interface).await - } - - /// Kubelet calls allocate during pod creation. - /// This means kubelet is trying to reserve a usage slot (virtual Device) of the Instance for this node. - /// Returns error if cannot reserve that slot. - async fn allocate( - &self, - requests: Request, - ) -> Result, Status> { - info!( - "allocate - kubelet called allocate for Instance {}", - self.instance_name - ); - let kube_interface = Arc::new(k8s::KubeImpl::new().await.unwrap()); - self.internal_allocate(requests, kube_interface).await - } - - /// Should never be called, as indicated by DevicePluginService during registration. - async fn pre_start_container( - &self, - _request: Request, - ) -> Result, Status> { - error!( - "pre_start_container - kubelet called pre_start_container for Instance {}", - self.instance_name - ); - Ok(Response::new(v1beta1::PreStartContainerResponse {})) - } -} - -impl DevicePluginService { - async fn internal_list_and_watch<'a>( - &'a self, - kube_interface: Arc, - ) -> Result::ListAndWatchStream>, Status> { - let dps = Arc::new(self.clone()); - // Create a channel that list_and_watch can periodically send updates to kubelet on - let (kubelet_update_sender, kubelet_update_receiver) = - mpsc::channel(KUBELET_UPDATE_CHANNEL_CAPACITY); - // Spawn thread so can send kubelet the receiving end of the channel to listen on - tokio::spawn(async move { - match &dps.device_plugin_behavior { - DevicePluginBehavior::Configuration(dp) => { - dp.list_and_watch( - dps.clone(), - kube_interface, - kubelet_update_sender, - LIST_AND_WATCH_SLEEP_SECS, - ) - .await - } - DevicePluginBehavior::Instance(dp) => { - dp.list_and_watch( - dps.clone(), - kube_interface, - kubelet_update_sender, - LIST_AND_WATCH_SLEEP_SECS, - ) - .await - } - } - }); - Ok(Response::new(ReceiverStream::new(kubelet_update_receiver))) - } - - /// Called when kubelet is trying to reserve for this node a usage slot (or virtual device) of the Instance. - /// Tries to update Instance CRD to reserve the requested slot. If cannot reserve that slot, forces `list_and_watch` to continue - /// (sending kubelet the latest list of slots) and returns error, so kubelet will not schedule the pod to this node. - async fn internal_allocate( - &self, - requests: Request, - kube_interface: Arc, - ) -> Result, Status> { - let dps = Arc::new(self.clone()); - match &dps.device_plugin_behavior { - DevicePluginBehavior::Configuration(dp) => { - dp.allocate(dps.clone(), requests, kube_interface).await - } - DevicePluginBehavior::Instance(dp) => { - dp.allocate(dps.clone(), requests, kube_interface).await - } - } - } -} - -#[derive(Clone)] -pub struct InstanceDevicePlugin { - /// Instance hash id - pub instance_id: String, - /// Instance is \[not\]shared - pub shared: bool, - /// Device that the instance represents. - /// Contains information about environment variables and volumes that should be mounted - /// into requesting Pods. - pub device: Device, -} - -impl InstanceDevicePlugin { - async fn list_and_watch( - &self, - dps: Arc, - kube_interface: Arc, - kubelet_update_sender: mpsc::Sender>, - polling_interval_secs: u64, - ) { - let mut list_and_watch_message_receiver = dps.list_and_watch_message_sender.subscribe(); - let mut keep_looping = true; - // Try to create an Instance CRD for this plugin and add it to the global InstanceMap else shutdown - if let Err(e) = try_create_instance(dps.clone(), self, kube_interface.clone()).await { - error!( - "InstanceDevicePlugin::list_and_watch - ending service because could not create instance {} with error {}", - dps.instance_name, - e - ); - dps.server_ender_sender.clone().send(()).await.unwrap(); - keep_looping = false; - } - - let mut prev_virtual_devices: Vec = Vec::new(); - while keep_looping { - trace!( - "InstanceDevicePlugin::list_and_watch - loop iteration for Instance {}", - dps.instance_name - ); - - let device_usage_states = get_instance_device_usage_states( - &dps.node_name, - &dps.instance_name, - &dps.config_namespace, - &dps.config.capacity, - kube_interface.clone(), - ) - .await; - - // Generate virtual devices, for Instance Device Plugin virtual devices, - // the health state is healthy if the slot is free or - // is reserved by Instance Device Plugin itself previously - let virtual_devices = device_usage_states - .into_iter() - .map(|(slot, state)| v1beta1::Device { - id: slot, - health: match state { - DeviceUsageStatus::Free | DeviceUsageStatus::ReservedByInstance => { - HEALTHY.to_string() - } - _ => UNHEALTHY.to_string(), - }, - }) - .collect::>(); - // Only send the virtual devices if the list has changed - if !(prev_virtual_devices - .iter() - .all(|item| virtual_devices.contains(item)) - && virtual_devices.len() == prev_virtual_devices.len()) - { - prev_virtual_devices = virtual_devices.clone(); - let resp = v1beta1::ListAndWatchResponse { - devices: virtual_devices, - }; - info!( - "InstanceDevicePlugin::list_and_watch - for device plugin {}, response = {:?}", - dps.instance_name, resp - ); - // Send virtual devices list back to kubelet - if let Err(e) = kubelet_update_sender.send(Ok(resp)).await { - trace!( - "InstanceDevicePlugin::list_and_watch - for Instance {} kubelet no longer receiving with error {}", - dps.instance_name, - e - ); - // This means kubelet is down/has been restarted. Remove instance from instance map so - // do_periodic_discovery will create a new device plugin service for this instance. - dps.device_plugin_context - .write() - .await - .instances - .remove(&dps.instance_name); - dps.server_ender_sender.clone().send(()).await.unwrap(); - keep_looping = false; - } else { - // Notify device usage had been changed - if let Some(sender) = &dps - .device_plugin_context - .read() - .await - .usage_update_message_sender - { - if let Err(e) = sender.send(ListAndWatchMessageKind::Continue) { - error!("InstanceDevicePlugin::list_and_watch - fails to notify device usage, error {}", e); - } - } - } - } - - // Sleep for polling_interval_secs unless receive message to shutdown the server - // or continue (and send another list of devices) - match timeout( - Duration::from_secs(polling_interval_secs), - list_and_watch_message_receiver.recv(), - ) - .await - { - Ok(message) => { - // If receive message to end list_and_watch, send list of unhealthy devices - // and shutdown the server by sending message on server_ender_sender channel - if message == Ok(ListAndWatchMessageKind::End) { - trace!( - "InstanceDevicePlugin::list_and_watch - for Instance {} received message to end", - dps.instance_name - ); - let devices = prev_virtual_devices - .iter() - .map(|d| v1beta1::Device { - id: d.id.clone(), - health: UNHEALTHY.into() - }) - .collect::>(); - if !devices.is_empty() { - let resp = v1beta1::ListAndWatchResponse { devices }; - info!( - "InstanceDevicePlugin::list_and_watch - for device plugin {}, end response = {:?}", - dps.instance_name, resp - ); - kubelet_update_sender.send(Ok(resp)) - .await - .unwrap(); - } - dps.server_ender_sender.clone().send(()).await.unwrap(); - keep_looping = false; - } - } - Err(_) => trace!( - "InstanceDevicePlugin::list_and_watch - for Instance {} did not receive a message for {} seconds ... continuing", dps.instance_name, polling_interval_secs - ), - } - } - trace!( - "InstanceDevicePlugin::list_and_watch - for Instance {} ending", - dps.instance_name - ); - // Notify device usage for this instance is gone - // Best effort, channel may be closed in the case of the Configuration delete - if let Some(sender) = &dps - .device_plugin_context - .read() - .await - .usage_update_message_sender - { - if let Err(e) = sender.send(ListAndWatchMessageKind::Continue) { - trace!( - "InstanceDevicePlugin::list_and_watch - fails to notify device usage on ending, error {}", - e - ); - } - } - } - - /// Called when kubelet is trying to reserve for this node a usage slot (or virtual device) of the Instance. - /// Tries to update Instance CRD to reserve the requested slot. If cannot reserve that slot, forces `list_and_watch` to continue - /// (sending kubelet the latest list of slots) and returns error, so kubelet will not schedule the pod to this node. - async fn allocate( - &self, - dps: Arc, - requests: Request, - kube_interface: Arc, - ) -> Result, Status> { - let mut container_responses: Vec = Vec::new(); - // Suffix to add to each device property - let device_property_suffix = self.instance_id.to_uppercase(); - - for request in requests.into_inner().container_requests { - trace!( - "InstanceDevicePlugin::allocate - for Instance {} handling request {:?}", - &dps.instance_name, - request, - ); - let mut akri_annotations = HashMap::new(); - let mut akri_device_properties = HashMap::new(); - let mut akri_devices = HashMap::::new(); - for device_usage_id in request.devices_i_ds { - trace!( - "InstanceDevicePlugin::allocate - for Instance {} processing request for device usage slot id {}", - &dps.instance_name, - device_usage_id - ); - - if let Err(e) = try_update_instance_device_usage( - &device_usage_id, - &dps.node_name, - &dps.instance_name, - &dps.config_namespace, - DeviceUsageKind::Instance, - kube_interface.clone(), - ) - .await - { - trace!("InstanceDevicePlugin::allocate - could not assign {} slot to {} node ... forcing list_and_watch to continue", device_usage_id, &dps.node_name); - dps.list_and_watch_message_sender - .send(ListAndWatchMessageKind::Continue) - .unwrap(); - return Err(e); - } - - let node_usage = - NodeUsage::create(&DeviceUsageKind::Instance, &dps.node_name).unwrap(); - akri_annotations.insert( - format!("{}{}", AKRI_SLOT_ANNOTATION_NAME_PREFIX, &device_usage_id), - node_usage.to_string(), - ); - - // Add suffix _ to each device property - let converted_properties = self - .device - .properties - .iter() - .map(|(key, value)| { - ( - format!("{}_{}", key, &device_property_suffix), - value.to_string(), - ) - }) - .collect::>(); - akri_device_properties.extend(converted_properties); - akri_devices.insert(dps.instance_name.clone(), self.device.clone()); - - trace!( - "InstanceDevicePlugin::allocate - finished processing device_usage_id {}", - device_usage_id - ); - } - // Successfully reserved device_usage_slot[s] for this node. - // Add response to list of responses - let broker_properties = - get_all_broker_properties(&dps.config.broker_properties, &akri_device_properties); - let response = build_container_allocate_response( - broker_properties, - akri_annotations, - &akri_devices.into_values().collect(), - ); - container_responses.push(response); - } - - // Notify device usage had been changed - if let Some(sender) = &dps - .device_plugin_context - .read() - .await - .usage_update_message_sender - { - if let Err(e) = sender.send(ListAndWatchMessageKind::Continue) { - error!( - "InstanceDevicePlugin::allocate - fails to notify device usage, error {}", - e - ); - } - } - trace!( - "InstanceDevicePlugin::allocate - for Instance {} returning responses", - &dps.instance_name - ); - Ok(Response::new(v1beta1::AllocateResponse { - container_responses, - })) - } -} - -#[derive(Clone, Default)] -pub struct ConfigurationDevicePlugin {} - -impl ConfigurationDevicePlugin { - async fn list_and_watch( - &self, - dps: Arc, - kube_interface: Arc, - kubelet_update_sender: mpsc::Sender>, - polling_interval_secs: u64, - ) { - let mut list_and_watch_message_receiver = dps.list_and_watch_message_sender.subscribe(); - let mut keep_looping = true; - let mut prev_virtual_devices = HashMap::new(); - while keep_looping { - trace!( - "ConfigurationDevicePlugin::list_and_watch - loop iteration for device plugin {}", - dps.instance_name - ); - let available_virtual_devices = get_available_virtual_devices( - &dps.device_plugin_context.read().await.clone().instances, - &dps.node_name, - &dps.config_namespace, - &dps.config.capacity, - kube_interface.clone(), - ) - .await; - info!( - "ConfigurationDevicePlugin::list_and_watch - available_virtual_devices = {:?}", - available_virtual_devices - ); - - let current_virtual_devices = available_virtual_devices - .into_iter() - .map(|id| (id, HEALTHY.to_string())) - .collect::>(); - - // Only send the virtual devices if the list has changed - if current_virtual_devices != prev_virtual_devices { - // Find devices that no longer exist, set health state to UNHEALTHY for all devices that are gone - let mut devices_to_report = prev_virtual_devices - .keys() - .filter(|key| !current_virtual_devices.contains_key(&(*key).clone())) - .map(|key| (key.clone(), UNHEALTHY.to_string())) - .collect::>(); - devices_to_report.extend(current_virtual_devices.clone()); - - prev_virtual_devices = current_virtual_devices; - - let resp = v1beta1::ListAndWatchResponse { - devices: devices_to_report - .iter() - .map(|(id, health)| v1beta1::Device { - id: id.clone(), - health: health.clone(), - }) - .collect(), - }; - info!( - "ConfigurationDevicePlugin::list_and_watch - for device plugin {}, response = {:?}", - dps.instance_name, resp - ); - // Send virtual devices list back to kubelet - if let Err(e) = kubelet_update_sender.send(Ok(resp)).await { - trace!( - "ConfigurationDevicePlugin::list_and_watch - for device plugin {} kubelet no longer receiving with error {}", - dps.instance_name, - e - ); - // This means kubelet is down/has been restarted. - dps.server_ender_sender.clone().send(()).await.unwrap(); - keep_looping = false; - } - } - - // Sleep for polling_interval_secs unless receive message to shutdown the server - // or continue (and send another list of devices) - match timeout( - Duration::from_secs(polling_interval_secs), - list_and_watch_message_receiver.recv(), - ) - .await - { - Ok(message) => { - // If receive message to end list_and_watch, send list of unhealthy devices - // and shutdown the server by sending message on server_ender_sender channel - if message == Ok(ListAndWatchMessageKind::End) { - trace!( - "ConfigurationDevicePlugin::list_and_watch - for device plugin {} received message to end", - dps.instance_name - ); - if !prev_virtual_devices.is_empty() { - let resp = v1beta1::ListAndWatchResponse { - devices: prev_virtual_devices.keys() - .map(|id| v1beta1::Device { - id: id.clone(), - health: UNHEALTHY.to_string(), - }) - .collect(), - }; - info!( - "ConfigurationDevicePlugin::list_and_watch - for device plugin {}, end response = {:?}", - dps.instance_name, resp - ); - kubelet_update_sender.send(Ok(resp)) - .await - .unwrap(); - } - dps.server_ender_sender.clone().send(()).await.unwrap(); - keep_looping = false; - } - }, - Err(_) => trace!( - "ConfigurationDevicePlugin::list_and_watch - for device plugin {} did not receive a message for {} seconds ... continuing", - dps.instance_name, polling_interval_secs - ), - } - } - trace!( - "ConfigurationDevicePlugin::list_and_watch - for device plugin {} ending", - dps.instance_name - ); - } - - /// Called when kubelet is trying to reserve for this node a usage slot (or virtual device) of the Instance. - /// Tries to update Instance CRD to reserve the requested slot. If cannot reserve that slot, forces `list_and_watch` to continue - /// (sending kubelet the latest list of slots) and returns error, so kubelet will not schedule the pod to this node. - async fn allocate( - &self, - dps: Arc, - requests: Request, - kube_interface: Arc, - ) -> Result, Status> { - let mut container_responses = Vec::new(); - let mut allocated_instances = HashMap::::new(); - for request in requests.into_inner().container_requests { - trace!( - "ConfigurationDevicePlugin::allocate - for device plugin {} handling request {:?}", - &dps.instance_name, - request, - ); - - let resources_to_allocate = match get_virtual_device_resources( - request.devices_i_ds.clone(), - &dps.device_plugin_context.read().await.clone().instances, - &dps.node_name, - &dps.config_namespace, - &dps.config.capacity, - kube_interface.clone(), - ) - .await - { - Ok(resources) => { - info!( - "ConfigurationDevicePlugin::allocate - resource to allocate = {:?}", - resources - ); - resources - } - Err(e) => { - dps.list_and_watch_message_sender - .send(ListAndWatchMessageKind::Continue) - .unwrap(); - return Err(e); - } - }; - - let mut akri_annotations = HashMap::new(); - let mut akri_device_properties = HashMap::new(); - let mut akri_devices = HashMap::::new(); - for (vdev_id, (instance_name, device_usage_id)) in resources_to_allocate { - info!( - "ConfigurationDevicePlugin::allocate - for device plugin {} processing request for vdevice id {}", - &dps.instance_name, - vdev_id - ); - // Find device from instance_map - let (device, instance_id) = match dps - .device_plugin_context - .read() - .await - .instances - .get(&instance_name) - .ok_or_else(|| { - Status::new( - Code::Unknown, - format!("instance {} not found in instance map", instance_name), - ) - }) { - Ok(instance_info) => ( - instance_info.device.clone(), - instance_info.instance_id.clone(), - ), - Err(e) => { - dps.list_and_watch_message_sender - .send(ListAndWatchMessageKind::Continue) - .unwrap(); - return Err(e); - } - }; - - if let Err(e) = try_update_instance_device_usage( - &device_usage_id, - &dps.node_name, - &instance_name, - &dps.config_namespace, - DeviceUsageKind::Configuration(vdev_id.clone()), - kube_interface.clone(), - ) - .await - { - trace!("ConfigurationDevicePlugin::allocate - could not assign {} slot to {} node ... forcing list_and_watch to continue", device_usage_id, &dps.node_name); - dps.list_and_watch_message_sender - .send(ListAndWatchMessageKind::Continue) - .unwrap(); - return Err(e); - } - - let node_usage = NodeUsage::create( - &DeviceUsageKind::Configuration(vdev_id.clone()), - &dps.node_name, - ) - .unwrap(); - akri_annotations.insert( - format!("{}{}", AKRI_SLOT_ANNOTATION_NAME_PREFIX, &device_usage_id), - node_usage.to_string(), - ); - - // Add suffix _ to each device property - let device_property_suffix = instance_id.to_uppercase(); - let converted_properties = device - .properties - .iter() - .map(|(key, value)| { - ( - format!("{}_{}", key, &device_property_suffix), - value.to_string(), - ) - }) - .collect::>(); - akri_device_properties.extend(converted_properties); - akri_devices.insert(instance_name.clone(), device.clone()); - - allocated_instances.insert( - instance_name.clone(), - (instance_name.clone(), device_usage_id.clone()), - ); - - trace!( - "ConfigurationDevicePlugin::allocate - finished processing device_usage_id {}", - device_usage_id - ); - } - // Successfully reserved device_usage_slot[s] for this node. - // Add response to list of responses - let broker_properties = - get_all_broker_properties(&dps.config.broker_properties, &akri_device_properties); - let response = build_container_allocate_response( - broker_properties, - akri_annotations, - &akri_devices.into_values().collect(), - ); - container_responses.push(response); - } - - // Notify effected instance device plugin to rescan list and watch and update the cl_usage_slot - { - let device_plugin_context = dps.device_plugin_context.read().await; - for instance_name in allocated_instances.keys() { - trace!( - "ConfigurationDevicePlugin::allocate - notify Instance {} to refresh list_and_watch", - instance_name, - ); - - if let Some(instance_info) = device_plugin_context.instances.get(instance_name) { - instance_info - .list_and_watch_message_sender - .send(ListAndWatchMessageKind::Continue) - .unwrap(); - } - } - } - trace!( - "ConfigurationDevicePlugin::allocate - for device plugin {} returning responses", - &dps.instance_name - ); - Ok(Response::new(v1beta1::AllocateResponse { - container_responses, - })) - } -} - -async fn get_available_virtual_devices( - instances: &HashMap, - node_name: &str, - instance_namespace: &str, - capacity: &i32, - kube_interface: Arc, -) -> HashSet { - let mut instance_device_usage_states = HashMap::new(); - for instance_name in instances.keys() { - let device_usage_states = get_instance_device_usage_states( - node_name, - instance_name, - instance_namespace, - capacity, - kube_interface.clone(), - ) - .await; - instance_device_usage_states.insert(instance_name.to_string(), device_usage_states); - } - let mut vdev_ids_to_report = HashSet::new(); - let mut free_instances = HashSet::new(); - for (instance_name, device_usage_states) in instance_device_usage_states { - for (_id, state) in device_usage_states { - match state { - DeviceUsageStatus::Free => { - free_instances.insert(instance_name.clone()); - } - DeviceUsageStatus::ReservedByConfiguration(vdev_id) => { - vdev_ids_to_report.insert(vdev_id); - } - _ => (), - }; - } - } - - // Decide virtual device id, report already reserved virtual devices + 1 free device usage (if available) per instance - let mut id_to_set: u32 = 0; - for _x in 0..free_instances.len() { - while vdev_ids_to_report.get(&format!("{}", id_to_set)).is_some() { - id_to_set += 1; - } - let id_string = format!("{}", id_to_set); - vdev_ids_to_report.insert(id_string); - id_to_set += 1; - } - - vdev_ids_to_report -} - -async fn get_virtual_device_resources( - requested_vdev_ids: Vec, - instances: &HashMap, - node_name: &str, - instance_namespace: &str, - capacity: &i32, - kube_interface: Arc, -) -> Result, Status> { - let mut instance_device_usage_states = HashMap::new(); - for instance_name in instances.keys() { - let device_usage_states = get_instance_device_usage_states( - node_name, - instance_name, - instance_namespace, - capacity, - kube_interface.clone(), - ) - .await; - instance_device_usage_states.insert(instance_name.to_string(), device_usage_states); - } - - let mut usage_ids_to_use = HashMap::::new(); - - // Get all available virtual devices, group by device usage status, - // a virtual device is available if it's Free or ReservedByConfiguration with requested vdev_ids - let mut free_device_usage_states = HashMap::new(); - let mut reserved_by_configuration_usage_states = HashMap::new(); - for (instance_name, device_usage_state) in instance_device_usage_states { - for (slot, state) in device_usage_state { - match state { - DeviceUsageStatus::Free => { - free_device_usage_states - .entry(instance_name.to_string()) - .or_insert(HashSet::new()) - .insert(slot); - } - DeviceUsageStatus::ReservedByConfiguration(vdev_id) => { - if requested_vdev_ids.contains(&vdev_id) { - reserved_by_configuration_usage_states - .entry(instance_name.to_string()) - .or_insert(HashMap::new()) - .insert(slot, vdev_id); - } - } - - _ => (), - }; - } - } - - // Find (Instance, usage slot) for vdev ids, (Instance, usage slot) in free_device_usage_states and - // reserved_by_configuration_usage_states are available to be assigned. - // Iterate vdev ids to look up vdev id from reserved_by_configuration_usage_states, if not found - // pick one (Instance, usage slot) from free_device_usage_states - let unallocated_device_ids = requested_vdev_ids - .into_iter() - .filter(|vdev_id| { - // true means not allocated - let mut resource: Option<(String, String)> = None; - - for (instance_name, slots) in &reserved_by_configuration_usage_states { - for (slot, vid) in slots { - if vdev_id == vid { - resource = Some((instance_name.clone(), slot.clone())); - break; - } - } - if resource.is_some() { - break; - } - } - - if resource.is_none() { - if let Some((instance_name, slots)) = &free_device_usage_states - .iter() - .max_by(|a, b| a.1.len().cmp(&b.1.len())) - { - if let Some(slot) = slots.iter().next() { - resource = Some((instance_name.to_string(), slot.to_string())); - } - } - } - - match resource { - Some((instance_name, slot)) => { - reserved_by_configuration_usage_states.remove(&instance_name); - free_device_usage_states.remove(&instance_name); - usage_ids_to_use.insert(vdev_id.clone(), (instance_name, slot)); - false - } - None => true, - } - }) - .collect::>(); - - // Return error if any unallocated vdev id - if !unallocated_device_ids.is_empty() { - return Err(Status::new( - Code::Unknown, - "Insufficient instances to allocate", - )); - } - Ok(usage_ids_to_use) -} - -/// This returns device usage status of all slots for an Instance on a given node -/// if the Instance doesn't exist or fail to parse device usage of its slots return -/// DeviceUsageStatus::Unknown since insufficient information to decide the usage state -pub async fn get_instance_device_usage_states( - node_name: &str, - instance_name: &str, - instance_namespace: &str, - capacity: &i32, - kube_interface: Arc, -) -> Vec<(String, DeviceUsageStatus)> { - let mut device_usage_states = Vec::new(); - match kube_interface - .find_instance(instance_name, instance_namespace) - .await - { - Ok(kube_akri_instance) => { - for (device_name, device_usage_string) in kube_akri_instance.spec.device_usage { - let device_usage_status = match NodeUsage::from_str(&device_usage_string) { - Ok(node_usage) => get_device_usage_state(&node_usage, node_name), - Err(_) => { - error!( - "get_instance_device_usage_states - fail to parse device usage {}", - device_usage_string - ); - DeviceUsageStatus::Unknown - } - }; - device_usage_states.push((device_name.clone(), device_usage_status)); - } - device_usage_states - } - Err(_) => (0..*capacity) - .map(|x| { - ( - format!("{}-{}", instance_name, x), - DeviceUsageStatus::Unknown, - ) - }) - .collect(), - } -} - -/// This returns device usage status of a `device_usage_id` slot for an instance on a given node -/// # More details -/// Cases based on the device usage value -/// 1. DeviceUsageKind::Free ... this means that the device is available for use -/// * (ACTION) return DeviceUsageStatus::Free -/// 2. node_usage.node_name == node_name ... this means node_name previously used device_usage -/// * (ACTION) return previously reserved kind, DeviceUsageStatus::ReservedByConfiguration or DeviceUsageStatus::ReservedByInstance -/// 3. node_usage.node_name == (some other node) ... this means that we believe this device is in use by another node -/// * (ACTION) return DeviceUsageStatus::ReservedByOtherNode -fn get_device_usage_state(node_usage: &NodeUsage, node_name: &str) -> DeviceUsageStatus { - let device_usage_state = match node_usage.get_kind() { - DeviceUsageKind::Free => DeviceUsageStatus::Free, - DeviceUsageKind::Configuration(vdev_id) => { - DeviceUsageStatus::ReservedByConfiguration(vdev_id) - } - DeviceUsageKind::Instance => DeviceUsageStatus::ReservedByInstance, - }; - if device_usage_state != DeviceUsageStatus::Free && !node_usage.is_same_node(node_name) { - return DeviceUsageStatus::ReservedByOtherNode; - } - device_usage_state -} - -/// This tries up to `MAX_INSTANCE_UPDATE_TRIES` to update the requested slot of the Instance with the this node's name. -/// It cannot be assumed that this will successfully update Instance on first try since Device Plugins on other nodes -/// may be simultaneously trying to update the Instance. -/// This returns an error if slot already be reserved by other nodes or device plugins, -/// cannot be updated or `MAX_INSTANCE_UPDATE_TRIES` attempted. -async fn try_update_instance_device_usage( - device_usage_id: &str, - node_name: &str, - instance_name: &str, - instance_namespace: &str, - desired_device_usage_kind: DeviceUsageKind, - kube_interface: Arc, -) -> Result<(), Status> { - let mut instance: InstanceSpec; - for x in 0..MAX_INSTANCE_UPDATE_TRIES { - // Grab latest instance - match kube_interface - .find_instance(instance_name, instance_namespace) - .await - { - Ok(instance_object) => instance = instance_object.spec, - Err(_) => { - trace!( - "try_update_instance_device_usage - could not find Instance {}", - instance_name - ); - return Err(Status::new( - Code::Unknown, - format!("Could not find Instance {}", instance_name), - )); - } - } - - // Update the instance to reserve this slot for this node iff it is available and not already reserved for this node. - let current_device_usage_string = instance.device_usage.get(device_usage_id); - if current_device_usage_string.is_none() { - // No corresponding id found - trace!( - "try_update_instance_device_usage - could not find {} id in device_usage", - device_usage_id - ); - return Err(Status::new( - Code::Unknown, - "Could not find device usage slot", - )); - } - - let current_device_usage = NodeUsage::from_str(current_device_usage_string.unwrap()) - .map_err(|_| { - Status::new( - Code::Unknown, - format!( - "Fails to parse {} to DeviceUsage ", - current_device_usage_string.unwrap() - ), - ) - })?; - // Call get_device_usage_state to check current device usage to see if the slot can be reserved. - // A device usage slot can be reserved if it's free or already reserved by this node and the desired usage kind matches. - // For slots owned by this node, get_device_usage_state returns ReservedByConfiguration or ReservedByInstance. - // For slots owned by other nodes (by Configuration or Instance), get_device_usage_state returns ReservedByOtherNode. - match get_device_usage_state(¤t_device_usage, node_name) { - DeviceUsageStatus::Free => { - let new_device_usage = NodeUsage::create(&desired_device_usage_kind, node_name) - .map_err(|e| { - Status::new( - Code::Unknown, - format!("Fails to create DeviceUsage - {}", e), - ) - })?; - instance - .device_usage - .insert(device_usage_id.to_string(), new_device_usage.to_string()); - - if let Err(e) = kube_interface - .update_instance(&instance, instance_name, instance_namespace) - .await - { - if x == (MAX_INSTANCE_UPDATE_TRIES - 1) { - trace!("try_update_instance_device_usage - update_instance returned error [{}] after max tries ... returning error", e); - return Err(Status::new(Code::Unknown, "Could not update Instance")); - } - random_delay().await; - } else { - return Ok(()); - } - } - DeviceUsageStatus::ReservedByConfiguration(_) => { - if matches!(desired_device_usage_kind, DeviceUsageKind::Configuration(_)) { - return Ok(()); - } else { - return Err(Status::new( - Code::Unknown, - format!( - "Requested device kind {:?} already in use by DeviceUsageKind::Configuration", - desired_device_usage_kind, - ), - )); - } - } - DeviceUsageStatus::ReservedByInstance => { - if matches!(desired_device_usage_kind, DeviceUsageKind::Instance) { - return Ok(()); - } else { - return Err(Status::new( - Code::Unknown, - format!( - "Requested device kind {:?} already in use by DeviceUsageKind::Instance", - desired_device_usage_kind, - ), - )); - } - } - DeviceUsageStatus::ReservedByOtherNode => { - trace!("try_update_instance_device_usage - request for device slot {} previously claimed by a diff node {} than this one {} ... indicates the device on THIS node must be marked unhealthy, invoking ListAndWatch ... returning failure, next scheduling should succeed!", - device_usage_id, current_device_usage.get_node_name(), node_name); - return Err(Status::new( - Code::Unknown, - format!( - "Requested device kind {:?} already in use by other nodes", - desired_device_usage_kind, - ), - )); - } - DeviceUsageStatus::Unknown => { - trace!( - "try_update_instance_device_usage - request for device slot {} status unknown!", - device_usage_id - ); - return Err(Status::new( - Code::Unknown, - "Requested device usage status unknown", - )); - } - }; - } - Ok(()) -} - -/// This sets the volume mounts and environment variables according to the instance's `DiscoveryHandler`. -fn build_container_allocate_response( - broker_properties: HashMap, - annotations: HashMap, - devices: &Vec, -) -> v1beta1::ContainerAllocateResponse { - let mut total_mounts = Vec::new(); - let mut total_device_specs = Vec::new(); - for device in devices { - // Cast v0 discovery Mount and DeviceSpec types to v1beta1 DevicePlugin types - let mounts: Vec = device - .mounts - .clone() - .into_iter() - .map(|mount| Mount { - container_path: mount.container_path, - host_path: mount.host_path, - read_only: mount.read_only, - }) - .collect(); - total_mounts.extend(mounts); - - let device_specs: Vec = device - .device_specs - .clone() - .into_iter() - .map(|device_spec| DeviceSpec { - container_path: device_spec.container_path, - host_path: device_spec.host_path, - permissions: device_spec.permissions, - }) - .collect(); - total_device_specs.extend(device_specs); - } - // Create response, setting environment variables to be an instance's properties. - v1beta1::ContainerAllocateResponse { - annotations, - mounts: total_mounts, - devices: total_device_specs, - envs: broker_properties, - } -} - -/// Try to find Instance CRD for this instance or create one and add it to the global InstanceMap -/// If a Config does not exist for this instance, return error. -/// This is most likely caused by deletion of a Config right after adding it, in which case -/// `handle_config_delete` fails to delete this instance because kubelet has yet to call `list_and_watch` -async fn try_create_instance( - dps: Arc, - instance_dp: &InstanceDevicePlugin, - kube_interface: Arc, -) -> Result<(), anyhow::Error> { - // Make sure Configuration exists for instance - if let Err(e) = kube_interface - .find_configuration(&dps.config_name, &dps.config_namespace) - .await - { - error!( - "try_create_instance - no Configuration for device {} ... returning error", - dps.instance_name - ); - return Err(e); - } - - let device_usage: std::collections::HashMap = (0..dps.config.capacity) - .map(|x| { - ( - format!("{}-{}", dps.instance_name, x), - NodeUsage::default().to_string(), - ) - }) - .collect(); - let instance = InstanceSpec { - configuration_name: dps.config_name.clone(), - shared: instance_dp.shared, - nodes: vec![dps.node_name.clone()], - device_usage, - broker_properties: get_all_broker_properties( - &dps.config.broker_properties, - &instance_dp.device.properties, - ), - }; - - // Try up to MAX_INSTANCE_UPDATE_TRIES to create or update instance, breaking on success - for x in 0..MAX_INSTANCE_UPDATE_TRIES { - // First check if instance already exists - match kube_interface - .find_instance(&dps.instance_name, &dps.config_namespace) - .await - { - Ok(mut instance_object) => { - trace!( - "try_create_instance - discovered Instance {} already created", - dps.instance_name - ); - - // Check if instance's node list already contains this node, possibly due to device plugin failure and restart - if !instance_object.spec.nodes.contains(&dps.node_name) { - instance_object.spec.nodes.push(dps.node_name.clone()); - match kube_interface - .update_instance( - &instance_object.spec, - &instance_object.metadata.name.unwrap(), - &dps.config_namespace, - ) - .await - { - Ok(()) => { - trace!( - "try_create_instance - updated Instance {} to include {}", - dps.instance_name, - dps.node_name - ); - break; - } - Err(e) => { - trace!("try_create_instance - call to update_instance returned with error {} on try # {} of {}", e, x, MAX_INSTANCE_UPDATE_TRIES); - if x == (MAX_INSTANCE_UPDATE_TRIES - 1) { - return Err(e); - } - } - }; - } else { - break; - } - } - Err(_) => { - match kube_interface - .create_instance( - &instance, - &dps.instance_name, - &dps.config_namespace, - &dps.config_name, - &dps.config_uid, - ) - .await - { - Ok(()) => { - trace!( - "try_create_instance - created Instance with name {}", - dps.instance_name - ); - break; - } - Err(e) => { - trace!("try_create_instance - couldn't create instance with error {} on try # {} of {}", e, x, MAX_INSTANCE_UPDATE_TRIES); - if x == MAX_INSTANCE_UPDATE_TRIES - 1 { - return Err(e); - } - } - } - } - } - random_delay().await; - } - - // Successfully created or updated instance. Add it to instance_map. - dps.device_plugin_context.write().await.instances.insert( - dps.instance_name.clone(), - InstanceInfo { - list_and_watch_message_sender: dps.list_and_watch_message_sender.clone(), - connectivity_status: InstanceConnectivityStatus::Online, - instance_id: instance_dp.instance_id.clone(), - device: instance_dp.device.clone(), - }, - ); - - Ok(()) -} - -/// This sends message to end `list_and_watch` and removes instance from InstanceMap. -/// Called when an instance has been offline for too long. -pub async fn terminate_device_plugin_service( - instance_name: &str, - device_plugin_context: Arc>, -) -> Result<(), Box> { - let mut device_plugin_context = device_plugin_context.write().await; - info!( - "terminate_device_plugin_service -- forcing list_and_watch to end for Instance {}", - instance_name - ); - device_plugin_context - .instances - .get(instance_name) - .unwrap() - .list_and_watch_message_sender - .send(ListAndWatchMessageKind::End) - .unwrap(); - - trace!( - "terminate_device_plugin_service -- removing Instance {} from instance_map", - instance_name - ); - device_plugin_context.instances.remove(instance_name); - Ok(()) -} - -/// This creates a Configuration's unique name -pub fn get_device_configuration_name(config_name: &str) -> String { - config_name.to_string().replace(['.', '/'], "-") -} - -/// This creates an Instance's unique name -pub fn get_device_instance_name(id: &str, config_name: &str) -> String { - format!("{}-{}", config_name, &id) - .replace('.', "-") - .replace('/', "-") -} - -// Aggregate a Configuration and Device's properties so they can be displayed in an Instance and injected into brokers as environment variables. -pub fn get_all_broker_properties( - configuration_properties: &HashMap, - device_properties: &HashMap, -) -> HashMap { - configuration_properties - .clone() - .into_iter() - .chain(device_properties.clone()) - .collect::>() -} - -#[cfg(test)] -mod device_plugin_service_tests { - use super::*; - use akri_shared::akri::configuration::Configuration; - use akri_shared::{ - akri::instance::{Instance, InstanceSpec}, - k8s::MockKubeInterface, - }; - use std::{ - fs, - io::{Error, ErrorKind}, - }; - - enum NodeName { - ThisNode, - OtherNode, - } - - enum DevicePluginKind { - Configuration, - Instance, - } - - // Need to be kept alive during tests - struct DevicePluginServiceReceivers { - configuration_list_and_watch_message_receiver: broadcast::Receiver, - instance_list_and_watch_message_receiver: broadcast::Receiver, - } - - fn configure_find_instance( - mock: &mut MockKubeInterface, - result_file: &'static str, - instance_name: String, - instance_namespace: String, - device_usage_node: String, - node_name: NodeName, - expected_calls: usize, - ) { - let instance_name_clone = instance_name.clone(); - mock.expect_find_instance() - .times(expected_calls) - .withf(move |name: &str, namespace: &str| { - namespace == instance_namespace && name == instance_name - }) - .returning(move |_, _| { - let mut instance_json = - fs::read_to_string(result_file).expect("Unable to read file"); - let host_name = match node_name { - NodeName::ThisNode => "node-a", - NodeName::OtherNode => "other", - }; - instance_json = instance_json.replace("node-a", host_name); - instance_json = instance_json.replace("config-a-b494b6", &instance_name_clone); - instance_json = - instance_json.replace("\":\"\"", &format!("\":\"{}\"", device_usage_node)); - let instance: Instance = serde_json::from_str(&instance_json).unwrap(); - Ok(instance) - }); - } - - fn setup_find_instance_with_mock_instances( - mock: &mut MockKubeInterface, - instance_namespace: &str, - mock_instances: Vec<(String, Instance)>, - ) { - for (instance_name, kube_instance) in mock_instances { - let instance_namespace = instance_namespace.to_string(); - mock.expect_find_instance() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == instance_namespace && name == instance_name - }) - .returning(move |_, _| Ok(kube_instance.clone())); - } - } - - fn setup_find_instance_with_not_found_err( - mock: &mut MockKubeInterface, - instance_name: &str, - instance_namespace: &str, - ) { - let instance_name = instance_name.to_string(); - let instance_namespace = instance_namespace.to_string(); - mock.expect_find_instance() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == instance_namespace && name == instance_name - }) - .returning(move |_, _| Err(get_kube_not_found_error().into())); - } - - fn create_device_plugin_service( - device_plugin_kind: DevicePluginKind, - connectivity_status: InstanceConnectivityStatus, - add_to_instance_map: bool, - ) -> (DevicePluginService, DevicePluginServiceReceivers) { - let path_to_config = "../test/yaml/config-a.yaml"; - let instance_id = "b494b6"; - let kube_akri_config_yaml = - fs::read_to_string(path_to_config).expect("Unable to read file"); - let kube_akri_config: Configuration = serde_yaml::from_str(&kube_akri_config_yaml).unwrap(); - let config_name = kube_akri_config.metadata.name.as_ref().unwrap(); - let device_instance_name = get_device_instance_name(instance_id, config_name); - let ( - configuration_list_and_watch_message_sender, - configuration_list_and_watch_message_receiver, - ) = broadcast::channel(4); - let (instance_list_and_watch_message_sender, instance_list_and_watch_message_receiver) = - broadcast::channel(4); - let (server_ender_sender, _) = mpsc::channel(1); - - let device = Device { - id: "n/a".to_string(), - properties: HashMap::from([( - "DEVICE_LOCATION_INFO".to_string(), - "endpoint".to_string(), - )]), - mounts: Vec::new(), - device_specs: Vec::new(), - }; - let mut instances = HashMap::new(); - if add_to_instance_map { - let instance_info: InstanceInfo = InstanceInfo { - list_and_watch_message_sender: instance_list_and_watch_message_sender.clone(), - connectivity_status, - instance_id: instance_id.to_string(), - device: device.clone(), - }; - instances.insert(device_instance_name.clone(), instance_info); - } - let device_plugin_context = Arc::new(RwLock::new(DevicePluginContext { - usage_update_message_sender: Some(configuration_list_and_watch_message_sender.clone()), - instances, - })); - - let (list_and_watch_message_sender, device_plugin_behavior) = match device_plugin_kind { - DevicePluginKind::Instance => ( - instance_list_and_watch_message_sender, - DevicePluginBehavior::Instance(InstanceDevicePlugin { - instance_id: instance_id.to_string(), - shared: false, - device, - }), - ), - DevicePluginKind::Configuration => ( - configuration_list_and_watch_message_sender, - DevicePluginBehavior::Configuration(ConfigurationDevicePlugin::default()), - ), - }; - let dps = DevicePluginService { - instance_name: device_instance_name, - config: kube_akri_config.spec.clone(), - config_name: config_name.to_string(), - config_uid: kube_akri_config.metadata.uid.unwrap(), - config_namespace: kube_akri_config.metadata.namespace.unwrap(), - node_name: "node-a".to_string(), - device_plugin_context, - list_and_watch_message_sender, - server_ender_sender, - device_plugin_behavior, - }; - ( - dps, - DevicePluginServiceReceivers { - configuration_list_and_watch_message_receiver, - instance_list_and_watch_message_receiver, - }, - ) - } - - fn get_kube_not_found_error() -> kube::Error { - // Mock error thrown when instance not found - kube::Error::Api(kube::error::ErrorResponse { - status: "Failure".to_string(), - message: "instances.akri.sh \"akri-blah-901a7b\" not found".to_string(), - reason: "NotFound".to_string(), - code: k8s::ERROR_NOT_FOUND, - }) - } - - // Tests that configuration device plugin names are formatted correctly - #[test] - fn test_get_device_configuration_name() { - let names_to_test = [ - ("no_dash_no_dot", "no_dash_no_dot"), - ("usb/camera", "usb-camera"), - ("another//camera", "another--camera"), - ("name.with.dot", "name-with-dot"), - ("name.with..dots...", "name-with--dots---"), - ]; - names_to_test.iter().for_each(|(test, expected)| { - println!("{:?}", (test, expected)); - assert_eq!(get_device_configuration_name(test), expected.to_string()); - }); - } - - // Tests that instance names are formatted correctly - #[test] - fn test_get_device_instance_name() { - let instance_name1: String = "/dev/video0".to_string(); - let instance_name2: String = "10.1.2.3".to_string(); - assert_eq!( - "usb-camera--dev-video0", - get_device_instance_name(&instance_name1, "usb-camera") - ); - assert_eq!( - "ip-camera-10-1-2-3".to_string(), - get_device_instance_name(&instance_name2, "ip-camera") - ); - } - - // Test that a Device and Configuration's properties are aggregated and that - // a Device property overwrites a Configuration's. - #[test] - fn test_get_all_broker_properties() { - let mut device_properties = HashMap::new(); - device_properties.insert("ENDPOINT".to_string(), "123".to_string()); - device_properties.insert("OVERWRITE".to_string(), "222".to_string()); - let mut configuration_properties = HashMap::new(); - configuration_properties.insert("USE HD".to_string(), "true".to_string()); - configuration_properties.insert("OVERWRITE".to_string(), "111".to_string()); - let all_properties = - get_all_broker_properties(&configuration_properties, &device_properties); - assert_eq!(all_properties.len(), 3); - assert_eq!(all_properties.get("ENDPOINT").unwrap(), "123"); - assert_eq!(all_properties.get("USE HD").unwrap(), "true"); - assert_eq!(all_properties.get("OVERWRITE").unwrap(), "222"); - } - - // Test correct device usage status is returned when a device usage slot is used on the same node - #[test] - fn test_get_device_usage_state_same_node() { - let _ = env_logger::builder().is_test(true).try_init(); - let this_node = "node-a"; - let vdev_id = "vdev_0"; - // Free - assert_eq!( - get_device_usage_state( - &NodeUsage::create(&DeviceUsageKind::Free, "").unwrap(), - this_node - ), - DeviceUsageStatus::Free - ); - // Used by Configuration - assert_eq!( - get_device_usage_state( - &NodeUsage::create( - &DeviceUsageKind::Configuration(vdev_id.to_string()), - this_node - ) - .unwrap(), - this_node - ), - DeviceUsageStatus::ReservedByConfiguration(vdev_id.to_string()) - ); - // Used by Instance - assert_eq!( - get_device_usage_state( - &NodeUsage::create(&DeviceUsageKind::Instance, this_node).unwrap(), - this_node - ), - DeviceUsageStatus::ReservedByInstance - ); - } - - // Test DeviceUsageStatus::ReservedByOtherNode is returned when a device usage slot is used on a different node - #[test] - fn test_get_device_usage_state_different_node() { - let _ = env_logger::builder().is_test(true).try_init(); - let this_node = "node-a"; - let that_node = "node-b"; - let vdev_id = "vdev_0"; - // Free - assert_eq!( - get_device_usage_state( - &NodeUsage::create(&DeviceUsageKind::Free, "").unwrap(), - this_node - ), - DeviceUsageStatus::Free - ); - // Used by Configuration - assert_eq!( - get_device_usage_state( - &NodeUsage::create( - &DeviceUsageKind::Configuration(vdev_id.to_string()), - that_node - ) - .unwrap(), - this_node - ), - DeviceUsageStatus::ReservedByOtherNode - ); - // Used by Instance - assert_eq!( - get_device_usage_state( - &NodeUsage::create(&DeviceUsageKind::Instance, that_node).unwrap(), - this_node - ), - DeviceUsageStatus::ReservedByOtherNode - ); - } - - fn configure_find_configuration( - mock: &mut MockKubeInterface, - config_name: String, - config_namespace: String, - ) { - mock.expect_find_configuration() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == config_namespace && name == config_name - }) - .returning(move |_, _| { - let path_to_config = "../test/yaml/config-a.yaml"; - let kube_akri_config_yaml = - fs::read_to_string(path_to_config).expect("Unable to read file"); - let kube_akri_config: Configuration = - serde_yaml::from_str(&kube_akri_config_yaml).unwrap(); - Ok(kube_akri_config) - }); - } - - // Tests that try_create_instance creates an instance - #[tokio::test] - async fn test_try_create_instance() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, _device_plugin_service_receivers) = - create_device_plugin_service( - DevicePluginKind::Instance, - InstanceConnectivityStatus::Online, - false, - ); - let mut mock = MockKubeInterface::new(); - configure_find_configuration( - &mut mock, - device_plugin_service.config_name.clone(), - device_plugin_service.config_namespace.clone(), - ); - let instance_name = device_plugin_service.instance_name.clone(); - let config_name = device_plugin_service.config_name.clone(); - let config_uid = device_plugin_service.config_uid.clone(); - let config_namespace = device_plugin_service.config_namespace.clone(); - mock.expect_find_instance() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == config_namespace && name == instance_name - }) - .returning(move |_, _| Err(get_kube_not_found_error().into())); - let instance_name = device_plugin_service.instance_name.clone(); - let config_namespace = device_plugin_service.config_namespace.clone(); - mock.expect_create_instance() - .withf(move |instance, name, namespace, owner_name, owner_uid| { - namespace == config_namespace - && name == instance_name - && instance.nodes.contains(&"node-a".to_string()) - && owner_name == config_name - && owner_uid == config_uid - }) - .returning(move |_, _, _, _, _| Ok(())); - - let dps = Arc::new(device_plugin_service); - if let DevicePluginBehavior::Instance(instance_device_plugin) = &dps.device_plugin_behavior - { - assert!( - try_create_instance(dps.clone(), instance_device_plugin, Arc::new(mock)) - .await - .is_ok() - ); - assert!(dps - .device_plugin_context - .read() - .await - .instances - .contains_key(&dps.instance_name)); - } else { - panic!("Incorrect device plugin kind configured"); - } - } - - // Tests that try_create_instance updates already existing instance with this node - #[tokio::test] - async fn test_try_create_instance_already_created() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, _device_plugin_service_receivers) = - create_device_plugin_service( - DevicePluginKind::Instance, - InstanceConnectivityStatus::Online, - false, - ); - let mut mock = MockKubeInterface::new(); - configure_find_configuration( - &mut mock, - device_plugin_service.config_name.clone(), - device_plugin_service.config_namespace.clone(), - ); - configure_find_instance( - &mut mock, - "../test/json/local-instance.json", - device_plugin_service.instance_name.clone(), - device_plugin_service.config_namespace.clone(), - String::new(), - NodeName::OtherNode, - 1, - ); - let instance_name = device_plugin_service.instance_name.clone(); - let config_namespace = device_plugin_service.config_namespace.clone(); - mock.expect_update_instance() - .times(1) - .withf(move |instance, name, namespace| { - namespace == config_namespace - && name == instance_name - && instance.nodes.contains(&"node-a".to_string()) - }) - .returning(move |_, _, _| Ok(())); - - let dps = Arc::new(device_plugin_service); - if let DevicePluginBehavior::Instance(instance_device_plugin) = &dps.device_plugin_behavior - { - assert!( - try_create_instance(dps.clone(), instance_device_plugin, Arc::new(mock)) - .await - .is_ok() - ); - assert!(dps - .device_plugin_context - .read() - .await - .instances - .contains_key(&dps.instance_name)); - } else { - panic!("Incorrect device plugin kind configured"); - } - } - - // Test when instance already created and already contains this node. - // Should find the instance but not update it. - #[tokio::test] - async fn test_try_create_instance_already_created_no_update() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, _device_plugin_service_receivers) = - create_device_plugin_service( - DevicePluginKind::Instance, - InstanceConnectivityStatus::Online, - false, - ); - let mut mock = MockKubeInterface::new(); - configure_find_configuration( - &mut mock, - device_plugin_service.config_name.clone(), - device_plugin_service.config_namespace.clone(), - ); - configure_find_instance( - &mut mock, - "../test/json/local-instance.json", - device_plugin_service.instance_name.clone(), - device_plugin_service.config_namespace.clone(), - String::new(), - NodeName::ThisNode, - 1, - ); - let dps = Arc::new(device_plugin_service); - if let DevicePluginBehavior::Instance(instance_device_plugin) = &dps.device_plugin_behavior - { - assert!( - try_create_instance(dps.clone(), instance_device_plugin, Arc::new(mock)) - .await - .is_ok() - ); - assert!(dps - .device_plugin_context - .read() - .await - .instances - .contains_key(&dps.instance_name)); - } else { - panic!("Incorrect device plugin kind configured"); - } - } - - // Tests that try_create_instance returns error when trying to create an Instance for a Config that DNE - #[tokio::test] - async fn test_try_create_instance_no_config() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, _device_plugin_service_receivers) = - create_device_plugin_service( - DevicePluginKind::Instance, - InstanceConnectivityStatus::Online, - false, - ); - let config_name = device_plugin_service.config_name.clone(); - let config_namespace = device_plugin_service.config_namespace.clone(); - let mut mock = MockKubeInterface::new(); - mock.expect_find_configuration() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == config_namespace && name == config_name - }) - .returning(move |_, _| { - let error = Error::new(ErrorKind::InvalidInput, "Configuration doesn't exist"); - Err(error.into()) - }); - let dps = Arc::new(device_plugin_service); - if let DevicePluginBehavior::Instance(instance_device_plugin) = &dps.device_plugin_behavior - { - assert!( - try_create_instance(dps.clone(), instance_device_plugin, Arc::new(mock)) - .await - .is_err() - ); - } else { - panic!("Incorrect device plugin kind configured"); - } - } - - // Tests that try_create_instance error - #[tokio::test] - async fn test_try_create_instance_error() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, _device_plugin_service_receivers) = - create_device_plugin_service( - DevicePluginKind::Instance, - InstanceConnectivityStatus::Online, - false, - ); - let mut mock = MockKubeInterface::new(); - configure_find_configuration( - &mut mock, - device_plugin_service.config_name.clone(), - device_plugin_service.config_namespace.clone(), - ); - let instance_name = device_plugin_service.instance_name.clone(); - let config_name = device_plugin_service.config_name.clone(); - let config_uid = device_plugin_service.config_uid.clone(); - let config_namespace = device_plugin_service.config_namespace.clone(); - mock.expect_find_instance() - .times(MAX_INSTANCE_UPDATE_TRIES as usize) - .withf(move |name: &str, namespace: &str| { - namespace == config_namespace && name == instance_name - }) - .returning(move |_, _| Err(get_kube_not_found_error().into())); - let instance_name = device_plugin_service.instance_name.clone(); - let config_namespace = device_plugin_service.config_namespace.clone(); - mock.expect_create_instance() - .times(MAX_INSTANCE_UPDATE_TRIES as usize) - .withf(move |instance, name, namespace, owner_name, owner_uid| { - namespace == config_namespace - && name == instance_name - && instance.nodes.contains(&"node-a".to_string()) - && owner_name == config_name - && owner_uid == config_uid - }) - .returning(move |_, _, _, _, _| Err(anyhow::anyhow!("failure"))); - - let dps = Arc::new(device_plugin_service); - if let DevicePluginBehavior::Instance(instance_device_plugin) = &dps.device_plugin_behavior - { - assert!( - try_create_instance(dps.clone(), instance_device_plugin, Arc::new(mock)) - .await - .is_err() - ); - assert!(!dps - .device_plugin_context - .read() - .await - .instances - .contains_key(&dps.instance_name)); - } else { - panic!("Incorrect device plugin kind configured"); - } - } - - // Tests list_and_watch by creating DevicePluginService and DevicePlugin client (emulating kubelet) - #[tokio::test] - async fn test_internal_list_and_watch() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, _device_plugin_service_receivers) = - create_device_plugin_service( - DevicePluginKind::Instance, - InstanceConnectivityStatus::Online, - false, - ); - let list_and_watch_message_sender = - device_plugin_service.list_and_watch_message_sender.clone(); - let mut mock = MockKubeInterface::new(); - configure_find_configuration( - &mut mock, - device_plugin_service.config_name.clone(), - device_plugin_service.config_namespace.clone(), - ); - let instance_name = device_plugin_service.instance_name.clone(); - let config_name = device_plugin_service.config_name.clone(); - let config_uid = device_plugin_service.config_uid.clone(); - let config_namespace = device_plugin_service.config_namespace.clone(); - mock.expect_find_instance() - .times(2) - .withf(move |name: &str, namespace: &str| { - namespace == config_namespace && name == instance_name - }) - .returning(move |_, _| Err(get_kube_not_found_error().into())); - let instance_name = device_plugin_service.instance_name.clone(); - let config_namespace = device_plugin_service.config_namespace.clone(); - mock.expect_create_instance() - .withf(move |instance, name, namespace, owner_name, owner_uid| { - namespace == config_namespace - && name == instance_name - && instance.nodes.contains(&"node-a".to_string()) - && owner_name == config_name - && owner_uid == config_uid - }) - .returning(move |_, _, _, _, _| Ok(())); - - let stream = device_plugin_service - .internal_list_and_watch(Arc::new(mock)) - .await - .unwrap() - .into_inner(); - list_and_watch_message_sender - .send(ListAndWatchMessageKind::End) - .unwrap(); - if let Ok(list_and_watch_response) = stream.into_inner().recv().await.unwrap() { - assert_eq!( - list_and_watch_response.devices[0].id, - format!("{}-0", device_plugin_service.instance_name) - ); - }; - } - - fn setup_internal_allocate_tests( - mock: &mut MockKubeInterface, - device_plugin_service: &DevicePluginService, - formerly_allocated_node: String, - newly_allocated_node: Option, - ) -> Request { - let device_usage_id_slot = format!("{}-0", device_plugin_service.instance_name); - let device_usage_id_slot_2 = device_usage_id_slot.clone(); - configure_find_instance( - mock, - "../test/json/local-instance.json", - device_plugin_service.instance_name.clone(), - device_plugin_service.config_namespace.clone(), - formerly_allocated_node, - NodeName::ThisNode, - 1, - ); - if let Some(new_node) = newly_allocated_node { - mock.expect_update_instance() - .times(1) - .withf(move |instance_to_update: &InstanceSpec, _, _| { - instance_to_update - .device_usage - .get(&device_usage_id_slot) - .unwrap() - == &new_node - }) - .returning(move |_, _, _| Ok(())); - } - let devices_i_ds = vec![device_usage_id_slot_2]; - let container_requests = vec![v1beta1::ContainerAllocateRequest { devices_i_ds }]; - Request::new(AllocateRequest { container_requests }) - } - - // Test that environment variables set in a Configuration will be set in brokers - #[tokio::test] - async fn test_internal_allocate_env_vars() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, mut device_plugin_service_receivers) = - create_device_plugin_service( - DevicePluginKind::Instance, - InstanceConnectivityStatus::Online, - true, - ); - let node_name = device_plugin_service.node_name.clone(); - let mut mock = MockKubeInterface::new(); - let request = setup_internal_allocate_tests( - &mut mock, - &device_plugin_service, - String::new(), - Some(node_name), - ); - let broker_envs = device_plugin_service - .internal_allocate(request, Arc::new(mock)) - .await - .unwrap() - .into_inner() - .container_responses[0] - .envs - .clone(); - assert_eq!(broker_envs.get("RESOLUTION_WIDTH").unwrap(), "800"); - assert_eq!(broker_envs.get("RESOLUTION_HEIGHT").unwrap(), "600"); - // Check that Device properties are set as env vars by checking for - // property of device created in `create_device_plugin_service` - assert_eq!( - broker_envs.get("DEVICE_LOCATION_INFO_B494B6").unwrap(), - "endpoint" - ); - assert!(device_plugin_service_receivers - .instance_list_and_watch_message_receiver - .try_recv() - .is_err()); - assert_eq!( - device_plugin_service_receivers - .configuration_list_and_watch_message_receiver - .recv() - .await - .unwrap(), - ListAndWatchMessageKind::Continue - ); - } - - // Test when device_usage[id] == "" - // internal_allocate should set device_usage[id] = m.nodeName, return - #[tokio::test] - async fn test_internal_allocate_success() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, mut device_plugin_service_receivers) = - create_device_plugin_service( - DevicePluginKind::Instance, - InstanceConnectivityStatus::Online, - true, - ); - let node_name = device_plugin_service.node_name.clone(); - let mut mock = MockKubeInterface::new(); - let request = setup_internal_allocate_tests( - &mut mock, - &device_plugin_service, - String::new(), - Some(node_name), - ); - assert!(device_plugin_service - .internal_allocate(request, Arc::new(mock),) - .await - .is_ok()); - assert!(device_plugin_service_receivers - .instance_list_and_watch_message_receiver - .try_recv() - .is_err()); - assert_eq!( - device_plugin_service_receivers - .configuration_list_and_watch_message_receiver - .recv() - .await - .unwrap(), - ListAndWatchMessageKind::Continue - ); - } - - // Test when device_usage[id] == self.nodeName - // Expected behavior: internal_allocate should keep device_usage[id] == self.nodeName and - // instance should not be updated - #[tokio::test] - async fn test_internal_allocate_deallocate() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, mut device_plugin_service_receivers) = - create_device_plugin_service( - DevicePluginKind::Instance, - InstanceConnectivityStatus::Online, - true, - ); - let mut mock = MockKubeInterface::new(); - let request = setup_internal_allocate_tests( - &mut mock, - &device_plugin_service, - "node-a".to_string(), - None, - ); - assert!(device_plugin_service - .internal_allocate(request, Arc::new(mock)) - .await - .is_ok()); - assert!(device_plugin_service_receivers - .instance_list_and_watch_message_receiver - .try_recv() - .is_err()); - assert_eq!( - device_plugin_service_receivers - .configuration_list_and_watch_message_receiver - .recv() - .await - .unwrap(), - ListAndWatchMessageKind::Continue - ); - } - - // Tests when device_usage[id] == - // Expected behavior: should invoke list_and_watch, and return error - #[tokio::test] - async fn test_internal_allocate_taken() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, mut device_plugin_service_receivers) = - create_device_plugin_service( - DevicePluginKind::Instance, - InstanceConnectivityStatus::Online, - true, - ); - let device_usage_id_slot = format!("{}-0", device_plugin_service.instance_name); - let mut mock = MockKubeInterface::new(); - configure_find_instance( - &mut mock, - "../test/json/local-instance.json", - device_plugin_service.instance_name.clone(), - device_plugin_service.config_namespace.clone(), - "other".to_string(), - NodeName::ThisNode, - 1, - ); - let devices_i_ds = vec![device_usage_id_slot]; - let container_requests = vec![v1beta1::ContainerAllocateRequest { devices_i_ds }]; - let requests = Request::new(AllocateRequest { container_requests }); - match device_plugin_service - .internal_allocate(requests, Arc::new(mock)) - .await - { - Ok(_) => panic!( - "internal allocate is expected to fail due to requested device already being used" - ), - Err(e) => assert_eq!( - e.message(), - "Requested device kind Instance already in use by other nodes" - ), - } - assert_eq!( - device_plugin_service_receivers - .instance_list_and_watch_message_receiver - .recv() - .await - .unwrap(), - ListAndWatchMessageKind::Continue - ); - assert!(device_plugin_service_receivers - .configuration_list_and_watch_message_receiver - .try_recv() - .is_err()); - } - - // Tests when instance does not have the requested device usage id - // Expected behavior: should invoke list_and_watch, and return error - #[tokio::test] - async fn test_internal_allocate_no_id() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, mut device_plugin_service_receivers) = - create_device_plugin_service( - DevicePluginKind::Instance, - InstanceConnectivityStatus::Online, - true, - ); - let device_usage_id_slot = format!("{}-100", device_plugin_service.instance_name); - let mut mock = MockKubeInterface::new(); - configure_find_instance( - &mut mock, - "../test/json/local-instance.json", - device_plugin_service.instance_name.clone(), - device_plugin_service.config_namespace.clone(), - "other".to_string(), - NodeName::ThisNode, - 1, - ); - let devices_i_ds = vec![device_usage_id_slot]; - let container_requests = vec![v1beta1::ContainerAllocateRequest { devices_i_ds }]; - let requests = Request::new(AllocateRequest { container_requests }); - match device_plugin_service - .internal_allocate(requests, Arc::new(mock)) - .await - { - Ok(_) => { - panic!("internal allocate is expected to fail due to invalid device usage slot") - } - Err(e) => assert_eq!(e.message(), "Could not find device usage slot"), - } - assert_eq!( - device_plugin_service_receivers - .instance_list_and_watch_message_receiver - .recv() - .await - .unwrap(), - ListAndWatchMessageKind::Continue - ); - assert!(device_plugin_service_receivers - .configuration_list_and_watch_message_receiver - .try_recv() - .is_err()); - } - - // Tests correct device usage is returned when an Instance is found - // Expected behavior: should return correct device usage state for all usage slots - #[tokio::test] - async fn test_get_instance_device_usage_state() { - let _ = env_logger::builder().is_test(true).try_init(); - let node_name = "node-a"; - let instance_name = "instance-1"; - let instance_namespace = "test-namespace"; - let mock_device_usages = vec![(DeviceUsageKind::Free, "".to_string()); 5]; - let capacity = mock_device_usages.len() as i32; - let mut kube_instance_builder = KubeInstanceBuilder::new(instance_name, instance_namespace); - kube_instance_builder.add_node(node_name); - kube_instance_builder.add_device_usages(instance_name, mock_device_usages); - let kube_instance = kube_instance_builder.build(); - let mock_instances = vec![(instance_name.to_string(), kube_instance)]; - let mut mock = MockKubeInterface::new(); - setup_find_instance_with_mock_instances(&mut mock, instance_namespace, mock_instances); - - let device_usage_states = get_instance_device_usage_states( - node_name, - instance_name, - instance_namespace, - &capacity, - Arc::new(mock), - ) - .await; - assert!(device_usage_states - .into_iter() - .all(|(_, v)| { v == DeviceUsageStatus::Free })); - } - - // Tests correct device usage is returned when an Instance is not found - // Expected behavior: should return DeviceUsageStatus::Unknown for all usage slots - #[tokio::test] - async fn test_get_instance_device_usage_state_no_instance() { - let _ = env_logger::builder().is_test(true).try_init(); - let node_name = "node-a"; - let instance_name = "instance-1"; - let instance_namespace = "test-namespace"; - let capacity = 5i32; - let mut mock = MockKubeInterface::new(); - setup_find_instance_with_not_found_err(&mut mock, instance_name, instance_namespace); - - let device_usage_states = get_instance_device_usage_states( - node_name, - instance_name, - instance_namespace, - &capacity, - Arc::new(mock), - ) - .await; - assert!(device_usage_states - .into_iter() - .all(|(_, v)| { v == DeviceUsageStatus::Unknown })); - } - - fn create_configuration_device_plugin_service( - connectivity_status: InstanceConnectivityStatus, - add_to_instance_map: bool, - ) -> (DevicePluginService, DevicePluginServiceReceivers) { - let (dps, receivers) = create_device_plugin_service( - DevicePluginKind::Configuration, - connectivity_status, - add_to_instance_map, - ); - - (dps, receivers) - } - - // Tests 0 virtual device id is returned if instance not found in InstanceConfig - #[tokio::test] - async fn test_get_available_virtual_devices_no_instance_in_instance_map() { - let _ = env_logger::builder().is_test(true).try_init(); - let this_node = "node-a"; - let instance_namespace = "test-namespace"; - let capacity = 5; - let device_plugin_context = DevicePluginContextBuilder::new().build(); - let mock = MockKubeInterface::new(); - - let result = get_available_virtual_devices( - &device_plugin_context.instances, - this_node, - instance_namespace, - &capacity, - Arc::new(mock), - ) - .await; - assert!(result.is_empty()); - } - - // Tests 0 virtual device id is returned if instance not found from kube find_instance - #[tokio::test] - async fn test_get_available_virtual_devices_no_kube_instance() { - let _ = env_logger::builder().is_test(true).try_init(); - let this_node = "node-a"; - let instance_name = "instance-1"; - let instance_namespace = "test-namespace"; - let capacity = 5; - let mut device_plugin_context_builder = DevicePluginContextBuilder::new(); - device_plugin_context_builder - .add_instance(instance_name, &InstanceConnectivityStatus::Online); - let device_plugin_context = device_plugin_context_builder.build(); - let mut mock = MockKubeInterface::new(); - setup_find_instance_with_not_found_err(&mut mock, instance_name, instance_namespace); - - let result = get_available_virtual_devices( - &device_plugin_context.instances, - this_node, - instance_namespace, - &capacity, - Arc::new(mock), - ) - .await; - assert!(result.is_empty()); - } - - // Tests 0 virtual device id is returned if all slots are taken by other node - #[tokio::test] - async fn test_get_available_virtual_devices_all_taken_by_other_node() { - let this_node = "node-a"; - let other_node = "other"; - let mock_device_usages = vec![(DeviceUsageKind::Instance, other_node.to_string()); 5]; - - let result = run_get_available_virtual_devices_test(this_node, mock_device_usages).await; - assert!(result.is_empty()); - } - - // Tests 0 virtual device id is returned if all slots are taken by Instance on the same node - #[tokio::test] - async fn test_get_available_virtual_devices_all_taken_by_instance() { - let this_node = "node-a"; - let mock_device_usages = vec![(DeviceUsageKind::Instance, this_node.to_string()); 5]; - - let result = run_get_available_virtual_devices_test(this_node, mock_device_usages).await; - assert!(result.is_empty()); - } - - // Tests 1 virtual device id is returned if all slots are free - #[tokio::test] - async fn test_get_available_virtual_devices_all_free() { - let this_node = "node-a"; - let expected_length = 1; - let mock_device_usages = vec![(DeviceUsageKind::Free, "".to_string()); 5]; - - let result = run_get_available_virtual_devices_test(this_node, mock_device_usages).await; - assert_eq!(result.len(), expected_length); - } - - // Tests 1 virtual device id is returned if all slots are taken by Configuration - // with the same vdev_id - #[tokio::test] - async fn test_get_available_virtual_devices_all_taken_by_configuration_same_vdev_id() { - let this_node = "node-a"; - let vdev_ids = ["vdev-a"; 5]; - let expected_length = 1; - let mock_device_usages = vdev_ids - .iter() - .map(|id| { - ( - DeviceUsageKind::Configuration(id.to_string()), - this_node.to_string(), - ) - }) - .collect::>(); - - let result = run_get_available_virtual_devices_test(this_node, mock_device_usages).await; - assert_eq!(result.len(), expected_length); - } - - // Tests correct virtual device ids are returned if all slots are taken by Configuration - // with different vdev_id - #[tokio::test] - async fn test_get_available_virtual_devices_all_taken_by_configuration() { - let this_node = "node-a"; - let vdev_ids = ["vdev-a", "vdev-b", "vdev-c", "vdev-d", "vdev-e"]; - let expected_length = vdev_ids.len(); - let mock_device_usages = vdev_ids - .iter() - .map(|id| { - ( - DeviceUsageKind::Configuration(id.to_string()), - this_node.to_string(), - ) - }) - .collect::>(); - - let result = run_get_available_virtual_devices_test(this_node, mock_device_usages).await; - assert_eq!(result.len(), expected_length); - } - - // Tests correct virtual device ids are returned if some slots are taken by Configuration - // with different vdev_id - #[tokio::test] - async fn test_get_available_virtual_devices_some_taken_by_configuration() { - let this_node = "node-a"; - let vdev_ids = ["vdev-a", "vdev-b"]; - let expected_length = 2 + 1; // 2 vdev_ids + 1 free - let mut mock_device_usages = vec![(DeviceUsageKind::Free, "".to_string()); 3]; - mock_device_usages.extend( - vdev_ids - .iter() - .map(|id| { - ( - DeviceUsageKind::Configuration(id.to_string()), - this_node.to_string(), - ) - }) - .collect::>(), - ); - - let result = run_get_available_virtual_devices_test(this_node, mock_device_usages).await; - assert_eq!(result.len(), expected_length); - } - - async fn run_get_available_virtual_devices_test( - node_name: &str, - device_usages: Vec<(DeviceUsageKind, String)>, - ) -> HashSet { - let _ = env_logger::builder().is_test(true).try_init(); - let instance_name = "instance-1"; - let instance_namespace = "test-namespace"; - let capacity = device_usages.len() as i32; - - let mut device_plugin_context_builder = DevicePluginContextBuilder::new(); - device_plugin_context_builder - .add_instance(instance_name, &InstanceConnectivityStatus::Online); - let device_plugin_context = device_plugin_context_builder.build(); - - let mut kube_instance_builder = KubeInstanceBuilder::new(instance_name, instance_namespace); - kube_instance_builder.add_node(node_name); - kube_instance_builder.add_device_usages(instance_name, device_usages); - let kube_instance = kube_instance_builder.build(); - let mock_instances = vec![(instance_name.to_string(), kube_instance)]; - let mut mock = MockKubeInterface::new(); - setup_find_instance_with_mock_instances(&mut mock, instance_namespace, mock_instances); - - get_available_virtual_devices( - &device_plugin_context.instances, - node_name, - instance_namespace, - &capacity, - Arc::new(mock), - ) - .await - } - - #[derive(Default, Clone)] - struct DevicePluginContextBuilder { - device_plugin_context: DevicePluginContext, - } - - impl DevicePluginContextBuilder { - pub fn new() -> Self { - Self::default() - } - - pub fn add_instance( - &mut self, - instance_name: &str, - connectivity_status: &InstanceConnectivityStatus, - ) -> &mut Self { - let (list_and_watch_message_sender, _) = broadcast::channel(4); - let instance_info = InstanceInfo { - list_and_watch_message_sender, - connectivity_status: connectivity_status.clone(), - instance_id: format!("{}-instance-id", instance_name), - device: Device { - id: format!("{}-device-id", instance_name), - properties: HashMap::new(), - mounts: Vec::new(), - device_specs: Vec::new(), - }, - }; - self.device_plugin_context - .instances - .insert(instance_name.to_string(), instance_info); - self - } - - pub fn build(&self) -> DevicePluginContext { - self.device_plugin_context.clone() - } - } - - #[derive(Clone)] - struct KubeInstanceBuilder { - name: String, - namespace: String, - configuration_name: String, - nodes: Vec, - shared: bool, - device_usages: HashMap>, - } - - impl KubeInstanceBuilder { - pub fn new(name: &str, namespace: &str) -> Self { - Self { - name: name.to_string(), - namespace: namespace.to_string(), - configuration_name: String::default(), - nodes: Vec::new(), - shared: true, - device_usages: HashMap::new(), - } - } - - pub fn add_node(&mut self, node: &str) -> &mut Self { - self.nodes.push(node.to_string()); - self - } - - pub fn add_device_usage( - &mut self, - instance_name: &str, - device_usage: (DeviceUsageKind, String), - ) -> &mut Self { - self.device_usages - .entry(instance_name.to_string()) - .or_default() - .push(device_usage); - self - } - - pub fn add_device_usages( - &mut self, - instance_name: &str, - device_usages: Vec<(DeviceUsageKind, String)>, - ) -> &mut Self { - self.device_usages - .entry(instance_name.to_string()) - .or_default() - .extend(device_usages); - self - } - - pub fn build(&self) -> Instance { - let instance_json = format!( - r#"{{ - "apiVersion": "akri.sh/v0", - "kind": "Instance", - "metadata": {{ - "name": "{}", - "namespace": "{}", - "uid": "abcdegfh-ijkl-mnop-qrst-uvwxyz012345" - }}, - "spec": {{ - "configurationName": "", - "nodes": [], - "shared": true, - "deviceUsage": {{ - }} - }} - }} - "#, - self.name, self.namespace - ); - let mut instance: Instance = serde_json::from_str(&instance_json).unwrap(); - instance.spec.configuration_name = self.configuration_name.clone(); - instance.spec.nodes = self.nodes.clone(); - instance.spec.shared = self.shared; - instance.spec.device_usage = self - .device_usages - .iter() - .flat_map(|(instance_name, usages)| { - usages.iter().enumerate().map(move |(pos, (kind, node))| { - let key = format!("{}-{}", instance_name, pos); - (key, NodeUsage::create(kind, node).unwrap().to_string()) - }) - }) - .collect::>(); - instance - } - } - - // Tests correct virtual devices are returned if all usage slots are free - #[tokio::test] - async fn test_get_virtual_device_resources_all_free() { - let mock_instance_data = HashMap::from([("instance-1", None), ("instance-2", None)]); - let request_vdev_ids = vec!["vdev-a", "vdev-b"]; - - let result = - run_get_virtual_device_resources_test(mock_instance_data, request_vdev_ids.clone()) - .await; - assert_eq!(result.unwrap().len(), request_vdev_ids.len()); - } - - // Tests correct virtual devices are returned if all usage slots taken by Configuration(same vdev id) - #[tokio::test] - async fn test_get_virtual_device_resources_all_taken_by_configuration_same_vdev_id() { - let mock_instance_data = HashMap::from([ - ("instance-1", Some("vdev-a")), - ("instance-2", Some("vdev-b")), - ]); - let request_vdev_ids = vec!["vdev-a", "vdev-b"]; - - let result = - run_get_virtual_device_resources_test(mock_instance_data, request_vdev_ids.clone()) - .await; - assert_eq!(result.unwrap().len(), request_vdev_ids.len()); - } - - // Tests correct virtual devices are returned if all usage slots are free or taken by Configuration(same vdev id) - #[tokio::test] - async fn test_get_virtual_device_resources_free_or_taken_by_configuration_same_vdev_id() { - let mock_instance_data = - HashMap::from([("instance-1", None), ("instance-2", Some("vdev-b"))]); - let request_vdev_ids = vec!["vdev-a", "vdev-b"]; - - let result = - run_get_virtual_device_resources_test(mock_instance_data, request_vdev_ids.clone()) - .await; - assert_eq!(result.unwrap().len(), request_vdev_ids.len()); - } - - // Tests get_virtual_device_resources returns err if all usage slots taken by Configuration(different vdev id) - #[tokio::test] - async fn test_get_virtual_device_resources_all_taken_by_configuration_different_vdev_id() { - let mock_instance_data = HashMap::from([ - ("instance-1", Some("other-vdev-a")), - ("instance-2", Some("other-vdev-b")), - ]); - let request_vdev_ids = vec!["vdev-a", "vdev-b"]; - - let result = - run_get_virtual_device_resources_test(mock_instance_data, request_vdev_ids).await; - assert!(result.is_err()); - } - - // Tests get_virtual_device_resources returns err if one instance usage slots taken by Configuration(different vdev id) - #[tokio::test] - async fn test_get_virtual_device_resources_some_taken_by_configuration_different_vdev_id() { - let mock_instance_data = HashMap::from([ - ("instance-1", Some("other-vdev-a")), - ("instance-2", Some("vdev-b")), - ]); - let request_vdev_ids = vec!["vdev-1", "vdev-2"]; - - let result = - run_get_virtual_device_resources_test(mock_instance_data, request_vdev_ids).await; - assert!(result.is_err()); - } - - // Tests get_virtual_device_resources returns err if one instance usage slots taken by Configuration(different vdev id) - #[tokio::test] - async fn test_get_virtual_device_resources_free_or_some_taken_by_configuration_different_vdev_id( - ) { - let mock_instance_data = - HashMap::from([("instance-1", Some("other-vdev-a")), ("instance-2", None)]); - let request_vdev_ids = vec!["vdev-1", "vdev-2"]; - - let result = - run_get_virtual_device_resources_test(mock_instance_data, request_vdev_ids).await; - assert!(result.is_err()); - } - - async fn run_get_virtual_device_resources_test( - instance_data: HashMap<&str, Option<&str>>, - request_vdev_ids: Vec<&str>, - ) -> Result, Status> { - let _ = env_logger::builder().is_test(true).try_init(); - let this_node = "node-a"; - let instance_namespace = "test-namespace"; - let capacity = 5; - let mut device_plugin_context_builder = DevicePluginContextBuilder::new(); - instance_data.keys().for_each(|instance_name| { - device_plugin_context_builder - .add_instance(instance_name, &InstanceConnectivityStatus::Online); - }); - let device_plugin_context = device_plugin_context_builder.build(); - let mock_instances = instance_data - .iter() - .map(|(instance_name, mock_vdev_id)| { - let device_usage = if let Some(id) = mock_vdev_id { - ( - DeviceUsageKind::Configuration(id.to_string()), - this_node.to_string(), - ) - } else { - (DeviceUsageKind::Free, "".to_string()) - }; - let mut kube_instance_builder = - KubeInstanceBuilder::new(instance_name, instance_namespace); - kube_instance_builder.add_node(this_node); - kube_instance_builder.add_device_usage(instance_name, device_usage); - - (instance_name.to_string(), kube_instance_builder.build()) - }) - .collect::>(); - let mut mock = MockKubeInterface::new(); - setup_find_instance_with_mock_instances(&mut mock, instance_namespace, mock_instances); - - get_virtual_device_resources( - request_vdev_ids.iter().map(|x| x.to_string()).collect(), - &device_plugin_context.instances, - this_node, - instance_namespace, - &capacity, - Arc::new(mock), - ) - .await - } - - // Configuration resource from instance, no instance available, should receive nothing from the response stream - #[tokio::test] - async fn test_cdps_internal_list_and_watch_no_instance() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, _device_plugin_service_receivers) = - create_configuration_device_plugin_service(InstanceConnectivityStatus::Online, false); - let list_and_watch_message_sender = - device_plugin_service.list_and_watch_message_sender.clone(); - let mock = MockKubeInterface::new(); - - let stream = device_plugin_service - .internal_list_and_watch(Arc::new(mock)) - .await - .unwrap() - .into_inner(); - list_and_watch_message_sender - .send(ListAndWatchMessageKind::End) - .unwrap(); - assert!(stream.into_inner().try_recv().is_err()); - } - - // Configuration resource from instance, instance available, should return capacity virtual devices - #[tokio::test] - async fn test_cdps_internal_list_and_watch_with_instance() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, _device_plugin_service_receivers) = - create_configuration_device_plugin_service(InstanceConnectivityStatus::Online, true); - let list_and_watch_message_sender = - device_plugin_service.list_and_watch_message_sender.clone(); - let mut mock = MockKubeInterface::new(); - configure_find_instance( - &mut mock, - "../test/json/local-instance.json", - device_plugin_service.instance_name.clone(), - device_plugin_service.config_namespace.clone(), - String::new(), - NodeName::OtherNode, - 1, - ); - let stream = device_plugin_service - .internal_list_and_watch(Arc::new(mock)) - .await - .unwrap() - .into_inner(); - list_and_watch_message_sender - .send(ListAndWatchMessageKind::End) - .unwrap(); - - let result = stream.into_inner().recv().await.unwrap(); - let list_and_watch_response = result.unwrap(); - assert_eq!(list_and_watch_response.devices.len(), 1); - } - - fn setup_configuration_internal_allocate_tests( - mock: &mut MockKubeInterface, - request_device_id: &str, - config_namespace: &str, - instance_name: &str, - formerly_device_usage: &NodeUsage, - newly_device_usage: Option<&NodeUsage>, - expected_calls: usize, - ) -> Request { - let formerly_device_usage_string = formerly_device_usage.to_string(); - configure_find_instance( - mock, - "../test/json/local-instance.json", - instance_name.to_string(), - config_namespace.to_string(), - formerly_device_usage_string, - NodeName::ThisNode, - expected_calls, - ); - if let Some(new_device_usage) = newly_device_usage { - let expected_device_usage_string = new_device_usage.to_string(); - mock.expect_update_instance() - .times(1) - .withf(move |instance_to_update: &InstanceSpec, _, _| { - let usages = instance_to_update - .device_usage - .iter() - .filter(|(_, usage)| *usage == &expected_device_usage_string) - .collect::>(); - usages.len() == 1 - }) - .returning(move |_, _, _| Ok(())); - } - let devices_i_ds = vec![request_device_id.to_string()]; - let container_requests = vec![v1beta1::ContainerAllocateRequest { devices_i_ds }]; - Request::new(AllocateRequest { container_requests }) - } - - // Test that environment variables set in a Configuration will be set in brokers - #[tokio::test] - async fn test_cdps_internal_allocate_env_vars() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, mut device_plugin_service_receivers) = - create_configuration_device_plugin_service(InstanceConnectivityStatus::Online, true); - let node_name = device_plugin_service.node_name.clone(); - let instance_name = device_plugin_service - .device_plugin_context - .read() - .await - .instances - .keys() - .next() - .unwrap() - .to_string(); - let request_vdev_id = "123456"; - let mut mock = MockKubeInterface::new(); - let request = setup_configuration_internal_allocate_tests( - &mut mock, - request_vdev_id, - &device_plugin_service.config_namespace, - &instance_name, - &NodeUsage::default(), - Some( - &NodeUsage::create( - &DeviceUsageKind::Configuration(request_vdev_id.to_string()), - &node_name, - ) - .unwrap(), - ), - 2, - ); - let broker_envs = device_plugin_service - .internal_allocate(request, Arc::new(mock)) - .await - .unwrap() - .into_inner() - .container_responses[0] - .envs - .clone(); - assert_eq!(broker_envs.get("RESOLUTION_WIDTH").unwrap(), "800"); - assert_eq!(broker_envs.get("RESOLUTION_HEIGHT").unwrap(), "600"); - // Check that Device properties are set as env vars by checking for - // property of device created in `create_configuration_device_plugin_service` - assert_eq!( - broker_envs.get("DEVICE_LOCATION_INFO_B494B6").unwrap(), - "endpoint" - ); - assert!(device_plugin_service_receivers - .configuration_list_and_watch_message_receiver - .try_recv() - .is_err()); - assert_eq!( - device_plugin_service_receivers - .instance_list_and_watch_message_receiver - .recv() - .await - .unwrap(), - ListAndWatchMessageKind::Continue - ); - } - - // Test when device_usage[id] == "" - // internal_allocate should set device_usage[id] = C:vdev_id:nodeName, return - #[tokio::test] - async fn test_cdps_internal_allocate_success() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, mut device_plugin_service_receivers) = - create_configuration_device_plugin_service(InstanceConnectivityStatus::Online, true); - let node_name = device_plugin_service.node_name.clone(); - let instance_name = device_plugin_service - .device_plugin_context - .read() - .await - .instances - .keys() - .next() - .unwrap() - .to_string(); - let request_vdev_id = "123456"; - let mut mock = MockKubeInterface::new(); - let request = setup_configuration_internal_allocate_tests( - &mut mock, - request_vdev_id, - &device_plugin_service.config_namespace, - &instance_name, - &NodeUsage::default(), - Some( - &NodeUsage::create( - &DeviceUsageKind::Configuration(request_vdev_id.to_string()), - &node_name, - ) - .unwrap(), - ), - 2, - ); - assert!(device_plugin_service - .internal_allocate(request, Arc::new(mock)) - .await - .is_ok()); - assert!(device_plugin_service_receivers - .configuration_list_and_watch_message_receiver - .try_recv() - .is_err()); - assert_eq!( - device_plugin_service_receivers - .instance_list_and_watch_message_receiver - .recv() - .await - .unwrap(), - ListAndWatchMessageKind::Continue - ); - } - - // Test when device_usage[id] == C:vdev_id:nodeName - // Expected behavior: internal_allocate should return success, this can happen when - // a device is allocated for a work load, after the work load finishs and exits - // the device plugin framework will re-allocate the device for other work loads to use - #[tokio::test] - async fn test_cdps_internal_reallocate() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, mut device_plugin_service_receivers) = - create_configuration_device_plugin_service(InstanceConnectivityStatus::Online, true); - let node_name = device_plugin_service.node_name.clone(); - let instance_name = device_plugin_service - .device_plugin_context - .read() - .await - .instances - .keys() - .next() - .unwrap() - .to_string(); - let request_vdev_id = "123456"; - let mut mock = MockKubeInterface::new(); - let request = setup_configuration_internal_allocate_tests( - &mut mock, - request_vdev_id, - &device_plugin_service.config_namespace, - &instance_name, - &NodeUsage::create( - &DeviceUsageKind::Configuration(request_vdev_id.to_string()), - &node_name, - ) - .unwrap(), - None, - 2, - ); - assert!(device_plugin_service - .internal_allocate(request, Arc::new(mock)) - .await - .is_ok()); - assert!(device_plugin_service_receivers - .configuration_list_and_watch_message_receiver - .try_recv() - .is_err()); - } - - // Tests when device_usage[id] == - // Expected behavior: should invoke list_and_watch, and return error - #[tokio::test] - async fn test_cdps_internal_allocate_taken_by_other_node() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, mut device_plugin_service_receivers) = - create_configuration_device_plugin_service(InstanceConnectivityStatus::Online, true); - let instance_name = device_plugin_service - .device_plugin_context - .read() - .await - .instances - .keys() - .next() - .unwrap() - .to_string(); - let request_vdev_id = "123456"; - let mut mock = MockKubeInterface::new(); - let request = setup_configuration_internal_allocate_tests( - &mut mock, - request_vdev_id, - &device_plugin_service.config_namespace, - &instance_name, - &NodeUsage::create( - &DeviceUsageKind::Configuration(request_vdev_id.to_string()), - "other", - ) - .unwrap(), - None, - 1, - ); - assert!(device_plugin_service - .internal_allocate(request, Arc::new(mock)) - .await - .is_err()); - assert_eq!( - device_plugin_service_receivers - .configuration_list_and_watch_message_receiver - .recv() - .await - .unwrap(), - ListAndWatchMessageKind::Continue - ); - } - - // Tests when device_usage[id] == - // Expected behavior: should invoke list_and_watch, and return error - #[tokio::test] - async fn test_cdps_internal_allocate_taken_by_instance() { - let _ = env_logger::builder().is_test(true).try_init(); - let (device_plugin_service, mut device_plugin_service_receivers) = - create_configuration_device_plugin_service(InstanceConnectivityStatus::Online, true); - let node_name = device_plugin_service.node_name.clone(); - let instance_name = device_plugin_service - .device_plugin_context - .read() - .await - .instances - .keys() - .next() - .unwrap() - .to_string(); - let request_vdev_id = "123456"; - let mut mock = MockKubeInterface::new(); - let request = setup_configuration_internal_allocate_tests( - &mut mock, - request_vdev_id, - &device_plugin_service.config_namespace, - &instance_name, - &NodeUsage::create(&DeviceUsageKind::Instance, &node_name).unwrap(), - None, - 1, - ); - assert!(device_plugin_service - .internal_allocate(request, Arc::new(mock)) - .await - .is_err()); - assert_eq!( - device_plugin_service_receivers - .configuration_list_and_watch_message_receiver - .recv() - .await - .unwrap(), - ListAndWatchMessageKind::Continue - ); - } -} diff --git a/agent/src/util/discovery_configuration_controller.rs b/agent/src/util/discovery_configuration_controller.rs new file mode 100644 index 000000000..9f0edf78a --- /dev/null +++ b/agent/src/util/discovery_configuration_controller.rs @@ -0,0 +1,638 @@ +use std::{ + collections::HashMap, + sync::{Arc, Mutex}, + time::Duration, +}; + +use akri_shared::{ + akri::{ + configuration::{Configuration, DiscoveryProperty}, + instance::Instance, + }, + k8s::crud::IntoApi, +}; +use futures::StreamExt; +use tokio::sync::mpsc; + +use crate::discovery_handler_manager::{ + discovery_handler_registry::DiscoveryHandlerRegistry, DiscoveryError, +}; + +use kube::{Resource, ResourceExt}; +use kube_runtime::{ + controller::Action, + reflector::{ObjectRef, Store}, + Controller, +}; +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum Error { + #[error(transparent)] + DiscoveryError(#[from] DiscoveryError), + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +pub trait DiscoveryConfigurationKubeClient: IntoApi + IntoApi {} + +impl + IntoApi> DiscoveryConfigurationKubeClient for T {} + +pub struct ControllerContext { + pub instances_cache: Store, + pub dh_registry: Arc, + pub client: Arc, + pub agent_instance_name: String, + pub error_backoffs: Mutex>, +} + +pub async fn start_controller( + ctx: Arc, + rec: mpsc::Receiver>, +) { + let api = ctx.client.all().as_inner(); + let controller = Controller::new(api, Default::default()); + + controller + .graceful_shutdown_on(async { + let mut signal = + tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()).unwrap(); + signal.recv().await; + }) + .reconcile_on(tokio_stream::wrappers::ReceiverStream::new(rec)) + .run(reconcile, error_policy, ctx) + .for_each(|_| futures::future::ready(())) + .await; +} + +pub async fn reconcile( + dc: Arc, + ctx: Arc, +) -> Result { + trace!("Reconciling {:?}::{}", dc.namespace(), dc.name_any()); + let namespace = dc.namespace().unwrap(); + let owner_ref = dc.controller_owner_ref(&()).unwrap(); + if dc.metadata.deletion_timestamp.is_some() { + ctx.dh_registry.terminate_request(&dc.name_any()).await; + + ctx.client + .namespaced(&namespace) + .remove_finalizer(dc.as_ref(), &ctx.agent_instance_name) + .await + .map_err(|e| Error::Other(e.into()))?; + + return Ok(Action::await_change()); + } + + if !dc.finalizers().contains(&ctx.agent_instance_name) { + ctx.client + .namespaced(&namespace) + .add_finalizer(dc.as_ref(), &ctx.agent_instance_name) + .await + .map_err(|e| Error::Other(e.into()))? + } + + let dh_name = &dc.spec.discovery_handler.name; + let dh_details = &dc.spec.discovery_handler.discovery_details; + let empty_vec = vec![]; + let dh_properties: &Vec = dc + .spec + .discovery_handler + .discovery_properties + .as_ref() + .unwrap_or(&empty_vec); + let dh_extra_device_properties = dc.spec.broker_properties.clone(); + + let discovered_instances: Vec = + match ctx.dh_registry.get_request(&dc.name_any()).await { + Some(req) => req + .get_instances()? + .into_iter() + .map(|mut instance| { + // Add + instance.spec.nodes = vec![ctx.agent_instance_name.to_owned()]; + instance.owner_references_mut().push(owner_ref.clone()); + instance.spec.capacity = dc.spec.capacity; + instance + }) + .collect(), + None => { + ctx.dh_registry + .new_request( + &dc.name_any(), + dh_name, + dh_details, + dh_properties, + dh_extra_device_properties, + &dc.namespace().unwrap_or("default".to_string()), + ) + .await?; + vec![] + } + }; + + for instance in ctx.instances_cache.state() { + if instance.owner_references().contains(&owner_ref) + && !discovered_instances + .iter() + .any(|di| di.name_any() == instance.name_any()) + { + delete_instance( + ctx.client.as_ref(), + instance.as_ref(), + &ctx.agent_instance_name, + ) + .await? + } + } + + for instance in discovered_instances { + ctx.client + .namespaced(&namespace) + .apply(instance, &ctx.agent_instance_name) + .await + .map_err(|e| Error::Other(e.into()))?; + } + + ctx.error_backoffs.lock().unwrap().remove(&dc.name_any()); + Ok(Action::requeue(Duration::from_secs(600))) +} + +pub fn error_policy(dc: Arc, error: &Error, ctx: Arc) -> Action { + let mut error_backoffs = ctx.error_backoffs.lock().unwrap(); + let previous_duration = error_backoffs + .get(&dc.name_any()) + .cloned() + .unwrap_or(Duration::from_millis(500)); + let next_duration = previous_duration * 2; + warn!( + "Error during reconciliation for {:?}::{}, retrying in {}s: {:?}", + dc.namespace(), + dc.name_any(), + next_duration.as_secs_f32(), + error + ); + error_backoffs.insert(dc.name_any(), next_duration); + Action::requeue(next_duration) +} + +async fn delete_instance( + client: &dyn DiscoveryConfigurationKubeClient, + instance: &Instance, + agent_instance_name: &String, +) -> Result<(), Error> { + if instance.spec.nodes.contains(agent_instance_name) { + let api = client.namespaced(&instance.namespace().unwrap()); + if instance.spec.nodes.len() == 1 { + api.delete(&instance.name_any()) + .await + .map_err(|e| Error::Other(e.into()))?; + return Ok(()); + } + let mut new_instance = instance.clone(); + new_instance.spec.nodes = vec![]; + api.apply(new_instance, agent_instance_name) + .await + .map_err(|e| Error::Other(e.into()))?; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use akri_shared::{ + akri::{ + configuration::{ConfigurationSpec, DiscoveryHandlerInfo}, + instance::InstanceSpec, + }, + k8s::crud::{Api, MockApi, MockIntoApi}, + }; + use k8s_openapi::apimachinery::pkg::apis::meta::v1::OwnerReference; + use kube::core::{ObjectMeta, Status}; + use mockall::predicate::eq; + + use crate::discovery_handler_manager::discovery_handler_registry::{ + MockDiscoveryHandlerRegistry, MockDiscoveryHandlerRequest, + }; + + use super::*; + + #[derive(Default)] + pub struct MockDiscoveryConfigurationKubeClient { + instance: MockIntoApi, + config: MockIntoApi, + } + + impl IntoApi for MockDiscoveryConfigurationKubeClient { + fn all(&self) -> Box> { + self.instance.all() + } + + fn namespaced(&self, namespace: &str) -> Box> { + self.instance.namespaced(namespace) + } + + fn default_namespaced(&self) -> Box> { + self.instance.default_namespaced() + } + } + + impl IntoApi for MockDiscoveryConfigurationKubeClient { + fn all(&self) -> Box> { + self.config.all() + } + + fn namespaced(&self, namespace: &str) -> Box> { + self.config.namespaced(namespace) + } + + fn default_namespaced(&self) -> Box> { + self.config.default_namespaced() + } + } + + #[test] + fn test_error_policy() { + let _ = env_logger::builder().is_test(true).try_init(); + let config_1 = Arc::new(Configuration { + metadata: ObjectMeta { + name: Some("config-1".to_string()), + ..Default::default() + }, + spec: ConfigurationSpec { + discovery_handler: DiscoveryHandlerInfo { + name: "debugEcho".to_string(), + discovery_details: String::default(), + discovery_properties: None, + }, + capacity: 1, + broker_spec: None, + instance_service_spec: None, + configuration_service_spec: None, + broker_properties: Default::default(), + }, + }); + let config_2 = Arc::new(Configuration { + metadata: ObjectMeta { + name: Some("config-2".to_string()), + ..Default::default() + }, + spec: ConfigurationSpec { + discovery_handler: DiscoveryHandlerInfo { + name: "debugEcho".to_string(), + discovery_details: String::default(), + discovery_properties: None, + }, + capacity: 1, + broker_spec: None, + instance_service_spec: None, + configuration_service_spec: None, + broker_properties: Default::default(), + }, + }); + + let (store, _) = kube_runtime::reflector::store(); + + let ctx = Arc::new(ControllerContext { + instances_cache: store, + dh_registry: Arc::new(MockDiscoveryHandlerRegistry::new()), + client: Arc::new(MockDiscoveryConfigurationKubeClient::default()), + agent_instance_name: "node-a".to_string(), + error_backoffs: Default::default(), + }); + + assert_eq!( + error_policy( + config_1.clone(), + &Error::Other(anyhow::anyhow!("Error")), + ctx.clone() + ), + Action::requeue(Duration::from_secs(1)) + ); + assert_eq!( + error_policy( + config_1.clone(), + &Error::Other(anyhow::anyhow!("Error")), + ctx.clone() + ), + Action::requeue(Duration::from_secs(2)) + ); + assert_eq!( + error_policy( + config_1.clone(), + &Error::Other(anyhow::anyhow!("Error")), + ctx.clone() + ), + Action::requeue(Duration::from_secs(4)) + ); + + assert_eq!( + error_policy( + config_2, + &Error::Other(anyhow::anyhow!("Error")), + ctx.clone() + ), + Action::requeue(Duration::from_secs(1)) + ); + + assert_eq!( + error_policy(config_1, &Error::Other(anyhow::anyhow!("Error")), ctx), + Action::requeue(Duration::from_secs(8)) + ); + } + + #[tokio::test] + async fn test_delete_instance_delete() { + let instance = Instance { + metadata: ObjectMeta { + name: Some("instance-1".to_string()), + namespace: Some("namespace-a".to_string()), + ..Default::default() + }, + spec: InstanceSpec { + capacity: 1, + configuration_name: Default::default(), + cdi_name: Default::default(), + broker_properties: Default::default(), + shared: false, + nodes: vec!["node-a".to_string()], + device_usage: Default::default(), + }, + }; + + let mut mock_client = MockDiscoveryConfigurationKubeClient::default(); + let mut mock_api = MockApi::new(); + let local_instance = instance.clone(); + mock_api + .expect_delete() + .with(eq("instance-1")) + .returning(move |_| Ok(itertools::Either::Left(local_instance.clone()))); + mock_client + .instance + .expect_namespaced() + .with(eq("namespace-a")) + .return_once(|_| Box::new(mock_api)); + + assert!( + delete_instance(&mock_client, &instance, &"node-a".to_string()) + .await + .is_ok() + ); + } + + #[tokio::test] + async fn test_delete_instance_remove_node() { + let instance = Instance { + metadata: ObjectMeta { + name: Some("instance-1".to_string()), + namespace: Some("namespace-a".to_string()), + ..Default::default() + }, + spec: InstanceSpec { + capacity: 1, + configuration_name: Default::default(), + cdi_name: Default::default(), + broker_properties: Default::default(), + shared: false, + nodes: vec!["node-a".to_string(), "node-b".to_string()], + device_usage: Default::default(), + }, + }; + + let mut mock_client = MockDiscoveryConfigurationKubeClient::default(); + let mut mock_api = MockApi::new(); + let local_instance = instance.clone(); + mock_api + .expect_apply() + .returning(move |_, _| Ok(local_instance.clone())); + mock_client + .instance + .expect_namespaced() + .with(eq("namespace-a")) + .return_once(|_| Box::new(mock_api)); + + assert!( + delete_instance(&mock_client, &instance, &"node-a".to_string()) + .await + .is_ok() + ); + } + + #[tokio::test] + async fn test_delete_instance_other_node() { + let instance = Instance { + metadata: ObjectMeta { + name: Some("instance-1".to_string()), + namespace: Some("namespace-a".to_string()), + ..Default::default() + }, + spec: InstanceSpec { + capacity: 1, + configuration_name: Default::default(), + cdi_name: Default::default(), + broker_properties: Default::default(), + shared: false, + nodes: vec!["node-b".to_string()], + device_usage: Default::default(), + }, + }; + + let mut mock_client = MockDiscoveryConfigurationKubeClient::default(); + let mock_api = MockApi::new(); + mock_client + .instance + .expect_namespaced() + .with(eq("namespace-a")) + .return_once(|_| Box::new(mock_api)); + + assert!( + delete_instance(&mock_client, &instance, &"node-a".to_string()) + .await + .is_ok() + ); + } + + #[tokio::test] + async fn test_reconcile_nothing_to_do() { + let (store, _) = kube_runtime::reflector::store(); + let mut client = MockDiscoveryConfigurationKubeClient::default(); + let api = MockApi::new(); + client + .config + .expect_namespaced() + .return_once(|_| Box::new(api)); + + let mut registry = MockDiscoveryHandlerRegistry::new(); + let mut request = MockDiscoveryHandlerRequest::new(); + request.expect_get_instances().returning(|| Ok(vec![])); + registry + .expect_get_request() + .return_once(|_| Some(Arc::new(request))); + + let ctx = Arc::new(ControllerContext { + instances_cache: store, + dh_registry: Arc::new(registry), + client: Arc::new(client), + agent_instance_name: "node-a".to_string(), + error_backoffs: Default::default(), + }); + + let dc = Arc::new(Configuration { + metadata: ObjectMeta { + name: Some("config-1".to_string()), + namespace: Some("namespace-a".to_string()), + uid: Some("00112233-4455-6677-8899-aabbccddeeff".to_string()), + finalizers: Some(vec!["node-a".to_string()]), + ..Default::default() + }, + spec: ConfigurationSpec { + discovery_handler: DiscoveryHandlerInfo { + name: "debugEcho".to_string(), + discovery_details: String::new(), + discovery_properties: None, + }, + capacity: 1, + broker_spec: None, + instance_service_spec: None, + configuration_service_spec: None, + broker_properties: Default::default(), + }, + }); + + assert!(reconcile(dc, ctx).await.is_ok()); + } + + #[tokio::test] + async fn test_reconcile_no_request_existing_instances() { + let (store, mut writer) = kube_runtime::reflector::store(); + writer.apply_watcher_event(&kube_runtime::watcher::Event::Restarted(vec![ + Instance { + metadata: ObjectMeta { + namespace: Some("namespace-a".to_string()), + name: Some("instance-1".to_string()), + owner_references: Some(vec![OwnerReference { + api_version: "akri.sh/v0".to_string(), + block_owner_deletion: None, + controller: Some(true), + kind: "Configuration".to_string(), + name: "config-1".to_string(), + uid: "00112233-4455-6677-8899-aabbccddeeff".to_string(), + }]), + ..Default::default() + }, + spec: InstanceSpec { + configuration_name: "config-1".to_string(), + cdi_name: "akri.sh/config-1=abcdef".to_string(), + capacity: 1, + broker_properties: HashMap::new(), + shared: true, + nodes: vec!["node-a".to_string()], + device_usage: Default::default(), + }, + }, + Instance { + metadata: ObjectMeta { + namespace: Some("namespace-a".to_string()), + name: Some("instance-2".to_string()), + owner_references: Some(vec![OwnerReference { + api_version: "akri.sh/v0".to_string(), + block_owner_deletion: None, + controller: Some(true), + kind: "Configuration".to_string(), + name: "config-1".to_string(), + uid: "00112233-4455-6677-8899-aabbccddeeff".to_string(), + }]), + ..Default::default() + }, + spec: InstanceSpec { + configuration_name: "config-1".to_string(), + cdi_name: "akri.sh/config-1=abcdef".to_string(), + capacity: 1, + broker_properties: HashMap::new(), + shared: true, + nodes: vec!["node-b".to_string()], + device_usage: Default::default(), + }, + }, + Instance { + metadata: ObjectMeta { + namespace: Some("namespace-a".to_string()), + name: Some("instance-3".to_string()), + owner_references: Some(vec![OwnerReference { + api_version: "akri.sh/v0".to_string(), + block_owner_deletion: None, + controller: Some(true), + kind: "Configuration".to_string(), + name: "config-2".to_string(), + uid: "11112233-4455-6677-8899-aabbccddeeff".to_string(), + }]), + ..Default::default() + }, + spec: InstanceSpec { + configuration_name: "config-2".to_string(), + cdi_name: "akri.sh/config-2=abcdef".to_string(), + capacity: 1, + broker_properties: HashMap::new(), + shared: true, + nodes: vec!["node-a".to_string()], + device_usage: Default::default(), + }, + }, + ])); + let mut client = MockDiscoveryConfigurationKubeClient::default(); + let mut api = MockApi::new(); + api.expect_add_finalizer().returning(|_, _| Ok(())); + client + .config + .expect_namespaced() + .return_once(|_| Box::new(api)); + + let mut instance_api = MockApi::new(); + instance_api + .expect_delete() + .with(eq("instance-1")) + .returning(|_| Ok(itertools::Either::Right(Status::default()))); + client + .instance + .expect_namespaced() + .return_once(|_| Box::new(instance_api)); + + let mut registry = MockDiscoveryHandlerRegistry::new(); + registry.expect_get_request().return_once(|_| None); + //TODO: check arguments here + registry + .expect_new_request() + .returning(|_, _, _, _, _, _| Ok(())); + + let ctx = Arc::new(ControllerContext { + instances_cache: store, + dh_registry: Arc::new(registry), + client: Arc::new(client), + agent_instance_name: "node-a".to_string(), + error_backoffs: Default::default(), + }); + + let dc = Arc::new(Configuration { + metadata: ObjectMeta { + name: Some("config-1".to_string()), + namespace: Some("namespace-a".to_string()), + uid: Some("00112233-4455-6677-8899-aabbccddeeff".to_string()), + ..Default::default() + }, + spec: ConfigurationSpec { + discovery_handler: DiscoveryHandlerInfo { + name: "debugEcho".to_string(), + discovery_details: String::new(), + discovery_properties: None, + }, + capacity: 1, + broker_spec: None, + instance_service_spec: None, + configuration_service_spec: None, + broker_properties: Default::default(), + }, + }); + + assert!(reconcile(dc, ctx).await.is_ok()); + } +} diff --git a/agent/src/util/discovery_operator.rs b/agent/src/util/discovery_operator.rs deleted file mode 100644 index e6fa5d0df..000000000 --- a/agent/src/util/discovery_operator.rs +++ /dev/null @@ -1,2960 +0,0 @@ -#[cfg(any(test, feature = "agent-full"))] -use super::embedded_discovery_handlers::get_discovery_handler; -use super::metrics::INSTANCE_COUNT_METRIC; -use super::{ - config_action::ConfigId, - constants::SHARED_INSTANCE_OFFLINE_GRACE_PERIOD_SECS, - device_plugin_builder::{DevicePluginBuilder, DevicePluginBuilderInterface}, - device_plugin_service, - device_plugin_service::{ - get_device_instance_name, DevicePluginContext, InstanceConnectivityStatus, InstanceInfo, - }, - registration::{DiscoveryDetails, DiscoveryHandlerEndpoint, RegisteredDiscoveryHandlerMap}, - streaming_extension::StreamingExt, -}; -use akri_discovery_utils::discovery::v0::{ - discovery_handler_client::DiscoveryHandlerClient, ByteData, Device, DiscoverRequest, - DiscoverResponse, -}; -use akri_shared::{ - akri::{ - configuration::{ - Configuration, DiscoveryProperty, DiscoveryPropertyKeySelector, DiscoveryPropertySource, - }, - retry::MAX_INSTANCE_UPDATE_TRIES, - }, - k8s, -}; -use blake2::{ - digest::{Update, VariableOutput}, - VarBlake2b, -}; -use k8s_openapi::api::core::v1::{ConfigMap, Secret}; -use kube::api::Api; -use log::{error, trace}; -#[cfg(test)] -use mock_instant::Instant; -#[cfg(test)] -use mockall::{automock, predicate::*}; -use std::io::{Error, ErrorKind}; -#[cfg(not(test))] -use std::time::Instant; -use std::{collections::HashMap, convert::TryFrom, sync::Arc}; -use tokio::sync::RwLock; -use tonic::transport::{Endpoint, Uri}; - -/// StreamType provides a wrapper around the two different types of streams returned from embedded -/// or embedded discovery handlers and ones running externally. -pub enum StreamType { - #[cfg(any(test, feature = "agent-full"))] - Embedded(tokio::sync::mpsc::Receiver>), - External(tonic::Streaming), -} - -/// A DiscoveryOperator is created for each Configuration that is applied to the cluster. -/// It handles discovery of the devices specified in a Configuration by calling `Discover` on -/// all `DiscoveryHandlers` registered with name `Configuration.discovery_handler.name.` -/// For each device discovered by the discovery handlers, it creates a device plugin. -/// If a device disappears, it deletes the associated instance after a grace period (for non-local devices). -/// Note: Since this structure is automocked, the compiler does not seem to be able to confirm that all the -/// methods are being used. Therefore, #[allow(dead_code)] has been added to all methods that are not invoked or -/// tested on a DiscoveryOperator. -#[derive(Clone)] -pub struct DiscoveryOperator { - /// Map of registered discovery handlers - discovery_handler_map: RegisteredDiscoveryHandlerMap, - /// The Akri Configuration associated with this `DiscoveryOperator`. - /// The Configuration tells the `DiscoveryOperator` what to look for. - config: Configuration, - /// Akri Instances discovered by this `DiscoveryOperator` - device_plugin_context: Arc>, - /// Timestamp of DiscoveryOperator is created when config is created or updated - config_timestamp: Instant, -} - -#[cfg_attr(test, automock)] -impl DiscoveryOperator { - pub fn new( - discovery_handler_map: RegisteredDiscoveryHandlerMap, - config: Configuration, - device_plugin_context: Arc>, - ) -> Self { - DiscoveryOperator { - discovery_handler_map, - config, - device_plugin_context, - config_timestamp: Instant::now(), - } - } - fn get_config_id(&self) -> ConfigId { - ( - self.config.metadata.namespace.clone().unwrap(), - self.config.metadata.name.clone().unwrap(), - ) - } - - /// Returns discovery_handler_map field. Allows the struct to be mocked. - #[allow(dead_code)] - pub fn get_discovery_handler_map(&self) -> RegisteredDiscoveryHandlerMap { - self.discovery_handler_map.clone() - } - /// Returns config field. Allows the struct to be mocked. - #[allow(dead_code)] - pub fn get_config(&self) -> Configuration { - self.config.clone() - } - /// Returns device_plugin_context field. Allows the struct to be mocked. - #[allow(dead_code)] - pub fn get_device_plugin_context(&self) -> Arc> { - self.device_plugin_context.clone() - } - /// Returns config_timestamp field. Allows the struct to be mocked. - #[allow(dead_code)] - pub fn get_config_timestamp(&self) -> Instant { - self.config_timestamp - } - #[allow(dead_code)] - pub async fn stop_all_discovery(&self) { - let mut discovery_handler_map = self.discovery_handler_map.lock().unwrap().clone(); - if let Some(discovery_handler_details_map) = - discovery_handler_map.get_mut(&self.config.spec.discovery_handler.name) - { - for (endpoint, dh_details) in discovery_handler_details_map.clone() { - // Send with the config_id so we stop discovery for this Configuration only. - match dh_details.close_discovery_handler_connection.send(Some(self.get_config_id())) { - Ok(_) => trace!("stop_all_discovery - discovery client for {} discovery handler at endpoint {:?} told to stop", self.config.spec.discovery_handler.name, endpoint), - Err(e) => error!("stop_all_discovery - discovery client {} discovery handler at endpoint {:?} could not receive stop message with error {:?}", self.config.spec.discovery_handler.name, endpoint, e) - } - } - } - } - - /// Calls discover on the Discovery Handler at the given endpoint and returns the connection stream. - pub async fn get_stream<'a>( - &'a self, - kube_interface: Arc, - endpoint: &'a DiscoveryHandlerEndpoint, - ) -> Option { - let discovery_properties = match self - .get_discovery_properties( - kube_interface.clone(), - &self.config.spec.discovery_handler.discovery_properties, - ) - .await - { - Ok(data) => data, - Err(e) => { - error!( - "get_stream - fail to retrieve discovery properties for Configuration {:?}, error {:?}", - self.config.metadata.name, e - ); - return None; - } - }; - let discover_request = tonic::Request::new(DiscoverRequest { - discovery_details: self.config.spec.discovery_handler.discovery_details.clone(), - discovery_properties, - }); - trace!("get_stream - endpoint is {:?}", endpoint); - match endpoint { - #[cfg(any(test, feature = "agent-full"))] - DiscoveryHandlerEndpoint::Embedded => { - match get_discovery_handler(&self.config.spec.discovery_handler) { - Ok(discovery_handler) => { - trace!( - "get_stream - using embedded {} discovery handler", - self.config.spec.discovery_handler.name - ); - match discovery_handler.discover(discover_request).await { - Ok(device_update_receiver) => Some(StreamType::Embedded( - // `discover` returns `Result, tonic::Status>` - // Get the `Receiver` from the `DiscoverStream` wrapper - device_update_receiver.into_inner().into_inner(), - )), - Err(e) => { - error!("get_stream - could not connect to DiscoveryHandler at endpoint {:?} with error {}", endpoint, e); - None - } - } - } - Err(e) => { - error!("get_stream - no embedded discovery handler found with name {} with error {:?}", self.config.spec.discovery_handler.name, e); - None - } - } - } - DiscoveryHandlerEndpoint::Uds(socket) => { - // Clone socket for closure which has static lifetime - let socket = socket.clone(); - // We will ignore this dummy uri because UDS does not use it. - // Some servers will check the uri content so the uri needs to - // be in valid format even it's not used, the scheme part is used - // to specific what scheme to use, such as http or https - match Endpoint::try_from("http://[::1]:50051") - .unwrap() - .connect_with_connector(tower::service_fn(move |_: Uri| { - let endpoint = socket.clone(); - tokio::net::UnixStream::connect(endpoint) - })) - .await - { - Ok(channel) => { - trace!( - "get_stream - connecting to external {} discovery handler over UDS", - self.config.spec.discovery_handler.name - ); - let mut discovery_handler_client = DiscoveryHandlerClient::new(channel); - match discovery_handler_client.discover(discover_request).await { - Ok(device_update_receiver) => { - Some(StreamType::External(device_update_receiver.into_inner())) - } - Err(e) => { - error!("get_stream - could not connect to DiscoveryHandler at endpoint {:?} with error {}", endpoint, e); - None - } - } - } - Err(e) => { - error!("get_stream - failed to connect to {} discovery handler over UDS with error {}", self.config.spec.discovery_handler.name, e); - None - } - } - } - DiscoveryHandlerEndpoint::Network(addr) => { - match DiscoveryHandlerClient::connect(addr.clone()).await { - Ok(mut discovery_handler_client) => { - trace!( - "get_stream - connecting to external {} discovery handler over network", - self.config.spec.discovery_handler.name - ); - match discovery_handler_client.discover(discover_request).await { - Ok(device_update_receiver) => { - Some(StreamType::External(device_update_receiver.into_inner())) - } - Err(e) => { - error!("get_stream - could not connect to DiscoveryHandler at endpoint {:?} with error {}", endpoint, e); - None - } - } - } - Err(e) => { - error!("get_stream - failed to connect to {} discovery handler over network with error {}", self.config.spec.discovery_handler.name, e); - None - } - } - } - } - } - /// Listens for new discovery responses and calls a function to handle the new discovery results. - /// Runs until notified to stop discovery. - #[allow(dead_code)] - pub async fn internal_do_discover<'a>( - &'a self, - kube_interface: Arc, - dh_details: &'a DiscoveryDetails, - stream: &'a mut dyn StreamingExt, - node_name: String, - ) -> anyhow::Result<()> { - // clone objects for thread - let discovery_operator = Arc::new(self.clone()); - let stop_discovery_receiver: &mut tokio::sync::broadcast::Receiver> = - &mut dh_details.close_discovery_handler_connection.subscribe(); - loop { - // Wait for either new discovery results or a message to stop discovery - tokio::select! { - result = stop_discovery_receiver.recv() => { - // Stop is triggered if the current config_id (to only stop this task) or None (to stop all tasks of this handler) is sent. - if let Ok(Some(config_id)) = result { - if config_id != self.get_config_id() { - trace!("internal_do_discover - received message to stop discovery for another configuration, ignoring it."); - continue; - } - } - trace!( - "internal_do_discover({}::{}) - received message to stop discovery for endpoint {:?} serving protocol {}", - self.config.metadata.namespace.as_ref().unwrap(), - self.config.metadata.name.as_ref().unwrap(), - dh_details.endpoint, - discovery_operator.get_config().spec.discovery_handler.name, - ); - break; - }, - result = stream.get_message() => { - let response = result?.ok_or_else(|| anyhow::anyhow!("Received response type None. Should not happen."))?; - trace!("internal_do_discover - got discovery results {:?}", response.devices); - self.handle_discovery_results( - kube_interface.clone(), - response.devices, - dh_details.shared, - Box::new(DevicePluginBuilder{}), - node_name.clone(), - ) - .await?; - } - } - } - - Ok(()) - } - - /// Checks if any of this DiscoveryOperator's Configuration's Instances have been offline for too long. - /// If a non-local device has not come back online before `SHARED_INSTANCE_OFFLINE_GRACE_PERIOD_SECS`, - /// the associated Device Plugin and Instance are terminated and deleted, respectively. - pub async fn delete_offline_instances( - &self, - kube_interface: Arc, - node_name: String, - ) -> Result<(), Box> { - trace!( - "delete_offline_instances - entered for configuration {:?}", - self.config.metadata.name - ); - let kube_interface_clone = kube_interface.clone(); - let instance_map = self.device_plugin_context.write().await.clone().instances; - for (instance, instance_info) in instance_map { - if let InstanceConnectivityStatus::Offline(instant) = instance_info.connectivity_status - { - let time_offline = instant.elapsed().as_secs(); - // If instance has been offline for longer than the grace period or it is unshared, terminate the associated device plugin - // TODO: make grace period configurable - if time_offline >= SHARED_INSTANCE_OFFLINE_GRACE_PERIOD_SECS { - trace!("delete_offline_instances - instance {} has been offline too long ... terminating device plugin", instance); - device_plugin_service::terminate_device_plugin_service( - &instance, - self.device_plugin_context.clone(), - ) - .await - .unwrap(); - try_delete_instance( - kube_interface_clone.as_ref(), - &instance, - self.config.metadata.namespace.as_ref().unwrap(), - node_name.clone(), - ) - .await?; - } - } - } - Ok(()) - } - - /// Takes in a list of discovered devices and determines if there are any new devices or no longer visible devices. - /// For each new device, it creates a DevicePluginService. - /// For each previously visible device that was no longer discovered, it calls a function that updates the InstanceConnectivityStatus - /// of the instance or deletes it if it is a local device. - pub async fn handle_discovery_results( - &self, - kube_interface: Arc, - discovery_results: Vec, - shared: bool, - device_plugin_builder: Box, - node_name: String, - ) -> anyhow::Result<()> { - let config_name = self.config.metadata.name.clone().unwrap(); - trace!( - "handle_discovery_results - for config {} with discovery results {:?}", - config_name, - discovery_results - ); - let currently_visible_instances: HashMap = discovery_results - .iter() - .map(|discovery_result| { - let id = generate_instance_digest(&discovery_result.id, shared, &node_name); - let instance_name = get_device_instance_name(&id, &config_name); - (instance_name, discovery_result.clone()) - }) - .collect(); - INSTANCE_COUNT_METRIC - .with_label_values(&[&config_name, &shared.to_string()]) - .set(currently_visible_instances.len() as i64); - // Update the connectivity status of instances and return list of visible instances that don't have Instance CRs - let device_plugin_context = self.device_plugin_context.read().await.clone(); - // Find all visible instances that do not have Instance CRDs yet - let new_discovery_results: Vec = currently_visible_instances - .iter() - .filter(|(name, _)| !device_plugin_context.instances.contains_key(*name)) - .map(|(_, p)| p.clone()) - .collect(); - self.update_instance_connectivity_status( - kube_interface, - currently_visible_instances, - shared, - node_name.clone(), - ) - .await?; - - // If there are newly visible instances associated with a Config, make a device plugin and Instance CR for them - if !new_discovery_results.is_empty() { - for discovery_result in new_discovery_results { - let id = generate_instance_digest(&discovery_result.id, shared, &node_name); - let instance_name = get_device_instance_name(&id, &config_name); - trace!( - "handle_discovery_results - new instance {} came online", - instance_name - ); - let device_plugin_context = self.device_plugin_context.clone(); - if let Err(e) = device_plugin_builder - .build_device_plugin( - id, - &self.config, - shared, - device_plugin_context, - discovery_result.clone(), - node_name.clone(), - ) - .await - { - error!("handle_discovery_results - error {} building device plugin ... trying again on next iteration", e); - } - } - } - Ok(()) - } - - /// Takes in a list of currently visible instances and either updates an Instance's InstanceConnectivityStatus or deletes an Instance. - /// If a non-local/network based device is not longer visible it's InstanceConnectivityStatus is changed to Offline(time now). - /// The associated DevicePluginService checks its InstanceConnectivityStatus before sending a response back to kubelet - /// and will send all unhealthy devices if its status is Offline, preventing kubelet from allocating any more pods to it. - /// An Instance CRD is deleted and it's DevicePluginService shutdown if its: - /// (A) non-local Instance is still not visible after 5 minutes or (B) local instance is still not visible. - pub async fn update_instance_connectivity_status( - &self, - kube_interface: Arc, - currently_visible_instances: HashMap, - shared: bool, - node_name: String, - ) -> anyhow::Result<()> { - let instance_map = self.device_plugin_context.read().await.clone().instances; - for (instance, instance_info) in instance_map { - trace!( - "update_instance_connectivity_status - checking connectivity status of instance {}", - instance - ); - if currently_visible_instances.contains_key(&instance) { - let connectivity_status = instance_info.connectivity_status; - // If instance is visible, make sure connectivity status is (updated to be) Online - if let InstanceConnectivityStatus::Offline(_instant) = connectivity_status { - trace!( - "update_instance_connectivity_status - instance {} that was temporarily offline is back online", - instance - ); - let list_and_watch_message_sender = instance_info.list_and_watch_message_sender; - let device = currently_visible_instances.get(&instance).unwrap(); - let updated_instance_info = InstanceInfo { - connectivity_status: InstanceConnectivityStatus::Online, - list_and_watch_message_sender: list_and_watch_message_sender.clone(), - instance_id: instance_info.instance_id.clone(), - device: device.clone(), - }; - self.device_plugin_context - .write() - .await - .instances - .insert(instance.clone(), updated_instance_info); - // Signal list_and_watch to update kubelet that the devices are healthy. - list_and_watch_message_sender - .send(device_plugin_service::ListAndWatchMessageKind::Continue) - .unwrap(); - } else { - trace!( - "update_instance_connectivity_status - instance {} still online", - instance - ); - } - } else { - // If the instance is not visible: - // // If the instance is local, remove it - // // If the instance is not local - // // // If it has not already been labeled offline, label it - // // // If the instance has already been labeled offline - // // // remove instance from map if grace period has elapsed without the instance coming back online - let mut remove_instance = false; - match instance_info.connectivity_status { - InstanceConnectivityStatus::Online => { - if !shared { - remove_instance = true; - } else { - let sender = instance_info.list_and_watch_message_sender.clone(); - let updated_instance_info = InstanceInfo { - connectivity_status: InstanceConnectivityStatus::Offline( - Instant::now(), - ), - list_and_watch_message_sender: instance_info - .list_and_watch_message_sender - .clone(), - instance_id: instance_info.instance_id.clone(), - device: instance_info.device.clone(), - }; - self.device_plugin_context - .write() - .await - .instances - .insert(instance.clone(), updated_instance_info); - trace!( - "update_instance_connectivity_status - instance {} went offline ... starting timer and forcing list_and_watch to continue", - instance - ); - sender - .send(device_plugin_service::ListAndWatchMessageKind::Continue) - .unwrap(); - } - } - InstanceConnectivityStatus::Offline(instant) => { - let time_offline = instant.elapsed().as_secs(); - // If instance has been offline for longer than the grace period, terminate the associated device plugin - if time_offline >= SHARED_INSTANCE_OFFLINE_GRACE_PERIOD_SECS { - remove_instance = true; - } - } - } - if remove_instance { - trace!("update_instance_connectivity_status - instance {} has been offline too long ... terminating device plugin", instance); - device_plugin_service::terminate_device_plugin_service( - &instance, - self.device_plugin_context.clone(), - ) - .await - .unwrap(); - try_delete_instance( - kube_interface.as_ref(), - &instance, - self.config.metadata.namespace.as_ref().unwrap(), - node_name.clone(), - ) - .await - .unwrap(); - } - } - } - Ok(()) - } - - async fn get_discovery_properties( - &self, - kube_interface: Arc, - properties: &Option>, - ) -> anyhow::Result> { - match properties { - None => Ok(HashMap::new()), - Some(properties) => { - let mut tmp_properties = HashMap::new(); - for p in properties { - match self.get_discovery_property(kube_interface.clone(), p).await { - Ok(tmp_p) => { - if let Some((k, v)) = tmp_p { - tmp_properties.insert(k, v.clone()); - } - } - Err(e) => return Err(e), - } - } - Ok(tmp_properties) - } - } - } - - async fn get_discovery_property( - &self, - kube_interface: Arc, - property: &DiscoveryProperty, - ) -> anyhow::Result> { - let value; - if let Some(v) = &property.value { - value = ByteData { - vec: Some(v.as_bytes().to_vec()), - }; - } else if let Some(value_from) = &property.value_from { - let kube_client = ActualKubeClient::new(kube_interface.clone()); - value = match self - .get_discovery_property_value_from(&kube_client, value_from) - .await - { - Ok(byte_data) => { - if byte_data.is_none() { - // optional value, not found - return Ok(None); - } - byte_data.unwrap() - } - Err(e) => return Err(e), - }; - } else { - // property without value - value = ByteData { vec: None } - } - - Ok(Some((property.name.clone(), value))) - } - - async fn get_discovery_property_value_from( - &self, - kube_client: &dyn KubeClient, - property: &DiscoveryPropertySource, - ) -> anyhow::Result> { - match property { - DiscoveryPropertySource::ConfigMapKeyRef(config_map_key_selector) => { - get_discovery_property_value_from_config_map(kube_client, config_map_key_selector) - .await - } - DiscoveryPropertySource::SecretKeyRef(secret_key_selector) => { - get_discovery_property_value_from_secret(kube_client, secret_key_selector).await - } - } - } -} - -async fn try_delete_instance( - kube_interface: &dyn k8s::KubeInterface, - instance_name: &str, - instance_namespace: &str, - node_name: String, -) -> Result<(), anyhow::Error> { - for x in 0..MAX_INSTANCE_UPDATE_TRIES { - // First check if instance still exists - if let Ok(mut instance) = kube_interface - .find_instance(instance_name, instance_namespace) - .await - { - if instance.spec.nodes.contains(&node_name) { - instance.spec.nodes.retain(|node| node != &node_name); - } - if instance.spec.nodes.is_empty() { - match k8s::try_delete_instance(kube_interface, instance_name, instance_namespace) - .await - { - Ok(()) => { - trace!("try_delete_instance - deleted Instance {}", instance_name); - break; - } - Err(e) => { - trace!("try_delete_instance - call to delete_instance returned with error {} on try # {} of {}", e, x, MAX_INSTANCE_UPDATE_TRIES); - if x == (MAX_INSTANCE_UPDATE_TRIES - 1) { - return Err(e); - } - } - } - } else { - match kube_interface - .update_instance( - &instance.spec, - &instance.metadata.name.unwrap(), - instance_namespace, - ) - .await - { - Ok(()) => { - trace!( - "try_delete_instance - updated Instance {} to remove {}", - instance_name, - node_name - ); - break; - } - Err(e) => { - trace!("try_delete_instance - call to update_instance returned with error {} on try # {} of {}", e, x, MAX_INSTANCE_UPDATE_TRIES); - if x == (MAX_INSTANCE_UPDATE_TRIES - 1) { - return Err(e); - } - } - }; - } - } - } - Ok(()) -} - -/// This provides a mockable way to query configMap and secret -#[cfg_attr(test, automock)] -#[tonic::async_trait] -pub trait KubeClient: Send + Sync { - async fn get_secret(&self, name: &str, namespace: &str) -> anyhow::Result>; - - async fn get_config_map( - &self, - name: &str, - namespace: &str, - ) -> anyhow::Result>; -} - -struct ActualKubeClient { - pub kube_interface: Arc, -} - -impl ActualKubeClient { - pub fn new(kube_interface: Arc) -> Self { - ActualKubeClient { kube_interface } - } -} - -#[tonic::async_trait] -impl KubeClient for ActualKubeClient { - async fn get_secret(&self, name: &str, namespace: &str) -> anyhow::Result> { - let resource_client: Api = - Api::namespaced(self.kube_interface.get_kube_client(), namespace); - let resource = resource_client.get_opt(name).await?; - Ok(resource) - } - - async fn get_config_map( - &self, - name: &str, - namespace: &str, - ) -> anyhow::Result> { - let resource_client: Api = - Api::namespaced(self.kube_interface.get_kube_client(), namespace); - let resource = resource_client.get_opt(name).await?; - Ok(resource) - } -} - -async fn get_discovery_property_value_from_secret( - kube_client: &dyn KubeClient, - secret_key_selector: &DiscoveryPropertyKeySelector, -) -> anyhow::Result> { - let optional = secret_key_selector.optional.unwrap_or_default(); - let secret_name = &secret_key_selector.name; - let secret_namespace = &secret_key_selector.namespace; - let secret_key = &secret_key_selector.key; - - let secret = kube_client - .get_secret(secret_name, secret_namespace) - .await?; - if secret.is_none() { - if optional { - return Ok(None); - } else { - return Err(Error::new( - ErrorKind::InvalidInput, - "discoveryProperties' referenced Secret not found", - ) - .into()); - } - } - let secret = secret.unwrap(); - // All key-value pairs in the stringData field are internally merged into the data field - // we don't need to check string_data. - if let Some(data) = secret.data { - if let Some(v) = data.get(secret_key) { - return Ok(Some(ByteData { - vec: Some(v.0.clone()), - })); - } - } - - // secret key/value not found - if optional { - Ok(None) - } else { - Err(Error::new( - ErrorKind::InvalidInput, - "discoveryProperties' referenced Secret data not found", - ) - .into()) - } -} - -async fn get_discovery_property_value_from_config_map( - kube_client: &dyn KubeClient, - config_map_key_selector: &DiscoveryPropertyKeySelector, -) -> anyhow::Result> { - let optional = config_map_key_selector.optional.unwrap_or_default(); - let config_map_name = &config_map_key_selector.name; - let config_map_namespace = &config_map_key_selector.namespace; - let config_map_key = &config_map_key_selector.key; - - let config_map = kube_client - .get_config_map(config_map_name, config_map_namespace) - .await?; - if config_map.is_none() { - if optional { - return Ok(None); - } else { - return Err(Error::new( - ErrorKind::InvalidInput, - "discoveryProperties' referenced ConfigMap not found", - ) - .into()); - } - } - let config_map = config_map.unwrap(); - if let Some(data) = config_map.data { - if let Some(v) = data.get(config_map_key) { - return Ok(Some(ByteData { - vec: Some(v.as_bytes().to_vec()), - })); - } - } - if let Some(binary_data) = config_map.binary_data { - if let Some(v) = binary_data.get(config_map_key) { - return Ok(Some(ByteData { - vec: Some(v.0.clone()), - })); - } - } - - // config_map key/value not found - if optional { - Ok(None) - } else { - Err(Error::new( - ErrorKind::InvalidInput, - "discoveryProperties' referenced ConfigMap data not found", - ) - .into()) - } -} - -pub mod start_discovery { - use super::super::metrics::{DISCOVERY_RESPONSE_RESULT_METRIC, DISCOVERY_RESPONSE_TIME_METRIC}; - use super::super::registration::{DiscoveryDetails, DiscoveryHandlerEndpoint}; - // Use this `mockall` macro to automate importing a mock type in test mode, or a real type otherwise. - use super::super::device_plugin_builder::{DevicePluginBuilder, DevicePluginBuilderInterface}; - use super::device_plugin_service::get_device_configuration_name; - #[double] - pub use super::DiscoveryOperator; - use super::StreamType; - use akri_shared::k8s; - use mockall_double::double; - use std::{sync::Arc, time::Duration}; - use tokio::sync::{broadcast, mpsc}; - - /// This is spawned as a task for each Configuration and continues to run - /// until the Configuration is deleted, at which point, this function is signaled to stop. - /// It consists of three subtasks: - /// 1) Initiates discovery on all already registered discovery handlers in the RegisteredDiscoveryHandlerMap - /// with the same discovery handler name as the Configuration (Configuration.discovery_handler.name). - /// 2) Listens for new discover handlers to come online for this Configuration and initiates discovery. - /// 3) Checks whether Offline Instances have exceeded their grace period, in which case it - /// deletes the Instance. - pub async fn start_discovery( - discovery_operator: DiscoveryOperator, - new_discovery_handler_sender: broadcast::Sender, - stop_all_discovery_sender: broadcast::Sender<()>, - finished_all_discovery_sender: &mut mpsc::Sender<()>, - kube_interface: Arc, - node_name: String, - ) -> Result<(), Box> { - internal_start_discovery( - discovery_operator, - new_discovery_handler_sender, - stop_all_discovery_sender, - finished_all_discovery_sender, - kube_interface, - Box::new(DevicePluginBuilder {}), - node_name, - ) - .await - } - - pub async fn internal_start_discovery( - discovery_operator: DiscoveryOperator, - new_discovery_handler_sender: broadcast::Sender, - stop_all_discovery_sender: broadcast::Sender<()>, - finished_all_discovery_sender: &mut mpsc::Sender<()>, - kube_interface: Arc, - device_plugin_builder: Box, - node_name: String, - ) -> Result<(), Box> { - let config = discovery_operator.get_config(); - info!( - "internal_start_discovery - entered for {} discovery handler", - config.spec.discovery_handler.name - ); - let config_name = config.metadata.name.clone().unwrap(); - let mut tasks = Vec::new(); - let device_plugin_context = discovery_operator.get_device_plugin_context(); - let discovery_operator = Arc::new(discovery_operator); - - // Create a device plugin for the Configuration - let config_dp_name = get_device_configuration_name(&config_name); - trace!( - "internal_start_discovery - create configuration device plugin {}", - config_dp_name - ); - match device_plugin_builder - .build_configuration_device_plugin( - config_dp_name, - &config, - device_plugin_context.clone(), - node_name.clone(), - ) - .await - { - Ok(s) => { - device_plugin_context - .write() - .await - .usage_update_message_sender = Some(s); - } - Err(e) => { - error!( - "internal_start_discovery - error {} building configuration device plugin", - e - ); - } - }; - - // Call discover on already registered Discovery Handlers requested by this Configuration's - let known_dh_discovery_operator = discovery_operator.clone(); - let known_dh_kube_interface = kube_interface.clone(); - let known_node_name = node_name.clone(); - tasks.push(tokio::spawn(async move { - do_discover( - known_dh_discovery_operator, - known_dh_kube_interface, - known_node_name, - ) - .await - .unwrap(); - })); - - // Listen for new discovery handlers to call discover on - let mut stop_all_discovery_receiver = stop_all_discovery_sender.subscribe(); - let mut new_discovery_handler_receiver = new_discovery_handler_sender.subscribe(); - let new_dh_discovery_operator = discovery_operator.clone(); - let new_node_name = node_name.clone(); - tasks.push(tokio::spawn(async move { - listen_for_new_discovery_handlers( - new_dh_discovery_operator, - &mut new_discovery_handler_receiver, - &mut stop_all_discovery_receiver, - new_node_name, - ) - .await - .unwrap(); - })); - - // Non-local devices are only allowed to be offline for `SHARED_INSTANCE_OFFLINE_GRACE_PERIOD_SECS` minutes before being removed. - // This task periodically checks if devices have been offline for too long. - let mut stop_all_discovery_receiver = stop_all_discovery_sender.subscribe(); - let offline_dh_discovery_operator = discovery_operator.clone(); - let offline_dh_kube_interface = kube_interface.clone(); - let offline_node_name = node_name.clone(); - tasks.push(tokio::spawn(async move { - loop { - offline_dh_discovery_operator - .delete_offline_instances(offline_dh_kube_interface.clone(), offline_node_name.clone()) - .await - .unwrap(); - if tokio::time::timeout( - Duration::from_secs(30), - stop_all_discovery_receiver.recv(), - ) - .await.is_ok() - { - trace!("internal_start_discovery - received message to stop checking connectivity status for configuration {}", config_name); - break; - } - } - })); - futures::future::try_join_all(tasks).await?; - finished_all_discovery_sender.send(()).await?; - Ok(()) - } - - /// Waits to be notified of new discovery handlers. If the discovery handler does discovery for this Configuration, - /// discovery is kicked off. - async fn listen_for_new_discovery_handlers( - discovery_operator: Arc, - new_discovery_handler_receiver: &mut broadcast::Receiver, - stop_all_discovery_receiver: &mut broadcast::Receiver<()>, - node_name: String, - ) -> Result<(), Box> { - let mut discovery_tasks = Vec::new(); - loop { - tokio::select! { - _ = stop_all_discovery_receiver.recv() => { - trace!("listen_for_new_discovery_handlers - received message to stop discovery for configuration {:?}", discovery_operator.get_config().metadata.name); - discovery_operator.stop_all_discovery().await; - break; - }, - result = new_discovery_handler_receiver.recv() => { - // Check if it is one of this Configuration's discovery handlers - if let Ok(discovery_handler_name) = result { - if discovery_handler_name == discovery_operator.get_config().spec.discovery_handler.name { - trace!("listen_for_new_discovery_handlers - received new registered discovery handler for configuration {:?}", discovery_operator.get_config().metadata.name); - let new_discovery_operator = discovery_operator.clone(); - let node_name = node_name.clone(); - discovery_tasks.push(tokio::spawn(async move { - do_discover(new_discovery_operator, Arc::new(k8s::KubeImpl::new().await.unwrap()), node_name.clone()).await.unwrap(); - })); - } - } - } - } - } - // Wait for all discovery handlers to complete discovery - futures::future::try_join_all(discovery_tasks).await?; - Ok(()) - } - - /// A Configuration specifies the name of `DiscoveryHandlers` that should be utilized for discovery. - /// This tries to establish connection with each `DiscoveryHandler` registered under the requested - /// `DiscoveryHandler` name and spawns a discovery thread for each connection. - /// If a connection cannot be established, continues to try, sleeping between iteration. - pub async fn do_discover( - discovery_operator: Arc, - kube_interface: Arc, - node_name: String, - ) -> Result<(), Box> { - let mut discovery_tasks = Vec::new(); - let config = discovery_operator.get_config(); - trace!( - "do_discover - entered for {} discovery handler", - config.spec.discovery_handler.name - ); - // get clone of map - let mut discovery_handler_map = discovery_operator - .get_discovery_handler_map() - .lock() - .unwrap() - .clone(); - trace!( - "do_discover - discovery_handler_map is {:?}", - discovery_handler_map - ); - if let Some(discovery_handler_details_map) = - discovery_handler_map.get_mut(&config.spec.discovery_handler.name) - { - for (endpoint, dh_details) in discovery_handler_details_map.clone() { - trace!( - "do_discover - for {} discovery handler at endpoint {:?}", - config.spec.discovery_handler.name, - endpoint - ); - let discovery_operator = discovery_operator.clone(); - let kube_interface = kube_interface.clone(); - let node_name = node_name.clone(); - discovery_tasks.push(tokio::spawn(async move { - do_discover_on_discovery_handler( - discovery_operator.clone(), - kube_interface.clone(), - &endpoint, - &dh_details, - node_name.clone(), - ) - .await - .unwrap(); - })); - } - } - futures::future::try_join_all(discovery_tasks).await?; - Ok(()) - } - - /// Try to connect to discovery handler until connection has been established or grace period has passed - async fn do_discover_on_discovery_handler<'a>( - discovery_operator: Arc, - kube_interface: Arc, - endpoint: &'a DiscoveryHandlerEndpoint, - dh_details: &'a DiscoveryDetails, - node_name: String, - ) -> anyhow::Result<()> { - // get discovery handler name for metric use - let dh_name = discovery_operator.get_config().spec.discovery_handler.name; - let (_config_namespace, config_name) = discovery_operator.get_config_id(); - let mut first_call = true; - loop { - let stream_type = discovery_operator - .get_stream(kube_interface.clone(), endpoint) - .await; - let request_result = stream_type.as_ref().map(|_| "Success").unwrap_or("Fail"); - DISCOVERY_RESPONSE_RESULT_METRIC - .with_label_values(&[&dh_name, request_result]) - .inc(); - if first_call { - first_call = false; - let start_time = discovery_operator.get_config_timestamp(); - DISCOVERY_RESPONSE_TIME_METRIC - .with_label_values(&[&config_name]) - .observe(start_time.elapsed().as_secs_f64()); - } - if let Some(stream_type) = stream_type { - match stream_type { - StreamType::External(mut stream) => { - match discovery_operator - .internal_do_discover( - kube_interface.clone(), - dh_details, - &mut stream, - node_name.clone(), - ) - .await - { - Ok(_) => { - break; - } - Err(e) => { - if let Some(status) = e.downcast_ref::() { - if status.message().contains("broken pipe") { - // Mark all associated instances as offline - error!("do_discover_on_discovery_handler - connection with Discovery Handler dropped with status {:?}. Marking all instances offline.", status); - discovery_operator - .update_instance_connectivity_status( - kube_interface.clone(), - std::collections::HashMap::new(), - dh_details.shared, - node_name.clone(), - ) - .await?; - } else { - trace!("do_discover_on_discovery_handler - Discovery Handlers returned error status {}. Marking all instances offline.", status); - // TODO: Possibly mark config as invalid - discovery_operator - .update_instance_connectivity_status( - kube_interface.clone(), - std::collections::HashMap::new(), - dh_details.shared, - node_name.clone(), - ) - .await?; - } - } else { - return Err(e); - } - } - } - } - #[cfg(any(test, feature = "agent-full"))] - StreamType::Embedded(mut stream) => { - discovery_operator - .internal_do_discover( - kube_interface.clone(), - dh_details, - &mut stream, - node_name.clone(), - ) - .await?; - // Embedded discovery should only return okay if signaled to stop. Otherwise, bubble up error. - break; - } - } - } - - // If a connection cannot be established with the Discovery Handler, it will sleep and try again. - // This continues until connection established or the Discovery Handler is told to stop discovery. - let mut stop_discovery_receiver = - dh_details.close_discovery_handler_connection.subscribe(); - let mut sleep_duration = Duration::from_secs(60); - if cfg!(test) { - sleep_duration = Duration::from_millis(100); - } - - if let Ok(result) = - tokio::time::timeout(sleep_duration, stop_discovery_receiver.recv()).await - { - // Stop is triggered if the current config_id (to only stop this task) or None (to stop all tasks of this handler) is sent. - if let Ok(Some(config_id)) = result { - if config_id != discovery_operator.get_config_id() { - trace!("do_discover_on_discovery_handler - received message to stop discovery for another configuration, ignoring it."); - continue; - } - } - let (config_namespace, config_name) = discovery_operator.get_config_id(); - trace!( - "do_discover_on_discovery_handler({}::{}) - received message to stop discovery for {} Discovery Handler at endpoint {:?}", - config_namespace, config_name, - dh_details.name, dh_details.endpoint, - ); - break; - } - } - Ok(()) - } -} - -/// Generates an digest of an Instance's id. There should be a unique digest and Instance for each discovered device. -/// This means that the id of non-local devices that could be visible to multiple nodes should always resolve -/// to the same instance name (which is suffixed with this digest). -/// However, local devices' Instances should have unique hashes even if they have the same id. -/// To ensure this, the node's name is added to the id before it is hashed. -pub fn generate_instance_digest(id_to_digest: &str, shared: bool, node_name: &str) -> String { - let mut id_to_digest = id_to_digest.to_string(); - // For local devices, include node hostname in id_to_digest so instances have unique names - if !shared { - id_to_digest = format!("{}{}", &id_to_digest, node_name,); - } - let mut digest = String::new(); - let mut hasher = VarBlake2b::new(3).unwrap(); - hasher.update(id_to_digest); - hasher.finalize_variable(|var| { - digest = var - .iter() - .map(|num| format!("{:02x}", num)) - .collect::>() - .join("") - }); - digest -} - -#[cfg(test)] -pub mod tests { - use super::super::{ - device_plugin_builder::MockDevicePluginBuilderInterface, - registration::{inner_register_embedded_discovery_handlers, DiscoveryDetails}, - }; - use super::device_plugin_service::DevicePluginContext; - use super::*; - use akri_discovery_utils::discovery::mock_discovery_handler; - use akri_shared::{ - akri::configuration::Configuration, k8s::MockKubeInterface, os::env_var::MockEnvVarQuery, - }; - use k8s_openapi::ByteString; - use mock_instant::{Instant, MockClock}; - use mockall::Sequence; - use std::collections::BTreeMap; - use std::time::Duration; - use tokio::sync::{broadcast, mpsc}; - - pub async fn build_device_plugin_context( - config: &Configuration, - visible_discovery_results: &mut Vec, - list_and_watch_message_receivers: &mut Vec< - broadcast::Receiver, - >, - connectivity_status: InstanceConnectivityStatus, - ) -> Arc> { - let device1 = Device { - id: "filter1".to_string(), - properties: HashMap::new(), - mounts: Vec::default(), - device_specs: Vec::default(), - }; - let device2 = Device { - id: "filter2".to_string(), - properties: HashMap::new(), - mounts: Vec::default(), - device_specs: Vec::default(), - }; - let discovery_results = vec![device1, device2]; - *visible_discovery_results = discovery_results.clone(); - generate_instance_map( - discovery_results, - list_and_watch_message_receivers, - connectivity_status, - config.metadata.name.as_ref().unwrap(), - ) - } - - fn generate_instance_map( - discovery_results: Vec, - list_and_watch_message_receivers: &mut Vec< - broadcast::Receiver, - >, - connectivity_status: InstanceConnectivityStatus, - config_name: &str, - ) -> Arc> { - Arc::new(RwLock::new(DevicePluginContext { - usage_update_message_sender: None, - instances: discovery_results - .iter() - .map(|device| { - let (list_and_watch_message_sender, list_and_watch_message_receiver) = - broadcast::channel(2); - list_and_watch_message_receivers.push(list_and_watch_message_receiver); - let instance_name = get_device_instance_name(&device.id, config_name); - ( - instance_name, - InstanceInfo { - list_and_watch_message_sender, - connectivity_status: connectivity_status.clone(), - instance_id: device.id.clone(), - device: device.clone(), - }, - ) - }) - .collect(), - })) - } - - fn create_mock_discovery_operator( - discovery_handler_map: RegisteredDiscoveryHandlerMap, - config: Configuration, - device_plugin_context: Arc>, - ) -> MockDiscoveryOperator { - let ctx = MockDiscoveryOperator::new_context(); - let discovery_handler_map_clone = discovery_handler_map.clone(); - let config_clone = config.clone(); - let config_id = ( - config.metadata.namespace.clone().unwrap(), - config.metadata.namespace.clone().unwrap(), - ); - let device_plugin_context_clone = device_plugin_context.clone(); - ctx.expect().return_once(move |_, _, _| { - // let mut discovery_handler_status_seq = Sequence::new(); - let mut mock = MockDiscoveryOperator::default(); - mock.expect_get_discovery_handler_map() - .returning(move || discovery_handler_map_clone.clone()); - mock.expect_get_config() - .returning(move || config_clone.clone()); - mock.expect_get_device_plugin_context() - .returning(move || device_plugin_context_clone.clone()); - mock.expect_get_config_id() - .returning(move || config_id.clone()); - mock - }); - MockDiscoveryOperator::new(discovery_handler_map, config, device_plugin_context) - } - - // Creates a discovery handler with specified properties and adds it to the RegisteredDiscoveryHandlerMap. - pub fn add_discovery_handler_to_map( - dh_name: &str, - endpoint: &DiscoveryHandlerEndpoint, - shared: bool, - registered_dh_map: RegisteredDiscoveryHandlerMap, - ) { - let discovery_handler_details = - create_discovery_handler_details(dh_name, endpoint.clone(), shared); - // Add discovery handler to registered discovery handler map - let dh_details_map = match registered_dh_map.lock().unwrap().clone().get_mut(dh_name) { - Some(dh_details_map) => { - if !dh_details_map.contains_key(endpoint) { - dh_details_map.insert(endpoint.clone(), discovery_handler_details); - } - dh_details_map.clone() - } - None => { - let mut dh_details_map = HashMap::new(); - dh_details_map.insert(endpoint.clone(), discovery_handler_details); - dh_details_map - } - }; - registered_dh_map - .lock() - .unwrap() - .insert(dh_name.to_string(), dh_details_map); - } - - fn create_discovery_handler_details( - name: &str, - endpoint: DiscoveryHandlerEndpoint, - shared: bool, - ) -> DiscoveryDetails { - let (close_discovery_handler_connection, _) = broadcast::channel(2); - DiscoveryDetails { - name: name.to_string(), - endpoint, - shared, - close_discovery_handler_connection, - } - } - - fn setup_test_do_discover( - config_name: &str, - discovery_handler_map: RegisteredDiscoveryHandlerMap, - ) -> MockDiscoveryOperator { - add_discovery_handler_to_map( - "debugEcho", - &DiscoveryHandlerEndpoint::Uds("socket.sock".to_string()), - false, - discovery_handler_map.clone(), - ); - - // Build discovery operator - let path_to_config = "../test/yaml/config-a.yaml"; - let config_yaml = std::fs::read_to_string(path_to_config).expect("Unable to read file"); - let mut config: Configuration = serde_yaml::from_str(&config_yaml).unwrap(); - config.metadata.name = Some(config_name.to_string()); - config.metadata.namespace = Some(config_name.to_string()); - create_mock_discovery_operator( - discovery_handler_map, - config, - Arc::new(RwLock::new(DevicePluginContext::default())), - ) - } - - #[test] - fn test_generate_instance_digest() { - let id = "video1"; - let first_unshared_video_digest = generate_instance_digest(id, false, "node-a"); - let first_shared_video_digest = generate_instance_digest(id, true, "node-a"); - - let second_unshared_video_digest = generate_instance_digest(id, false, "node-b"); - let second_shared_video_digest = generate_instance_digest(id, true, "node-b"); - // unshared instances visible to different nodes should NOT have the same digest - assert_ne!(first_unshared_video_digest, second_unshared_video_digest); - // shared instances visible to different nodes should have the same digest - assert_eq!(first_shared_video_digest, second_shared_video_digest); - // shared and unshared instance for same node should NOT have the same digest - assert_ne!(first_unshared_video_digest, first_shared_video_digest); - } - - #[tokio::test] - async fn test_internal_do_discover_stop_one() { - let mock_kube_interface1: Arc = Arc::new(MockKubeInterface::new()); - let mock_kube_interface2 = mock_kube_interface1.clone(); - let dh_name = "debugEcho"; - let discovery_handler_map = Arc::new(std::sync::Mutex::new(HashMap::new())); - let endpoint = DiscoveryHandlerEndpoint::Uds("socket.sock".to_string()); - add_discovery_handler_to_map(dh_name, &endpoint, false, discovery_handler_map.clone()); - let dh_details1 = discovery_handler_map - .lock() - .unwrap() - .get(dh_name) - .unwrap() - .get(&endpoint) - .unwrap() - .clone(); - let dh_details2 = dh_details1.clone(); - - let (_tx1, mut rx1) = mpsc::channel(2); - let (_tx2, mut rx2) = mpsc::channel(2); - - let config1: Configuration = serde_yaml::from_str( - std::fs::read_to_string("../test/yaml/config-a.yaml") - .expect("Unable to read file") - .as_str(), - ) - .unwrap(); - let discovery_operator1 = Arc::new(DiscoveryOperator::new( - discovery_handler_map.clone(), - config1, - Arc::new(RwLock::new(DevicePluginContext::default())), - )); - let local_do1 = discovery_operator1.clone(); - let discover1 = tokio::spawn(async move { - discovery_operator1 - .internal_do_discover( - mock_kube_interface1, - &dh_details1, - &mut rx1, - "node-a".to_string(), - ) - .await - .unwrap() - }); - - let config2: Configuration = serde_yaml::from_str( - std::fs::read_to_string("../test/yaml/config-b.yaml") - .expect("Unable to read file") - .as_str(), - ) - .unwrap(); - let discovery_operator2 = Arc::new(DiscoveryOperator::new( - discovery_handler_map, - config2, - Arc::new(RwLock::new(DevicePluginContext::default())), - )); - let discover2 = tokio::spawn(async move { - discovery_operator2 - .internal_do_discover( - mock_kube_interface2, - &dh_details2, - &mut rx2, - "node-a".to_string(), - ) - .await - .unwrap() - }); - tokio::time::sleep(Duration::from_millis(100)).await; // Make sure they had time to launch - local_do1.stop_all_discovery().await; - assert!(tokio::time::timeout(Duration::from_millis(100), discover1) - .await - .is_ok()); - assert!(tokio::time::timeout(Duration::from_millis(100), discover2) - .await - .is_err()); - } - - #[tokio::test] - async fn test_stop_all_discovery() { - let dh_name = "debugEcho"; - let discovery_handler_map = Arc::new(std::sync::Mutex::new(HashMap::new())); - let endpoint1 = DiscoveryHandlerEndpoint::Uds("socket.sock".to_string()); - add_discovery_handler_to_map(dh_name, &endpoint1, false, discovery_handler_map.clone()); - let mut close_discovery_handler_connection_receiver1 = discovery_handler_map - .lock() - .unwrap() - .get(dh_name) - .unwrap() - .get(&endpoint1) - .unwrap() - .close_discovery_handler_connection - .subscribe(); - let endpoint2 = DiscoveryHandlerEndpoint::Uds("socket2.sock".to_string()); - add_discovery_handler_to_map(dh_name, &endpoint2, false, discovery_handler_map.clone()); - let mut close_discovery_handler_connection_receiver2 = discovery_handler_map - .lock() - .unwrap() - .get(dh_name) - .unwrap() - .get(&endpoint2) - .unwrap() - .close_discovery_handler_connection - .subscribe(); - let path_to_config = "../test/yaml/config-a.yaml"; - let config_yaml = std::fs::read_to_string(path_to_config).expect("Unable to read file"); - let config: Configuration = serde_yaml::from_str(&config_yaml).unwrap(); - let discovery_operator = Arc::new(DiscoveryOperator::new( - discovery_handler_map, - config, - Arc::new(RwLock::new(DevicePluginContext::default())), - )); - tokio::spawn(async move { - discovery_operator.stop_all_discovery().await; - }); - assert!(close_discovery_handler_connection_receiver1 - .recv() - .await - .is_ok()); - assert!(close_discovery_handler_connection_receiver2 - .recv() - .await - .is_ok()); - } - - #[tokio::test] - async fn test_start_discovery_termination() { - let _ = env_logger::builder().is_test(true).try_init(); - let discovery_handler_map = Arc::new(std::sync::Mutex::new(HashMap::new())); - let mut start_discovery_components = - start_discovery_setup("config-a", true, discovery_handler_map).await; - start_discovery_components - .running_receiver - .recv() - .await - .unwrap(); - start_discovery_components - .stop_all_discovery_sender - .send(()) - .unwrap(); - start_discovery_components - .finished_discovery_receiver - .recv() - .await - .unwrap(); - // Make sure that all threads have finished - start_discovery_components - .start_discovery_handle - .await - .unwrap(); - } - - // Test that start discovery can be called twice for two (differently named) - // Configurations that use the same DH. - #[tokio::test] - async fn test_start_discovery_same_discovery_handler() { - let _ = env_logger::builder().is_test(true).try_init(); - let discovery_handler_map = Arc::new(std::sync::Mutex::new(HashMap::new())); - let mut start_discovery_components_a = - start_discovery_setup("config-a", false, discovery_handler_map.clone()).await; - let mut start_discovery_components_b = - start_discovery_setup("config-b", false, discovery_handler_map.clone()).await; - - start_discovery_components_a - .running_receiver - .recv() - .await - .unwrap(); - start_discovery_components_b - .running_receiver - .recv() - .await - .unwrap(); - } - - struct StartDiscoveryComponents { - finished_discovery_receiver: tokio::sync::mpsc::Receiver<()>, - stop_all_discovery_sender: tokio::sync::broadcast::Sender<()>, - running_receiver: tokio::sync::broadcast::Receiver<()>, - start_discovery_handle: tokio::task::JoinHandle<()>, - } - - async fn start_discovery_setup( - config_name: &str, - terminate: bool, - discovery_handler_map: RegisteredDiscoveryHandlerMap, - ) -> StartDiscoveryComponents { - let mut mock_discovery_operator = - setup_test_do_discover(config_name, discovery_handler_map.clone()); - let (running_sender, running_receiver) = tokio::sync::broadcast::channel::<()>(1); - mock_discovery_operator - .expect_get_stream() - .returning(move |_, _| { - running_sender.clone().send(()).unwrap(); - None - }); - - mock_discovery_operator - .expect_delete_offline_instances() - .times(1) - .returning(move |_, _| Ok(())); - if terminate { - let stop_dh_discovery_sender = discovery_handler_map - .lock() - .unwrap() - .get_mut("debugEcho") - .unwrap() - .clone() - .get(&DiscoveryHandlerEndpoint::Uds("socket.sock".to_string())) - .unwrap() - .clone() - .close_discovery_handler_connection; - let local_config_id = mock_discovery_operator.get_config_id(); - mock_discovery_operator - .expect_stop_all_discovery() - .times(1) - .returning(move || { - stop_dh_discovery_sender - .clone() - .send(Some(local_config_id.clone())) - .unwrap(); - }); - } - // Config timestamp should be called - mock_discovery_operator - .expect_get_config_timestamp() - .times(1) - .returning(Instant::now); - let (mut finished_discovery_sender, finished_discovery_receiver) = - tokio::sync::mpsc::channel(2); - let (new_dh_sender, _) = broadcast::channel(2); - let (stop_all_discovery_sender, _) = broadcast::channel(2); - let thread_stop_all_discovery_sender = stop_all_discovery_sender.clone(); - let mock_kube_interface: Arc = Arc::new(MockKubeInterface::new()); - let mut mock_device_plugin_builder = Box::new(MockDevicePluginBuilderInterface::new()); - mock_device_plugin_builder - .expect_build_configuration_device_plugin() - .times(1) - .returning(move |_, _, _, _| { - let (sender, _) = broadcast::channel(2); - Ok(sender) - }); - - let start_discovery_handle = tokio::spawn(async move { - start_discovery::internal_start_discovery( - mock_discovery_operator, - new_dh_sender.to_owned(), - thread_stop_all_discovery_sender, - &mut finished_discovery_sender, - mock_kube_interface, - mock_device_plugin_builder, - "node-a".to_string(), - ) - .await - .unwrap(); - }); - StartDiscoveryComponents { - finished_discovery_receiver, - stop_all_discovery_sender, - running_receiver, - start_discovery_handle, - } - } - - // Test that DH is connected to on second try getting stream. - #[tokio::test] - async fn test_do_discover_completed_internal_connection() { - let _ = env_logger::builder().is_test(true).try_init(); - let discovery_handler_map = Arc::new(std::sync::Mutex::new(HashMap::new())); - let mut mock_discovery_operator = setup_test_do_discover("config-a", discovery_handler_map); - let mut get_stream_seq = Sequence::new(); - // First time cannot get stream - mock_discovery_operator - .expect_get_stream() - .times(1) - .returning(|_, _| None) - .in_sequence(&mut get_stream_seq); - // Second time successfully get stream - let (_, rx) = mpsc::channel(2); - let stream_type = Some(StreamType::Embedded(rx)); - mock_discovery_operator - .expect_get_stream() - .times(1) - .return_once(move |_, _| stream_type) - .in_sequence(&mut get_stream_seq); - // Discovery should be initiated - mock_discovery_operator - .expect_internal_do_discover() - .times(1) - .returning(|_, _, _, _| Ok(())); - // Config timestamp should be called - mock_discovery_operator - .expect_get_config_timestamp() - .times(1) - .returning(Instant::now); - let mock_kube_interface: Arc = Arc::new(MockKubeInterface::new()); - start_discovery::do_discover( - Arc::new(mock_discovery_operator), - mock_kube_interface, - "node-a".to_string(), - ) - .await - .unwrap(); - } - - #[tokio::test] - async fn test_handle_discovery_results() { - let _ = env_logger::builder().is_test(true).try_init(); - let mock_kube_interface: Arc = Arc::new(MockKubeInterface::new()); - let discovery_handler_map: RegisteredDiscoveryHandlerMap = - Arc::new(std::sync::Mutex::new(HashMap::new())); - let path_to_config = "../test/yaml/config-a.yaml"; - let config_yaml = std::fs::read_to_string(path_to_config).expect("Unable to read file"); - let config: Configuration = serde_yaml::from_str(&config_yaml).unwrap(); - let config_name = config.metadata.name.clone().unwrap(); - INSTANCE_COUNT_METRIC - .with_label_values(&[&config_name, "true"]) - .set(0); - let device1 = Device { - id: "device1".to_string(), - properties: HashMap::new(), - mounts: Vec::default(), - device_specs: Vec::default(), - }; - let device2 = Device { - id: "device2".to_string(), - properties: HashMap::new(), - mounts: Vec::default(), - device_specs: Vec::default(), - }; - let discovery_results: Vec = vec![device1, device2]; - let discovery_operator = Arc::new(DiscoveryOperator::new( - discovery_handler_map, - config, - Arc::new(RwLock::new(DevicePluginContext::default())), - )); - let mut mock_device_plugin_builder = MockDevicePluginBuilderInterface::new(); - mock_device_plugin_builder - .expect_build_device_plugin() - .times(2) - .returning(move |_, _, _, _, _, _| Ok(())); - discovery_operator - .handle_discovery_results( - mock_kube_interface, - discovery_results, - true, - Box::new(mock_device_plugin_builder), - "node-a".to_string(), - ) - .await - .unwrap(); - assert_eq!( - INSTANCE_COUNT_METRIC - .with_label_values(&[&config_name, "true"]) - .get(), - 2 - ); - } - - // Checks either that InstanceConnectivityStatus changed to expected value until success or exceeded tries - // or that all instances have been deleted from map. - // Sleep between tries to give update_instance_connectivity_status the chance chance to grab mutex InstanceMap. - async fn check_status_or_empty_loop( - status: InstanceConnectivityStatus, - equality: bool, - device_plugin_context: Arc>, - check_empty: bool, - ) { - let mut keep_looping = false; - let mut map_is_empty = false; - let tries: i8 = 5; - for _x in 0..tries { - println!("try number {}", _x); - keep_looping = false; - tokio::time::sleep(Duration::from_millis(100)).await; - let unwrapped_device_plugin_context = device_plugin_context.read().await.clone(); - if check_empty && unwrapped_device_plugin_context.instances.is_empty() { - map_is_empty = true; - break; - } - for (_, instance_info) in unwrapped_device_plugin_context.instances { - if instance_info.connectivity_status != status && equality { - keep_looping = true; - } - if instance_info.connectivity_status == status && !equality { - keep_looping = true; - } - } - if !keep_looping { - break; - } - } - if keep_looping { - panic!( - "failed to assert that all instances had status equal T/F: [{}] to status [{:?}]", - equality, status - ); - } - if check_empty && !map_is_empty { - panic!("instances were not cleared from map"); - } - } - - fn get_test_instance(nodes: Vec<&str>) -> akri_shared::akri::instance::Instance { - let nodes = nodes.into_iter().map(|e| e.to_string()).collect(); - let mut instance: akri_shared::akri::instance::Instance = serde_json::from_str( - r#" - { - "apiVersion": "akri.sh/v0", - "kind": "Instance", - "metadata": { - "name": "foo", - "namespace": "bar", - "uid": "abcdegfh-ijkl-mnop-qrst-uvwxyz012345" - }, - "spec": { - "configurationName": "", - "nodes": [], - "shared": true, - "deviceUsage": {} - } - } - "#, - ) - .unwrap(); - instance.spec.nodes = nodes; - instance - } - - #[tokio::test] - async fn test_delete_offline_instances() { - let _ = env_logger::builder().is_test(true).try_init(); - let path_to_config = "../test/yaml/config-a.yaml"; - let config_yaml = std::fs::read_to_string(path_to_config).expect("Unable to read file"); - let config: Configuration = serde_yaml::from_str(&config_yaml).unwrap(); - let mut list_and_watch_message_receivers = Vec::new(); - let discovery_handler_map: RegisteredDiscoveryHandlerMap = - Arc::new(std::sync::Mutex::new(HashMap::new())); - let mut visible_discovery_results = Vec::new(); - - // Assert no action (to delete instances by mock kube interface) is taken for all online instances - let device_plugin_context = build_device_plugin_context( - &config, - &mut visible_discovery_results, - &mut list_and_watch_message_receivers, - InstanceConnectivityStatus::Online, - ) - .await; - let mock = MockKubeInterface::new(); - let discovery_operator = Arc::new(DiscoveryOperator::new( - discovery_handler_map.clone(), - config.clone(), - device_plugin_context, - )); - discovery_operator - .delete_offline_instances(Arc::new(mock), "node-a".to_string()) - .await - .unwrap(); - - // Assert no action (to delete instances by mock kube interface) is taken for instances offline for less than grace period - let mock_now = Instant::now(); - MockClock::advance(Duration::from_secs(30)); - let device_plugin_context = build_device_plugin_context( - &config, - &mut visible_discovery_results, - &mut list_and_watch_message_receivers, - InstanceConnectivityStatus::Offline(mock_now), - ) - .await; - let mock = MockKubeInterface::new(); - let discovery_operator = Arc::new(DiscoveryOperator::new( - discovery_handler_map.clone(), - config.clone(), - device_plugin_context, - )); - discovery_operator - .delete_offline_instances(Arc::new(mock), "node-a".to_string()) - .await - .unwrap(); - - // Assert that all instances that have been offline for more than 5 minutes are deleted - let mock_now = Instant::now(); - MockClock::advance(Duration::from_secs(301)); - let device_plugin_context = build_device_plugin_context( - &config, - &mut visible_discovery_results, - &mut list_and_watch_message_receivers, - InstanceConnectivityStatus::Offline(mock_now), - ) - .await; - let mut mock = MockKubeInterface::new(); - let instance = get_test_instance(vec![]); - mock.expect_find_instance() - .times(2) - .returning(move |_, _| Ok(instance.clone())); - mock.expect_delete_instance() - .times(2) - .returning(move |_, _| Ok(())); - let discovery_operator = Arc::new(DiscoveryOperator::new( - discovery_handler_map.clone(), - config.clone(), - device_plugin_context.clone(), - )); - discovery_operator - .delete_offline_instances(Arc::new(mock), "node-a".to_string()) - .await - .unwrap(); - - // Make sure all instances are deleted from map. Note, first 3 arguments are ignored. - check_status_or_empty_loop( - InstanceConnectivityStatus::Online, - true, - device_plugin_context, - true, - ) - .await; - } - - // 1: InstanceConnectivityStatus of all instances that go offline is changed from Online to Offline - // 2: InstanceConnectivityStatus of shared instances that come back online in under 5 minutes is changed from Offline to Online - // 3: InstanceConnectivityStatus of unshared instances that come back online before next periodic discovery is changed from Offline to Online - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn test_update_instance_connectivity_status_factory() { - let _ = env_logger::builder().is_test(true).try_init(); - let path_to_config = "../test/yaml/config-a.yaml"; - let config_yaml = std::fs::read_to_string(path_to_config).expect("Unable to read file"); - let config: Configuration = serde_yaml::from_str(&config_yaml).unwrap(); - let config_name = config.metadata.name.clone().unwrap(); - let mut list_and_watch_message_receivers = Vec::new(); - let mut visible_discovery_results = Vec::new(); - let discovery_handler_map: RegisteredDiscoveryHandlerMap = - Arc::new(std::sync::Mutex::new(HashMap::new())); - let discovery_handler_map_clone = discovery_handler_map.clone(); - // set environment variable to set whether debug echo instances are shared - let mut mock_env_var_shared = MockEnvVarQuery::new(); - mock_env_var_shared - .expect_get_env_var() - .returning(|_| Ok("false".to_string())); - inner_register_embedded_discovery_handlers( - discovery_handler_map_clone, - &mock_env_var_shared, - ) - .unwrap(); - - // - // 1: Assert that InstanceConnectivityStatus of non local instances that are no longer visible is changed to Offline - // - let device_plugin_context = build_device_plugin_context( - &config, - &mut visible_discovery_results, - &mut list_and_watch_message_receivers, - InstanceConnectivityStatus::Online, - ) - .await; - let shared = true; - run_update_instance_connectivity_status( - config.clone(), - HashMap::new(), - shared, - device_plugin_context.clone(), - discovery_handler_map.clone(), - MockKubeInterface::new(), - ) - .await; - - // Check that no instances are still online - check_status_or_empty_loop( - InstanceConnectivityStatus::Online, - false, - device_plugin_context, - false, - ) - .await; - - // - // 2: Assert that InstanceConnectivityStatus of shared instances that come back online in <5 mins is changed to Online - // - let mock_now = Instant::now(); - MockClock::advance(Duration::from_secs(30)); - let device_plugin_context = build_device_plugin_context( - &config, - &mut visible_discovery_results, - &mut list_and_watch_message_receivers, - InstanceConnectivityStatus::Offline(mock_now), - ) - .await; - let currently_visible_instances: HashMap = visible_discovery_results - .iter() - .map(|device| { - let instance_name = get_device_instance_name(&device.id, &config_name); - (instance_name, device.clone()) - }) - .collect(); - let shared = true; - run_update_instance_connectivity_status( - config.clone(), - currently_visible_instances.clone(), - shared, - device_plugin_context.clone(), - discovery_handler_map.clone(), - MockKubeInterface::new(), - ) - .await; - - // Check that all instances marked online - check_status_or_empty_loop( - InstanceConnectivityStatus::Online, - true, - device_plugin_context, - false, - ) - .await; - - // - // 3: Assert that shared instances that are offline for more than 5 minutes are removed from the instance map - // - let mock_now = Instant::now(); - MockClock::advance(Duration::from_secs(301)); - let device_plugin_context = build_device_plugin_context( - &config, - &mut visible_discovery_results, - &mut list_and_watch_message_receivers, - InstanceConnectivityStatus::Offline(mock_now), - ) - .await; - let mut mock = MockKubeInterface::new(); - let instance = get_test_instance(vec![]); - mock.expect_find_instance() - .times(2) - .returning(move |_, _| Ok(instance.clone())); - mock.expect_delete_instance() - .times(2) - .returning(move |_, _| Ok(())); - let shared = true; - run_update_instance_connectivity_status( - config.clone(), - HashMap::new(), - shared, - device_plugin_context.clone(), - discovery_handler_map.clone(), - mock, - ) - .await; - // Make sure all instances are deleted from map. Note, first 3 arguments are ignored. - check_status_or_empty_loop( - InstanceConnectivityStatus::Online, - true, - device_plugin_context, - true, - ) - .await; - - // - // 4: Assert that local devices that go offline are removed from the instance map - // - let mut mock = MockKubeInterface::new(); - let instance = get_test_instance(vec![]); - mock.expect_find_instance() - .times(2) - .returning(move |_, _| Ok(instance.clone())); - mock.expect_delete_instance() - .times(2) - .returning(move |_, _| Ok(())); - - let device_plugin_context = build_device_plugin_context( - &config, - &mut visible_discovery_results, - &mut list_and_watch_message_receivers, - InstanceConnectivityStatus::Online, - ) - .await; - let shared = false; - run_update_instance_connectivity_status( - config, - HashMap::new(), - shared, - device_plugin_context.clone(), - discovery_handler_map.clone(), - mock, - ) - .await; - // Make sure all instances are deleted from map. Note, first 3 arguments are ignored. - check_status_or_empty_loop( - InstanceConnectivityStatus::Online, - true, - device_plugin_context, - true, - ) - .await; - } - - async fn run_update_instance_connectivity_status( - config: Configuration, - currently_visible_instances: HashMap, - shared: bool, - device_plugin_context: Arc>, - discovery_handler_map: RegisteredDiscoveryHandlerMap, - mock: MockKubeInterface, - ) { - let discovery_operator = Arc::new(DiscoveryOperator::new( - discovery_handler_map, - config, - device_plugin_context.clone(), - )); - discovery_operator - .update_instance_connectivity_status( - Arc::new(mock), - currently_visible_instances, - shared, - "node-a".to_string(), - ) - .await - .unwrap(); - } - - fn create_discovery_operator(path_to_config: &str) -> DiscoveryOperator { - let config_yaml = std::fs::read_to_string(path_to_config).expect("Unable to read file"); - let config: Configuration = serde_yaml::from_str(&config_yaml).unwrap(); - let discovery_handler_map = Arc::new(std::sync::Mutex::new(HashMap::new())); - DiscoveryOperator::new( - discovery_handler_map, - config, - Arc::new(RwLock::new(DevicePluginContext::default())), - ) - } - - fn setup_non_mocked_dh( - dh_name: &str, - endpoint: &DiscoveryHandlerEndpoint, - ) -> DiscoveryOperator { - let discovery_operator = create_discovery_operator("../test/yaml/config-a.yaml"); - add_discovery_handler_to_map( - dh_name, - endpoint, - false, - discovery_operator.discovery_handler_map.clone(), - ); - discovery_operator - } - - #[tokio::test] - async fn test_get_stream_embedded() { - let _ = env_logger::builder().is_test(true).try_init(); - std::env::set_var(super::super::constants::ENABLE_DEBUG_ECHO_LABEL, "yes"); - let path_to_config = "../test/yaml/config-a.yaml"; - let config_yaml = std::fs::read_to_string(path_to_config).expect("Unable to read file"); - let config: Configuration = serde_yaml::from_str(&config_yaml).unwrap(); - let discovery_handler_map = Arc::new(std::sync::Mutex::new(HashMap::new())); - let endpoint = DiscoveryHandlerEndpoint::Embedded; - let dh_name = akri_debug_echo::DISCOVERY_HANDLER_NAME.to_string(); - add_discovery_handler_to_map(&dh_name, &endpoint, false, discovery_handler_map.clone()); - let discovery_operator = DiscoveryOperator::new( - discovery_handler_map, - config, - Arc::new(RwLock::new(DevicePluginContext::default())), - ); - let mock_kube_interface: Arc = Arc::new(MockKubeInterface::new()); - - // test embedded debugEcho socket - if let Some(StreamType::Embedded(_)) = discovery_operator - .get_stream(mock_kube_interface, &DiscoveryHandlerEndpoint::Embedded) - .await - { - // expected - } else { - panic!("expected internal stream"); - } - } - - async fn setup_and_run_mock_discovery_handler( - endpoint: &str, - endpoint_dir: &str, - dh_endpoint: &DiscoveryHandlerEndpoint, - return_error: bool, - ) -> DiscoveryOperator { - let discovery_operator = setup_non_mocked_dh("mockName", dh_endpoint); - // Start mock DH, specifying that it should successfully run - let _dh_server_thread_handle = mock_discovery_handler::run_mock_discovery_handler( - endpoint_dir, - endpoint, - return_error, - Vec::new(), - ) - .await; - // Make sure registration server has started - akri_shared::uds::unix_stream::try_connect(endpoint) - .await - .unwrap(); - discovery_operator - } - - #[tokio::test] - async fn test_get_stream_no_dh() { - let (_, endpoint) = - mock_discovery_handler::get_mock_discovery_handler_dir_and_endpoint("mock.sock"); - let dh_endpoint = DiscoveryHandlerEndpoint::Uds(endpoint.to_string()); - let discovery_operator = setup_non_mocked_dh("mock", &dh_endpoint); - let mock_kube_interface: Arc = Arc::new(MockKubeInterface::new()); - - // Should not be able to get stream if DH is not running - assert!(discovery_operator - .get_stream(mock_kube_interface, &dh_endpoint) - .await - .is_none()); - } - - #[tokio::test] - async fn test_get_stream_error() { - // Start mock DH, specifying that it should return an error - let return_error = true; - let (endpoint_dir, endpoint) = - mock_discovery_handler::get_mock_discovery_handler_dir_and_endpoint("mock.sock"); - let dh_endpoint = DiscoveryHandlerEndpoint::Uds(endpoint.to_string()); - let discovery_operator = setup_and_run_mock_discovery_handler( - &endpoint, - &endpoint_dir, - &dh_endpoint, - return_error, - ) - .await; - let mock_kube_interface: Arc = Arc::new(MockKubeInterface::new()); - - // Assert that get_stream returns none if the DH returns error - assert!(discovery_operator - .get_stream(mock_kube_interface, &dh_endpoint) - .await - .is_none()); - } - - #[tokio::test] - async fn test_get_stream_external_success() { - // Start mock DH, specifying that it should NOT return an error - let return_error = false; - let (endpoint_dir, endpoint) = - mock_discovery_handler::get_mock_discovery_handler_dir_and_endpoint("mock.sock"); - let dh_endpoint = DiscoveryHandlerEndpoint::Uds(endpoint.to_string()); - let discovery_operator = setup_and_run_mock_discovery_handler( - &endpoint, - &endpoint_dir, - &dh_endpoint, - return_error, - ) - .await; - let mock_kube_interface: Arc = Arc::new(MockKubeInterface::new()); - - if let Some(StreamType::External(mut receiver)) = discovery_operator - .get_stream(mock_kube_interface, &dh_endpoint) - .await - { - // MockDiscoveryHandler returns an empty array of devices - assert_eq!( - receiver.get_message().await.unwrap().unwrap().devices.len(), - 0 - ); - } else { - panic!("expected external stream"); - } - } - - #[tokio::test] - async fn test_get_discovery_properties_no_properties() { - let _ = env_logger::builder().is_test(true).try_init(); - let discovery_operator = create_discovery_operator("../test/yaml/config-a.yaml"); - let mock_kube_interface: Arc = Arc::new(MockKubeInterface::new()); - - // properties should be empty if not specified - assert!(discovery_operator - .get_discovery_properties(mock_kube_interface, &None) - .await - .unwrap() - .is_empty()); - } - - #[tokio::test] - async fn test_get_discovery_properties_empty_property_list() { - let _ = env_logger::builder().is_test(true).try_init(); - let discovery_operator = create_discovery_operator("../test/yaml/config-a.yaml"); - let mock_kube_interface: Arc = Arc::new(MockKubeInterface::new()); - let properties = Vec::::new(); - - // properties should be empty if property list is empty - assert!(discovery_operator - .get_discovery_properties(mock_kube_interface, &Some(properties)) - .await - .unwrap() - .is_empty()); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_no_value() { - let _ = env_logger::builder().is_test(true).try_init(); - let discovery_operator = create_discovery_operator("../test/yaml/config-a.yaml"); - let mock_kube_interface: Arc = Arc::new(MockKubeInterface::new()); - let property_name_1 = "property_name_1".to_string(); - let property_name_2 = "".to_string(); // allow empty property name - let properties = vec![ - DiscoveryProperty { - name: property_name_1.clone(), - value: None, - value_from: None, - }, - DiscoveryProperty { - name: property_name_2.clone(), - value: None, - value_from: None, - }, - ]; - let expected_result = HashMap::from([ - (property_name_1, ByteData { vec: None }), - (property_name_2, ByteData { vec: None }), - ]); - - // properties should only contain (name, None) if no value specified - let result = discovery_operator - .get_discovery_properties(mock_kube_interface.clone(), &Some(properties)) - .await - .unwrap(); - assert_eq!(result, expected_result); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_with_value() { - let _ = env_logger::builder().is_test(true).try_init(); - let discovery_operator = create_discovery_operator("../test/yaml/config-a.yaml"); - let mock_kube_interface: Arc = Arc::new(MockKubeInterface::new()); - let property_name_1 = "property_name_1".to_string(); - let property_name_2 = "".to_string(); // allow empty property name - let property_value_1 = "property_value_1".to_string(); - let property_value_2 = "property_value_2".to_string(); - let properties = vec![ - DiscoveryProperty { - name: property_name_1.clone(), - value: Some(property_value_1.clone()), - value_from: None, - }, - DiscoveryProperty { - name: property_name_2.clone(), - value: Some(property_value_2.clone()), - value_from: None, - }, - ]; - let expected_result = HashMap::from([ - ( - property_name_1, - ByteData { - vec: Some(property_value_1.into()), - }, - ), - ( - property_name_2, - ByteData { - vec: Some(property_value_2.into()), - }, - ), - ]); - - // properties should contains (name, value) if specified - let result = discovery_operator - .get_discovery_properties(mock_kube_interface, &Some(properties)) - .await - .unwrap(); - assert_eq!(result, expected_result); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_from_secret_no_secret_found() { - let _ = env_logger::builder().is_test(true).try_init(); - let namespace_name = "namespace_name"; - let secret_name = "secret_1"; - let key_in_secret = "key_in_secret"; - - let selector = DiscoveryPropertyKeySelector { - key: key_in_secret.to_string(), - name: secret_name.to_string(), - namespace: namespace_name.to_string(), - optional: Some(false), - }; - - let mut mock_kube_client = MockKubeClient::new(); - mock_kube_client - .expect_get_secret() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == namespace_name && name == secret_name - }) - .returning(move |_, _| Ok(None)); - - // get_discovery_property_value_from_secret should return error if secret not found - let result = get_discovery_property_value_from_secret(&mock_kube_client, &selector).await; - assert!(result.is_err()); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_from_secret_no_secret_found_optional() { - let _ = env_logger::builder().is_test(true).try_init(); - let namespace_name = "namespace_name"; - let secret_name = "secret_1"; - let key_in_secret = "key_in_secret"; - - let selector = DiscoveryPropertyKeySelector { - key: key_in_secret.to_string(), - name: secret_name.to_string(), - namespace: namespace_name.to_string(), - optional: Some(true), - }; - - let mut mock_kube_client = MockKubeClient::new(); - mock_kube_client - .expect_get_secret() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == namespace_name && name == secret_name - }) - .returning(move |_, _| Ok(None)); - - // get_discovery_property_value_from_secret for an optional key should return None if secret not found - let result = get_discovery_property_value_from_secret(&mock_kube_client, &selector).await; - assert!(result.unwrap().is_none()); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_from_secret_no_key() { - let _ = env_logger::builder().is_test(true).try_init(); - let namespace_name = "namespace_name"; - let secret_name = "secret_1"; - let key_in_secret = "key_in_secret"; - - let selector = DiscoveryPropertyKeySelector { - key: key_in_secret.to_string(), - name: secret_name.to_string(), - namespace: namespace_name.to_string(), - optional: Some(false), - }; - - let mut mock_kube_client = MockKubeClient::new(); - mock_kube_client - .expect_get_secret() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == namespace_name && name == secret_name - }) - .returning(move |_, _| Ok(Some(Secret::default()))); - - // get_discovery_property_value_from_secret should return error if key in secret not found - assert!( - get_discovery_property_value_from_secret(&mock_kube_client, &selector,) - .await - .is_err() - ); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_from_secret_no_key_optional() { - let _ = env_logger::builder().is_test(true).try_init(); - let namespace_name = "namespace_name"; - let secret_name = "secret_1"; - let key_in_secret = "key_in_config_map"; - - let selector = DiscoveryPropertyKeySelector { - key: key_in_secret.to_string(), - name: secret_name.to_string(), - namespace: namespace_name.to_string(), - optional: Some(true), - }; - - let mut mock_kube_client = MockKubeClient::new(); - mock_kube_client - .expect_get_secret() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == namespace_name && name == secret_name - }) - .returning(move |_, _| Ok(Some(Secret::default()))); - - // get_discovery_property_value_from_secret for an optional key should return None if key in secret not found - let result = get_discovery_property_value_from_secret(&mock_kube_client, &selector).await; - assert!(result.unwrap().is_none()); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_from_secret_no_value() { - let _ = env_logger::builder().is_test(true).try_init(); - let namespace_name = "namespace_name"; - let secret_name = "secret_1"; - let key_in_secret = "key_in_secret"; - - let selector = DiscoveryPropertyKeySelector { - key: key_in_secret.to_string(), - name: secret_name.to_string(), - namespace: namespace_name.to_string(), - optional: Some(false), - }; - - let mut mock_kube_client = MockKubeClient::new(); - mock_kube_client - .expect_get_secret() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == namespace_name && name == secret_name - }) - .returning(move |_, _| { - let secret = Secret { - data: Some(BTreeMap::new()), - ..Default::default() - }; - Ok(Some(secret)) - }); - - // get_discovery_property_value_from_secret should return error if no value in secret - let result = get_discovery_property_value_from_secret(&mock_kube_client, &selector).await; - assert!(result.is_err()); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_from_secret_no_value_optional() { - let _ = env_logger::builder().is_test(true).try_init(); - let namespace_name = "namespace_name"; - let secret_name = "secret_1"; - let key_in_secret = "key_in_config_map"; - - let selector = DiscoveryPropertyKeySelector { - key: key_in_secret.to_string(), - name: secret_name.to_string(), - namespace: namespace_name.to_string(), - optional: Some(true), - }; - - let mut mock_kube_client = MockKubeClient::new(); - mock_kube_client - .expect_get_secret() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == namespace_name && name == secret_name - }) - .returning(move |_, _| { - let secret = Secret { - data: Some(BTreeMap::new()), - ..Default::default() - }; - Ok(Some(secret)) - }); - - // get_discovery_property_value_from_secret for an optional key should return None if key in secret not found - let result = get_discovery_property_value_from_secret(&mock_kube_client, &selector).await; - assert!(result.unwrap().is_none()); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_from_secret_data_value() { - let _ = env_logger::builder().is_test(true).try_init(); - let namespace_name = "namespace_name"; - let secret_name = "secret_1"; - let key_in_secret = "key_in_secret"; - let value_in_secret = "value_in_secret"; - - let selector = DiscoveryPropertyKeySelector { - key: key_in_secret.to_string(), - name: secret_name.to_string(), - namespace: namespace_name.to_string(), - optional: Some(false), - }; - - let mut mock_kube_client = MockKubeClient::new(); - mock_kube_client - .expect_get_secret() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == namespace_name && name == secret_name - }) - .returning(move |_, _| { - let data = BTreeMap::from([( - key_in_secret.to_string(), - ByteString(value_in_secret.into()), - )]); - let secret = Secret { - data: Some(data), - ..Default::default() - }; - Ok(Some(secret)) - }); - - let expected_result = ByteData { - vec: Some(value_in_secret.into()), - }; - - // get_discovery_property_value_from_secret should return correct value if data value in secret - let result = get_discovery_property_value_from_secret(&mock_kube_client, &selector).await; - assert_eq!(result.unwrap().unwrap(), expected_result); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_from_config_map_no_config_map_found() { - let _ = env_logger::builder().is_test(true).try_init(); - let namespace_name = "namespace_name"; - let config_map_name = "config_map_1"; - let key_in_config_map = "key_in_config_map"; - - let selector = DiscoveryPropertyKeySelector { - key: key_in_config_map.to_string(), - name: config_map_name.to_string(), - namespace: namespace_name.to_string(), - optional: Some(false), - }; - - let mut mock_kube_client = MockKubeClient::new(); - mock_kube_client - .expect_get_config_map() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == namespace_name && name == config_map_name - }) - .returning(move |_, _| Ok(None)); - - // get_discovery_property_value_from_config_map should return error if configMap not found - let result = - get_discovery_property_value_from_config_map(&mock_kube_client, &selector).await; - assert!(result.is_err()); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_from_config_map_no_config_map_found_optional() { - let _ = env_logger::builder().is_test(true).try_init(); - let namespace_name = "namespace_name"; - let config_map_name = "config_map_1"; - let key_in_config_map = "key_in_config_map"; - - let selector = DiscoveryPropertyKeySelector { - key: key_in_config_map.to_string(), - name: config_map_name.to_string(), - namespace: namespace_name.to_string(), - optional: Some(true), - }; - - let mut mock_kube_client = MockKubeClient::new(); - mock_kube_client - .expect_get_config_map() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == namespace_name && name == config_map_name - }) - .returning(move |_, _| Ok(None)); - - // get_discovery_property_value_from_config_map for an optional key should return None if configMap not found - let result = - get_discovery_property_value_from_config_map(&mock_kube_client, &selector).await; - assert!(result.unwrap().is_none()); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_from_config_map_no_key() { - let _ = env_logger::builder().is_test(true).try_init(); - let namespace_name = "namespace_name"; - let config_map_name = "config_map_1"; - let key_in_config_map = "key_in_config_map"; - - let selector = DiscoveryPropertyKeySelector { - key: key_in_config_map.to_string(), - name: config_map_name.to_string(), - namespace: namespace_name.to_string(), - optional: Some(false), - }; - - let mut mock_kube_client = MockKubeClient::new(); - mock_kube_client - .expect_get_config_map() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == namespace_name && name == config_map_name - }) - .returning(move |_, _| Ok(Some(ConfigMap::default()))); - - // get_discovery_property_value_from_config_map should return error if key in configMap not found - let result = - get_discovery_property_value_from_config_map(&mock_kube_client, &selector).await; - assert!(result.is_err()); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_from_config_map_no_key_optional() { - let _ = env_logger::builder().is_test(true).try_init(); - let namespace_name = "namespace_name"; - let config_map_name = "config_map_1"; - let key_in_config_map = "key_in_config_map"; - - let selector = DiscoveryPropertyKeySelector { - key: key_in_config_map.to_string(), - name: config_map_name.to_string(), - namespace: namespace_name.to_string(), - optional: Some(true), - }; - - let mut mock_kube_client = MockKubeClient::new(); - mock_kube_client - .expect_get_config_map() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == namespace_name && name == config_map_name - }) - .returning(move |_, _| Ok(Some(ConfigMap::default()))); - - // get_discovery_property_value_from_config_map for an optional key should return None if key in configMap not found - let result = - get_discovery_property_value_from_config_map(&mock_kube_client, &selector).await; - assert!(result.unwrap().is_none()); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_from_config_map_no_value() { - let _ = env_logger::builder().is_test(true).try_init(); - let namespace_name = "namespace_name"; - let config_map_name = "config_map_1"; - let key_in_config_map = "key_in_config_map"; - - let selector = DiscoveryPropertyKeySelector { - key: key_in_config_map.to_string(), - name: config_map_name.to_string(), - namespace: namespace_name.to_string(), - optional: Some(false), - }; - - let mut mock_kube_client = MockKubeClient::new(); - mock_kube_client - .expect_get_config_map() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == namespace_name && name == config_map_name - }) - .returning(move |_, _| { - let config_map = ConfigMap { - data: Some(BTreeMap::new()), - binary_data: Some(BTreeMap::new()), - ..Default::default() - }; - Ok(Some(config_map)) - }); - - // get_discovery_property_value_from_config_map should return error if no value in configMap - let result = - get_discovery_property_value_from_config_map(&mock_kube_client, &selector).await; - assert!(result.is_err()); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_from_config_map_no_value_optional() { - let _ = env_logger::builder().is_test(true).try_init(); - let namespace_name = "namespace_name"; - let config_map_name = "config_map_1"; - let key_in_config_map = "key_in_config_map"; - - let selector = DiscoveryPropertyKeySelector { - key: key_in_config_map.to_string(), - name: config_map_name.to_string(), - namespace: namespace_name.to_string(), - optional: Some(true), - }; - - let mut mock_kube_client = MockKubeClient::new(); - mock_kube_client - .expect_get_config_map() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == namespace_name && name == config_map_name - }) - .returning(move |_, _| { - let config_map = ConfigMap { - data: Some(BTreeMap::new()), - binary_data: Some(BTreeMap::new()), - ..Default::default() - }; - Ok(Some(config_map)) - }); - - // get_discovery_property_value_from_config_map for an optional key should return None if key in configMap not found - let result = - get_discovery_property_value_from_config_map(&mock_kube_client, &selector).await; - assert!(result.unwrap().is_none()); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_from_config_map_data_value() { - let _ = env_logger::builder().is_test(true).try_init(); - let namespace_name = "namespace_name"; - let config_map_name = "config_map_1"; - let key_in_config_map = "key_in_config_map"; - let value_in_config_map = "value_in_config_map"; - - let selector = DiscoveryPropertyKeySelector { - key: key_in_config_map.to_string(), - name: config_map_name.to_string(), - namespace: namespace_name.to_string(), - optional: Some(false), - }; - - let mut mock_kube_client = MockKubeClient::new(); - mock_kube_client - .expect_get_config_map() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == namespace_name && name == config_map_name - }) - .returning(move |_, _| { - let data = BTreeMap::from([( - key_in_config_map.to_string(), - value_in_config_map.to_string(), - )]); - let config_map = ConfigMap { - data: Some(data), - binary_data: Some(BTreeMap::new()), - ..Default::default() - }; - Ok(Some(config_map)) - }); - - let expected_result = ByteData { - vec: Some(value_in_config_map.into()), - }; - - // get_discovery_property_value_from_config_map should return correct value if data value in configMap - let result = - get_discovery_property_value_from_config_map(&mock_kube_client, &selector).await; - assert_eq!(result.unwrap().unwrap(), expected_result); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_from_config_map_binary_data_value() { - let _ = env_logger::builder().is_test(true).try_init(); - let namespace_name = "namespace_name"; - let config_map_name = "config_map_1"; - let key_in_config_map = "key_in_config_map"; - let value_in_config_map = "value_in_config_map"; - - let selector = DiscoveryPropertyKeySelector { - key: key_in_config_map.to_string(), - name: config_map_name.to_string(), - namespace: namespace_name.to_string(), - optional: Some(false), - }; - - let mut mock_kube_client = MockKubeClient::new(); - mock_kube_client - .expect_get_config_map() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == namespace_name && name == config_map_name - }) - .returning(move |_, _| { - let binary_data = BTreeMap::from([( - key_in_config_map.to_string(), - ByteString(value_in_config_map.into()), - )]); - let config_map = ConfigMap { - data: Some(BTreeMap::new()), - binary_data: Some(binary_data), - ..Default::default() - }; - Ok(Some(config_map)) - }); - - let expected_result = ByteData { - vec: Some(value_in_config_map.into()), - }; - - // get_discovery_property_value_from_config_map should return correct value if binary data value in configMap - let result = - get_discovery_property_value_from_config_map(&mock_kube_client, &selector).await; - assert_eq!(result.unwrap().unwrap(), expected_result); - } - - #[tokio::test] - async fn test_get_discovery_properties_value_from_config_map_data_and_binary_data_value() { - let _ = env_logger::builder().is_test(true).try_init(); - let namespace_name = "namespace_name"; - let config_map_name = "config_map_1"; - let key_in_config_map = "key_in_config_map"; - let value_in_config_map = "value_in_config_map"; - let binary_value_in_config_map = "binary_value_in_config_map"; - - let selector = DiscoveryPropertyKeySelector { - key: key_in_config_map.to_string(), - name: config_map_name.to_string(), - namespace: namespace_name.to_string(), - optional: Some(false), - }; - - let mut mock_kube_client = MockKubeClient::new(); - mock_kube_client - .expect_get_config_map() - .times(1) - .withf(move |name: &str, namespace: &str| { - namespace == namespace_name && name == config_map_name - }) - .returning(move |_, _| { - let data = BTreeMap::from([( - key_in_config_map.to_string(), - value_in_config_map.to_string(), - )]); - let binary_data = BTreeMap::from([( - key_in_config_map.to_string(), - ByteString(binary_value_in_config_map.into()), - )]); - let config_map = ConfigMap { - data: Some(data), - binary_data: Some(binary_data), - ..Default::default() - }; - Ok(Some(config_map)) - }); - - let expected_result = ByteData { - vec: Some(value_in_config_map.into()), - }; - - // get_discovery_property_value_from_config_map should return value from data if both data and binary data value exist - let result = - get_discovery_property_value_from_config_map(&mock_kube_client, &selector).await; - assert_eq!(result.unwrap().unwrap(), expected_result); - } - - #[tokio::test] - async fn test_try_delete_instance() { - let _ = env_logger::builder().is_test(true).try_init(); - // should do nothing for non existing instance - let mut kube_interface = MockKubeInterface::new(); - kube_interface - .expect_find_instance() - .with(eq("foo"), eq("bar")) - .returning(move |_, _| Err(anyhow::format_err!("Not Found"))); - try_delete_instance(&kube_interface, "foo", "bar", "node-a".to_string()) - .await - .unwrap(); - - // should delete instance with already empty node list - let mut kube_interface = MockKubeInterface::new(); - let instance = get_test_instance(vec![]); - kube_interface - .expect_find_instance() - .with(eq("foo"), eq("bar")) - .returning(move |_, _| Ok(instance.clone())); - kube_interface - .expect_delete_instance() - .with(eq("foo"), eq("bar")) - .returning(move |_, _| Ok(())); - try_delete_instance(&kube_interface, "foo", "bar", "node-a".to_string()) - .await - .unwrap(); - - // should delete instance with then empty node list - let mut kube_interface = MockKubeInterface::new(); - let instance = get_test_instance(vec!["node-a"]); - kube_interface - .expect_find_instance() - .with(eq("foo"), eq("bar")) - .returning(move |_, _| Ok(instance.clone())); - kube_interface - .expect_delete_instance() - .with(eq("foo"), eq("bar")) - .returning(move |_, _| Ok(())); - try_delete_instance(&kube_interface, "foo", "bar", "node-a".to_string()) - .await - .unwrap(); - - // should update instance with non empty node list - let mut kube_interface = MockKubeInterface::new(); - let instance = get_test_instance(vec!["node-a", "node-b"]); - kube_interface - .expect_find_instance() - .with(eq("foo"), eq("bar")) - .returning(move |_, _| Ok(instance.clone())); - kube_interface - .expect_update_instance() - .times(1) - .withf(move |instance, name, namespace| { - name == "foo" && namespace == "bar" && instance.nodes == vec!["node-b"] - }) - .returning(move |_, _, _| Ok(())); - try_delete_instance(&kube_interface, "foo", "bar", "node-a".to_string()) - .await - .unwrap(); - } -} diff --git a/agent/src/util/embedded_discovery_handlers.rs b/agent/src/util/embedded_discovery_handlers.rs deleted file mode 100644 index 541781f6e..000000000 --- a/agent/src/util/embedded_discovery_handlers.rs +++ /dev/null @@ -1,138 +0,0 @@ -use akri_debug_echo::discovery_handler::DebugEchoDiscoveryDetails; -use akri_discovery_utils::discovery::{ - v0::discovery_handler_server::DiscoveryHandler, DiscoverStream, -}; -#[cfg(any(test, feature = "onvif-feat"))] -use akri_onvif::discovery_handler::OnvifDiscoveryDetails; -#[cfg(any(test, feature = "opcua-feat"))] -use akri_opcua::discovery_handler::OpcuaDiscoveryDetails; -use akri_shared::{ - akri::configuration::DiscoveryHandlerInfo, - os::env_var::{ActualEnvVarQuery, EnvVarQuery}, -}; -#[cfg(any(test, feature = "udev-feat"))] -use akri_udev::discovery_handler::UdevDiscoveryDetails; -use anyhow::Error; -use log::trace; - -/// Returns the appropriate embedded DiscoveryHandler as determined by the deserialized discovery_details string. -pub fn get_discovery_handler( - discovery_handler_info: &DiscoveryHandlerInfo, -) -> Result>, Error> { - let query_var_set = ActualEnvVarQuery {}; - inner_get_discovery_handler(discovery_handler_info, &query_var_set) -} - -fn inner_get_discovery_handler( - discovery_handler_info: &DiscoveryHandlerInfo, - query: &impl EnvVarQuery, -) -> Result>, Error> { - trace!( - "inner_get_discovery_handler - for DiscoveryHandlerInfo {:?}", - discovery_handler_info - ); - // Determine whether it is an embedded discovery handler - match discovery_handler_info.name.as_str() { - #[cfg(any(test, feature = "onvif-feat"))] - akri_onvif::DISCOVERY_HANDLER_NAME => { - trace!("here in onvif"); - let _discovery_handler_config: OnvifDiscoveryDetails = serde_yaml::from_str(&discovery_handler_info.discovery_details).map_err(|e| anyhow::format_err!("ONVIF Configuration discovery details improperly configured with error {:?}", e))?; - Ok(Box::new( - akri_onvif::discovery_handler::DiscoveryHandlerImpl::new(None), - )) - } - #[cfg(any(test, feature = "udev-feat"))] - akri_udev::DISCOVERY_HANDLER_NAME => { - let _discovery_handler_config: UdevDiscoveryDetails = serde_yaml::from_str(&discovery_handler_info.discovery_details).map_err(|e| anyhow::format_err!("udev Configuration discovery details improperly configured with error {:?}", e))?; - Ok(Box::new( - akri_udev::discovery_handler::DiscoveryHandlerImpl::new(None), - )) - } - #[cfg(any(test, feature = "opcua-feat"))] - akri_opcua::DISCOVERY_HANDLER_NAME => { - let _discovery_handler_config: OpcuaDiscoveryDetails = serde_yaml::from_str(&discovery_handler_info.discovery_details).map_err(|e| anyhow::format_err!("OPC UA Configuration discovery details improperly configured with error {:?}", e))?; - Ok(Box::new( - akri_opcua::discovery_handler::DiscoveryHandlerImpl::new(None), - )) - } - akri_debug_echo::DISCOVERY_HANDLER_NAME => { - match query.get_env_var(super::constants::ENABLE_DEBUG_ECHO_LABEL) { - Ok(_) => { - let _discovery_handler_config: DebugEchoDiscoveryDetails = serde_yaml::from_str(&discovery_handler_info.discovery_details).map_err(|e| anyhow::format_err!("debug echo Configuration discovery details improperly configured with error {:?}", e))?; - Ok(Box::new( - akri_debug_echo::discovery_handler::DiscoveryHandlerImpl::new(None))) - }, - _ => Err(anyhow::format_err!("Debug echo discovery handler not configured")), - } - } - _ => Err(anyhow::format_err!( - "No embedded discovery handler found for configuration with discovery handler info {:?}", - discovery_handler_info - )), - } -} - -#[cfg(test)] -mod tests { - use super::*; - use akri_shared::{akri::configuration::DiscoveryHandlerInfo, os::env_var::MockEnvVarQuery}; - use std::env::VarError; - - #[test] - fn test_inner_get_discovery_handler() { - let _ = env_logger::builder().is_test(true).try_init(); - let mock_query = MockEnvVarQuery::new(); - let deserialized = serde_json::from_str::( - r#"{"name":"onvif", "discoveryDetails":"{}"}"#, - ) - .unwrap(); - assert!(inner_get_discovery_handler(&deserialized, &mock_query).is_ok()); - - let udev_yaml = r#" - name: udev - discoveryDetails: |+ - udevRules: [] - "#; - let deserialized: DiscoveryHandlerInfo = serde_yaml::from_str(udev_yaml).unwrap(); - assert!(inner_get_discovery_handler(&deserialized, &mock_query).is_ok()); - - let yaml = r#" - name: opcua - discoveryDetails: |+ - opcuaDiscoveryMethod: - standard: {} - "#; - let deserialized: DiscoveryHandlerInfo = serde_yaml::from_str(yaml).unwrap(); - assert!(inner_get_discovery_handler(&deserialized, &mock_query).is_ok()); - - let deserialized = serde_json::from_str::( - r#"{"name":"random", "discoveryDetails":"some details"}"#, - ) - .unwrap(); - assert!(inner_get_discovery_handler(&deserialized, &mock_query).is_err()); - } - - #[tokio::test] - async fn test_factory_for_debug_echo() { - let debug_echo_yaml = r#" - discoveryHandler: - name: debugEcho - discoveryDetails: |+ - descriptions: - - "foo1" - "#; - let deserialized: DiscoveryHandlerInfo = serde_yaml::from_str(debug_echo_yaml).unwrap(); - // Test that errors without environment var set - let mut mock_query_without_var_set = MockEnvVarQuery::new(); - mock_query_without_var_set - .expect_get_env_var() - .returning(|_| Err(VarError::NotPresent)); - assert!(inner_get_discovery_handler(&deserialized, &mock_query_without_var_set,).is_err()); - // Test that succeeds when env var set - let mut mock_query_with_var_set = MockEnvVarQuery::new(); - mock_query_with_var_set - .expect_get_env_var() - .returning(|_| Ok("1".to_string())); - assert!(inner_get_discovery_handler(&deserialized, &mock_query_with_var_set).is_ok()); - } -} diff --git a/agent/src/util/mod.rs b/agent/src/util/mod.rs index cd796f956..7660b3239 100644 --- a/agent/src/util/mod.rs +++ b/agent/src/util/mod.rs @@ -1,13 +1,5 @@ -pub mod config_action; -pub mod constants; -pub mod crictl_containers; -mod device_plugin_builder; -mod device_plugin_service; -pub mod discovery_operator; -#[cfg(any(test, feature = "agent-full"))] -pub mod embedded_discovery_handlers; +pub mod discovery_configuration_controller; + mod metrics; -pub mod registration; -pub mod slot_reconciliation; -pub mod streaming_extension; -mod v1beta1; + +pub mod stopper; diff --git a/agent/src/util/registration.rs b/agent/src/util/registration.rs deleted file mode 100644 index fc1c8c7ee..000000000 --- a/agent/src/util/registration.rs +++ /dev/null @@ -1,460 +0,0 @@ -use super::config_action::ConfigId; -use super::constants::CLOSE_DISCOVERY_HANDLER_CONNECTION_CHANNEL_CAPACITY; -#[cfg(any(test, feature = "agent-full"))] -use super::constants::ENABLE_DEBUG_ECHO_LABEL; -use akri_discovery_utils::discovery::v0::{ - register_discovery_handler_request::EndpointType, - registration_server::{Registration, RegistrationServer}, - Empty, RegisterDiscoveryHandlerRequest, -}; -#[cfg(any(test, feature = "agent-full"))] -use akri_shared::os::env_var::{ActualEnvVarQuery, EnvVarQuery}; -use akri_shared::uds::unix_stream; -use futures::TryFutureExt; -use std::collections::HashMap; -use std::sync::{Arc, Mutex}; -use tokio::sync::broadcast; -use tonic::{transport::Server, Request, Response, Status}; - -/// Map of `DiscoveryHandlers` of the same type (registered with the same name) where key is the endpoint of the -/// Discovery Handler and value is `DiscoveryDetails`. -pub type DiscoveryHandlerDetailsMap = HashMap; - -/// Map of all registered `DiscoveryHandlers` where key is `DiscoveryHandler` name and value is a map of all -/// `DiscoveryHandlers` with that name. -pub type RegisteredDiscoveryHandlerMap = - Arc>>; - -/// Alias illustrating that `AgentRegistration.new_discovery_handler_sender`, sends the Discovery Handler name of the -/// newly registered Discovery Handler. -pub type DiscoveryHandlerName = String; - -/// A Discovery Handler's endpoint, distinguished by URI type -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum DiscoveryHandlerEndpoint { - /// Embedded means the Discovery Handler is running inside the Agent - #[cfg(any(test, feature = "agent-full"))] - Embedded, - /// Uds means the Discovery Handler is running on a specified unix domain socket - Uds(String), - /// Network means the Discovery Handler is running at an specified URL - Network(String), -} - -/// Details about a `DiscoveryHandler` and a sender for terminating its clients when needed. -#[derive(Debug, Clone)] -pub struct DiscoveryDetails { - /// Name of the `DiscoveryHandler` - pub name: String, - /// Endpoint of the `DiscoveryHandler` - pub endpoint: DiscoveryHandlerEndpoint, - /// Whether instances discovered by the `DiscoveryHandler` can be shared/seen by multiple nodes. - pub shared: bool, - /// Channel over which the Registration service tells a DiscoveryOperator client to close a connection with a - /// `DiscoveryHandler`, if any. A broadcast channel is used so both the sending and receiving ends can be cloned. - pub close_discovery_handler_connection: broadcast::Sender>, -} - -/// This maps the endpoint string and endpoint type of a `RegisterDiscoveryHandlerRequest` into a -/// `DiscoveryHandlerEndpoint` so as to support embedded `DiscoveryHandlers`. -pub fn create_discovery_handler_endpoint( - endpoint: &str, - endpoint_type: EndpointType, -) -> DiscoveryHandlerEndpoint { - match endpoint_type { - EndpointType::Network => DiscoveryHandlerEndpoint::Network(endpoint.to_string()), - EndpointType::Uds => DiscoveryHandlerEndpoint::Uds(endpoint.to_string()), - } -} - -/// Hosts a register service that external Discovery Handlers can call in order to be added to the -/// RegisteredDiscoveryHandlerMap that is shared with DiscoveryOperators. When a new Discovery Handler is registered, a -/// message is broadcast to inform any running DiscoveryOperators in case they should use the new Discovery Handler. -pub struct AgentRegistration { - new_discovery_handler_sender: broadcast::Sender, - registered_discovery_handlers: RegisteredDiscoveryHandlerMap, -} - -impl AgentRegistration { - pub fn new( - new_discovery_handler_sender: broadcast::Sender, - registered_discovery_handlers: RegisteredDiscoveryHandlerMap, - ) -> Self { - AgentRegistration { - new_discovery_handler_sender, - registered_discovery_handlers, - } - } -} - -#[tonic::async_trait] -impl Registration for AgentRegistration { - /// Adds new `DiscoveryHandler`s to the RegisteredDiscoveryHandlerMap and broadcasts a message to any running - /// DiscoveryOperators that a new `DiscoveryHandler` exists. If the discovery handler is already registered at an - /// endpoint and the register request has changed, the previously registered DH is told to stop discovery and is - /// removed from the map. Then, the updated DH is registered. - async fn register_discovery_handler( - &self, - request: Request, - ) -> Result, Status> { - let req = request.into_inner(); - let dh_name = req.name.clone(); - let endpoint = req.endpoint.clone(); - let dh_endpoint = create_discovery_handler_endpoint( - &endpoint, - EndpointType::try_from(req.endpoint_type).unwrap(), - ); - info!( - "register_discovery_handler - called with register request {:?}", - req - ); - let (close_discovery_handler_connection, _) = - broadcast::channel(CLOSE_DISCOVERY_HANDLER_CONNECTION_CHANNEL_CAPACITY); - let discovery_handler_details = DiscoveryDetails { - name: dh_name.clone(), - endpoint: dh_endpoint.clone(), - shared: req.shared, - close_discovery_handler_connection, - }; - let mut registered_discovery_handlers = self.registered_discovery_handlers.lock().unwrap(); - // Check if any DiscoveryHandlers have been registered under this name - if let Some(register_request_map) = registered_discovery_handlers.get_mut(&dh_name) { - if let Some(dh_details) = register_request_map.get(&dh_endpoint) { - // Check if DH at that endpoint is already registered but changed request - if dh_details.shared != req.shared || dh_details.endpoint != dh_endpoint { - // Stop all (using None argument) current discovery with this DH if any. - // A receiver may not exist if: - // 1) no configuration has been applied that uses this DH or - // 2) a connection cannot be made with the DH's endpoint - dh_details - .close_discovery_handler_connection - .send(None) - .unwrap_or_default(); - } else { - // Already registered. Return early. - return Ok(Response::new(Empty {})); - } - } - // New or updated Discovery Handler - register_request_map.insert(dh_endpoint, discovery_handler_details); - } else { - // First Discovery Handler registered under this name - let mut register_request_map = HashMap::new(); - register_request_map.insert(dh_endpoint, discovery_handler_details); - registered_discovery_handlers.insert(dh_name.clone(), register_request_map); - } - // Notify of new Discovery Handler - if self - .new_discovery_handler_sender - .send(dh_name.clone()) - .is_err() - { - // If no configurations have been applied, no receivers can nor need to be updated about the new discovery - // handler - trace!("register_discovery_handler - new {} discovery handler registered but no active discovery operators to receive the message", dh_name); - } - Ok(Response::new(Empty {})) - } -} - -/// Serves the Agent registration service over UDS. -pub async fn run_registration_server( - discovery_handler_map: RegisteredDiscoveryHandlerMap, - new_discovery_handler_sender: broadcast::Sender, -) -> Result<(), Box> { - internal_run_registration_server( - discovery_handler_map, - new_discovery_handler_sender, - &akri_discovery_utils::get_registration_socket(), - ) - .await -} - -pub async fn internal_run_registration_server( - discovery_handler_map: RegisteredDiscoveryHandlerMap, - new_discovery_handler_sender: broadcast::Sender, - socket_path: &str, -) -> Result<(), Box> { - info!("internal_run_registration_server - entered"); - let registration = AgentRegistration::new(new_discovery_handler_sender, discovery_handler_map); - trace!( - "internal_run_registration_server - registration server listening on socket {}", - socket_path - ); - // Delete socket in case previously created/used - std::fs::remove_file(socket_path).unwrap_or(()); - let incoming = { - let uds = - tokio::net::UnixListener::bind(socket_path).expect("Failed to bind to socket path"); - - async_stream::stream! { - loop { - let item = uds.accept().map_ok(|(st, _)| unix_stream::UnixStream(st)).await; - yield item; - } - } - }; - Server::builder() - .add_service(RegistrationServer::new(registration)) - .serve_with_incoming(incoming) - .await?; - trace!( - "internal_run_registration_server - gracefully shutdown ... deleting socket {}", - socket_path - ); - std::fs::remove_file(socket_path).unwrap_or(()); - Ok(()) -} - -#[cfg(any(test, feature = "agent-full"))] -pub fn register_embedded_discovery_handlers( - discovery_handler_map: RegisteredDiscoveryHandlerMap, -) -> Result<(), Box> { - info!("register_embedded_discovery_handlers - entered"); - let env_var_query = ActualEnvVarQuery {}; - inner_register_embedded_discovery_handlers(discovery_handler_map, &env_var_query)?; - Ok(()) -} - -/// Adds all embedded Discovery Handlers to the RegisteredDiscoveryHandlerMap, specifying an endpoint of -/// Endpoint::Embedded to signal that it is an embedded Discovery Handler. -#[cfg(any(test, feature = "agent-full"))] -pub fn inner_register_embedded_discovery_handlers( - discovery_handler_map: RegisteredDiscoveryHandlerMap, - query: &impl EnvVarQuery, -) -> Result<(), Box> { - type Details = (String, bool); - let mut embedded_discovery_handlers: Vec
= Vec::new(); - if query.get_env_var(ENABLE_DEBUG_ECHO_LABEL).is_ok() { - let shared: bool = query - .get_env_var(akri_debug_echo::DEBUG_ECHO_INSTANCES_SHARED_LABEL) - .unwrap() - .parse() - .unwrap(); - embedded_discovery_handlers - .push((akri_debug_echo::DISCOVERY_HANDLER_NAME.to_string(), shared)); - } - #[cfg(feature = "onvif-feat")] - embedded_discovery_handlers.push(( - akri_onvif::DISCOVERY_HANDLER_NAME.to_string(), - akri_onvif::SHARED, - )); - #[cfg(feature = "udev-feat")] - embedded_discovery_handlers.push(( - akri_udev::DISCOVERY_HANDLER_NAME.to_string(), - akri_udev::SHARED, - )); - #[cfg(feature = "opcua-feat")] - embedded_discovery_handlers.push(( - akri_opcua::DISCOVERY_HANDLER_NAME.to_string(), - akri_opcua::SHARED, - )); - - embedded_discovery_handlers.into_iter().for_each(|dh| { - let (name, shared) = dh; - let (close_discovery_handler_connection, _) = - broadcast::channel(CLOSE_DISCOVERY_HANDLER_CONNECTION_CHANNEL_CAPACITY); - let discovery_handler_details = DiscoveryDetails { - name: name.clone(), - endpoint: DiscoveryHandlerEndpoint::Embedded, - shared, - close_discovery_handler_connection, - }; - let mut register_request_map = HashMap::new(); - register_request_map.insert( - DiscoveryHandlerEndpoint::Embedded, - discovery_handler_details, - ); - discovery_handler_map - .lock() - .unwrap() - .insert(name, register_request_map); - }); - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - use akri_discovery_utils::discovery::v0::registration_client::RegistrationClient; - use akri_shared::os::env_var::MockEnvVarQuery; - use std::convert::TryFrom; - use tempfile::Builder; - use tokio::net::UnixStream; - use tonic::transport::{Endpoint, Uri}; - - #[test] - fn test_register_embedded_discovery_handlers() { - let mut seq = mockall::Sequence::new(); - // Enable debug echo and set environment variable to set whether debug echo instances are shared - let mut mock_env_var = MockEnvVarQuery::new(); - mock_env_var - .expect_get_env_var() - .times(1) - .withf(|label: &str| label == ENABLE_DEBUG_ECHO_LABEL) - .in_sequence(&mut seq) - .returning(|_| Ok("1".to_string())); - mock_env_var - .expect_get_env_var() - .times(1) - .withf(|label: &str| label == akri_debug_echo::DEBUG_ECHO_INSTANCES_SHARED_LABEL) - .in_sequence(&mut seq) - .returning(|_| Ok("false".to_string())); - let discovery_handler_map = Arc::new(Mutex::new(HashMap::new())); - inner_register_embedded_discovery_handlers(discovery_handler_map.clone(), &mock_env_var) - .unwrap(); - assert!(discovery_handler_map - .lock() - .unwrap() - .get("debugEcho") - .is_some()); - #[cfg(feature = "onvif-feat")] - assert!(discovery_handler_map.lock().unwrap().get("onvif").is_some()); - #[cfg(feature = "opcua-feat")] - assert!(discovery_handler_map.lock().unwrap().get("opcua").is_some()); - #[cfg(feature = "udev-feat")] - assert!(discovery_handler_map.lock().unwrap().get("udev").is_some()); - } - - #[test] - fn test_register_embedded_discovery_handlers_no_debug_echo() { - let mut mock_env_var = MockEnvVarQuery::new(); - mock_env_var - .expect_get_env_var() - .times(1) - .withf(|label: &str| label == ENABLE_DEBUG_ECHO_LABEL) - .returning(|_| Err(std::env::VarError::NotPresent)); - let discovery_handler_map = Arc::new(Mutex::new(HashMap::new())); - inner_register_embedded_discovery_handlers(discovery_handler_map.clone(), &mock_env_var) - .unwrap(); - assert!(discovery_handler_map - .lock() - .unwrap() - .get("debugEcho") - .is_none()); - } - - #[tokio::test] - async fn test_run_registration_server_reregister_discovery_handler() { - let registration_socket_dir = Builder::new().tempdir().unwrap(); - let registration_socket_path = registration_socket_dir - .path() - .join("agent-registration.sock"); - let registration_socket_path_string_thread = - registration_socket_path.to_str().unwrap().to_string(); - let registration_socket_path_string = - registration_socket_path.to_str().unwrap().to_string(); - let (new_discovery_handler_sender, mut new_discovery_handler_receiver) = - broadcast::channel(4); - let discovery_handler_map = Arc::new(Mutex::new(HashMap::new())); - let thread_discovery_handler_map = discovery_handler_map.clone(); - - // Run registration service - tokio::spawn(async move { - internal_run_registration_server( - thread_discovery_handler_map, - new_discovery_handler_sender, - ®istration_socket_path_string_thread, - ) - .await - .unwrap(); - }); - - // Make sure registration service is running - assert!(unix_stream::try_connect(®istration_socket_path_string) - .await - .is_ok()); - // Connect to registration service - let channel = Endpoint::try_from("http://[::1]:50051") - .unwrap() - .connect_with_connector(tower::service_fn(move |_: Uri| { - UnixStream::connect(registration_socket_path_string.clone()) - })) - .await - .unwrap(); - // Create registration client - let mut registration_client = RegistrationClient::new(channel); - - // Test registering a discovery handler with UDS endpoint - let endpoint_string = "/path/to/socket/name.sock".to_string(); - let discovery_handler_endpoint = DiscoveryHandlerEndpoint::Uds(endpoint_string.clone()); - let request = RegisterDiscoveryHandlerRequest { - name: "name".to_string(), - endpoint: endpoint_string.clone(), - endpoint_type: EndpointType::Uds as i32, - shared: true, - }; - assert!(registration_client - .register_discovery_handler(request.clone()) - .await - .is_ok()); - assert_eq!(new_discovery_handler_receiver.recv().await.unwrap(), "name"); - let discovery_handler_details = discovery_handler_map - .lock() - .unwrap() - .get("name") - .unwrap() - .get(&discovery_handler_endpoint) - .unwrap() - .clone(); - assert_eq!( - discovery_handler_details.endpoint, - DiscoveryHandlerEndpoint::Uds(request.endpoint.clone()) - ); - assert_eq!(discovery_handler_details.shared, request.shared); - - // When a discovery handler is re-registered with the same register request, no message should be sent to - // terminate any existing discovery clients. - let mut stop_discovery_receiver = discovery_handler_details - .close_discovery_handler_connection - .subscribe(); - assert!(registration_client - .register_discovery_handler(request) - .await - .is_ok()); - assert!(stop_discovery_receiver.try_recv().is_err()); - - // When a discovery handler at a specified endpoint re-registers at the same endpoint but with a different - // locality current discovery handler clients should be notified to terminate and the entry in the - // RegisteredDiscoveryHandlersMap should be replaced. - let local_request = RegisterDiscoveryHandlerRequest { - name: "name".to_string(), - endpoint: endpoint_string, - endpoint_type: EndpointType::Uds as i32, - shared: false, - }; - assert!(registration_client - .register_discovery_handler(local_request.clone()) - .await - .is_ok()); - assert!(stop_discovery_receiver.try_recv().is_ok()); - let discovery_handler_details = discovery_handler_map - .lock() - .unwrap() - .get("name") - .unwrap() - .get(&discovery_handler_endpoint) - .unwrap() - .clone(); - assert_eq!( - discovery_handler_details.endpoint, - DiscoveryHandlerEndpoint::Uds(local_request.endpoint) - ); - assert_eq!(discovery_handler_details.shared, local_request.shared); - } - - #[test] - fn test_create_discovery_handler_endpoint() { - // Assert the endpoint with EndpointType::Uds in converted to DiscoveryHandlerEndpoint::Uds(endpoint) - assert_eq!( - create_discovery_handler_endpoint("/path/to/socket.sock", EndpointType::Uds), - DiscoveryHandlerEndpoint::Uds("/path/to/socket.sock".to_string()) - ); - - // Assert the endpoint with EndpointType::Network in converted to DiscoveryHandlerEndpoint::Network(endpoint) - assert_eq!( - create_discovery_handler_endpoint("http://10.1.2.3:1000", EndpointType::Network), - DiscoveryHandlerEndpoint::Network("http://10.1.2.3:1000".to_string()) - ); - } -} diff --git a/agent/src/util/slot_reconciliation.rs b/agent/src/util/slot_reconciliation.rs deleted file mode 100644 index a7c7c335f..000000000 --- a/agent/src/util/slot_reconciliation.rs +++ /dev/null @@ -1,769 +0,0 @@ -use super::{constants::SLOT_RECONCILIATION_CHECK_DELAY_SECS, crictl_containers}; -use akri_shared::akri::instance::device_usage::NodeUsage; -use akri_shared::{akri::instance::InstanceSpec, k8s::KubeInterface}; -use async_trait::async_trait; -use k8s_openapi::api::core::v1::PodStatus; -#[cfg(test)] -use mockall::{automock, predicate::*}; -use std::{ - collections::{HashMap, HashSet}, - str::FromStr, - sync::{Arc, Mutex}, - time::{Duration, Instant}, -}; -use tokio::process::Command; - -type SlotQueryResult = - Result, Box>; - -#[cfg_attr(test, automock)] -#[async_trait] -pub trait SlotQuery { - async fn get_node_slots(&self) -> SlotQueryResult; -} - -/// Discovers which of an instance's usage slots are actively used by containers on this node -pub struct CriCtlSlotQuery { - pub crictl_path: String, - pub runtime_endpoint: String, - pub image_endpoint: String, -} - -#[async_trait] -impl SlotQuery for CriCtlSlotQuery { - /// Calls crictl to query container runtime in search of active containers and extracts their usage slots. - async fn get_node_slots(&self) -> SlotQueryResult { - match Command::new(&self.crictl_path) - .args([ - "--runtime-endpoint", - &self.runtime_endpoint, - "--image-endpoint", - &self.image_endpoint, - "ps", - "-v", - "--output", - "json", - ]) - .output() - .await - { - Ok(output) => { - if output.status.success() { - trace!("get_node_slots - crictl called successfully"); - let output_string = String::from_utf8_lossy(&output.stdout); - Ok(crictl_containers::get_container_slot_usage(&output_string)) - } else { - let output_string = String::from_utf8_lossy(&output.stderr); - Err(None.ok_or(format!( - "get_node_slots - Failed to call crictl: {:?}", - output_string - ))?) - } - } - Err(e) => { - trace!("get_node_slots - Command failed to call crictl: {:?}", e); - Err(e.into()) - } - } - } -} - -/// Makes sure Instance's `device_usage` accurately reflects actual usage. -pub struct DevicePluginSlotReconciler { - pub removal_slot_map: Arc>>, -} - -impl DevicePluginSlotReconciler { - pub async fn reconcile( - &self, - node_name: &str, - slot_grace_period: Duration, - slot_query: &impl SlotQuery, - kube_interface: &impl KubeInterface, - ) { - trace!( - "reconcile - thread iteration start [{:?}]", - self.removal_slot_map - ); - - let node_slot_usage = match slot_query.get_node_slots().await { - Ok(usage) => usage, - Err(e) => { - trace!("reconcile - get_node_slots failed: {:?}", e); - // If an error occurs in the crictl call, return early - // to avoid treating this error like crictl found no - // active containers. Currently, reconcile is a best - // effort approach. - return; - } - }; - trace!( - "reconcile - slots currently in use on this node: {:?}", - node_slot_usage - ); - - // Any slot found in use should be scrubbed from our list - { - let mut removal_slot_map_guard = self.removal_slot_map.lock().unwrap(); - node_slot_usage.iter().for_each(|(slot, _)| { - trace!("reconcile - remove slot from tracked slots: {:?}", slot); - removal_slot_map_guard.remove(slot); - }); - } - trace!( - "reconcile - removal_slot_map after removing node_slot_usage: {:?}", - self.removal_slot_map - ); - - let instances = match kube_interface.get_instances().await { - Ok(instances) => instances, - Err(e) => { - trace!("reconcile - Failed to get instances: {:?}", e); - return; - } - }; - - let pods = match kube_interface - .find_pods_with_field(&format!("{}={}", "spec.nodeName", &node_name,)) - .await - { - Ok(pods) => { - trace!("reconcile - found {} pods on this node", pods.items.len()); - pods - } - Err(e) => { - trace!("reconcile - error finding pending pods: {}", e); - return; - } - }; - - // Check to see if there are any Pods on this Node that have - // Containers that are not ready. If there are any, we should - // wait for the Containers to be ready before cleaning any - // Instance device_usage - let any_unready_pods = pods.items.iter().any(|pod| { - pod.status - .as_ref() - .unwrap_or(&PodStatus::default()) - .conditions - .as_ref() - .unwrap_or(&Vec::new()) - .iter() - .any(|condition| { - condition.type_ == "ContainersReady" - && condition.status != "True" - && condition.reason != Some("PodCompleted".to_string()) - }) - }); - if any_unready_pods { - trace!("reconcile - Pods with unready Containers exist on this node, we can't clean the slots yet"); - return; - } - - for instance in instances { - // Check Instance against list of slots that are being used by this node's - // current pods. If we find any missing, we should update the Instance for - // the actual slot usage. - let slots_missing_this_node_name = instance - .spec - .device_usage - .iter() - .filter_map(|(k, v)| { - let same_node_name = match NodeUsage::from_str(v) { - Ok(node_usage) => node_usage.is_same_node(node_name), - Err(_) => false, - }; - if !same_node_name { - // We need to add node_name to this slot IF - // the slot is not labeled with node_name AND - // there is a container using that slot on this node - node_slot_usage - .get_key_value(k) - .map(|(slot, node_usage)| (slot.to_string(), node_usage.clone())) - } else { - None - } - }) - .collect::>(); - - // Check Instance to find slots that are registered to this node, but - // there is no actual pod using the slot. We should update the Instance - // to clear the false usage. - // - // For slots that need to be cleaned, we should wait for a "grace - // period" prior to updating the Instance. - let slots_to_clean = instance - .spec - .device_usage - .iter() - .filter_map(|(k, v)| { - let same_node_name = match NodeUsage::from_str(v) { - Ok(usage) => usage.is_same_node(node_name), - Err(_) => false, - }; - if same_node_name && !node_slot_usage.contains_key(k) { - // We need to clean this slot IF - // this slot is handled by this node AND - // there are no containers using that slot on this node - Some(k.to_string()) - } else { - None - } - }) - .filter(|slot_string| { - let mut local_slot_map = self.removal_slot_map.lock().unwrap(); - if let Some(time) = local_slot_map.get(slot_string) { - let now = Instant::now(); - match now.checked_duration_since(*time) { - Some(duration) => { - if duration > slot_grace_period { - trace!("reconcile - slot expired: [{:?}]", duration); - true // slot has been unoccupied beyond the grace period - } else { - false // still in grace period - } - } - None => { - false // still in grace period - } - } - } else { - trace!("reconcile - slot added to list: [Now]"); - local_slot_map.insert(slot_string.to_string(), Instant::now()); - false // do not remove this node just yet - } - }) - .collect::>(); - trace!( - "reconcile - these slots have no pods according to crictl AND have expired: {:?}", - &slots_to_clean - ); - - if !slots_to_clean.is_empty() || !slots_missing_this_node_name.is_empty() { - trace!( - "reconcile - update Instance slots_to_clean: {:?} slots_missing_this_node_name: {:?}", - slots_to_clean, - slots_missing_this_node_name - ); - let modified_device_usage = instance - .spec - .device_usage - .iter() - .map(|(slot, usage)| { - ( - slot.to_string(), - if slots_missing_this_node_name.contains_key(slot) { - // Restore usage because there have been - // cases where a Pod is running (which corresponds - // to an Allocate call, but the Instance slot is empty. - slots_missing_this_node_name.get(slot).unwrap().to_string() - } else if slots_to_clean.contains(slot) { - // Set usage to free because there is no - // Deallocate message from kubelet for us to know - // when a slot is no longer in use - NodeUsage::default().to_string() - } else { - // This slot remains unchanged. - usage.into() - }, - ) - }) - .collect::>(); - let modified_instance = InstanceSpec { - configuration_name: instance.spec.configuration_name.clone(), - broker_properties: instance.spec.broker_properties.clone(), - shared: instance.spec.shared, - device_usage: modified_device_usage, - nodes: instance.spec.nodes.clone(), - }; - trace!("reconcile - update Instance from: {:?}", &instance.spec); - trace!("reconcile - update Instance to: {:?}", &modified_instance); - match kube_interface - .update_instance( - &modified_instance, - &instance.metadata.name.unwrap(), - &instance.metadata.namespace.unwrap(), - ) - .await - { - Ok(()) => { - slots_to_clean.iter().for_each(|slot| { - trace!("reconcile - remove {} from removal_slot_map", slot); - self.removal_slot_map.lock().unwrap().remove(slot); - }); - } - Err(e) => { - // If update fails, let the next iteration update the Instance. We - // may want to revisit this decision and add some retry logic - // here. - trace!("reconcile - update Instance failed: {:?}", e); - } - } - } - } - - trace!("reconcile - thread iteration end"); - } -} - -/// This periodically checks to make sure that all Instances' device_usage -/// accurately reflects the actual usage. -/// -/// The Kubernetes Device-Plugin implementation has no notifications for -/// when a Pod disappears (which should, in turn, free up a slot). Because -/// of this, if a Pod disappears, there will be a slot that Akri (and the -/// Kubernetes scheduler) falsely thinks is in use. -/// -/// To work around this, we have done 2 things: -/// 1. Each of Agent's device plugins add slot information to the Annotations -/// section of the Allocate response. -/// 2. periodic_slot_reconciliation will periodically call crictl to query the -/// container runtime in search of active Containers that have our slot -/// Annotations. This function will make sure that our Instance device_usage -/// accurately reflects the actual usage. -/// -/// It has rarely been seen, perhaps due to connectivity issues, that active -/// Containers with our Annotation are no longer in our Instance. This is a bug that -/// we are aware of, but haven't found yet. To address this, until a fix is found, -/// we will also make sure that any Container that exists with our Annotation will -/// be shown in our Instance device_usage. -pub async fn periodic_slot_reconciliation( - slot_grace_period: std::time::Duration, -) -> Result<(), Box> { - trace!("periodic_slot_reconciliation - start"); - let kube_interface = akri_shared::k8s::KubeImpl::new().await?; - let node_name = std::env::var("AGENT_NODE_NAME").unwrap(); - let crictl_path = std::env::var("HOST_CRICTL_PATH").unwrap(); - let runtime_endpoint = std::env::var("HOST_RUNTIME_ENDPOINT").unwrap(); - let image_endpoint = std::env::var("HOST_IMAGE_ENDPOINT").unwrap(); - - let reconciler = DevicePluginSlotReconciler { - removal_slot_map: Arc::new(std::sync::Mutex::new(HashMap::new())), - }; - let slot_query = CriCtlSlotQuery { - crictl_path, - runtime_endpoint, - image_endpoint, - }; - - loop { - trace!("periodic_slot_reconciliation - iteration pre sleep"); - tokio::time::sleep(std::time::Duration::from_secs( - SLOT_RECONCILIATION_CHECK_DELAY_SECS, - )) - .await; - - trace!("periodic_slot_reconciliation - iteration call reconiler.reconcile"); - reconciler - .reconcile(&node_name, slot_grace_period, &slot_query, &kube_interface) - .await; - - trace!("periodic_slot_reconciliation - iteration end"); - } -} - -#[cfg(test)] -mod reconcile_tests { - use super::*; - use akri_shared::akri::instance::device_usage::DeviceUsageKind; - use akri_shared::{akri::instance::InstanceList, k8s::MockKubeInterface, os::file}; - use k8s_openapi::api::core::v1::Pod; - use kube::api::ObjectList; - - fn configure_get_node_slots( - mock: &mut MockSlotQuery, - result: HashMap, - error: bool, - ) { - mock.expect_get_node_slots().times(1).returning(move || { - if !error { - Ok(result.clone()) - } else { - Err(None.ok_or("failure")?) - } - }); - } - - fn configure_get_instances(mock: &mut MockKubeInterface, result_file: &'static str) { - mock.expect_get_instances().times(1).returning(move || { - let instance_list_json = file::read_file_to_string(result_file); - let instance_list: InstanceList = serde_json::from_str(&instance_list_json).unwrap(); - Ok(instance_list) - }); - } - - fn configure_find_pods_with_field( - mock: &mut MockKubeInterface, - selector: &'static str, - result_file: &'static str, - ) { - mock.expect_find_pods_with_field() - .times(1) - .withf(move |s| s == selector) - .returning(move |_| { - let pods_json = file::read_file_to_string(result_file); - let pods: ObjectList = serde_json::from_str(&pods_json).unwrap(); - Ok(pods) - }); - } - - struct NodeSlots { - node_slots: HashMap, - node_slots_error: bool, - } - - struct UpdateInstance { - expected_slot_1_node: &'static str, - expected_slot_5_node: &'static str, - } - - async fn configure_scnenario( - node_slots: NodeSlots, - instances_result_file: &'static str, - update_instance: Option, - grace_period: Duration, - reconciler: &DevicePluginSlotReconciler, - ) { - let mut slot_query = MockSlotQuery::new(); - // slot_query to identify one slot used by this node - configure_get_node_slots( - &mut slot_query, - node_slots.node_slots, - node_slots.node_slots_error, - ); - - let mut kube_interface = MockKubeInterface::new(); - if !node_slots.node_slots_error { - // kube_interface to find Instance with node-a using slots: - // config-a-359973-1 & config-a-359973-3 - configure_get_instances(&mut kube_interface, instances_result_file); - // kube_interface to find no pods with unready containers - configure_find_pods_with_field( - &mut kube_interface, - "spec.nodeName=node-a", - "../test/json/running-pod-list-for-config-a-shared.json", - ); - if let Some(update_instance_) = update_instance { - trace!( - "expect_update_instance - slot1: {}, slot5: {}", - update_instance_.expected_slot_1_node, - update_instance_.expected_slot_5_node - ); - // kube_interface to update Instance - kube_interface - .expect_update_instance() - .times(1) - .withf(move |instance, name, namespace| { - name == "config-a-359973" - && namespace == "config-a-namespace" - && instance.nodes.len() == 3 - && instance.nodes.contains(&"node-a".to_string()) - && instance.nodes.contains(&"node-b".to_string()) - && instance.nodes.contains(&"node-c".to_string()) - && instance.device_usage["config-a-359973-0"] == "node-b" - && instance.device_usage["config-a-359973-1"] - == update_instance_.expected_slot_1_node - && instance.device_usage["config-a-359973-2"] == "node-b" - && instance.device_usage["config-a-359973-3"] == "node-a" - && instance.device_usage["config-a-359973-4"] == "node-c" - && instance.device_usage["config-a-359973-5"] - == update_instance_.expected_slot_5_node - }) - .returning(move |_, _, _| Ok(())); - } - } - - reconciler - .reconcile("node-a", grace_period, &slot_query, &kube_interface) - .await; - } - - #[tokio::test] - async fn test_reconcile_no_slots_to_reconcile() { - let _ = env_logger::builder().is_test(true).try_init(); - - let reconciler = DevicePluginSlotReconciler { - removal_slot_map: Arc::new(Mutex::new(HashMap::new())), - }; - configure_scnenario( - NodeSlots { - node_slots: HashMap::new(), - node_slots_error: false, - }, - "../test/json/shared-instance-list.json", - None, - Duration::from_secs(10), - &reconciler, - ) - .await; - } - - #[tokio::test] - async fn test_reconcile_get_slots_error() { - let _ = env_logger::builder().is_test(true).try_init(); - - let reconciler = DevicePluginSlotReconciler { - removal_slot_map: Arc::new(Mutex::new(HashMap::new())), - }; - configure_scnenario( - NodeSlots { - node_slots: HashMap::new(), - node_slots_error: true, - }, - "", - None, - Duration::from_secs(10), - &reconciler, - ) - .await; - } - - #[tokio::test] - async fn test_reconcile_slots_to_add() { - let _ = env_logger::builder().is_test(true).try_init(); - - let reconciler = DevicePluginSlotReconciler { - removal_slot_map: Arc::new(Mutex::new(HashMap::new())), - }; - - let grace_period = Duration::from_millis(100); - let mut node_slots = HashMap::new(); - node_slots.insert( - "config-a-359973-3".to_string(), - NodeUsage::create(&DeviceUsageKind::Instance, "node-a").unwrap(), - ); - node_slots.insert( - "config-a-359973-5".to_string(), - NodeUsage::create(&DeviceUsageKind::Instance, "node-a").unwrap(), - ); - configure_scnenario( - // slot_query to identify one slot used by this node - NodeSlots { - node_slots, - node_slots_error: false, - }, - // kube_interface to find Instance with node-a using slots: - // config-a-359973-1 & config-a-359973-3 - "../test/json/shared-instance-list-slots.json", - Some(UpdateInstance { - expected_slot_1_node: "node-a", - expected_slot_5_node: "node-a", - }), - grace_period, - &reconciler, - ) - .await; - - // Validate that the slot has been added to the list of "to be removed slots" - assert!(reconciler.removal_slot_map.lock().unwrap().len() == 1); - assert!(reconciler - .removal_slot_map - .lock() - .unwrap() - .contains_key("config-a-359973-1")); - } - - #[tokio::test] - async fn test_reconcile_slots_to_delete() { - let _ = env_logger::builder().is_test(true).try_init(); - - let reconciler = DevicePluginSlotReconciler { - removal_slot_map: Arc::new(Mutex::new(HashMap::new())), - }; - - let grace_period = Duration::from_millis(100); - let mut node_slots = HashMap::new(); - node_slots.insert( - "config-a-359973-3".to_string(), - NodeUsage::create(&DeviceUsageKind::Instance, "node-a").unwrap(), - ); - configure_scnenario( - // slot_query to identify one slot used by this node - NodeSlots { - node_slots: node_slots.clone(), - node_slots_error: false, - }, - // kube_interface to find Instance with node-a using slots: - // config-a-359973-1 & config-a-359973-3 - "../test/json/shared-instance-list-slots.json", - None, - grace_period, - &reconciler, - ) - .await; - - // Validate that the slot has been added to the list of "to be removed slots" - assert!(reconciler.removal_slot_map.lock().unwrap().len() == 1); - assert!(reconciler - .removal_slot_map - .lock() - .unwrap() - .contains_key("config-a-359973-1")); - - // Wait for more than the grace period ... it short, so, just wait twice :) - std::thread::sleep(grace_period); - std::thread::sleep(grace_period); - - configure_scnenario( - // slot_query to identify one slot used by this node - NodeSlots { - node_slots: node_slots.clone(), - node_slots_error: false, - }, - // kube_interface to find Instance with node-a using slots: - // config-a-359973-1 & config-a-359973-3 - "../test/json/shared-instance-list-slots.json", - Some(UpdateInstance { - expected_slot_1_node: "", - expected_slot_5_node: "", - }), - grace_period, - &reconciler, - ) - .await; - - // Validate that the slot has been added to the list of "to be removed slots" - assert!(reconciler.removal_slot_map.lock().unwrap().is_empty()); - } - - #[tokio::test] - async fn test_reconcile_slots_to_delete_and_add() { - let _ = env_logger::builder().is_test(true).try_init(); - - let reconciler = DevicePluginSlotReconciler { - removal_slot_map: Arc::new(Mutex::new(HashMap::new())), - }; - - let grace_period = Duration::from_millis(100); - let mut node_slots = HashMap::new(); - node_slots.insert( - "config-a-359973-3".to_string(), - NodeUsage::create(&DeviceUsageKind::Instance, "node-a").unwrap(), - ); - configure_scnenario( - // slot_query to identify one slot used by this node - NodeSlots { - node_slots, - node_slots_error: false, - }, - // kube_interface to find Instance with node-a using slots: - // config-a-359973-1 & config-a-359973-3 - "../test/json/shared-instance-list-slots.json", - None, - grace_period, - &reconciler, - ) - .await; - - // Validate that the slot has been added to the list of "to be removed slots" - assert!(reconciler.removal_slot_map.lock().unwrap().len() == 1); - assert!(reconciler - .removal_slot_map - .lock() - .unwrap() - .contains_key("config-a-359973-1")); - - // Wait for more than the grace period ... it short, so, just wait twice :) - std::thread::sleep(grace_period); - std::thread::sleep(grace_period); - - let mut node_slots_added = HashMap::new(); - node_slots_added.insert( - "config-a-359973-3".to_string(), - NodeUsage::create(&DeviceUsageKind::Instance, "node-a").unwrap(), - ); - node_slots_added.insert( - "config-a-359973-5".to_string(), - NodeUsage::create(&DeviceUsageKind::Instance, "node-a").unwrap(), - ); - configure_scnenario( - // slot_query to identify one slot used by this node - NodeSlots { - node_slots: node_slots_added, - node_slots_error: false, - }, - // kube_interface to find Instance with node-a using slots: - // config-a-359973-1 & config-a-359973-3 - "../test/json/shared-instance-list-slots.json", - Some(UpdateInstance { - expected_slot_1_node: "", - expected_slot_5_node: "node-a", - }), - grace_period, - &reconciler, - ) - .await; - - // Validate that the slot has been added to the list of "to be removed slots" - assert!(reconciler.removal_slot_map.lock().unwrap().is_empty()); - } - - #[tokio::test] - async fn test_reconcile_slots_to_delete_only_temporarily() { - let _ = env_logger::builder().is_test(true).try_init(); - - let reconciler = DevicePluginSlotReconciler { - removal_slot_map: Arc::new(Mutex::new(HashMap::new())), - }; - - let grace_period = Duration::from_millis(100); - let mut node_slots = HashMap::new(); - node_slots.insert( - "config-a-359973-3".to_string(), - NodeUsage::create(&DeviceUsageKind::Instance, "node-a").unwrap(), - ); - configure_scnenario( - // slot_query to identify one slot used by this node - NodeSlots { - node_slots, - node_slots_error: false, - }, - // kube_interface to find Instance with node-a using slots: - // config-a-359973-1 & config-a-359973-3 - "../test/json/shared-instance-list-slots.json", - None, - grace_period, - &reconciler, - ) - .await; - - // Validate that the slot has been added to the list of "to be removed slots" - assert!(reconciler.removal_slot_map.lock().unwrap().len() == 1); - assert!(reconciler - .removal_slot_map - .lock() - .unwrap() - .contains_key("config-a-359973-1")); - - // Wait for more than the grace period ... it short, so, just wait twice :) - std::thread::sleep(grace_period); - std::thread::sleep(grace_period); - - let mut node_slots_added = HashMap::new(); - node_slots_added.insert( - "config-a-359973-1".to_string(), - NodeUsage::create(&DeviceUsageKind::Instance, "node-a").unwrap(), - ); - node_slots_added.insert( - "config-a-359973-3".to_string(), - NodeUsage::create(&DeviceUsageKind::Instance, "node-a").unwrap(), - ); - configure_scnenario( - // slot_query to identify two slots used by this node - NodeSlots { - node_slots: node_slots_added, - node_slots_error: false, - }, - // kube_interface to find Instance with node-a using slots: - // config-a-359973-1 & config-a-359973-3 - "../test/json/shared-instance-list-slots.json", - None, - grace_period, - &reconciler, - ) - .await; - - // Validate that the slot has been added to the list of "to be removed slots" - assert!(reconciler.removal_slot_map.lock().unwrap().is_empty()); - } -} diff --git a/agent/src/util/stopper.rs b/agent/src/util/stopper.rs new file mode 100644 index 000000000..0649cb6a5 --- /dev/null +++ b/agent/src/util/stopper.rs @@ -0,0 +1,99 @@ +use std::sync::Arc; + +use futures::stream::{AbortHandle, Abortable}; +use tokio::{signal::unix::SignalKind, sync::watch}; + +#[derive(Clone)] +pub struct Stopper { + state: Arc>, +} + +impl Stopper { + pub fn new() -> Self { + let (state, _) = watch::channel(false); + let s = Self { + state: Arc::new(state), + }; + let local_s = s.clone(); + tokio::spawn(async move { + let mut signal = tokio::signal::unix::signal(SignalKind::terminate()).unwrap(); + tokio::select! { + _ = local_s.stopped() => {}, + _ = signal.recv() => local_s.stop() + } + }); + s + } + + pub fn stop(&self) { + self.state.send_replace(true); + } + + pub fn is_stopped(&self) -> bool { + *self.state.borrow() + } + + pub async fn stopped(&self) { + let mut r = self.state.subscribe(); + if !*r.borrow_and_update() { + let _ = r.changed().await; + } + } + + pub fn make_abortable(&self, inner: T) -> Abortable { + let (handle, reg) = AbortHandle::new_pair(); + let local_self = self.clone(); + tokio::spawn(async move { + local_self.stopped().await; + handle.abort(); + }); + Abortable::new(inner, reg) + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use super::*; + + #[tokio::test] + async fn test_stopper() { + let stopper = Stopper::new(); + assert!(!stopper.is_stopped()); + assert!( + tokio::time::timeout(Duration::from_secs(2), stopper.stopped()) + .await + .is_err() + ); + let local_stopper = stopper.clone(); + tokio::spawn(async move { + tokio::time::sleep(Duration::from_secs(1)).await; + local_stopper.stop() + }); + assert!( + tokio::time::timeout(Duration::from_secs(2), stopper.stopped()) + .await + .is_ok() + ); + assert!(stopper.is_stopped()); + } + + #[tokio::test] + async fn test_make_abortable() { + let stopper = Stopper::new(); + let abortable = stopper.make_abortable(async { + tokio::time::sleep(Duration::from_millis(50)).await; + true + }); + assert!(!abortable.is_aborted()); + assert_eq!(abortable.await, Ok(true)); + + let abortable = stopper.make_abortable(async { + tokio::time::sleep(Duration::from_millis(50)).await; + true + }); + stopper.stop(); + assert!(abortable.await.is_err()); + } +} diff --git a/agent/src/util/streaming_extension.rs b/agent/src/util/streaming_extension.rs deleted file mode 100644 index 8bc15fce5..000000000 --- a/agent/src/util/streaming_extension.rs +++ /dev/null @@ -1,31 +0,0 @@ -use akri_discovery_utils::discovery::v0::DiscoverResponse; -use async_trait::async_trait; -use tokio::sync::mpsc; -use tonic::{Code, Status}; - -/// An extension trait that is used to get the latest message from both embedded and -/// external Discovery Handlers' streams. -#[async_trait] -pub trait StreamingExt: Send { - async fn get_message(&mut self) -> Result, Status>; -} - -#[async_trait] -impl StreamingExt for mpsc::Receiver> { - async fn get_message(&mut self) -> Result, Status> { - match self.recv().await { - Some(result) => match result { - Ok(res) => Ok(Some(res)), - Err(e) => Err(e), - }, - None => Err(Status::new(Code::Unavailable, "broken pipe")), - } - } -} - -#[async_trait] -impl StreamingExt for tonic::codec::Streaming { - async fn get_message(&mut self) -> Result, Status> { - self.message().await - } -} diff --git a/build/containers/Dockerfile.rust b/build/containers/Dockerfile.rust index fa70b5879..440d98943 100644 --- a/build/containers/Dockerfile.rust +++ b/build/containers/Dockerfile.rust @@ -1,6 +1,6 @@ FROM --platform=$BUILDPLATFORM tonistiigi/xx:master AS xx -FROM --platform=$BUILDPLATFORM rust:1.73-slim-bookworm AS build +FROM --platform=$BUILDPLATFORM rust:1.74-slim-bookworm AS build RUN rustup component add rustfmt RUN apt-get update && apt-get install -y clang lld protobuf-compiler pkg-config mmdebstrap wget COPY --from=xx / / diff --git a/controller/Cargo.toml b/controller/Cargo.toml index b52716eec..ed3201797 100644 --- a/controller/Cargo.toml +++ b/controller/Cargo.toml @@ -15,9 +15,9 @@ async-std = "1.5.0" chrono = "0.4.10" env_logger = "0.10.0" futures = "0.3.1" -k8s-openapi = { version = "0.17.0", default-features = false, features = ["schemars", "v1_23"] } -kube = { version = "0.80.0", features = ["derive"] } -kube-runtime = "0.80.0" +k8s-openapi = { version = "0.20.0", default-features = false, features = ["schemars", "v1_23"] } +kube = { version = "0.87.1", features = ["derive"] } +kube-runtime = "0.87.1" lazy_static = "1.4" log = "0.4" prometheus = { version = "0.12.0", features = ["process"] } diff --git a/controller/src/util/instance_action.rs b/controller/src/util/instance_action.rs index 9a13a8be2..412769c9a 100644 --- a/controller/src/util/instance_action.rs +++ b/controller/src/util/instance_action.rs @@ -12,8 +12,8 @@ use async_std::sync::Mutex; use futures::{StreamExt, TryStreamExt}; use k8s_openapi::api::batch::v1::JobSpec; use k8s_openapi::api::core::v1::{Pod, PodSpec}; -use kube::api::{Api, ListParams}; -use kube_runtime::watcher::{default_backoff, watcher, Event}; +use kube::api::Api; +use kube_runtime::watcher::{watcher, Config, Event}; use kube_runtime::WatchStreamExt; use log::{error, info, trace}; use std::collections::HashMap; @@ -93,7 +93,7 @@ async fn internal_do_instance_watch( ) -> Result<(), Box> { trace!("internal_do_instance_watch - enter"); let resource = Api::::all(kube_interface.get_kube_client()); - let watcher = watcher(resource, ListParams::default()).backoff(default_backoff()); + let watcher = watcher(resource, Config::default()).default_backoff(); let mut informer = watcher.boxed(); let mut first_event = true; // Currently, this does not handle None except to break the loop. diff --git a/controller/src/util/node_watcher.rs b/controller/src/util/node_watcher.rs index 0b669d089..93f53baa5 100644 --- a/controller/src/util/node_watcher.rs +++ b/controller/src/util/node_watcher.rs @@ -9,8 +9,8 @@ use akri_shared::{ }; use futures::{StreamExt, TryStreamExt}; use k8s_openapi::api::core::v1::{Node, NodeStatus}; -use kube::api::{Api, ListParams}; -use kube_runtime::watcher::{default_backoff, watcher, Event}; +use kube::api::Api; +use kube_runtime::watcher::{watcher, Config, Event}; use kube_runtime::WatchStreamExt; use log::{error, info, trace}; use std::collections::HashMap; @@ -58,7 +58,7 @@ impl NodeWatcher { trace!("watch - enter"); let kube_interface = k8s::KubeImpl::new().await?; let resource = Api::::all(kube_interface.get_kube_client()); - let watcher = watcher(resource, ListParams::default()).backoff(default_backoff()); + let watcher = watcher(resource, Config::default()).default_backoff(); let mut informer = watcher.boxed(); let mut first_event = true; @@ -316,6 +316,8 @@ impl NodeWatcher { // Save the instance let modified_instance = InstanceSpec { + cdi_name: instance.spec.cdi_name.clone(), + capacity: instance.spec.capacity, configuration_name: instance.spec.configuration_name.clone(), broker_properties: instance.spec.broker_properties.clone(), shared: instance.spec.shared, diff --git a/controller/src/util/pod_watcher.rs b/controller/src/util/pod_watcher.rs index 829257733..b71b3fc4e 100644 --- a/controller/src/util/pod_watcher.rs +++ b/controller/src/util/pod_watcher.rs @@ -12,8 +12,8 @@ use akri_shared::{ use async_std::sync::Mutex; use futures::{StreamExt, TryStreamExt}; use k8s_openapi::api::core::v1::{Pod, ServiceSpec}; -use kube::api::{Api, ListParams}; -use kube_runtime::watcher::{default_backoff, watcher, Event}; +use kube::api::Api; +use kube_runtime::watcher::{watcher, Config, Event}; use kube_runtime::WatchStreamExt; use log::{error, info, trace}; use std::{collections::HashMap, sync::Arc}; @@ -129,9 +129,9 @@ impl BrokerPodWatcher { let resource = Api::::all(kube_interface.get_kube_client()); let watcher = watcher( resource, - ListParams::default().labels(AKRI_CONFIGURATION_LABEL_NAME), + Config::default().labels(AKRI_CONFIGURATION_LABEL_NAME), ) - .backoff(default_backoff()); + .default_backoff(); let mut informer = watcher.boxed(); let synchronization = Arc::new(Mutex::new(())); let mut first_event = true; diff --git a/deployment/helm/crds/akri-instance-crd.yaml b/deployment/helm/crds/akri-instance-crd.yaml index 61873e7d8..ccac64c0f 100644 --- a/deployment/helm/crds/akri-instance-crd.yaml +++ b/deployment/helm/crds/akri-instance-crd.yaml @@ -1,56 +1,88 @@ +--- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: name: instances.akri.sh spec: group: akri.sh + names: + categories: [] + kind: Instance + plural: instances + shortNames: + - akrii + singular: instance + scope: Namespaced versions: - - name: v0 - served: true - storage: true + - additionalPrinterColumns: + - description: The Configuration this Instance belongs to + jsonPath: ".spec.configurationName" + name: Config + type: string + - description: Describes whether this Instance is shared + jsonPath: ".spec.shared" + name: Shared + type: boolean + - description: Nodes that expose this Instance + jsonPath: ".spec.nodes" + name: Nodes + type: string + - jsonPath: ".metadata.creationTimestamp" + name: Age + type: date + name: v0 schema: openAPIV3Schema: - type: object + description: "Auto-generated derived type for InstanceSpec via `CustomResource`" properties: spec: - type: object + description: "Defines the information in the Instance CRD\n\nAn Instance is a specific instance described by a Configuration. For example, a Configuration may describe many cameras, each camera will be represented by a Instance." properties: + brokerProperties: + additionalProperties: + type: string + default: {} + description: "This defines some properties that will be set as environment variables in broker Pods that request the resource this Instance represents. It contains the `Configuration.broker_properties` from this Instance's Configuration and the `Device.properties` set by the Discovery Handler that discovered the resource this Instance represents." + type: object + capacity: + description: This contains the number of slots for the Instance + format: uint + minimum: 0.0 + type: integer + cdiName: + description: This contains the CDI fully qualified name of the device linked to the Instance + type: string configurationName: + description: This contains the name of the corresponding Configuration type: string - brokerProperties: + deviceUsage: additionalProperties: type: string + default: {} + description: This contains a map of capability slots to node names. The number of slots corresponds to the associated Configuration.capacity field. Each slot will either map to an empty string (if the slot has not been claimed) or to a node name (corresponding to the node that has claimed the slot) type: object - shared: - type: boolean + x-kubernetes-map-type: granular nodes: - type: array + default: [] + description: This contains a list of the nodes that can access this capability instance items: type: string - deviceUsage: # map - additionalProperties: - type: string - type: object - additionalPrinterColumns: - - name: Config - type: string - description: The Configuration this Instance belongs to - jsonPath: .spec.configurationName - - name: Shared - type: boolean - description: Describes whether this Instance is shared - jsonPath: .spec.shared - - name: Nodes - type: string - description: Nodes that expose this Instance - jsonPath: .spec.nodes - - name: Age - type: date - jsonPath: .metadata.creationTimestamp - scope: Namespaced - names: - plural: instances - singular: instance - kind: Instance - shortNames: - - akrii + type: array + x-kubernetes-list-type: set + shared: + default: false + description: This defines whether the capability is to be shared by multiple nodes + type: boolean + required: + - capacity + - cdiName + - configurationName + type: object + required: + - spec + title: Instance + type: object + served: true + storage: true + subresources: {} + diff --git a/deployment/helm/templates/agent.yaml b/deployment/helm/templates/agent.yaml index 0fa2a601a..113abf1a3 100644 --- a/deployment/helm/templates/agent.yaml +++ b/deployment/helm/templates/agent.yaml @@ -84,6 +84,8 @@ spec: mountPath: /var/lib/akri - name: device-plugin mountPath: /var/lib/kubelet/device-plugins + - name: pod-resources + mountPath: /var/lib/kubelet/pod-resources - name: var-run-dockershim mountPath: /host/run/containerd/containerd.sock {{- if .Values.agent.host.udev }} @@ -106,6 +108,9 @@ spec: - name: device-plugin hostPath: path: "{{ .Values.agent.host.kubeletDevicePlugins }}" + - name: pod-resources + hostPath: + path: "{{ .Values.agent.host.kubeletPodResources }}" - name: var-run-dockershim hostPath: {{- if ne "" .Values.agent.host.containerRuntimeSocket }} diff --git a/deployment/helm/templates/rbac.yaml b/deployment/helm/templates/rbac.yaml index 0acac0008..80ac088a2 100644 --- a/deployment/helm/templates/rbac.yaml +++ b/deployment/helm/templates/rbac.yaml @@ -55,7 +55,7 @@ rules: verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] - apiGroups: [{{ .Values.crds.group | quote }}] resources: ["configurations"] - verbs: ["get", "list", "watch"] + verbs: ["get", "list", "watch", "patch"] --- apiVersion: 'rbac.authorization.k8s.io/v1' kind: 'ClusterRoleBinding' diff --git a/deployment/helm/values.yaml b/deployment/helm/values.yaml index ca462c08d..d4dc45576 100644 --- a/deployment/helm/values.yaml +++ b/deployment/helm/values.yaml @@ -107,6 +107,8 @@ agent: discoveryHandlers: /var/lib/akri # kubeletDevicePlugins is the location of the kubelet device-plugin sockets kubeletDevicePlugins: /var/lib/kubelet/device-plugins + # kubeletPodResources is the location of the kubelet pod-resources socket + kubeletPodResources: /var/lib/kubelet/pod-resources # containerRuntimeSocket is the default node path of the container runtime socket. # For MicroK8s, set to "/var/snap/microk8s/common/run/containerd.sock" # For K3s, set to "/run/k3s/containerd/containerd.sock" diff --git a/shared/Cargo.toml b/shared/Cargo.toml index 14335e44b..08650dd12 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -12,8 +12,8 @@ rust-version = "1.73.0" anyhow = "1.0.38" async-trait = "0.1.0" either = '*' -k8s-openapi = { version = "0.17.0", default-features = false, features = ["schemars", "v1_23"] } -kube = { version = "0.80.0", features = ["derive"] } +k8s-openapi = { version = "0.20.0", default-features = false, features = ["schemars", "v1_23"] } +kube = { version = "0.87.1", features = ["derive"] } log = "0.4" mockall = "0.12" prometheus = { version = "0.12.0", features = ["process"] } @@ -31,3 +31,6 @@ warp = "0.3.6" [dev-dependencies] env_logger = "0.10.0" +[[bin]] +name="gen_crds" +path="src/gen_crds.rs" \ No newline at end of file diff --git a/shared/src/akri/configuration.rs b/shared/src/akri/configuration.rs index b2ebc7907..3406361d8 100644 --- a/shared/src/akri/configuration.rs +++ b/shared/src/akri/configuration.rs @@ -108,12 +108,13 @@ pub enum BrokerSpec { pub struct ConfigurationSpec { /// This defines the `DiscoveryHandler` that should be used to /// discover the capability and any information needed by the `DiscoveryHandler`. + #[schemars(schema_with = "immutable_dh_info")] pub discovery_handler: DiscoveryHandlerInfo, /// This defines the number of nodes that can schedule workloads for /// any given capability that is found #[serde(default = "default_capacity")] - pub capacity: i32, + pub capacity: usize, /// This defines a workload that should be scheduled to any /// node that can access any capability described by this @@ -145,6 +146,17 @@ pub struct ConfigurationSpec { pub broker_properties: HashMap, } +fn immutable_dh_info(gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema { + let mut schema: schemars::schema::SchemaObject = + ::json_schema(gen).into(); + schema.extensions.insert( + "x-kubernetes-validations".to_owned(), + serde_json::from_str(r#"[{"message": "Value is immutable", "rule": "self == oldSelf"}]"#) + .unwrap(), + ); + schema.into() +} + /// Get Configurations for a given namespace /// /// Example: @@ -230,7 +242,7 @@ pub async fn find_configuration( }, } } -fn default_capacity() -> i32 { +fn default_capacity() -> usize { 1 } diff --git a/shared/src/akri/instance.rs b/shared/src/akri/instance.rs index f50bd6aeb..22a1eb485 100644 --- a/shared/src/akri/instance.rs +++ b/shared/src/akri/instance.rs @@ -16,14 +16,50 @@ pub type InstanceList = ObjectList; /// a Configuration. For example, a Configuration /// may describe many cameras, each camera will be represented by a /// Instance. -#[derive(CustomResource, Deserialize, Serialize, Clone, Debug, JsonSchema)] +#[derive(CustomResource, Deserialize, Serialize, Clone, Debug, JsonSchema, PartialEq)] #[serde(rename_all = "camelCase")] // group = API_NAMESPACE and version = API_VERSION -#[kube(group = "akri.sh", version = "v0", kind = "Instance", namespaced)] +#[kube( + group = "akri.sh", + version = "v0", + kind = "Instance", + namespaced, + shortname = "akrii", + printcolumn = r#"{ + "name": "Config", + "type": "string", + "jsonPath": ".spec.configurationName", + "description": "The Configuration this Instance belongs to" + }"#, + printcolumn = r#"{ + "name": "Shared", + "type": "boolean", + "jsonPath": ".spec.shared", + "description": "Describes whether this Instance is shared" + }"#, + printcolumn = r#"{ + "name": "Nodes", + "type": "string", + "jsonPath": ".spec.nodes", + "description": "Nodes that expose this Instance" + }"#, + printcolumn = r#"{ + "name": "Age", + "type": "date", + "jsonPath": ".metadata.creationTimestamp" + }"#, + derive = "PartialEq" +)] pub struct InstanceSpec { /// This contains the name of the corresponding Configuration pub configuration_name: String, + /// This contains the CDI fully qualified name of the device linked to the Instance + pub cdi_name: String, + + /// This contains the number of slots for the Instance + pub capacity: usize, + /// This defines some properties that will be set as /// environment variables in broker Pods that request /// the resource this Instance represents. @@ -40,6 +76,7 @@ pub struct InstanceSpec { /// This contains a list of the nodes that can access this capability instance #[serde(default)] + #[schemars(schema_with = "ssa_nodes_set")] pub nodes: Vec, /// This contains a map of capability slots to node names. The number of @@ -48,9 +85,29 @@ pub struct InstanceSpec { /// been claimed) or to a node name (corresponding to the node that has claimed /// the slot) #[serde(default)] + #[schemars(schema_with = "ssa_usage_granular")] pub device_usage: HashMap, } +fn ssa_nodes_set(gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema { + let mut schema: schemars::schema::SchemaObject = >::json_schema(gen).into(); + schema.extensions.insert( + "x-kubernetes-list-type".to_owned(), + serde_json::Value::String("set".to_owned()), + ); + schema.into() +} + +fn ssa_usage_granular(gen: &mut schemars::gen::SchemaGenerator) -> schemars::schema::Schema { + let mut schema: schemars::schema::SchemaObject = + >::json_schema(gen).into(); + schema.extensions.insert( + "x-kubernetes-map-type".to_owned(), + serde_json::Value::String("granular".to_owned()), + ); + schema.into() +} + /// Get Instances for a given namespace /// /// Example: @@ -154,6 +211,8 @@ pub async fn find_instance( /// let instance = instance::create_instance( /// &InstanceSpec { /// configuration_name: "capability_configuration_name".to_string(), +/// cdi_name: "akri.sh/config-1=instance-1".to_string(), +/// capacity: 1, /// shared: true, /// nodes: Vec::new(), /// device_usage: std::collections::HashMap::new(), @@ -274,6 +333,8 @@ pub async fn delete_instance( /// let instance = instance::update_instance( /// &InstanceSpec { /// configuration_name: "capability_configuration_name".to_string(), +/// cdi_name: "akri.sh/config-1=instance-1".to_string(), +/// capacity: 1, /// shared: true, /// nodes: Vec::new(), /// device_usage: std::collections::HashMap::new(), @@ -443,7 +504,7 @@ mod crd_serializeation_tests { fn test_instance_defaults_with_json_serialization() { let _ = env_logger::builder().is_test(true).try_init(); - let json = r#"{"configurationName": "foo"}"#; + let json = r#"{"configurationName": "foo", "cdiName": "akri.sh/foo=bar", "capacity": 1}"#; let deserialized: InstanceSpec = serde_json::from_str(json).unwrap(); assert_eq!("foo".to_string(), deserialized.configuration_name); assert_eq!(0, deserialized.broker_properties.len()); @@ -452,7 +513,7 @@ mod crd_serializeation_tests { assert_eq!(0, deserialized.device_usage.len()); let serialized = serde_json::to_string(&deserialized).unwrap(); - let expected_deserialized = r#"{"configurationName":"foo","brokerProperties":{},"shared":false,"nodes":[],"deviceUsage":{}}"#; + let expected_deserialized = r#"{"configurationName":"foo","cdiName":"akri.sh/foo=bar","capacity":1,"brokerProperties":{},"shared":false,"nodes":[],"deviceUsage":{}}"#; assert_eq!(expected_deserialized, serialized); } @@ -462,6 +523,8 @@ mod crd_serializeation_tests { let json = r#" configurationName: foo + cdiName: akri.sh/foo=bar + capacity: 1 "#; let deserialized: InstanceSpec = serde_yaml::from_str(json).unwrap(); assert_eq!("foo".to_string(), deserialized.configuration_name); @@ -471,7 +534,7 @@ mod crd_serializeation_tests { assert_eq!(0, deserialized.device_usage.len()); let serialized = serde_json::to_string(&deserialized).unwrap(); - let expected_deserialized = r#"{"configurationName":"foo","brokerProperties":{},"shared":false,"nodes":[],"deviceUsage":{}}"#; + let expected_deserialized = r#"{"configurationName":"foo","cdiName":"akri.sh/foo=bar","capacity":1,"brokerProperties":{},"shared":false,"nodes":[],"deviceUsage":{}}"#; assert_eq!(expected_deserialized, serialized); } @@ -479,7 +542,7 @@ mod crd_serializeation_tests { fn test_instance_serialization() { let _ = env_logger::builder().is_test(true).try_init(); - let json = r#"{"configurationName":"blah","brokerProperties":{"a":"two"},"shared":true,"nodes":["n1","n2"],"deviceUsage":{"0":"","1":"n1"}}"#; + let json = r#"{"configurationName":"blah","cdiName": "akri.sh/foo=bar", "capacity": 1, "brokerProperties":{"a":"two"},"shared":true,"nodes":["n1","n2"],"deviceUsage":{"0":"","1":"n1"}}"#; let deserialized: InstanceSpec = serde_json::from_str(json).unwrap(); assert_eq!("blah".to_string(), deserialized.configuration_name); assert_eq!(1, deserialized.broker_properties.len()); diff --git a/shared/src/gen_crds.rs b/shared/src/gen_crds.rs new file mode 100644 index 000000000..060fad185 --- /dev/null +++ b/shared/src/gen_crds.rs @@ -0,0 +1,9 @@ +use akri_shared::akri::instance; +use kube::CustomResourceExt; + +pub fn main() { + println!( + "{}", + serde_yaml::to_string(&instance::Instance::crd()).unwrap() + ); +} diff --git a/shared/src/k8s/crud.rs b/shared/src/k8s/crud.rs new file mode 100644 index 000000000..a4ac10286 --- /dev/null +++ b/shared/src/k8s/crud.rs @@ -0,0 +1,147 @@ +use std::fmt::Debug; + +use async_trait::async_trait; +use either::Either; +use kube::{ + api::{Patch, PatchParams}, + core::{ObjectList, ObjectMeta, PartialObjectMetaExt, Status}, + Error, Resource, ResourceExt, +}; +use mockall::automock; +use serde::de::DeserializeOwned; +use serde_json::Value; + +use super::KubeImpl; + +#[automock] +#[async_trait] +pub trait Api: Send + Sync { + fn as_inner(&self) -> kube::Api; + async fn apply(&self, obj: T, field_manager: &str) -> Result; + async fn raw_patch( + &self, + name: &str, + patch: &Patch, + pp: &PatchParams, + ) -> Result; + async fn delete(&self, name: &str) -> Result, Error>; + async fn get(&self, name: &str) -> Result, Error>; + async fn list(&self) -> Result, Error>; + async fn add_finalizer(&self, obj: &T, finalizer: &str) -> Result<(), Error> { + self.set_finalizers( + &obj.name_any(), + Some(vec![finalizer.to_string()]), + &format!("{}-fin", finalizer), + ) + .await + } + async fn remove_finalizer(&self, obj: &T, finalizer: &str) -> Result<(), Error> { + self.set_finalizers(&obj.name_any(), None, &format!("{}-fin", finalizer)) + .await + } + async fn set_finalizers( + &self, + name: &str, + finalizers: Option>, + field_manager: &str, + ) -> Result<(), Error>; +} + +#[async_trait] +impl Api for kube::Api +where + T: Clone + + DeserializeOwned + + Debug + + Resource + + serde::Serialize + + Send + + Sync, +{ + fn as_inner(&self) -> kube::Api { + self.to_owned() + } + async fn apply(&self, obj: T, field_manager: &str) -> Result { + let name = obj.name_any(); + let pp = PatchParams::apply(field_manager); + let patch = kube::api::Patch::Apply(obj); + self.patch(&name, &pp, &patch).await + } + async fn raw_patch( + &self, + name: &str, + patch: &Patch, + pp: &PatchParams, + ) -> Result { + self.patch(name, pp, patch).await + } + async fn delete(&self, name: &str) -> Result, Error> { + self.delete(name, &Default::default()).await + } + async fn get(&self, name: &str) -> Result, Error> { + self.get_opt(name).await + } + async fn list(&self) -> Result, Error> { + self.list(&Default::default()).await + } + async fn set_finalizers( + &self, + name: &str, + finalizers: Option>, + field_manager: &str, + ) -> Result<(), Error> { + let metadata = ObjectMeta { + finalizers, + ..Default::default() + } + .into_request_partial::(); + self.patch_metadata( + name, + &PatchParams::apply(field_manager), + &Patch::Apply(&metadata), + ) + .await?; + Ok(()) + } +} + +#[automock] +pub trait IntoApi: Send + Sync { + fn all(&self) -> Box>; + fn namespaced(&self, namespace: &str) -> Box> + where + T: Resource; + fn default_namespaced(&self) -> Box> + where + T: Resource; +} + +impl IntoApi for KubeImpl +where + T: Resource + + Clone + + DeserializeOwned + + Debug + + serde::Serialize + + Send + + Sync + + 'static, +{ + fn all(&self) -> Box> { + Box::new(kube::Api::all(self.client.clone())) + } + + fn namespaced(&self, namespace: &str) -> Box> + where + T: Resource, + { + Box::new(kube::Api::namespaced(self.client.clone(), namespace)) + } + + fn default_namespaced(&self) -> Box> + where + T: Resource, + { + Box::new(kube::Api::default_namespaced(self.client.clone())) + } +} diff --git a/shared/src/k8s/job.rs b/shared/src/k8s/job.rs index ed9a071d3..58baf37fa 100644 --- a/shared/src/k8s/job.rs +++ b/shared/src/k8s/job.rs @@ -90,6 +90,8 @@ pub async fn find_jobs_with_selector( /// let api_client = Client::try_default().await.unwrap(); /// let instance_spec = InstanceSpec { /// configuration_name: "configuration_name".to_string(), +/// cdi_name: "akri.sh/configuration_name=instance_name".to_string(), +/// capacity: 1, /// shared: true, /// nodes: Vec::new(), /// device_usage: std::collections::HashMap::new(), diff --git a/shared/src/k8s/mod.rs b/shared/src/k8s/mod.rs index 757abe0b6..7fc7aa79d 100644 --- a/shared/src/k8s/mod.rs +++ b/shared/src/k8s/mod.rs @@ -12,6 +12,7 @@ use k8s_openapi::api::core::v1::{Node, Pod, Service}; use kube::{api::ObjectList, client::Client}; use mockall::{automock, predicate::*}; +pub mod crud; pub mod job; pub mod node; pub mod pod; @@ -509,6 +510,8 @@ impl KubeInterface for KubeImpl { /// kube.create_instance( /// &InstanceSpec{ /// configuration_name: "capability_configuration_name".to_string(), + /// cdi_name: "akri.sh/config-1=instance-1".to_string(), + /// capacity: 1, /// shared: true, /// nodes: Vec::new(), /// device_usage: std::collections::HashMap::new(), @@ -574,6 +577,8 @@ impl KubeInterface for KubeImpl { /// kube.update_instance( /// &InstanceSpec{ /// configuration_name: "capability_configuration_name".to_string(), + /// cdi_name: "akri.sh/capability_configuration_name=instance-1".to_string(), + /// capacity: 1, /// shared: true, /// nodes: Vec::new(), /// device_usage: std::collections::HashMap::new(), diff --git a/shared/src/os/env_var.rs b/shared/src/os/env_var.rs index 384eee2d0..f09c5de79 100644 --- a/shared/src/os/env_var.rs +++ b/shared/src/os/env_var.rs @@ -3,7 +3,7 @@ use std::{env, env::VarError}; /// This provides a mockable way to query an env var. #[automock] -pub trait EnvVarQuery { +pub trait EnvVarQuery: Send + Sync { fn get_env_var(&self, name: &'static str) -> Result; fn get_env_vars(&self) -> Vec<(String, String)>; } diff --git a/test/e2e/test_core.py b/test/e2e/test_core.py index 21a8eb7da..a3b0e72fc 100644 --- a/test/e2e/test_core.py +++ b/test/e2e/test_core.py @@ -126,9 +126,10 @@ def test_cleanup(akri_version, faker): def test_slot_reconciliation(): + #TODO: Add some real testing of this feature agent_logs = get_agent_logs(since=20) for logs in agent_logs.values(): - assert "get_node_slots - crictl called successfully" in logs + assert "reclaiming unused slots - start" in logs def test_broker_recreated_if_deleted(basic_config): diff --git a/test/json/local-instance-list.json b/test/json/local-instance-list.json index 381397ca9..47d6d1572 100644 --- a/test/json/local-instance-list.json +++ b/test/json/local-instance-list.json @@ -11,6 +11,8 @@ }, "spec": { "configurationName": "config-b", + "capacity": 5, + "cdiName": "akri.sh/config-b=b494b6", "nodes": [ "node-a" ], "shared": false, "deviceUsage": { diff --git a/test/json/local-instance.json b/test/json/local-instance.json index 34105187b..c3b29fd94 100644 --- a/test/json/local-instance.json +++ b/test/json/local-instance.json @@ -8,6 +8,8 @@ }, "spec": { "configurationName": "config-a", + "capacity": 5, + "cdiName": "akri.sh/config-a=b494b6", "nodes": [ "node-a" ], "shared": false, "deviceUsage": { diff --git a/test/json/shared-instance-list-slots.json b/test/json/shared-instance-list-slots.json index f3c571458..5e053a2c9 100644 --- a/test/json/shared-instance-list-slots.json +++ b/test/json/shared-instance-list-slots.json @@ -11,6 +11,8 @@ }, "spec": { "configurationName": "config-a", + "capacity": 5, + "cdiName": "akri.sh/config-a=359973", "deviceUsage": { "config-a-359973-0": "node-b", "config-a-359973-1": "node-a", diff --git a/test/json/shared-instance-list.json b/test/json/shared-instance-list.json index 1b9914dd1..a237554a0 100644 --- a/test/json/shared-instance-list.json +++ b/test/json/shared-instance-list.json @@ -11,6 +11,8 @@ }, "spec": { "configurationName": "config-a", + "capacity": 5, + "cdiName": "akri.sh/config-a=359973", "nodes": [ "node-a" ], "shared": true } diff --git a/test/json/shared-instance-update.json b/test/json/shared-instance-update.json index 81d4b5168..09c68d526 100644 --- a/test/json/shared-instance-update.json +++ b/test/json/shared-instance-update.json @@ -8,6 +8,8 @@ }, "spec": { "configurationName": "config-a", + "capacity": 5, + "cdiName": "akri.sh/config-a=359973", "nodes": [ "node-b" ], "deviceUsage": { "config-a-359973-0": "", diff --git a/test/json/shared-instance.json b/test/json/shared-instance.json index 120d0d169..666937fae 100644 --- a/test/json/shared-instance.json +++ b/test/json/shared-instance.json @@ -8,6 +8,8 @@ }, "spec": { "configurationName": "config-a", + "capacity": 5, + "cdiName": "akri.sh/config-a=359973", "nodes": [ "node-a" ], "shared": true } diff --git a/test/yaml/akri-instance-onvif-camera.yaml b/test/yaml/akri-instance-onvif-camera.yaml index 2aa928801..fafaa6db9 100644 --- a/test/yaml/akri-instance-onvif-camera.yaml +++ b/test/yaml/akri-instance-onvif-camera.yaml @@ -4,6 +4,8 @@ metadata: name: onvif-camera-ffffff spec: configurationName: onvif-camera + capacity: 5 + cdiName: akri.sh/onvif-camera=ffffff shared: true nodes: - "linux-dev3" diff --git a/test/yaml/akri-instance-usb-camera.yaml b/test/yaml/akri-instance-usb-camera.yaml index 97bdc7224..508abed6a 100644 --- a/test/yaml/akri-instance-usb-camera.yaml +++ b/test/yaml/akri-instance-usb-camera.yaml @@ -4,6 +4,8 @@ metadata: name: usb-dev-video0-ffffff spec: configurationName: usb-dev-video0 + capacity: 5 + cdiName: akri.sh/usb-dev-video0=ffffff shared: false nodes: - "linux-dev3" From 39d346b87b24f0d1171b919ee2b0d49a1049a42c Mon Sep 17 00:00:00 2001 From: Nicolas Belouin Date: Fri, 26 Apr 2024 17:39:49 +0200 Subject: [PATCH 2/8] Apply suggestions Signed-off-by: Nicolas Belouin --- agent/build.rs | 5 - agent/proto/pluginregistration.proto | 50 ---- .../discovery_property_solver.rs | 2 +- agent/src/discovery_handler_manager/mod.rs | 4 +- agent/src/main.rs | 6 +- .../device_plugin_instance_controller.rs | 31 ++- .../device_plugin_slot_reclaimer.rs | 30 +- agent/src/plugin_manager/mod.rs | 1 - .../src/plugin_manager/pluginregistration.rs | 260 ------------------ .../discovery_configuration_controller.rs | 38 +-- shared/src/k8s/{crud.rs => api.rs} | 0 shared/src/k8s/mod.rs | 2 +- 12 files changed, 72 insertions(+), 357 deletions(-) delete mode 100644 agent/proto/pluginregistration.proto delete mode 100644 agent/src/plugin_manager/pluginregistration.rs rename shared/src/k8s/{crud.rs => api.rs} (100%) diff --git a/agent/build.rs b/agent/build.rs index 34a7e1c1a..b4ee4152b 100644 --- a/agent/build.rs +++ b/agent/build.rs @@ -8,9 +8,4 @@ fn main() { &["./proto"], ) .expect("failed to compile protos"); - tonic_build::configure() - .build_client(false) - .out_dir("./src/plugin_manager") - .compile(&["./proto/pluginregistration.proto"], &["./proto"]) - .expect("failed to compile protos"); } diff --git a/agent/proto/pluginregistration.proto b/agent/proto/pluginregistration.proto deleted file mode 100644 index 6e3be0d78..000000000 --- a/agent/proto/pluginregistration.proto +++ /dev/null @@ -1,50 +0,0 @@ -// To regenerate api.pb.go run `hack/update-codegen.sh protobindings` -syntax = "proto3"; - -package pluginregistration; // This should have been v1. - -// PluginInfo is the message sent from a plugin to the Kubelet pluginwatcher for plugin registration -message PluginInfo { - // Type of the Plugin. CSIPlugin or DevicePlugin - string type = 1; - // Plugin name that uniquely identifies the plugin for the given plugin type. - // For DevicePlugin, this is the resource name that the plugin manages and - // should follow the extended resource name convention. - // For CSI, this is the CSI driver registrar name. - string name = 2; - // Optional endpoint location. If found set by Kubelet component, - // Kubelet component will use this endpoint for specific requests. - // This allows the plugin to register using one endpoint and possibly use - // a different socket for control operations. CSI uses this model to delegate - // its registration external from the plugin. - string endpoint = 3; - // Plugin service API versions the plugin supports. - // For DevicePlugin, this maps to the deviceplugin API versions the - // plugin supports at the given socket. - // The Kubelet component communicating with the plugin should be able - // to choose any preferred version from this list, or returns an error - // if none of the listed versions is supported. - repeated string supported_versions = 4; -} - -// RegistrationStatus is the message sent from Kubelet pluginwatcher to the plugin for notification on registration status -message RegistrationStatus { - // True if plugin gets registered successfully at Kubelet - bool plugin_registered = 1; - // Error message in case plugin fails to register, empty string otherwise - string error = 2; -} - -// RegistrationStatusResponse is sent by plugin to kubelet in response to RegistrationStatus RPC -message RegistrationStatusResponse { -} - -// InfoRequest is the empty request message from Kubelet -message InfoRequest { -} - -// Registration is the service advertised by the Plugins. -service Registration { - rpc GetInfo(InfoRequest) returns (PluginInfo) {} - rpc NotifyRegistrationStatus(RegistrationStatus) returns (RegistrationStatusResponse) {} -} diff --git a/agent/src/discovery_handler_manager/discovery_property_solver.rs b/agent/src/discovery_handler_manager/discovery_property_solver.rs index c382042be..6c06dd7cb 100644 --- a/agent/src/discovery_handler_manager/discovery_property_solver.rs +++ b/agent/src/discovery_handler_manager/discovery_property_solver.rs @@ -127,7 +127,7 @@ async fn solve_value_from_secret( mod tests { use std::collections::BTreeMap; - use akri_shared::k8s::crud::MockApi; + use akri_shared::k8s::api::MockApi; use k8s_openapi::ByteString; use crate::discovery_handler_manager::mock::MockDiscoveryManagerKubeInterface; diff --git a/agent/src/discovery_handler_manager/mod.rs b/agent/src/discovery_handler_manager/mod.rs index 183881539..6a4c1434d 100644 --- a/agent/src/discovery_handler_manager/mod.rs +++ b/agent/src/discovery_handler_manager/mod.rs @@ -6,7 +6,7 @@ mod registration_socket; use std::{collections::HashMap, sync::Arc}; -use akri_shared::{akri::configuration::Configuration, k8s::crud::IntoApi}; +use akri_shared::{akri::configuration::Configuration, k8s::api::IntoApi}; use k8s_openapi::api::core::v1::{ConfigMap, Secret}; use kube_runtime::reflector::ObjectRef; @@ -58,7 +58,7 @@ impl + IntoApi> DiscoveryManagerKubeInterface for #[cfg(test)] mod mock { - use akri_shared::k8s::crud::{Api, IntoApi, MockIntoApi}; + use akri_shared::k8s::api::{Api, IntoApi, MockIntoApi}; use k8s_openapi::api::core::v1::{ConfigMap, Secret}; #[derive(Default)] pub struct MockDiscoveryManagerKubeInterface { diff --git a/agent/src/main.rs b/agent/src/main.rs index efe1f33f5..3ebc6f9bd 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -44,7 +44,7 @@ async fn main() -> Result<(), Box run_metrics_server().await.unwrap(); })); - let (device_notifier, discovery_handler_registry, conf_notifier) = + let (device_notifier, discovery_handler_registry, config_notifier) = discovery_handler_manager::new_registry(kube_client.clone()); let dh_registry = Arc::new(discovery_handler_registry); @@ -85,7 +85,7 @@ async fn main() -> Result<(), Box instances_cache, dh_registry, client: kube_client.clone(), - agent_instance_name: node_name.clone(), + agent_identifier: node_name.clone(), error_backoffs: Mutex::new(HashMap::new()), }, ); @@ -93,7 +93,7 @@ async fn main() -> Result<(), Box tasks.push(tokio::spawn(async { util::discovery_configuration_controller::start_controller( config_controller_context, - conf_notifier, + config_notifier, ) .await; })); diff --git a/agent/src/plugin_manager/device_plugin_instance_controller.rs b/agent/src/plugin_manager/device_plugin_instance_controller.rs index 4ceaa24db..30368637f 100644 --- a/agent/src/plugin_manager/device_plugin_instance_controller.rs +++ b/agent/src/plugin_manager/device_plugin_instance_controller.rs @@ -3,7 +3,7 @@ use std::fmt::Display; use std::str::FromStr; use std::{collections::HashMap, sync::Arc, time::Duration}; -use akri_shared::{akri::instance::Instance, k8s::crud::IntoApi}; +use akri_shared::{akri::instance::Instance, k8s::api::IntoApi}; use async_trait::async_trait; use futures::StreamExt; use itertools::Itertools; @@ -718,8 +718,11 @@ pub struct DevicePluginManager { node_name: String, kube_client: Arc>, device_manager: Arc, + error_backoffs: std::sync::Mutex>, } +const SUCCESS_REQUEUE: Duration = Duration::from_secs(600); + impl DevicePluginManager { pub fn new( node_name: String, @@ -732,6 +735,7 @@ impl DevicePluginManager { node_name, kube_client, device_manager, + error_backoffs: std::sync::Mutex::new(HashMap::default()), } } @@ -885,20 +889,33 @@ pub async fn reconcile( .add_plugin(instance.name_any(), instance_plugin) .await; } - Ok(Action::requeue(Duration::from_secs(300))) + ctx.error_backoffs + .lock() + .unwrap() + .remove(&instance.name_any()); + Ok(Action::requeue(SUCCESS_REQUEUE)) } pub fn error_policy( dc: Arc, error: &DevicePluginError, - _ctx: Arc, + ctx: Arc, ) -> Action { - error!( - "Error during reconciliation of Instance {}: {:?}", + let mut error_backoffs = ctx.error_backoffs.lock().unwrap(); + let previous_duration = error_backoffs + .get(&dc.name_any()) + .cloned() + .unwrap_or(Duration::from_millis(500)); + let next_duration = previous_duration * 2; + warn!( + "Error during reconciliation of Instance {:?}::{}, retrying in {}s: {:?}", + dc.namespace(), dc.name_any(), + next_duration.as_secs_f32(), error ); - Action::requeue(Duration::from_secs(60)) + error_backoffs.insert(dc.name_any(), next_duration); + Action::requeue(next_duration) } #[cfg(test)] @@ -907,7 +924,7 @@ mod tests { use akri_shared::{ akri::instance::InstanceSpec, - k8s::crud::{MockApi, MockIntoApi}, + k8s::api::{MockApi, MockIntoApi}, }; use tokio_stream::StreamExt; diff --git a/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs b/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs index e5f491817..6a0e04a0f 100644 --- a/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs +++ b/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs @@ -18,6 +18,8 @@ use super::{ /// Path of the Kubelet registry socket pub const KUBELET_SOCKET: &str = "/var/lib/kubelet/pod-resources/kubelet.sock"; +const SLOT_GRACE_PERIOD: Duration = Duration::from_secs(20); +const SLOT_RECLAIM_INTERVAL: Duration = Duration::from_secs(10); async fn get_used_slots() -> Result, anyhow::Error> { // We will ignore this dummy uri because UDS does not use it. @@ -73,25 +75,33 @@ pub async fn start_reclaimer(dp_manager: Arc) { let theoretical_slots = dp_manager.get_used_slots().await; trace!("theoretical slots: {:?}", theoretical_slots); let mut new_stalled_slots: HashMap = HashMap::new(); - let now = Instant::now(); - for slot in theoretical_slots.difference(&used_slots) { - if let Some(at) = stalled_slots.get(slot) { - if now.saturating_duration_since(*at) >= Duration::from_secs(20) { - trace!("freeing slot: {}", slot); - if dp_manager.free_slot(slot.to_string()).await.is_err() { - new_stalled_slots.insert(slot.to_string(), at.to_owned()); + let reclaim_iteration_start = Instant::now(); + for slot_to_reclaim in theoretical_slots.difference(&used_slots) { + // See if slot was already stalled at previous iteration + if let Some(at) = stalled_slots.get(slot_to_reclaim) { + if reclaim_iteration_start.saturating_duration_since(*at) >= SLOT_GRACE_PERIOD { + // Slot is stalled for more than grace period, free it + trace!("freeing slot: {}", slot_to_reclaim); + if dp_manager + .free_slot(slot_to_reclaim.to_string()) + .await + .is_err() + { + new_stalled_slots.insert(slot_to_reclaim.to_string(), at.to_owned()); }; } else { - new_stalled_slots.insert(slot.to_string(), at.to_owned()); + // Keep slot as stall + new_stalled_slots.insert(slot_to_reclaim.to_string(), at.to_owned()); } } else { - new_stalled_slots.insert(slot.to_string(), now); + // Mark slot as stall + new_stalled_slots.insert(slot_to_reclaim.to_string(), reclaim_iteration_start); } } stalled_slots = new_stalled_slots; } tokio::select! { - _ = tokio::time::sleep(Duration::from_secs(10)) => {}, + _ = tokio::time::sleep(SLOT_RECLAIM_INTERVAL) => {}, _ = signal.recv() => return, }; } diff --git a/agent/src/plugin_manager/mod.rs b/agent/src/plugin_manager/mod.rs index 5300520f1..527ebf48b 100644 --- a/agent/src/plugin_manager/mod.rs +++ b/agent/src/plugin_manager/mod.rs @@ -1,4 +1,3 @@ -pub mod pluginregistration; // Pros generated pluginregistration module pub mod v1; // Prost generated podresources module pub mod v1beta1; // Prost generated pluginapi module diff --git a/agent/src/plugin_manager/pluginregistration.rs b/agent/src/plugin_manager/pluginregistration.rs deleted file mode 100644 index 8fdd546e4..000000000 --- a/agent/src/plugin_manager/pluginregistration.rs +++ /dev/null @@ -1,260 +0,0 @@ -/// PluginInfo is the message sent from a plugin to the Kubelet pluginwatcher for plugin registration -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct PluginInfo { - /// Type of the Plugin. CSIPlugin or DevicePlugin - #[prost(string, tag = "1")] - pub r#type: ::prost::alloc::string::String, - /// Plugin name that uniquely identifies the plugin for the given plugin type. - /// For DevicePlugin, this is the resource name that the plugin manages and - /// should follow the extended resource name convention. - /// For CSI, this is the CSI driver registrar name. - #[prost(string, tag = "2")] - pub name: ::prost::alloc::string::String, - /// Optional endpoint location. If found set by Kubelet component, - /// Kubelet component will use this endpoint for specific requests. - /// This allows the plugin to register using one endpoint and possibly use - /// a different socket for control operations. CSI uses this model to delegate - /// its registration external from the plugin. - #[prost(string, tag = "3")] - pub endpoint: ::prost::alloc::string::String, - /// Plugin service API versions the plugin supports. - /// For DevicePlugin, this maps to the deviceplugin API versions the - /// plugin supports at the given socket. - /// The Kubelet component communicating with the plugin should be able - /// to choose any preferred version from this list, or returns an error - /// if none of the listed versions is supported. - #[prost(string, repeated, tag = "4")] - pub supported_versions: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, -} -/// RegistrationStatus is the message sent from Kubelet pluginwatcher to the plugin for notification on registration status -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct RegistrationStatus { - /// True if plugin gets registered successfully at Kubelet - #[prost(bool, tag = "1")] - pub plugin_registered: bool, - /// Error message in case plugin fails to register, empty string otherwise - #[prost(string, tag = "2")] - pub error: ::prost::alloc::string::String, -} -/// RegistrationStatusResponse is sent by plugin to kubelet in response to RegistrationStatus RPC -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct RegistrationStatusResponse {} -/// InfoRequest is the empty request message from Kubelet -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct InfoRequest {} -/// Generated server implementations. -pub mod registration_server { - #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)] - use tonic::codegen::*; - /// Generated trait containing gRPC methods that should be implemented for use with RegistrationServer. - #[async_trait] - pub trait Registration: Send + Sync + 'static { - async fn get_info( - &self, - request: tonic::Request, - ) -> std::result::Result, tonic::Status>; - async fn notify_registration_status( - &self, - request: tonic::Request, - ) -> std::result::Result, tonic::Status>; - } - /// Registration is the service advertised by the Plugins. - #[derive(Debug)] - pub struct RegistrationServer { - inner: _Inner, - accept_compression_encodings: EnabledCompressionEncodings, - send_compression_encodings: EnabledCompressionEncodings, - max_decoding_message_size: Option, - max_encoding_message_size: Option, - } - struct _Inner(Arc); - impl RegistrationServer { - pub fn new(inner: T) -> Self { - Self::from_arc(Arc::new(inner)) - } - pub fn from_arc(inner: Arc) -> Self { - let inner = _Inner(inner); - Self { - inner, - accept_compression_encodings: Default::default(), - send_compression_encodings: Default::default(), - max_decoding_message_size: None, - max_encoding_message_size: None, - } - } - pub fn with_interceptor(inner: T, interceptor: F) -> InterceptedService - where - F: tonic::service::Interceptor, - { - InterceptedService::new(Self::new(inner), interceptor) - } - /// Enable decompressing requests with the given encoding. - #[must_use] - pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { - self.accept_compression_encodings.enable(encoding); - self - } - /// Compress responses with the given encoding, if the client supports it. - #[must_use] - pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { - self.send_compression_encodings.enable(encoding); - self - } - /// Limits the maximum size of a decoded message. - /// - /// Default: `4MB` - #[must_use] - pub fn max_decoding_message_size(mut self, limit: usize) -> Self { - self.max_decoding_message_size = Some(limit); - self - } - /// Limits the maximum size of an encoded message. - /// - /// Default: `usize::MAX` - #[must_use] - pub fn max_encoding_message_size(mut self, limit: usize) -> Self { - self.max_encoding_message_size = Some(limit); - self - } - } - impl tonic::codegen::Service> for RegistrationServer - where - T: Registration, - B: Body + Send + 'static, - B::Error: Into + Send + 'static, - { - type Response = http::Response; - type Error = std::convert::Infallible; - type Future = BoxFuture; - fn poll_ready( - &mut self, - _cx: &mut Context<'_>, - ) -> Poll> { - Poll::Ready(Ok(())) - } - fn call(&mut self, req: http::Request) -> Self::Future { - let inner = self.inner.clone(); - match req.uri().path() { - "/pluginregistration.Registration/GetInfo" => { - #[allow(non_camel_case_types)] - struct GetInfoSvc(pub Arc); - impl tonic::server::UnaryService for GetInfoSvc { - type Response = super::PluginInfo; - type Future = BoxFuture, tonic::Status>; - fn call( - &mut self, - request: tonic::Request, - ) -> Self::Future { - let inner = Arc::clone(&self.0); - let fut = - async move { ::get_info(&inner, request).await }; - Box::pin(fut) - } - } - let accept_compression_encodings = self.accept_compression_encodings; - let send_compression_encodings = self.send_compression_encodings; - let max_decoding_message_size = self.max_decoding_message_size; - let max_encoding_message_size = self.max_encoding_message_size; - let inner = self.inner.clone(); - let fut = async move { - let inner = inner.0; - let method = GetInfoSvc(inner); - let codec = tonic::codec::ProstCodec::default(); - let mut grpc = tonic::server::Grpc::new(codec) - .apply_compression_config( - accept_compression_encodings, - send_compression_encodings, - ) - .apply_max_message_size_config( - max_decoding_message_size, - max_encoding_message_size, - ); - let res = grpc.unary(method, req).await; - Ok(res) - }; - Box::pin(fut) - } - "/pluginregistration.Registration/NotifyRegistrationStatus" => { - #[allow(non_camel_case_types)] - struct NotifyRegistrationStatusSvc(pub Arc); - impl tonic::server::UnaryService - for NotifyRegistrationStatusSvc - { - type Response = super::RegistrationStatusResponse; - type Future = BoxFuture, tonic::Status>; - fn call( - &mut self, - request: tonic::Request, - ) -> Self::Future { - let inner = Arc::clone(&self.0); - let fut = async move { - ::notify_registration_status(&inner, request) - .await - }; - Box::pin(fut) - } - } - let accept_compression_encodings = self.accept_compression_encodings; - let send_compression_encodings = self.send_compression_encodings; - let max_decoding_message_size = self.max_decoding_message_size; - let max_encoding_message_size = self.max_encoding_message_size; - let inner = self.inner.clone(); - let fut = async move { - let inner = inner.0; - let method = NotifyRegistrationStatusSvc(inner); - let codec = tonic::codec::ProstCodec::default(); - let mut grpc = tonic::server::Grpc::new(codec) - .apply_compression_config( - accept_compression_encodings, - send_compression_encodings, - ) - .apply_max_message_size_config( - max_decoding_message_size, - max_encoding_message_size, - ); - let res = grpc.unary(method, req).await; - Ok(res) - }; - Box::pin(fut) - } - _ => Box::pin(async move { - Ok(http::Response::builder() - .status(200) - .header("grpc-status", "12") - .header("content-type", "application/grpc") - .body(empty_body()) - .unwrap()) - }), - } - } - } - impl Clone for RegistrationServer { - fn clone(&self) -> Self { - let inner = self.inner.clone(); - Self { - inner, - accept_compression_encodings: self.accept_compression_encodings, - send_compression_encodings: self.send_compression_encodings, - max_decoding_message_size: self.max_decoding_message_size, - max_encoding_message_size: self.max_encoding_message_size, - } - } - } - impl Clone for _Inner { - fn clone(&self) -> Self { - Self(Arc::clone(&self.0)) - } - } - impl std::fmt::Debug for _Inner { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self.0) - } - } - impl tonic::server::NamedService for RegistrationServer { - const NAME: &'static str = "pluginregistration.Registration"; - } -} diff --git a/agent/src/util/discovery_configuration_controller.rs b/agent/src/util/discovery_configuration_controller.rs index 9f0edf78a..723d615dd 100644 --- a/agent/src/util/discovery_configuration_controller.rs +++ b/agent/src/util/discovery_configuration_controller.rs @@ -9,7 +9,7 @@ use akri_shared::{ configuration::{Configuration, DiscoveryProperty}, instance::Instance, }, - k8s::crud::IntoApi, + k8s::api::IntoApi, }; use futures::StreamExt; use tokio::sync::mpsc; @@ -34,6 +34,8 @@ pub enum Error { Other(#[from] anyhow::Error), } +const SUCCESS_REQUEUE: Duration = Duration::from_secs(600); + pub trait DiscoveryConfigurationKubeClient: IntoApi + IntoApi {} impl + IntoApi> DiscoveryConfigurationKubeClient for T {} @@ -42,10 +44,12 @@ pub struct ControllerContext { pub instances_cache: Store, pub dh_registry: Arc, pub client: Arc, - pub agent_instance_name: String, + pub agent_identifier: String, pub error_backoffs: Mutex>, } +/// This function starts the reconciling loop for the Configuration controller. +/// It is expected to run this as a task. pub async fn start_controller( ctx: Arc, rec: mpsc::Receiver>, @@ -59,6 +63,7 @@ pub async fn start_controller( tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()).unwrap(); signal.recv().await; }) + // Reconcile the Configuration when the discovery handler manager signals a change .reconcile_on(tokio_stream::wrappers::ReceiverStream::new(rec)) .run(reconcile, error_policy, ctx) .for_each(|_| futures::future::ready(())) @@ -77,30 +82,29 @@ pub async fn reconcile( ctx.client .namespaced(&namespace) - .remove_finalizer(dc.as_ref(), &ctx.agent_instance_name) + .remove_finalizer(dc.as_ref(), &ctx.agent_identifier) .await .map_err(|e| Error::Other(e.into()))?; return Ok(Action::await_change()); } - if !dc.finalizers().contains(&ctx.agent_instance_name) { + if !dc.finalizers().contains(&ctx.agent_identifier) { ctx.client .namespaced(&namespace) - .add_finalizer(dc.as_ref(), &ctx.agent_instance_name) + .add_finalizer(dc.as_ref(), &ctx.agent_identifier) .await .map_err(|e| Error::Other(e.into()))? } let dh_name = &dc.spec.discovery_handler.name; let dh_details = &dc.spec.discovery_handler.discovery_details; - let empty_vec = vec![]; - let dh_properties: &Vec = dc + let dh_properties: &[DiscoveryProperty] = dc .spec .discovery_handler .discovery_properties - .as_ref() - .unwrap_or(&empty_vec); + .as_deref() + .unwrap_or_default(); let dh_extra_device_properties = dc.spec.broker_properties.clone(); let discovered_instances: Vec = @@ -110,7 +114,7 @@ pub async fn reconcile( .into_iter() .map(|mut instance| { // Add - instance.spec.nodes = vec![ctx.agent_instance_name.to_owned()]; + instance.spec.nodes = vec![ctx.agent_identifier.to_owned()]; instance.owner_references_mut().push(owner_ref.clone()); instance.spec.capacity = dc.spec.capacity; instance @@ -140,7 +144,7 @@ pub async fn reconcile( delete_instance( ctx.client.as_ref(), instance.as_ref(), - &ctx.agent_instance_name, + &ctx.agent_identifier, ) .await? } @@ -149,13 +153,13 @@ pub async fn reconcile( for instance in discovered_instances { ctx.client .namespaced(&namespace) - .apply(instance, &ctx.agent_instance_name) + .apply(instance, &ctx.agent_identifier) .await .map_err(|e| Error::Other(e.into()))?; } ctx.error_backoffs.lock().unwrap().remove(&dc.name_any()); - Ok(Action::requeue(Duration::from_secs(600))) + Ok(Action::requeue(SUCCESS_REQUEUE)) } pub fn error_policy(dc: Arc, error: &Error, ctx: Arc) -> Action { @@ -205,7 +209,7 @@ mod tests { configuration::{ConfigurationSpec, DiscoveryHandlerInfo}, instance::InstanceSpec, }, - k8s::crud::{Api, MockApi, MockIntoApi}, + k8s::api::{Api, MockApi, MockIntoApi}, }; use k8s_openapi::apimachinery::pkg::apis::meta::v1::OwnerReference; use kube::core::{ObjectMeta, Status}; @@ -297,7 +301,7 @@ mod tests { instances_cache: store, dh_registry: Arc::new(MockDiscoveryHandlerRegistry::new()), client: Arc::new(MockDiscoveryConfigurationKubeClient::default()), - agent_instance_name: "node-a".to_string(), + agent_identifier: "node-a".to_string(), error_backoffs: Default::default(), }); @@ -473,7 +477,7 @@ mod tests { instances_cache: store, dh_registry: Arc::new(registry), client: Arc::new(client), - agent_instance_name: "node-a".to_string(), + agent_identifier: "node-a".to_string(), error_backoffs: Default::default(), }); @@ -608,7 +612,7 @@ mod tests { instances_cache: store, dh_registry: Arc::new(registry), client: Arc::new(client), - agent_instance_name: "node-a".to_string(), + agent_identifier: "node-a".to_string(), error_backoffs: Default::default(), }); diff --git a/shared/src/k8s/crud.rs b/shared/src/k8s/api.rs similarity index 100% rename from shared/src/k8s/crud.rs rename to shared/src/k8s/api.rs diff --git a/shared/src/k8s/mod.rs b/shared/src/k8s/mod.rs index 7fc7aa79d..e004a44c3 100644 --- a/shared/src/k8s/mod.rs +++ b/shared/src/k8s/mod.rs @@ -12,7 +12,7 @@ use k8s_openapi::api::core::v1::{Node, Pod, Service}; use kube::{api::ObjectList, client::Client}; use mockall::{automock, predicate::*}; -pub mod crud; +pub mod api; pub mod job; pub mod node; pub mod pod; From 006f78a633bc4d52c03d4d16621dea1a80ebacf2 Mon Sep 17 00:00:00 2001 From: Nicolas Belouin Date: Mon, 29 Apr 2024 15:43:31 +0200 Subject: [PATCH 3/8] Address more comments Signed-off-by: Nicolas Belouin --- agent/Cargo.toml | 2 +- .../discovery_handler_registry.rs | 4 --- .../embedded_handler.rs | 3 -- .../registration_socket.rs | 9 ++---- agent/src/main.rs | 9 +++--- .../device_plugin_instance_controller.rs | 28 ++++++++----------- .../device_plugin_slot_reclaimer.rs | 13 +++++---- .../discovery_configuration_controller.rs | 22 +++++++++------ agent/src/util/stopper.rs | 4 +-- 9 files changed, 42 insertions(+), 52 deletions(-) diff --git a/agent/Cargo.toml b/agent/Cargo.toml index 1666e20c7..16efa261b 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -36,7 +36,7 @@ serde_derive = "1.0.104" serde_json = "1.0.45" serde_yaml = { version = "0.8.11", optional = true } thiserror = "1.0.50" -tokio = { version = "1.0", features = ["rt-multi-thread", "time", "fs", "macros", "net", "signal"] } +tokio = { version = "1.0", features = ["rt-multi-thread", "time", "fs", "macros", "net"] } tokio-stream = { version = "0.1", features = ["net", "sync"] } tonic = "0.10" tower = "0.4.8" diff --git a/agent/src/discovery_handler_manager/discovery_handler_registry.rs b/agent/src/discovery_handler_manager/discovery_handler_registry.rs index f12a24c06..d81c2bb3d 100644 --- a/agent/src/discovery_handler_manager/discovery_handler_registry.rs +++ b/agent/src/discovery_handler_manager/discovery_handler_registry.rs @@ -403,14 +403,10 @@ impl DiscoveryHandlerRegistry for DHRegistryImpl { let notifier_receiver = self.endpoint_notifier.subscribe(); let local_req = self.requests.clone(); tokio::spawn(async move { - let mut signal = - tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()) - .unwrap(); select! { _ = dh_req_ref .watch_devices(notifier_receiver) => {}, _ = terminated.notified() => {}, - _ = signal.recv() => {}, } local_req.write().await.remove(&local_key); }); diff --git a/agent/src/discovery_handler_manager/embedded_handler.rs b/agent/src/discovery_handler_manager/embedded_handler.rs index c82344320..6d0320892 100644 --- a/agent/src/discovery_handler_manager/embedded_handler.rs +++ b/agent/src/discovery_handler_manager/embedded_handler.rs @@ -36,12 +36,9 @@ impl EmbeddedHandlerEndpoint { sender: watch::Sender>>, mut stream: ReceiverStream>, ) { - let mut signal = - tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()).unwrap(); loop { let msg = select! { _ = sender.closed() => return, - _ = signal.recv() => return, msg = stream.try_next() => match msg { Ok(Some(msg)) => msg, Ok(None) => { diff --git a/agent/src/discovery_handler_manager/registration_socket.rs b/agent/src/discovery_handler_manager/registration_socket.rs index ffdefae34..1bc67aadd 100644 --- a/agent/src/discovery_handler_manager/registration_socket.rs +++ b/agent/src/discovery_handler_manager/registration_socket.rs @@ -7,7 +7,7 @@ use akri_discovery_utils::discovery::v0::{ }; use akri_shared::uds::unix_stream; use async_trait::async_trait; -use futures::{FutureExt, Stream, StreamExt, TryFutureExt}; +use futures::{Stream, StreamExt, TryFutureExt}; use tokio::{select, sync::watch}; use tokio_stream::StreamExt as _; use tonic::{transport::Channel, Request, Response, Status}; @@ -67,15 +67,12 @@ impl NetworkEndpoint { sender: watch::Sender>>, mut stream: Pin> + Send>>, ) { - let mut signal = - tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()).unwrap(); loop { let msg = select! { // This means all queries for this endpoint must end. _ = stopper.stopped() => return, // This means all receiver dropped (i.e no one cares about this query anymore) _ = sender.closed() => return, - _ = signal.recv() => return, msg = stream.try_next() => match msg { Ok(Some(msg)) => msg, Ok(None) => { @@ -216,8 +213,6 @@ pub async fn run_registration_server( } } }; - let mut signal = - tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()).unwrap(); tonic::transport::Server::builder() .add_service( akri_discovery_utils::discovery::v0::registration_server::RegistrationServer::new( @@ -227,7 +222,7 @@ pub async fn run_registration_server( }, ), ) - .serve_with_incoming_shutdown(incoming, signal.recv().map(|_| ())) + .serve_with_incoming(incoming) .await?; trace!( "internal_run_registration_server - gracefully shutdown ... deleting socket {}", diff --git a/agent/src/main.rs b/agent/src/main.rs index 3ebc6f9bd..75e1851d6 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -71,10 +71,11 @@ async fn main() -> Result<(), Box ), ); - let (instances_cache, task) = plugin_manager::device_plugin_instance_controller::start_dpm( - device_plugin_manager.clone(), - ); - tasks.push(task); + let (instances_cache, device_plugin_controller_task) = + plugin_manager::device_plugin_instance_controller::start_dpm( + device_plugin_manager.clone(), + ); + tasks.push(device_plugin_controller_task); tasks.push(tokio::spawn( plugin_manager::device_plugin_slot_reclaimer::start_reclaimer(device_plugin_manager), diff --git a/agent/src/plugin_manager/device_plugin_instance_controller.rs b/agent/src/plugin_manager/device_plugin_instance_controller.rs index 30368637f..9ec4c8fc5 100644 --- a/agent/src/plugin_manager/device_plugin_instance_controller.rs +++ b/agent/src/plugin_manager/device_plugin_instance_controller.rs @@ -4,12 +4,13 @@ use std::str::FromStr; use std::{collections::HashMap, sync::Arc, time::Duration}; use akri_shared::{akri::instance::Instance, k8s::api::IntoApi}; +use anyhow::Context; use async_trait::async_trait; use futures::StreamExt; use itertools::Itertools; use kube::api::{Patch, PatchParams}; use kube::core::{NotUsed, Object, ObjectMeta, TypeMeta}; -use kube::ResourceExt; +use kube::{Resource, ResourceExt}; use kube_runtime::controller::Action; use kube_runtime::reflector::Store; use kube_runtime::Controller; @@ -27,6 +28,8 @@ use super::device_plugin_runner::{ }; use super::v1beta1::{AllocateRequest, AllocateResponse, ListAndWatchResponse}; +pub const DP_SLOT_PREFIX: &str = "akri.sh/"; + #[derive(Error, Debug)] pub enum DevicePluginError { #[error("Slot already in use")] @@ -223,8 +226,8 @@ impl InstanceDevicePlugin { let patch = Patch::Apply( serde_json::to_value(Object { types: Some(TypeMeta { - api_version: "akri.sh/v0".to_owned(), - kind: "Instance".to_owned(), + api_version: Instance::api_version(&()).to_string(), + kind: Instance::kind(&()).to_string(), }), status: None::, spec: PartialInstanceSlotUsage { device_usage }, @@ -233,7 +236,7 @@ impl InstanceDevicePlugin { ..Default::default() }, }) - .unwrap(), + .context("Could not create instance patch")?, ); api.raw_patch( &self.instance_name, @@ -280,8 +283,8 @@ impl InstanceDevicePlugin { let patch = Patch::Apply( serde_json::to_value(Object { types: Some(TypeMeta { - api_version: "akri.sh/v0".to_owned(), - kind: "Instance".to_owned(), + api_version: Instance::api_version(&()).to_string(), + kind: Instance::kind(&()).to_string(), }), status: None::, spec: PartialInstanceSlotUsage { device_usage }, @@ -290,7 +293,7 @@ impl InstanceDevicePlugin { ..Default::default() }, }) - .unwrap(), + .context("Could not create instance patch")?, ); api.raw_patch( &self.instance_name, @@ -487,8 +490,6 @@ impl ConfigurationDevicePlugin { let instance_name = plugin.instance_name.clone(); let mut receiver = plugin.slots_status.lock().await.subscribe(); tokio::spawn(async move { - let mut signal = - tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()).unwrap(); loop { { let (has_free, used_config_slots) = { @@ -568,7 +569,6 @@ impl ConfigurationDevicePlugin { break; } }, - _ = signal.recv() => {break} } } slots_ref.write().await.send_modify(|slots| { @@ -779,7 +779,7 @@ impl DevicePluginManager { .enumerate() .filter_map(|(i, u)| match u { DeviceUsage::Node(n) if *n == self.node_name => { - Some(format!("akri.sh/{}-{}", instance, i)) + Some(format!("{}{}-{}", DP_SLOT_PREFIX, instance, i)) } DeviceUsage::Configuration { vdev, node } if *node == self.node_name => { Some(vdev.to_string()) @@ -798,12 +798,6 @@ pub fn start_dpm(dpm: Arc) -> (Store, JoinHandle< let store = controller.store(); let task = tokio::spawn(async { controller - .graceful_shutdown_on(async { - let mut signal = - tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()) - .unwrap(); - signal.recv().await; - }) .run(reconcile, error_policy, dpm) .for_each(|_| futures::future::ready(())) .await diff --git a/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs b/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs index 6a0e04a0f..1c9d41e34 100644 --- a/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs +++ b/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs @@ -9,7 +9,9 @@ use tokio::net::UnixStream; use tonic::transport::{Endpoint, Uri}; use tower::service_fn; -use crate::plugin_manager::v1::ListPodResourcesRequest; +use crate::plugin_manager::{ + device_plugin_instance_controller::DP_SLOT_PREFIX, v1::ListPodResourcesRequest, +}; use super::{ device_plugin_instance_controller::DevicePluginManager, @@ -21,6 +23,10 @@ pub const KUBELET_SOCKET: &str = "/var/lib/kubelet/pod-resources/kubelet.sock"; const SLOT_GRACE_PERIOD: Duration = Duration::from_secs(20); const SLOT_RECLAIM_INTERVAL: Duration = Duration::from_secs(10); +/// This function connects to kubelet's resource monitoring interface and extracts +/// the set of resources currently used by pods on the node. +/// It uses this Kubelet interface: +/// https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/#grpc-endpoint-list async fn get_used_slots() -> Result, anyhow::Error> { // We will ignore this dummy uri because UDS does not use it. // Some servers will check the uri content so the uri needs to @@ -51,7 +57,7 @@ async fn get_used_slots() -> Result, anyhow::Error> { .flat_map(|pr| { pr.containers.into_iter().flat_map(|cr| { cr.devices.into_iter().flat_map(|cd| { - if cd.resource_name.starts_with("akri.sh/") { + if cd.resource_name.starts_with(DP_SLOT_PREFIX) { cd.device_ids } else { vec![] @@ -66,8 +72,6 @@ async fn get_used_slots() -> Result, anyhow::Error> { pub async fn start_reclaimer(dp_manager: Arc) { let mut stalled_slots: HashMap = HashMap::new(); - let mut signal = - tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()).unwrap(); loop { trace!("reclaiming unused slots - start"); if let Ok(used_slots) = get_used_slots().await { @@ -102,7 +106,6 @@ pub async fn start_reclaimer(dp_manager: Arc) { } tokio::select! { _ = tokio::time::sleep(SLOT_RECLAIM_INTERVAL) => {}, - _ = signal.recv() => return, }; } } diff --git a/agent/src/util/discovery_configuration_controller.rs b/agent/src/util/discovery_configuration_controller.rs index 723d615dd..496a783fa 100644 --- a/agent/src/util/discovery_configuration_controller.rs +++ b/agent/src/util/discovery_configuration_controller.rs @@ -58,11 +58,6 @@ pub async fn start_controller( let controller = Controller::new(api, Default::default()); controller - .graceful_shutdown_on(async { - let mut signal = - tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()).unwrap(); - signal.recv().await; - }) // Reconcile the Configuration when the discovery handler manager signals a change .reconcile_on(tokio_stream::wrappers::ReceiverStream::new(rec)) .run(reconcile, error_policy, ctx) @@ -70,6 +65,17 @@ pub async fn start_controller( .await; } +/// This function is the main Reconcile function for Configurations resources +/// This will get called every time a Configuration gets added or is changed, it will also be called +/// for every existing configuration on startup. +/// We also set-up discovery manager to trigger reconciliation upon discovery state change +/// +/// Here the function will (in order): +/// - Check if Configuration awaits deletion, and if so terminate pending discovery, remove finalizer and return early +/// - Add finalizer if not here already +/// - Start discovery if not already started +/// - Get discovery results (empty list if just started) +/// - Create/Delete Instances according to discovery results pub async fn reconcile( dc: Arc, ctx: Arc, @@ -515,7 +521,7 @@ mod tests { namespace: Some("namespace-a".to_string()), name: Some("instance-1".to_string()), owner_references: Some(vec![OwnerReference { - api_version: "akri.sh/v0".to_string(), + api_version: Instance::api_version(&()).to_string(), block_owner_deletion: None, controller: Some(true), kind: "Configuration".to_string(), @@ -539,7 +545,7 @@ mod tests { namespace: Some("namespace-a".to_string()), name: Some("instance-2".to_string()), owner_references: Some(vec![OwnerReference { - api_version: "akri.sh/v0".to_string(), + api_version: Instance::api_version(&()).to_string(), block_owner_deletion: None, controller: Some(true), kind: "Configuration".to_string(), @@ -563,7 +569,7 @@ mod tests { namespace: Some("namespace-a".to_string()), name: Some("instance-3".to_string()), owner_references: Some(vec![OwnerReference { - api_version: "akri.sh/v0".to_string(), + api_version: Instance::api_version(&()).to_string(), block_owner_deletion: None, controller: Some(true), kind: "Configuration".to_string(), diff --git a/agent/src/util/stopper.rs b/agent/src/util/stopper.rs index 0649cb6a5..386faa142 100644 --- a/agent/src/util/stopper.rs +++ b/agent/src/util/stopper.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use futures::stream::{AbortHandle, Abortable}; -use tokio::{signal::unix::SignalKind, sync::watch}; +use tokio::sync::watch; #[derive(Clone)] pub struct Stopper { @@ -16,10 +16,8 @@ impl Stopper { }; let local_s = s.clone(); tokio::spawn(async move { - let mut signal = tokio::signal::unix::signal(SignalKind::terminate()).unwrap(); tokio::select! { _ = local_s.stopped() => {}, - _ = signal.recv() => local_s.stop() } }); s From cef20546d9503a3a8e2e0e72a4cd9c118c5dd90a Mon Sep 17 00:00:00 2001 From: Nicolas Belouin Date: Tue, 30 Apr 2024 09:45:20 +0200 Subject: [PATCH 4/8] Fix documentation build Signed-off-by: Nicolas Belouin --- agent/src/plugin_manager/device_plugin_slot_reclaimer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs b/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs index 1c9d41e34..4843aa3fd 100644 --- a/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs +++ b/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs @@ -26,7 +26,7 @@ const SLOT_RECLAIM_INTERVAL: Duration = Duration::from_secs(10); /// This function connects to kubelet's resource monitoring interface and extracts /// the set of resources currently used by pods on the node. /// It uses this Kubelet interface: -/// https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/#grpc-endpoint-list +/// async fn get_used_slots() -> Result, anyhow::Error> { // We will ignore this dummy uri because UDS does not use it. // Some servers will check the uri content so the uri needs to From 2e3c3a51b971eebb5666371a6540b98e07b4a22b Mon Sep 17 00:00:00 2001 From: Nicolas Belouin Date: Tue, 30 Apr 2024 16:04:05 +0200 Subject: [PATCH 5/8] Improve reclaimer logging Signed-off-by: Nicolas Belouin --- .../plugin_manager/device_plugin_instance_controller.rs | 1 + agent/src/plugin_manager/device_plugin_slot_reclaimer.rs | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/agent/src/plugin_manager/device_plugin_instance_controller.rs b/agent/src/plugin_manager/device_plugin_instance_controller.rs index 9ec4c8fc5..6f2e7a46d 100644 --- a/agent/src/plugin_manager/device_plugin_instance_controller.rs +++ b/agent/src/plugin_manager/device_plugin_instance_controller.rs @@ -858,6 +858,7 @@ pub async fn reconcile( plugin } Some(plugin) => { + // TODO: Add a way to handle a change in the instance's capacity. plugin.update_slots(&instance.spec.device_usage).await?; plugin.clone() } diff --git a/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs b/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs index 4843aa3fd..6d5ed39f6 100644 --- a/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs +++ b/agent/src/plugin_manager/device_plugin_slot_reclaimer.rs @@ -75,9 +75,7 @@ pub async fn start_reclaimer(dp_manager: Arc) { loop { trace!("reclaiming unused slots - start"); if let Ok(used_slots) = get_used_slots().await { - trace!("used slots: {:?}", used_slots); let theoretical_slots = dp_manager.get_used_slots().await; - trace!("theoretical slots: {:?}", theoretical_slots); let mut new_stalled_slots: HashMap = HashMap::new(); let reclaim_iteration_start = Instant::now(); for slot_to_reclaim in theoretical_slots.difference(&used_slots) { @@ -91,6 +89,12 @@ pub async fn start_reclaimer(dp_manager: Arc) { .await .is_err() { + warn!( + "Failed to free slot {}, will try again in {}s", + slot_to_reclaim, + SLOT_RECLAIM_INTERVAL.as_secs() + ); + // To try again we just keep the slot as stalled new_stalled_slots.insert(slot_to_reclaim.to_string(), at.to_owned()); }; } else { From c7998337f0879512c5c8fa051cde7ad0c99a3819 Mon Sep 17 00:00:00 2001 From: Nicolas Belouin Date: Thu, 2 May 2024 15:50:07 +0200 Subject: [PATCH 6/8] Directly use kube-rs Client structure in Agent Signed-off-by: Nicolas Belouin --- agent/src/main.rs | 2 +- shared/src/k8s/api.rs | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/agent/src/main.rs b/agent/src/main.rs index 75e1851d6..3416d0af6 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -37,7 +37,7 @@ async fn main() -> Result<(), Box let node_name = env::var("AGENT_NODE_NAME")?; { - let kube_client = Arc::new(akri_shared::k8s::KubeImpl::new().await?); + let kube_client = Arc::new(kube::Client::try_default().await?); // Start server for Prometheus metrics tasks.push(tokio::spawn(async move { diff --git a/shared/src/k8s/api.rs b/shared/src/k8s/api.rs index a4ac10286..dd90f43d7 100644 --- a/shared/src/k8s/api.rs +++ b/shared/src/k8s/api.rs @@ -145,3 +145,33 @@ where Box::new(kube::Api::default_namespaced(self.client.clone())) } } + +impl IntoApi for kube::Client +where + T: Resource + + Clone + + DeserializeOwned + + Debug + + serde::Serialize + + Send + + Sync + + 'static, +{ + fn all(&self) -> Box> { + Box::new(kube::Api::all(self.clone())) + } + + fn namespaced(&self, namespace: &str) -> Box> + where + T: Resource, + { + Box::new(kube::Api::namespaced(self.clone(), namespace)) + } + + fn default_namespaced(&self) -> Box> + where + T: Resource, + { + Box::new(kube::Api::default_namespaced(self.clone())) + } +} From 3145c7127f7482c7423cfbfb5b40df01a5e72a12 Mon Sep 17 00:00:00 2001 From: Nicolas Belouin Date: Fri, 28 Jun 2024 15:34:25 +0200 Subject: [PATCH 7/8] Improve documentation and clarity of code Signed-off-by: Nicolas Belouin --- agent/src/device_manager/in_memory.rs | 1 + .../discovery_handler_registry.rs | 152 ++++++++++-------- .../device_plugin_instance_controller.rs | 16 +- .../plugin_manager/device_plugin_runner.rs | 19 ++- 4 files changed, 108 insertions(+), 80 deletions(-) diff --git a/agent/src/device_manager/in_memory.rs b/agent/src/device_manager/in_memory.rs index 61a458413..54611c4e3 100644 --- a/agent/src/device_manager/in_memory.rs +++ b/agent/src/device_manager/in_memory.rs @@ -18,6 +18,7 @@ impl DeviceManager for InMemoryManager { /// It returns None if the device is not registered to the device manager /// If the device is registered, it resolves its properties by merging the device specific properties /// with the configuration (kind) level properties + /// Also change the name of the device in the returned structure to match the name used by Device Plugin fn get(&self, fqdn: &str) -> Option { let (kind, id) = fqdn.split_once('=').unwrap(); let state = self.state.borrow(); diff --git a/agent/src/discovery_handler_manager/discovery_handler_registry.rs b/agent/src/discovery_handler_manager/discovery_handler_registry.rs index d81c2bb3d..c8c558cda 100644 --- a/agent/src/discovery_handler_manager/discovery_handler_registry.rs +++ b/agent/src/discovery_handler_manager/discovery_handler_registry.rs @@ -6,6 +6,7 @@ use akri_shared::akri::configuration::{Configuration, DiscoveryProperty}; use akri_shared::akri::instance::Instance; use akri_shared::akri::instance::InstanceSpec; +use akri_shared::akri::AKRI_PREFIX; use async_trait::async_trait; use blake2::digest::{Update, VariableOutput}; use blake2::VarBlake2b; @@ -48,7 +49,7 @@ impl DiscoveredDevice { let mut id_to_digest = id_to_digest.to_string(); // For local devices, include node hostname in id_to_digest so instances have unique names if !shared { - id_to_digest = format!("{}{}", &id_to_digest, node_name,); + id_to_digest = format!("{}{}", id_to_digest, node_name); } let mut digest = String::new(); let mut hasher = VarBlake2b::new(3).unwrap(); @@ -109,16 +110,23 @@ pub trait DiscoveryHandlerEndpoint: Send + Sync { fn is_closed(&self) -> bool; } -/// This trait is here to help with testing for code that interract with the discovery handler registry +/// This trait is here to help with testing for code that interract with the discovery handler registry. +/// This trait represent a request made to a DH (either locally or through gRPC call), it will aggregate the +/// results across the different registered handlers of that type, and generate the Instance objects for discovered +/// devices. #[cfg_attr(test, automock)] pub trait DiscoveryHandlerRequest: Sync + Send { fn get_instances(&self) -> Result, DiscoveryError>; } /// This trait is here to help with testing for code that interract with the discovery handler registry +/// In the context of this trait, a "request" is a DiscoveryHandlerRequest, #[cfg_attr(test, automock)] #[async_trait] pub trait DiscoveryHandlerRegistry: Sync + Send { + /// Create a new request against a specific Discovery Handler type, the DH Registry will ensure it + /// gets sent to all registered handlers with this name, present and future, if no DH with that name + /// is registered, returns an error. async fn new_request( &self, key: &str, @@ -189,10 +197,13 @@ impl DHRequestImpl { } fn get_device_cdi_fqdn(&self, dev: &DiscoveredDevice) -> String { - format!("akri.sh/{}={}", self.key, dev.device_hash()) + format!("{}/{}={}", AKRI_PREFIX, self.key, dev.device_hash()) } - async fn watch_devices(&self, mut rec: broadcast::Receiver>) { + async fn watch_devices( + &self, + mut new_dh_receiver: broadcast::Receiver>, + ) { loop { let mut local_endpoints = self.endpoints.write().await.clone(); let futures = local_endpoints.iter_mut().map(|e| e.changed().boxed()); @@ -206,12 +217,12 @@ impl DHRequestImpl { } } }, - Ok(endpoint) = rec.recv() => { - if endpoint.get_name() != self.handler_name { + Ok(new_dh_endpoint) = new_dh_receiver.recv() => { + if new_dh_endpoint.get_name() != self.handler_name { // We woke up for another kind of DH, let's get back to sleep continue } - if let Ok(q) = self.query(endpoint).await { + if let Ok(q) = self.query(new_dh_endpoint).await { self.endpoints.write().await.push(q); } }, @@ -237,14 +248,14 @@ impl DHRequestImpl { async fn query( &self, - endpoint: Arc, + discovery_handler: Arc, ) -> Result>>, DiscoveryError> { let (q_sender, q_receiver) = watch::channel(vec![]); let query_body = DiscoverRequest { discovery_details: self.details.clone(), discovery_properties: self.solve_discovery_properties().await?, }; - endpoint.query(q_sender, query_body).await?; + discovery_handler.query(q_sender, query_body).await?; Ok(q_receiver) } @@ -293,6 +304,64 @@ impl DHRegistryImpl { } } +async fn handle_request( + mut req_notifier: watch::Receiver>>, + key: String, + namespace: &String, + cdi_sender: Arc>>>, + local_config_sender: mpsc::Sender>, + extra_device_properties: HashMap, +) { + let cdi_kind = format!("{}/{}", AKRI_PREFIX, key); + loop { + match req_notifier.changed().await { + Ok(_) => { + cdi_sender.lock().await.send_modify(|kind| { + kind.insert( + cdi_kind.clone(), + crate::device_manager::cdi::Kind { + kind: cdi_kind.clone(), + annotations: Default::default(), + devices: req_notifier + .borrow_and_update() + .iter() + .map(|d| d.as_ref().clone().into()) + .collect(), + container_edits: vec![ContainerEdit { + env: extra_device_properties + .iter() + .map(|(k, v)| format!("{}={}", k, v)) + .collect(), + ..Default::default() + }], + }, + ); + }); + trace!("Ask for reconciliation of {}::{}", namespace, key); + let res = local_config_sender + .send(ObjectRef::::new(&key).within(namespace)) + .await; + if res.is_err() { + cdi_sender.lock().await.send_modify(|kind| { + kind.remove(&cdi_kind); + }); + return; + } + } + Err(_) => { + trace!("Ask for reconciliation of {}::{}", namespace, key); + let _ = local_config_sender + .send(ObjectRef::::new(&key).within(namespace)) + .await; + cdi_sender.lock().await.send_modify(|kind| { + kind.remove(&cdi_kind); + }); + return; + } + } + } +} + #[async_trait] impl DiscoveryHandlerRegistry for DHRegistryImpl { async fn new_request( @@ -330,7 +399,7 @@ impl DiscoveryHandlerRegistry for DHRegistryImpl { req_w.insert(key.to_string(), Arc::new(dh_req)); } let dh_req_ref = self.requests.read().await.get(key).unwrap().to_owned(); - let mut local_req_notifier = self + let local_req_notifier = self .requests .read() .await @@ -343,60 +412,15 @@ impl DiscoveryHandlerRegistry for DHRegistryImpl { let local_key = key.to_owned(); let namespace = namespace.to_owned(); tokio::spawn(async move { - let cdi_kind = format!("akri.sh/{}", local_key); - loop { - match local_req_notifier.changed().await { - Ok(_) => { - local_cdi_sender.lock().await.send_modify(|kind| { - kind.insert( - cdi_kind.clone(), - crate::device_manager::cdi::Kind { - kind: cdi_kind.clone(), - annotations: Default::default(), - devices: local_req_notifier - .borrow_and_update() - .iter() - .map(|d| d.as_ref().clone().into()) - .collect(), - container_edits: vec![ContainerEdit { - env: extra_device_properties - .iter() - .map(|(k, v)| format!("{}={}", k, v)) - .collect(), - ..Default::default() - }], - }, - ); - }); - trace!("Ask for reconciliation of {}::{}", namespace, local_key); - let res = local_config_sender - .send( - ObjectRef::::new(&local_key) - .within(&namespace), - ) - .await; - if res.is_err() { - local_cdi_sender.lock().await.send_modify(|kind| { - kind.remove(&cdi_kind); - }); - return; - } - } - Err(_) => { - trace!("Ask for reconciliation of {}::{}", namespace, local_key); - let _ = local_config_sender - .send( - ObjectRef::::new(&local_key) - .within(&namespace), - ) - .await; - local_cdi_sender.lock().await.send_modify(|kind| { - kind.remove(&cdi_kind); - }); - return; - } - } - } + handle_request( + local_req_notifier, + local_key, + &namespace, + local_cdi_sender, + local_config_sender, + extra_device_properties, + ) + .await }); let local_key = key.to_owned(); diff --git a/agent/src/plugin_manager/device_plugin_instance_controller.rs b/agent/src/plugin_manager/device_plugin_instance_controller.rs index 6f2e7a46d..45a26b8b2 100644 --- a/agent/src/plugin_manager/device_plugin_instance_controller.rs +++ b/agent/src/plugin_manager/device_plugin_instance_controller.rs @@ -369,10 +369,10 @@ impl InternalDevicePlugin for InstanceDevicePlugin { let receiver_stream = tokio_stream::wrappers::WatchStream::new(receiver); Ok(tonic::Response::new(DeviceUsageStream { - f: instance_device_usage_to_device, - st: self.stopper.make_abortable(receiver_stream), - str_1: device_name, - str_2: node_name, + device_usage_to_device: instance_device_usage_to_device, + input_stream: self.stopper.make_abortable(receiver_stream), + device_name, + node_name, })) } @@ -633,10 +633,10 @@ impl InternalDevicePlugin for ConfigurationDevicePlugin { let receiver_stream = tokio_stream::wrappers::WatchStream::new(receiver); Ok(tonic::Response::new(DeviceUsageStream { - f: config_device_usage_to_device, - st: self.stopper.make_abortable(receiver_stream), - str_1: device_name, - str_2: node_name, + device_usage_to_device: config_device_usage_to_device, + input_stream: self.stopper.make_abortable(receiver_stream), + device_name, + node_name, })) } diff --git a/agent/src/plugin_manager/device_plugin_runner.rs b/agent/src/plugin_manager/device_plugin_runner.rs index 6c090a3f3..a6c0da105 100644 --- a/agent/src/plugin_manager/device_plugin_runner.rs +++ b/agent/src/plugin_manager/device_plugin_runner.rs @@ -45,10 +45,10 @@ pub(super) trait InternalDevicePlugin: Sync + Send { } pub(super) struct DeviceUsageStream { - pub f: fn(&str, &str, T) -> Result, - pub st: futures::stream::Abortable>, - pub str_1: String, - pub str_2: String, + pub device_usage_to_device: fn(&str, &str, T) -> Result, + pub input_stream: futures::stream::Abortable>, + pub device_name: String, + pub node_name: String, } impl futures::Stream for DeviceUsageStream { @@ -58,10 +58,13 @@ impl futures::Stream for DeviceUsageStream mut self: std::pin::Pin<&mut Self>, cx: &mut std::task::Context<'_>, ) -> std::task::Poll> { - match self.st.poll_next_unpin(cx) { - std::task::Poll::Ready(Some(i)) => { - std::task::Poll::Ready(Some((self.f)(&self.str_1, &self.str_2, i))) - } + match self.input_stream.poll_next_unpin(cx) { + std::task::Poll::Ready(Some(i)) => std::task::Poll::Ready(Some((self + .device_usage_to_device)( + &self.device_name, + &self.node_name, + i, + ))), std::task::Poll::Ready(None) => { trace!("Stream Stopped"); std::task::Poll::Ready(None) From 488f9bf743bf1ef927348d7dbbfbaae1d910440d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 12 Jul 2024 17:06:38 +0000 Subject: [PATCH 8/8] Update minor version Signed-off-by: github-actions[bot] --- Cargo.lock | 28 +++++++++---------- agent/Cargo.toml | 2 +- controller/Cargo.toml | 2 +- deployment/helm/Chart.yaml | 4 +-- .../debug-echo-discovery-handler/Cargo.toml | 2 +- .../onvif-discovery-handler/Cargo.toml | 2 +- .../opcua-discovery-handler/Cargo.toml | 2 +- .../udev-discovery-handler/Cargo.toml | 2 +- discovery-handlers/debug-echo/Cargo.toml | 2 +- discovery-handlers/onvif/Cargo.toml | 2 +- discovery-handlers/opcua/Cargo.toml | 2 +- discovery-handlers/udev/Cargo.toml | 2 +- discovery-utils/Cargo.toml | 2 +- samples/brokers/udev-video-broker/Cargo.toml | 2 +- shared/Cargo.toml | 2 +- version.txt | 2 +- webhooks/validating/configuration/Cargo.toml | 2 +- 17 files changed, 31 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 20d85662b..24f61e233 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -222,7 +222,7 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "agent" -version = "0.12.20" +version = "0.13.0" dependencies = [ "akri-debug-echo", "akri-discovery-utils", @@ -285,7 +285,7 @@ dependencies = [ [[package]] name = "akri-debug-echo" -version = "0.12.20" +version = "0.13.0" dependencies = [ "akri-discovery-utils", "akri-shared", @@ -303,7 +303,7 @@ dependencies = [ [[package]] name = "akri-discovery-utils" -version = "0.12.20" +version = "0.13.0" dependencies = [ "akri-shared", "anyhow", @@ -325,7 +325,7 @@ dependencies = [ [[package]] name = "akri-onvif" -version = "0.12.20" +version = "0.13.0" dependencies = [ "akri-discovery-utils", "anyhow", @@ -354,7 +354,7 @@ dependencies = [ [[package]] name = "akri-opcua" -version = "0.12.20" +version = "0.13.0" dependencies = [ "akri-discovery-utils", "anyhow", @@ -373,7 +373,7 @@ dependencies = [ [[package]] name = "akri-shared" -version = "0.12.20" +version = "0.13.0" dependencies = [ "anyhow", "async-trait", @@ -398,7 +398,7 @@ dependencies = [ [[package]] name = "akri-udev" -version = "0.12.20" +version = "0.13.0" dependencies = [ "akri-discovery-utils", "anyhow", @@ -957,7 +957,7 @@ dependencies = [ [[package]] name = "controller" -version = "0.12.20" +version = "0.13.0" dependencies = [ "akri-shared", "anyhow", @@ -1131,7 +1131,7 @@ checksum = "7e962a19be5cfc3f3bf6dd8f61eb50107f356ad6270fbb3ed41476571db78be5" [[package]] name = "debug-echo-discovery-handler" -version = "0.12.20" +version = "0.13.0" dependencies = [ "akri-debug-echo", "akri-discovery-utils", @@ -2416,7 +2416,7 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "onvif-discovery-handler" -version = "0.12.20" +version = "0.13.0" dependencies = [ "akri-discovery-utils", "akri-onvif", @@ -2465,7 +2465,7 @@ dependencies = [ [[package]] name = "opcua-discovery-handler" -version = "0.12.20" +version = "0.13.0" dependencies = [ "akri-discovery-utils", "akri-opcua", @@ -3936,7 +3936,7 @@ dependencies = [ [[package]] name = "udev-discovery-handler" -version = "0.12.20" +version = "0.13.0" dependencies = [ "akri-discovery-utils", "akri-udev", @@ -3947,7 +3947,7 @@ dependencies = [ [[package]] name = "udev-video-broker" -version = "0.12.20" +version = "0.13.0" dependencies = [ "akri-shared", "env_logger", @@ -4210,7 +4210,7 @@ dependencies = [ [[package]] name = "webhook-configuration" -version = "0.12.20" +version = "0.13.0" dependencies = [ "actix-rt", "actix-web", diff --git a/agent/Cargo.toml b/agent/Cargo.toml index 16efa261b..805a821ad 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "agent" -version = "0.12.20" +version = "0.13.0" license = "Apache-2.0" authors = ["Kate Goldenring ", ""] edition = "2021" diff --git a/controller/Cargo.toml b/controller/Cargo.toml index ed3201797..eb9a7e763 100644 --- a/controller/Cargo.toml +++ b/controller/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "controller" -version = "0.12.20" +version = "0.13.0" license = "Apache-2.0" authors = ["", ""] edition = "2021" diff --git a/deployment/helm/Chart.yaml b/deployment/helm/Chart.yaml index 84fc0569a..1e8d0f7d6 100644 --- a/deployment/helm/Chart.yaml +++ b/deployment/helm/Chart.yaml @@ -16,9 +16,9 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.12.20 +version: 0.13.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. -appVersion: 0.12.20 +appVersion: 0.13.0 diff --git a/discovery-handler-modules/debug-echo-discovery-handler/Cargo.toml b/discovery-handler-modules/debug-echo-discovery-handler/Cargo.toml index 162e43801..07b9c39f5 100644 --- a/discovery-handler-modules/debug-echo-discovery-handler/Cargo.toml +++ b/discovery-handler-modules/debug-echo-discovery-handler/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "debug-echo-discovery-handler" -version = "0.12.20" +version = "0.13.0" license = "Apache-2.0" authors = ["Kate Goldenring "] edition = "2021" diff --git a/discovery-handler-modules/onvif-discovery-handler/Cargo.toml b/discovery-handler-modules/onvif-discovery-handler/Cargo.toml index 58bf17fc4..90845bfd7 100644 --- a/discovery-handler-modules/onvif-discovery-handler/Cargo.toml +++ b/discovery-handler-modules/onvif-discovery-handler/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "onvif-discovery-handler" -version = "0.12.20" +version = "0.13.0" license = "Apache-2.0" authors = ["Kate Goldenring "] edition = "2021" diff --git a/discovery-handler-modules/opcua-discovery-handler/Cargo.toml b/discovery-handler-modules/opcua-discovery-handler/Cargo.toml index 8fd8eb290..6f5dd77fd 100644 --- a/discovery-handler-modules/opcua-discovery-handler/Cargo.toml +++ b/discovery-handler-modules/opcua-discovery-handler/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "opcua-discovery-handler" -version = "0.12.20" +version = "0.13.0" license = "Apache-2.0" authors = ["Kate Goldenring "] edition = "2021" diff --git a/discovery-handler-modules/udev-discovery-handler/Cargo.toml b/discovery-handler-modules/udev-discovery-handler/Cargo.toml index 79d645d57..a0773ccb2 100644 --- a/discovery-handler-modules/udev-discovery-handler/Cargo.toml +++ b/discovery-handler-modules/udev-discovery-handler/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "udev-discovery-handler" -version = "0.12.20" +version = "0.13.0" license = "Apache-2.0" authors = ["Kate Goldenring "] edition = "2021" diff --git a/discovery-handlers/debug-echo/Cargo.toml b/discovery-handlers/debug-echo/Cargo.toml index 996188ec7..108ae66f9 100644 --- a/discovery-handlers/debug-echo/Cargo.toml +++ b/discovery-handlers/debug-echo/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "akri-debug-echo" -version = "0.12.20" +version = "0.13.0" license = "Apache-2.0" authors = ["Kate Goldenring "] edition = "2021" diff --git a/discovery-handlers/onvif/Cargo.toml b/discovery-handlers/onvif/Cargo.toml index 8ab6303d4..a2551aee3 100644 --- a/discovery-handlers/onvif/Cargo.toml +++ b/discovery-handlers/onvif/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "akri-onvif" -version = "0.12.20" +version = "0.13.0" license = "Apache-2.0" authors = ["Kate Goldenring "] edition = "2021" diff --git a/discovery-handlers/opcua/Cargo.toml b/discovery-handlers/opcua/Cargo.toml index 56238db87..c0e821519 100644 --- a/discovery-handlers/opcua/Cargo.toml +++ b/discovery-handlers/opcua/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "akri-opcua" -version = "0.12.20" +version = "0.13.0" license = "Apache-2.0" authors = ["Kate Goldenring "] edition = "2021" diff --git a/discovery-handlers/udev/Cargo.toml b/discovery-handlers/udev/Cargo.toml index 00b8af9cf..537d31ecd 100644 --- a/discovery-handlers/udev/Cargo.toml +++ b/discovery-handlers/udev/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "akri-udev" -version = "0.12.20" +version = "0.13.0" license = "Apache-2.0" authors = ["Kate Goldenring "] edition = "2021" diff --git a/discovery-utils/Cargo.toml b/discovery-utils/Cargo.toml index 88fa43250..7ec5b3723 100644 --- a/discovery-utils/Cargo.toml +++ b/discovery-utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "akri-discovery-utils" -version = "0.12.20" +version = "0.13.0" license = "Apache-2.0" authors = ["Kate Goldenring "] edition = "2021" diff --git a/samples/brokers/udev-video-broker/Cargo.toml b/samples/brokers/udev-video-broker/Cargo.toml index c36abf86b..e208503cf 100644 --- a/samples/brokers/udev-video-broker/Cargo.toml +++ b/samples/brokers/udev-video-broker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "udev-video-broker" -version = "0.12.20" +version = "0.13.0" license = "Apache-2.0" authors = ["Kate Goldenring ", ""] edition = "2021" diff --git a/shared/Cargo.toml b/shared/Cargo.toml index 08650dd12..3f68773be 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "akri-shared" -version = "0.12.20" +version = "0.13.0" license = "Apache-2.0" authors = [""] edition = "2021" diff --git a/version.txt b/version.txt index c018e4506..54d1a4f2a 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.12.20 +0.13.0 diff --git a/webhooks/validating/configuration/Cargo.toml b/webhooks/validating/configuration/Cargo.toml index 71c78b233..dd43976ef 100644 --- a/webhooks/validating/configuration/Cargo.toml +++ b/webhooks/validating/configuration/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "webhook-configuration" -version = "0.12.20" +version = "0.13.0" license = "Apache-2.0" authors = ["DazWilkin "] edition = "2021"