From 376bf9d124ce271c28a81bb387b3ac8deafd1487 Mon Sep 17 00:00:00 2001 From: Alvin Peters Date: Sat, 3 Aug 2024 11:46:45 +1000 Subject: [PATCH 01/13] Add TiKV: Add necessary features/modules --- Cargo.lock | 483 +++++++++++++++++++++---- README.md | 2 +- crates/main/Cargo.toml | 5 +- crates/store/Cargo.toml | 2 + crates/store/src/backend/mod.rs | 2 + crates/store/src/backend/tikv/blob.rs | 27 ++ crates/store/src/backend/tikv/main.rs | 40 ++ crates/store/src/backend/tikv/mod.rs | 84 +++++ crates/store/src/backend/tikv/read.rs | 73 ++++ crates/store/src/backend/tikv/write.rs | 49 +++ crates/store/src/dispatch/blob.rs | 6 + crates/store/src/dispatch/mod.rs | 2 + crates/store/src/dispatch/store.rs | 22 ++ crates/store/src/lib.rs | 14 + crates/trc/src/imple.rs | 2 + crates/trc/src/lib.rs | 1 + 16 files changed, 746 insertions(+), 68 deletions(-) create mode 100644 crates/store/src/backend/tikv/blob.rs create mode 100644 crates/store/src/backend/tikv/main.rs create mode 100644 crates/store/src/backend/tikv/mod.rs create mode 100644 crates/store/src/backend/tikv/read.rs create mode 100644 crates/store/src/backend/tikv/write.rs diff --git a/Cargo.lock b/Cargo.lock index 9502cef9c..aac00d7cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -74,7 +74,7 @@ version = "0.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9" dependencies = [ - "getrandom", + "getrandom 0.2.15", "once_cell", "version_check", ] @@ -86,7 +86,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", - "getrandom", + "getrandom 0.2.15", "once_cell", "serde", "version_check", @@ -317,6 +317,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "async-recursion" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7d78656ba01f1b93024b7c3a0467f1608e4be67d725749fdcd7d2c7678fd7a2" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "async-recursion" version = "1.1.1" @@ -414,6 +425,34 @@ dependencies = [ "thiserror", ] +[[package]] +name = "axum" +version = "0.6.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf" +dependencies = [ + "async-trait", + "axum-core 0.3.4", + "bitflags 1.3.2", + "bytes", + "futures-util", + "http 0.2.12", + "http-body 0.4.6", + "hyper 0.14.30", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "sync_wrapper 0.1.2", + "tower", + "tower-layer", + "tower-service", +] + [[package]] name = "axum" version = "0.7.5" @@ -421,7 +460,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf" dependencies = [ "async-trait", - "axum-core", + "axum-core 0.4.3", "bytes", "futures-util", "http 1.1.0", @@ -441,6 +480,23 @@ dependencies = [ "tower-service", ] +[[package]] +name = "axum-core" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http 0.2.12", + "http-body 0.4.6", + "mime", + "rustversion", + "tower-layer", + "tower-service", +] + [[package]] name = "axum-core" version = "0.4.3" @@ -1108,7 +1164,7 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" dependencies = [ - "getrandom", + "getrandom 0.2.15", "once_cell", "tiny-keccak", ] @@ -1255,7 +1311,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" dependencies = [ "generic-array 0.14.7", - "rand_core", + "rand_core 0.6.4", "subtle", "zeroize", ] @@ -1267,7 +1323,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" dependencies = [ "generic-array 0.14.7", - "rand_core", + "rand_core 0.6.4", "typenum", ] @@ -1468,7 +1524,7 @@ checksum = "1ab8a4ea925ce79678034870834602a2980f4b88c09e97feb266496dbb4493d2" dependencies = [ "async-trait", "deadpool 0.12.1", - "getrandom", + "getrandom 0.2.15", "tokio", "tokio-postgres", "tracing", @@ -1549,6 +1605,17 @@ dependencies = [ "serde", ] +[[package]] +name = "derive-new" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3418329ca0ad70234b9735dc4ceed10af4df60eff9c8e7b06cb5e520d92c3535" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "derive_arbitrary" version = "1.3.2" @@ -1812,7 +1879,7 @@ checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871" dependencies = [ "curve25519-dalek", "ed25519", - "rand_core", + "rand_core 0.6.4", "serde", "sha2 0.10.8", "subtle", @@ -1860,7 +1927,7 @@ dependencies = [ "hkdf", "pem-rfc7468", "pkcs8", - "rand_core", + "rand_core 0.6.4", "sec1", "subtle", "zeroize", @@ -1939,6 +2006,17 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "fail" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be3c61c59fdc91f5dbc3ea31ee8623122ce80057058be560654c5d410d181a6" +dependencies = [ + "lazy_static", + "log", + "rand 0.7.3", +] + [[package]] name = "fallible-iterator" version = "0.2.0" @@ -1980,13 +2058,19 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c" +[[package]] +name = "fastrand" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" + [[package]] name = "ff" version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ded41244b729663b1e574f1b4fb731469f69f79c17667b5d776b16cda0479449" dependencies = [ - "rand_core", + "rand_core 0.6.4", "subtle", ] @@ -2065,14 +2149,14 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "020bf4ae7238dbdb1ff01e9f981db028515cf66883c461e29faedfea130b2728" dependencies = [ - "async-recursion", + "async-recursion 1.1.1", "async-trait", "foundationdb-gen", "foundationdb-macros", "foundationdb-sys", "futures", "memchr", - "rand", + "rand 0.8.5", "serde", "serde_bytes", "serde_json", @@ -2297,6 +2381,17 @@ dependencies = [ "windows-targets 0.48.5", ] +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -2306,7 +2401,7 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -2339,7 +2434,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63" dependencies = [ "ff", - "rand_core", + "rand_core 0.6.4", "subtle", ] @@ -2446,7 +2541,7 @@ dependencies = [ "hickory-proto", "once_cell", "radix_trie", - "rand", + "rand 0.8.5", "rustls 0.21.12", "thiserror", "tokio", @@ -2472,7 +2567,7 @@ dependencies = [ "idna 0.4.0", "ipnet", "once_cell", - "rand", + "rand 0.8.5", "ring 0.16.20", "rustls 0.21.12", "rustls-pemfile 1.0.4", @@ -2497,7 +2592,7 @@ dependencies = [ "lru-cache", "once_cell", "parking_lot", - "rand", + "rand 0.8.5", "resolv-conf", "rustls 0.21.12", "smallvec", @@ -2717,6 +2812,18 @@ dependencies = [ "webpki-roots 0.26.3", ] +[[package]] +name = "hyper-timeout" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" +dependencies = [ + "hyper 0.14.30", + "pin-project-lite", + "tokio", + "tokio-io-timeout", +] + [[package]] name = "hyper-timeout" version = "0.5.1" @@ -2730,6 +2837,22 @@ dependencies = [ "tower-service", ] +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper 1.4.1", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.6" @@ -2960,7 +3083,7 @@ dependencies = [ "md5", "nlp", "parking_lot", - "rand", + "rand 0.8.5", "rustls 0.23.12", "rustls-pemfile 2.1.2", "store", @@ -3190,7 +3313,7 @@ dependencies = [ "p256", "pkcs8", "quick-xml 0.35.0", - "rand", + "rand 0.8.5", "rasn", "rasn-cms", "rasn-pkix", @@ -3548,7 +3671,7 @@ dependencies = [ "mail-parser", "parking_lot", "quick-xml 0.32.0", - "rand", + "rand 0.8.5", "ring 0.17.8", "rsa", "rustls-pemfile 2.1.2", @@ -3754,7 +3877,7 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "log", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.48.0", ] @@ -3766,7 +3889,7 @@ checksum = "4569e456d394deccd22ce1c1913e6ea0e54519f577285001215d33557431afe4" dependencies = [ "hermit-abi", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.52.0", ] @@ -3809,7 +3932,7 @@ dependencies = [ "pem", "percent-encoding", "pin-project", - "rand", + "rand 0.8.5", "rustls 0.22.4", "rustls-pemfile 2.1.2", "serde", @@ -3848,7 +3971,7 @@ dependencies = [ "mysql-common-derive", "num-bigint", "num-traits", - "rand", + "rand 0.8.5", "regex", "rust_decimal", "saturating", @@ -3864,6 +3987,23 @@ dependencies = [ "zstd", ] +[[package]] +name = "native-tls" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + [[package]] name = "new_debug_unreachable" version = "1.0.6" @@ -3953,7 +4093,7 @@ dependencies = [ "num-integer", "num-iter", "num-traits", - "rand", + "rand 0.8.5", "smallvec", "zeroize", ] @@ -4133,11 +4273,11 @@ dependencies = [ "opentelemetry-http", "opentelemetry-proto", "opentelemetry_sdk", - "prost", + "prost 0.13.1", "reqwest 0.12.5", "thiserror", "tokio", - "tonic", + "tonic 0.12.1", ] [[package]] @@ -4148,8 +4288,8 @@ checksum = "30ee9f20bff9c984511a02f082dc8ede839e4a9bf15cc2487c8d6fea5ad850d9" dependencies = [ "opentelemetry", "opentelemetry_sdk", - "prost", - "tonic", + "prost 0.13.1", + "tonic 0.12.1", ] [[package]] @@ -4172,7 +4312,7 @@ dependencies = [ "once_cell", "opentelemetry", "percent-encoding", - "rand", + "rand 0.8.5", "serde_json", "thiserror", ] @@ -4221,7 +4361,7 @@ dependencies = [ "ecdsa", "elliptic-curve", "primeorder", - "rand_core", + "rand_core 0.6.4", "sha2 0.10.8", ] @@ -4255,7 +4395,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "346f04948ba92c43e8469c1ee6736c7563d71012b17d40745260fe106aac2166" dependencies = [ "base64ct", - "rand_core", + "rand_core 0.6.4", "subtle", ] @@ -4339,7 +4479,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" dependencies = [ "phf_shared 0.11.2", - "rand", + "rand 0.8.5", ] [[package]] @@ -4481,7 +4621,7 @@ dependencies = [ "hmac 0.12.1", "md-5 0.10.6", "memchr", - "rand", + "rand 0.8.5", "sha2 0.10.8", "stringprep", ] @@ -4603,6 +4743,57 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "procfs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +dependencies = [ + "bitflags 2.6.0", + "hex", + "lazy_static", + "procfs-core", + "rustix", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.6.0", + "hex", +] + +[[package]] +name = "prometheus" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1" +dependencies = [ + "cfg-if", + "fnv", + "lazy_static", + "libc", + "memchr", + "parking_lot", + "procfs", + "protobuf", + "reqwest 0.12.5", + "thiserror", +] + +[[package]] +name = "prost" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29" +dependencies = [ + "bytes", + "prost-derive 0.12.6", +] + [[package]] name = "prost" version = "0.13.1" @@ -4610,7 +4801,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e13db3d3fde688c61e2446b4d843bc27a7e8af269a69440c0308021dc92333cc" dependencies = [ "bytes", - "prost-derive", + "prost-derive 0.13.1", +] + +[[package]] +name = "prost-derive" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" +dependencies = [ + "anyhow", + "itertools 0.12.1", + "proc-macro2", + "quote", + "syn 2.0.72", ] [[package]] @@ -4626,6 +4830,12 @@ dependencies = [ "syn 2.0.72", ] +[[package]] +name = "protobuf" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" + [[package]] name = "proxy-header" version = "0.1.2" @@ -4666,7 +4876,7 @@ dependencies = [ "byteorder", "hmac 0.10.1", "md-5 0.9.1", - "rand", + "rand 0.8.5", "sha-1", "sha2 0.9.9", ] @@ -4720,7 +4930,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ddf517c03a109db8100448a4be38d498df8a210a99fe0e1b9eaf39e78c640efe" dependencies = [ "bytes", - "rand", + "rand 0.8.5", "ring 0.17.8", "rustc-hash", "rustls 0.23.12", @@ -4778,6 +4988,19 @@ dependencies = [ "nibble_vec", ] +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", +] + [[package]] name = "rand" version = "0.8.5" @@ -4785,8 +5008,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", ] [[package]] @@ -4796,7 +5029,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", ] [[package]] @@ -4805,7 +5047,16 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.15", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", ] [[package]] @@ -4925,7 +5176,7 @@ dependencies = [ "log", "percent-encoding", "pin-project-lite", - "rand", + "rand 0.8.5", "rustls 0.22.4", "rustls-native-certs 0.7.1", "rustls-pemfile 2.1.2", @@ -4964,7 +5215,7 @@ version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891" dependencies = [ - "getrandom", + "getrandom 0.2.15", "libredox", "thiserror", ] @@ -5059,6 +5310,7 @@ checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" dependencies = [ "base64 0.22.1", "bytes", + "encoding_rs", "futures-channel", "futures-core", "futures-util", @@ -5068,12 +5320,14 @@ dependencies = [ "http-body-util", "hyper 1.4.1", "hyper-rustls 0.27.2", + "hyper-tls", "hyper-util", "ipnet", "js-sys", "log", "mime", "mime_guess", + "native-tls", "once_cell", "percent-encoding", "pin-project-lite", @@ -5085,7 +5339,9 @@ dependencies = [ "serde_json", "serde_urlencoded", "sync_wrapper 1.0.1", + "system-configuration", "tokio", + "tokio-native-tls", "tokio-rustls 0.26.0", "tower-service", "url", @@ -5148,7 +5404,7 @@ checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" dependencies = [ "cc", "cfg-if", - "getrandom", + "getrandom 0.2.15", "libc", "spin 0.9.8", "untrusted 0.9.0", @@ -5237,7 +5493,7 @@ dependencies = [ "num-traits", "pkcs1", "pkcs8", - "rand_core", + "rand_core 0.6.4", "signature", "spki", "subtle", @@ -5342,7 +5598,7 @@ dependencies = [ "borsh", "bytes", "num-traits", - "rand", + "rand 0.8.5", "rkyv", "serde", "serde_json", @@ -5699,7 +5955,7 @@ dependencies = [ "ecdsa", "ed25519", "ed25519-dalek", - "getrandom", + "getrandom 0.2.15", "idea", "idna 1.0.2", "lalrpop", @@ -5713,8 +5969,8 @@ dependencies = [ "p256", "p384", "p521", - "rand", - "rand_core", + "rand 0.8.5", + "rand_core 0.6.4", "regex", "regex-syntax", "ripemd", @@ -5930,7 +6186,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" dependencies = [ "digest 0.10.7", - "rand_core", + "rand_core 0.6.4", ] [[package]] @@ -5997,7 +6253,7 @@ dependencies = [ "nlp", "num_cpus", "parking_lot", - "rand", + "rand 0.8.5", "rayon", "regex", "reqwest 0.12.5", @@ -6106,7 +6362,7 @@ dependencies = [ "num_cpus", "prettytable-rs", "pwhash", - "rand", + "rand 0.8.5", "reqwest 0.12.5", "rpassword", "serde", @@ -6146,7 +6402,7 @@ dependencies = [ "num_cpus", "parking_lot", "r2d2", - "rand", + "rand 0.8.5", "rayon", "redis", "regex", @@ -6159,6 +6415,7 @@ dependencies = [ "rustls-pki-types", "serde", "serde_json", + "tikv-client", "tokio", "tokio-postgres", "tokio-rustls 0.26.0", @@ -6315,6 +6572,19 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" +[[package]] +name = "tempfile" +version = "3.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fcd239983515c23a32fb82099f97d0b11b8c72f654ed659363a95c3dad7a53" +dependencies = [ + "cfg-if", + "fastrand", + "once_cell", + "rustix", + "windows-sys 0.52.0", +] + [[package]] name = "term" version = "0.7.0" @@ -6406,6 +6676,33 @@ dependencies = [ "syn 2.0.72", ] +[[package]] +name = "tikv-client" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "048968e4e3d04db472346770cc19914c6b5ae206fa44677f6a0874d54cd05940" +dependencies = [ + "async-recursion 0.3.2", + "async-trait", + "derive-new", + "either", + "fail", + "futures", + "lazy_static", + "log", + "pin-project", + "prometheus", + "prost 0.12.6", + "rand 0.8.5", + "regex", + "semver 1.0.23", + "serde", + "serde_derive", + "thiserror", + "tokio", + "tonic 0.10.2", +] + [[package]] name = "time" version = "0.3.36" @@ -6499,6 +6796,16 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "tokio-io-timeout" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" +dependencies = [ + "pin-project-lite", + "tokio", +] + [[package]] name = "tokio-macros" version = "2.4.0" @@ -6510,6 +6817,16 @@ dependencies = [ "syn 2.0.72", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-postgres" version = "0.7.11" @@ -6529,7 +6846,7 @@ dependencies = [ "pin-project-lite", "postgres-protocol", "postgres-types", - "rand", + "rand 0.8.5", "socket2", "tokio", "tokio-util", @@ -6637,6 +6954,36 @@ dependencies = [ "winnow", ] +[[package]] +name = "tonic" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e" +dependencies = [ + "async-stream", + "async-trait", + "axum 0.6.20", + "base64 0.21.7", + "bytes", + "h2 0.3.26", + "http 0.2.12", + "http-body 0.4.6", + "hyper 0.14.30", + "hyper-timeout 0.4.1", + "percent-encoding", + "pin-project", + "prost 0.12.6", + "rustls 0.21.12", + "rustls-pemfile 1.0.4", + "tokio", + "tokio-rustls 0.24.1", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "tonic" version = "0.12.1" @@ -6645,7 +6992,7 @@ checksum = "38659f4a91aba8598d27821589f5db7dddd94601e7a01b1e485a50e5484c7401" dependencies = [ "async-stream", "async-trait", - "axum", + "axum 0.7.5", "base64 0.22.1", "bytes", "h2 0.4.5", @@ -6653,11 +7000,11 @@ dependencies = [ "http-body 1.0.1", "http-body-util", "hyper 1.4.1", - "hyper-timeout", + "hyper-timeout 0.5.1", "hyper-util", "percent-encoding", "pin-project", - "prost", + "prost 0.13.1", "socket2", "tokio", "tokio-stream", @@ -6693,7 +7040,7 @@ dependencies = [ "indexmap 1.9.3", "pin-project", "pin-project-lite", - "rand", + "rand 0.8.5", "slab", "tokio", "tokio-util", @@ -6793,7 +7140,7 @@ dependencies = [ "http 1.1.0", "httparse", "log", - "rand", + "rand 0.8.5", "rustls 0.22.4", "rustls-pki-types", "sha1", @@ -6814,7 +7161,7 @@ dependencies = [ "http 1.1.0", "httparse", "log", - "rand", + "rand 0.8.5", "sha1", "thiserror", "utf-8", @@ -6836,7 +7183,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" dependencies = [ "cfg-if", - "rand", + "rand 0.8.5", "static_assertions", ] @@ -7006,7 +7353,7 @@ dependencies = [ "parking_lot", "pem", "privdrop", - "rand", + "rand 0.8.5", "rcgen 0.13.1", "regex", "reqwest 0.12.5", @@ -7030,7 +7377,7 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" dependencies = [ - "getrandom", + "getrandom 0.2.15", ] [[package]] @@ -7070,6 +7417,12 @@ dependencies = [ "try-lock", ] +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -7481,7 +7834,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7e468321c81fb07fa7f4c636c3972b9100f0346e5b6a9f2bd0603a52f7ed277" dependencies = [ "curve25519-dalek", - "rand_core", + "rand_core 0.6.4", "zeroize", ] @@ -7688,7 +8041,7 @@ dependencies = [ "lzma-rs", "memchr", "pbkdf2", - "rand", + "rand 0.8.5", "sha1", "thiserror", "time", diff --git a/README.md b/README.md index 56f6c331b..68ec2d142 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ Key features: - **Greylisting** to temporarily defer unknown senders. - **Spam traps** to set up decoy email addresses that catch and analyze spam. - **Flexible and scalable**: - - Pluggable storage backends with **RocksDB**, **FoundationDB**, **PostgreSQL**, **mySQL**, **SQLite**, **S3-Compatible**, **Redis** and **ElasticSearch** support. + - Pluggable storage backends with **RocksDB**, **FoundationDB**, **FoundationDB**, **TiKV**, **PostgreSQL**, **mySQL**, **SQLite**, **S3-Compatible**, **Redis** and **ElasticSearch** support. - **Clustering** support with node autodiscovery and partition-tolerant failure detection. - Built-in, **LDAP** or **SQL** authentication backend support. - Full-text search available in 17 languages. diff --git a/crates/main/Cargo.toml b/crates/main/Cargo.toml index 2288c837b..a6bddb8b6 100644 --- a/crates/main/Cargo.toml +++ b/crates/main/Cargo.toml @@ -34,13 +34,14 @@ tokio = { version = "1.23", features = ["full"] } jemallocator = "0.5.0" [features] -default = ["sqlite", "postgres", "mysql", "rocks", "elastic", "s3", "redis", "enterprise"] -#default = ["sqlite", "postgres", "mysql", "rocks", "elastic", "s3", "redis", "foundationdb", "enterprise"] +default = ["sqlite", "postgres", "mysql", "rocks", "tikv", "elastic", "s3", "redis", "enterprise"] +#default = ["sqlite", "postgres", "mysql", "rocks", "tikv", "elastic", "s3", "redis", "foundationdb", "enterprise"] sqlite = ["store/sqlite"] foundationdb = ["store/foundation", "common/foundation"] postgres = ["store/postgres"] mysql = ["store/mysql"] rocks = ["store/rocks"] +tikv = ["store/tikv"] elastic = ["store/elastic"] s3 = ["store/s3"] redis = ["store/redis"] diff --git a/crates/store/Cargo.toml b/crates/store/Cargo.toml index e41d6e958..0f615f2b2 100644 --- a/crates/store/Cargo.toml +++ b/crates/store/Cargo.toml @@ -10,6 +10,7 @@ nlp = { path = "../nlp" } trc = { path = "../trc" } rocksdb = { version = "0.22", optional = true, features = ["multi-threaded-cf"] } foundationdb = { version = "0.9.0", features = ["embedded-fdb-include", "fdb-7_1"], optional = true } +tikv-client = { version = "0.3.0", optional = true } rusqlite = { version = "0.31.0", features = ["bundled"], optional = true } rust-s3 = { version = "=0.35.0-alpha.2", default-features = false, features = ["tokio-rustls-tls", "no-verify-ssl"], optional = true } tokio = { version = "1.23", features = ["sync", "fs", "io-util"] } @@ -58,6 +59,7 @@ elastic = ["elasticsearch", "serde_json"] mysql = ["mysql_async", "futures"] s3 = ["rust-s3"] foundation = ["foundationdb", "futures"] +tikv = ["tikv-client"] fdb-chunked-bm = [] redis = ["dep:redis", "deadpool"] diff --git a/crates/store/src/backend/mod.rs b/crates/store/src/backend/mod.rs index 7ba2aaa18..152935175 100644 --- a/crates/store/src/backend/mod.rs +++ b/crates/store/src/backend/mod.rs @@ -22,6 +22,8 @@ pub mod rocksdb; pub mod s3; #[cfg(feature = "sqlite")] pub mod sqlite; +#[cfg(feature = "tikv")] +pub mod tikv; pub const MAX_TOKEN_LENGTH: usize = (u8::MAX >> 1) as usize; pub const MAX_TOKEN_MASK: usize = MAX_TOKEN_LENGTH - 1; diff --git a/crates/store/src/backend/tikv/blob.rs b/crates/store/src/backend/tikv/blob.rs new file mode 100644 index 000000000..f1ef32bbb --- /dev/null +++ b/crates/store/src/backend/tikv/blob.rs @@ -0,0 +1,27 @@ +/* + * SPDX-FileCopyrightText: 2020 Stalwart Labs Ltd + * + * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL + */ + +use std::ops::Range; + +use super::TikvStore; + +impl TikvStore { + pub(crate) async fn get_blob( + &self, + key: &[u8], + range: Range, + ) -> trc::Result>> { + todo!() + } + + pub(crate) async fn put_blob(&self, key: &[u8], data: &[u8]) -> trc::Result<()> { + todo!() + } + + pub(crate) async fn delete_blob(&self, key: &[u8]) -> trc::Result { + todo!() + } +} \ No newline at end of file diff --git a/crates/store/src/backend/tikv/main.rs b/crates/store/src/backend/tikv/main.rs new file mode 100644 index 000000000..a658fad5f --- /dev/null +++ b/crates/store/src/backend/tikv/main.rs @@ -0,0 +1,40 @@ +/* + * SPDX-FileCopyrightText: 2020 Stalwart Labs Ltd + * + * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL + */ + +use std::net::SocketAddr; + +use tikv_client::TransactionClient; +use utils::config::{utils::AsKey, Config}; + +use super::TikvStore; + +impl TikvStore { + pub async fn open(config: &mut Config, prefix: impl AsKey) -> Option { + + let prefix = prefix.as_key(); + + // Parse as SocketAddr but don't use it. TransactionClient takes only a String vector + let pd_endpoints = config.properties::((&prefix, "pd-endpoints")) + .into_iter() + .map(|(addr_str, _socket_addr)| addr_str) + .collect(); + + let client = TransactionClient::new(pd_endpoints) + .await + .map_err(|err| { + config.new_build_error( + prefix.as_str(), + format!("Failed to create TiKV database: {err:?}"), + ) + }) + .ok()?; + + Some(Self { + client, + version: Default::default(), + }) + } +} diff --git a/crates/store/src/backend/tikv/mod.rs b/crates/store/src/backend/tikv/mod.rs new file mode 100644 index 000000000..23acc9a6f --- /dev/null +++ b/crates/store/src/backend/tikv/mod.rs @@ -0,0 +1,84 @@ +/* + * SPDX-FileCopyrightText: 2020 Stalwart Labs Ltd + * + * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL + */ + +use std::time::{Duration, Instant}; + +//use foundationdb::{api::NetworkAutoStop, Database, FdbError, Transaction}; +use tikv_client::{TransactionClient, Transaction, Error as TikvError}; + +pub mod blob; +pub mod main; +pub mod read; +pub mod write; + +const MAX_VALUE_SIZE: usize = 100000; +const MAX_KEYS: u32 = 100000; +const MAX_KV_PAIRS: u32 = 50000; +pub const TRANSACTION_EXPIRY: Duration = Duration::from_secs(1); +pub const TRANSACTION_TIMEOUT: Duration = Duration::from_secs(4); + +#[allow(dead_code)] +pub struct TikvStore { + client: TransactionClient, + version: parking_lot::Mutex, +} + +pub(crate) struct TimedTransaction { + trx: Transaction, + expires: Instant, +} + +pub(crate) struct ReadVersion { + version: i64, + expires: Instant, +} + +impl ReadVersion { + pub fn new(version: i64) -> Self { + Self { + version, + expires: Instant::now() + TRANSACTION_EXPIRY, + } + } + + pub fn is_expired(&self) -> bool { + self.expires < Instant::now() + } +} + +impl Default for ReadVersion { + fn default() -> Self { + Self { + version: 0, + expires: Instant::now(), + } + } +} + +impl AsRef for TimedTransaction { + fn as_ref(&self) -> &Transaction { + &self.trx + } +} + +impl TimedTransaction { + pub fn new(trx: Transaction) -> Self { + Self { + trx, + expires: Instant::now() + TRANSACTION_TIMEOUT, + } + } + + pub fn is_expired(&self) -> bool { + self.expires < Instant::now() + } +} + +#[inline(always)] +fn into_error(error: TikvError) -> trc::Error { + trc::StoreEvent::FoundationdbError + .reason(error.to_string()) +} diff --git a/crates/store/src/backend/tikv/read.rs b/crates/store/src/backend/tikv/read.rs new file mode 100644 index 000000000..0676c0bf2 --- /dev/null +++ b/crates/store/src/backend/tikv/read.rs @@ -0,0 +1,73 @@ +/* + * SPDX-FileCopyrightText: 2020 Stalwart Labs Ltd + * + * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL + */ + +use tikv_client::{Transaction, Value}; +use futures::TryStreamExt; +use roaring::RoaringBitmap; +use crate::{ + backend::deserialize_i64_le, + write::{ + key::{DeserializeBigEndian, KeySerializer}, + BitmapClass, ValueClass, + }, + BitmapKey, Deserialize, IterateParams, Key, ValueKey, U32_LEN, WITH_SUBSPACE, +}; + +use super::{into_error, TikvStore}; + +#[allow(dead_code)] +pub(crate) enum ChunkedValue { + Single(Value), + Chunked { n_chunks: u8, bytes: Vec }, + None, +} + +impl TikvStore { + pub(crate) async fn get_value(&self, key: impl Key) -> trc::Result> + where + U: Deserialize, + { + todo!() + } + + pub(crate) async fn get_bitmap( + &self, + mut key: BitmapKey>, + ) -> trc::Result> { + todo!() + } + + pub(crate) async fn iterate( + &self, + params: IterateParams, + mut cb: impl for<'x> FnMut(&'x [u8], &'x [u8]) -> trc::Result + Sync + Send, + ) -> trc::Result<()> { + todo!() + } + + pub(crate) async fn get_counter( + &self, + key: impl Into>> + Sync + Send, + ) -> trc::Result { + todo!() + } + + pub(crate) async fn read_trx(&self) -> trc::Result { + todo!() + } + + pub(crate) async fn timed_read_trx(&self) -> trc::Result { + todo!() + } +} + +pub(crate) async fn read_chunked_value( + key: &[u8], + trx: &Transaction, + snapshot: bool, +) -> trc::Result { + todo!() +} diff --git a/crates/store/src/backend/tikv/write.rs b/crates/store/src/backend/tikv/write.rs new file mode 100644 index 000000000..9f065647a --- /dev/null +++ b/crates/store/src/backend/tikv/write.rs @@ -0,0 +1,49 @@ +/* + * SPDX-FileCopyrightText: 2020 Stalwart Labs Ltd + * + * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL + */ + +use std::{ + cmp::Ordering, + time::{Duration, Instant}, +}; + +use tikv_client::{ + Transaction +}; +use rand::Rng; +use roaring::RoaringBitmap; + +use crate::{ + backend::deserialize_i64_le, + write::{ + key::{DeserializeBigEndian, KeySerializer}, + AssignedIds, Batch, BitmapClass, Operation, RandomAvailableId, ValueOp, + MAX_COMMIT_ATTEMPTS, MAX_COMMIT_TIME, + }, + BitmapKey, IndexKey, Key, LogKey, SUBSPACE_COUNTER, SUBSPACE_QUOTA, U32_LEN, WITH_SUBSPACE, +}; + +use super::{ + into_error, + read::{read_chunked_value, ChunkedValue}, + TikvStore, ReadVersion, MAX_VALUE_SIZE, +}; + +impl TikvStore { + pub(crate) async fn write(&self, batch: Batch) -> trc::Result { + todo!() + } + + pub(crate) async fn commit(&self, trx: Transaction, will_retry: bool) -> trc::Result { + todo!() + } + pub(crate) async fn purge_store(&self) -> trc::Result<()> { + todo!() + } + + pub(crate) async fn delete_range(&self, from: impl Key, to: impl Key) -> trc::Result<()> { + todo!() + } +} diff --git a/crates/store/src/dispatch/blob.rs b/crates/store/src/dispatch/blob.rs index cc2d24cd2..7cd425827 100644 --- a/crates/store/src/dispatch/blob.rs +++ b/crates/store/src/dispatch/blob.rs @@ -30,6 +30,8 @@ impl BlobStore { Store::MySQL(store) => store.get_blob(key, read_range).await, #[cfg(feature = "rocks")] Store::RocksDb(store) => store.get_blob(key, read_range).await, + #[cfg(feature = "tikv")] + Store::TiKV(store) => store.get_blob(key, read_range).await, Store::None => Err(trc::StoreEvent::NotConfigured.into()), }, BlobBackend::Fs(store) => store.get_blob(key, read_range).await, @@ -96,6 +98,8 @@ impl BlobStore { Store::MySQL(store) => store.put_blob(key, data.as_ref()).await, #[cfg(feature = "rocks")] Store::RocksDb(store) => store.put_blob(key, data.as_ref()).await, + #[cfg(feature = "tikv")] + Store::TiKV(store) => store.put_blob(key, data.as_ref()).await, Store::None => Err(trc::StoreEvent::NotConfigured.into()), }, BlobBackend::Fs(store) => store.put_blob(key, data.as_ref()).await, @@ -118,6 +122,8 @@ impl BlobStore { Store::MySQL(store) => store.delete_blob(key).await, #[cfg(feature = "rocks")] Store::RocksDb(store) => store.delete_blob(key).await, + #[cfg(feature = "tikv")] + Store::TiKV(store) => store.delete_blob(key).await, Store::None => Err(trc::StoreEvent::NotConfigured.into()), }, BlobBackend::Fs(store) => store.delete_blob(key).await, diff --git a/crates/store/src/dispatch/mod.rs b/crates/store/src/dispatch/mod.rs index efd03b53d..3eca8848c 100644 --- a/crates/store/src/dispatch/mod.rs +++ b/crates/store/src/dispatch/mod.rs @@ -26,6 +26,8 @@ impl Store { Self::MySQL(_) => "mysql", #[cfg(feature = "rocks")] Self::RocksDb(_) => "rocksdb", + #[cfg(feature = "tikv")] + Self::TiKV(_) => "tikv", Self::None => "none", } } diff --git a/crates/store/src/dispatch/store.rs b/crates/store/src/dispatch/store.rs index 1588a9655..ca3aae3c7 100644 --- a/crates/store/src/dispatch/store.rs +++ b/crates/store/src/dispatch/store.rs @@ -43,6 +43,8 @@ impl Store { Self::MySQL(store) => store.get_value(key).await, #[cfg(feature = "rocks")] Self::RocksDb(store) => store.get_value(key).await, + #[cfg(feature = "tikv")] + Self::TiKV(store) => store.get_value(key).await, Self::None => Err(trc::StoreEvent::NotConfigured.into()), } .caused_by(trc::location!()) @@ -63,6 +65,8 @@ impl Store { Self::MySQL(store) => store.get_bitmap(key).await, #[cfg(feature = "rocks")] Self::RocksDb(store) => store.get_bitmap(key).await, + #[cfg(feature = "tikv")] + Self::TiKV(store) => store.get_bitmap(key).await, Self::None => Err(trc::StoreEvent::NotConfigured.into()), } .caused_by(trc::location!()) @@ -106,6 +110,8 @@ impl Store { Self::MySQL(store) => store.iterate(params, cb).await, #[cfg(feature = "rocks")] Self::RocksDb(store) => store.iterate(params, cb).await, + #[cfg(feature = "tikv")] + Self::TiKV(store) => store.iterate(params, cb).await, Self::None => Err(trc::StoreEvent::NotConfigured.into()), } .caused_by(trc::location!()) @@ -126,6 +132,8 @@ impl Store { Self::MySQL(store) => store.get_counter(key).await, #[cfg(feature = "rocks")] Self::RocksDb(store) => store.get_counter(key).await, + #[cfg(feature = "tikv")] + Self::TiKV(store) => store.get_counter(key).await, Self::None => Err(trc::StoreEvent::NotConfigured.into()), } .caused_by(trc::location!()) @@ -189,6 +197,8 @@ impl Store { Self::MySQL(store) => store.write(batch).await, #[cfg(feature = "rocks")] Self::RocksDb(store) => store.write(batch).await, + #[cfg(feature = "tikv")] + Self::TiKV(store) => store.write(batch).await, Self::None => Err(trc::StoreEvent::NotConfigured.into()), } .caused_by(trc::location!())?; @@ -231,6 +241,8 @@ impl Store { Self::MySQL(store) => store.write(batch).await, #[cfg(feature = "rocks")] Self::RocksDb(store) => store.write(batch).await, + #[cfg(feature = "tikv")] + Self::TiKV(store) => store.write(batch).await, Self::None => Err(trc::StoreEvent::NotConfigured.into()), } } @@ -277,6 +289,8 @@ impl Store { Self::MySQL(store) => store.purge_store().await, #[cfg(feature = "rocks")] Self::RocksDb(store) => store.purge_store().await, + #[cfg(feature = "tikv")] + Self::TiKV(store) => store.purge_store().await, Self::None => Err(trc::StoreEvent::NotConfigured.into()), } .caused_by(trc::location!()) @@ -294,6 +308,8 @@ impl Store { Self::MySQL(store) => store.delete_range(from, to).await, #[cfg(feature = "rocks")] Self::RocksDb(store) => store.delete_range(from, to).await, + #[cfg(feature = "tikv")] + Self::TiKV(store) => store.delete_range(from, to).await, Self::None => Err(trc::StoreEvent::NotConfigured.into()), } .caused_by(trc::location!()) @@ -447,6 +463,8 @@ impl Store { Self::MySQL(store) => store.get_blob(key, range).await, #[cfg(feature = "rocks")] Self::RocksDb(store) => store.get_blob(key, range).await, + #[cfg(feature = "tikv")] + Self::TiKV(store) => store.get_blob(key, range).await, Self::None => Err(trc::StoreEvent::NotConfigured.into()), } .caused_by(trc::location!()) @@ -464,6 +482,8 @@ impl Store { Self::MySQL(store) => store.put_blob(key, data).await, #[cfg(feature = "rocks")] Self::RocksDb(store) => store.put_blob(key, data).await, + #[cfg(feature = "tikv")] + Self::TiKV(store) => store.put_blob(key, data).await, Self::None => Err(trc::StoreEvent::NotConfigured.into()), } .caused_by(trc::location!()) @@ -481,6 +501,8 @@ impl Store { Self::MySQL(store) => store.delete_blob(key).await, #[cfg(feature = "rocks")] Self::RocksDb(store) => store.delete_blob(key).await, + #[cfg(feature = "tikv")] + Self::TiKV(store) => store.delete_blob(key).await, Self::None => Err(trc::StoreEvent::NotConfigured.into()), } .caused_by(trc::location!()) diff --git a/crates/store/src/lib.rs b/crates/store/src/lib.rs index 89a444f30..2bafcc33c 100644 --- a/crates/store/src/lib.rs +++ b/crates/store/src/lib.rs @@ -46,6 +46,9 @@ use backend::elastic::ElasticSearchStore; #[cfg(feature = "redis")] use backend::redis::RedisStore; +#[cfg(feature = "tikv")] +use backend::tikv::TikvStore; + pub trait Deserialize: Sized + Sync + Send { fn deserialize(bytes: &[u8]) -> trc::Result; } @@ -183,6 +186,8 @@ pub enum Store { MySQL(Arc), #[cfg(feature = "rocks")] RocksDb(Arc), + #[cfg(feature = "tikv")] + TiKV(Arc), #[default] None, } @@ -263,6 +268,13 @@ impl From for Store { } } +#[cfg(feature = "tikv")] +impl From for Store { + fn from(store: TikvStore) -> Self { + Self::TiKV(Arc::new(store)) + } +} + impl From for BlobStore { fn from(store: FsStore) -> Self { BlobStore { @@ -682,6 +694,8 @@ impl std::fmt::Debug for Store { Self::MySQL(_) => f.debug_tuple("MySQL").finish(), #[cfg(feature = "rocks")] Self::RocksDb(_) => f.debug_tuple("RocksDb").finish(), + #[cfg(feature = "tikv")] + Self::TiKV(_) => f.debug_tuple("TiKV").finish(), Self::None => f.debug_tuple("None").finish(), } } diff --git a/crates/trc/src/imple.rs b/crates/trc/src/imple.rs index f8d653078..1b7e8996e 100644 --- a/crates/trc/src/imple.rs +++ b/crates/trc/src/imple.rs @@ -863,6 +863,7 @@ impl EventType { | StoreEvent::PostgresqlError | StoreEvent::RocksdbError | StoreEvent::SqliteError + | StoreEvent::TikvError | StoreEvent::LdapError | StoreEvent::ElasticsearchError | StoreEvent::RedisError @@ -2077,6 +2078,7 @@ impl StoreEvent { StoreEvent::PostgresqlError => "PostgreSQL error", StoreEvent::RocksdbError => "RocksDB error", StoreEvent::SqliteError => "SQLite error", + StoreEvent::TikvError => "TiKV error", StoreEvent::LdapError => "LDAP error", StoreEvent::ElasticsearchError => "ElasticSearch error", StoreEvent::RedisError => "Redis error", diff --git a/crates/trc/src/lib.rs b/crates/trc/src/lib.rs index be3aadaad..ea8e6ec29 100644 --- a/crates/trc/src/lib.rs +++ b/crates/trc/src/lib.rs @@ -822,6 +822,7 @@ pub enum StoreEvent { PostgresqlError, RocksdbError, SqliteError, + TikvError, LdapError, ElasticsearchError, RedisError, From 019b2ba5a22d3082b2cee8f98b8dbe36cbcf1a02 Mon Sep 17 00:00:00 2001 From: Alvin Peters Date: Sat, 3 Aug 2024 13:53:44 +1000 Subject: [PATCH 02/13] Add TiKV: Work on read logic --- crates/store/src/backend/tikv/blob.rs | 68 ++++++++++++++++- crates/store/src/backend/tikv/main.rs | 5 +- crates/store/src/backend/tikv/mod.rs | 38 ++++++++- crates/store/src/backend/tikv/read.rs | 106 +++++++++++++++++++++++--- crates/store/src/config.rs | 25 ++++++ resources/config/config.toml | 39 +++++++--- 6 files changed, 251 insertions(+), 30 deletions(-) diff --git a/crates/store/src/backend/tikv/blob.rs b/crates/store/src/backend/tikv/blob.rs index f1ef32bbb..22c14f26d 100644 --- a/crates/store/src/backend/tikv/blob.rs +++ b/crates/store/src/backend/tikv/blob.rs @@ -5,8 +5,9 @@ */ use std::ops::Range; - -use super::TikvStore; +use crate::SUBSPACE_BLOBS; +use crate::write::key::KeySerializer; +use super::{into_error, MAX_KV_PAIRS, MAX_VALUE_SIZE, TikvStore}; impl TikvStore { pub(crate) async fn get_blob( @@ -14,7 +15,68 @@ impl TikvStore { key: &[u8], range: Range, ) -> trc::Result>> { - todo!() + let block_start = range.start / MAX_VALUE_SIZE; + let bytes_start = range.start % MAX_VALUE_SIZE; + let block_end = (range.end / MAX_VALUE_SIZE) + 1; + + let begin = KeySerializer::new(key.len() + 3) + .write(SUBSPACE_BLOBS) + .write(key) + .write(block_start as u16) + .finalize(); + let end = KeySerializer::new(key.len() + 3) + .write(SUBSPACE_BLOBS) + .write(key) + .write(block_end as u16) + .finalize(); + let key_len = begin.len(); + let mut trx = self.snapshot_trx().await?; + // TODO: Create repeat logic for over max + let mut values = trx.scan((begin, end), MAX_KV_PAIRS).await.map_err(into_error)?; + let mut blob_data: Option> = None; + let blob_range = range.end - range.start; + + 'outer: while let Some(kv_pair) = values.next() { + let key = kv_pair.0; + if key.len() == key_len { + let value = kv_pair.1; + if let Some(blob_data) = &mut blob_data { + blob_data.extend_from_slice( + value + .get( + ..std::cmp::min( + blob_range.saturating_sub(blob_data.len()), + value.len(), + ), + ) + .unwrap_or(&[]), + ); + if blob_data.len() == blob_range { + break 'outer; + } + } else { + let blob_size = if blob_range <= (5 * (1 << 20)) { + blob_range + } else if value.len() == MAX_VALUE_SIZE { + MAX_VALUE_SIZE * 2 + } else { + value.len() + }; + let mut blob_data_ = Vec::with_capacity(blob_size); + blob_data_.extend_from_slice( + value + .get(bytes_start..std::cmp::min(bytes_start + blob_range, value.len())) + .unwrap_or(&[]), + ); + if blob_data_.len() == blob_range { + return Ok(Some(blob_data_)); + } + blob_data = blob_data_.into(); + } + } + } + + Ok(blob_data) } pub(crate) async fn put_blob(&self, key: &[u8], data: &[u8]) -> trc::Result<()> { diff --git a/crates/store/src/backend/tikv/main.rs b/crates/store/src/backend/tikv/main.rs index a658fad5f..88c765b84 100644 --- a/crates/store/src/backend/tikv/main.rs +++ b/crates/store/src/backend/tikv/main.rs @@ -13,13 +13,12 @@ use super::TikvStore; impl TikvStore { pub async fn open(config: &mut Config, prefix: impl AsKey) -> Option { - let prefix = prefix.as_key(); // Parse as SocketAddr but don't use it. TransactionClient takes only a String vector - let pd_endpoints = config.properties::((&prefix, "pd-endpoints")) + let pd_endpoints = config.properties::((&prefix, "pd-endpoints")) .into_iter() - .map(|(addr_str, _socket_addr)| addr_str) + .map(|(_key, addr_str)| addr_str) .collect(); let client = TransactionClient::new(pd_endpoints) diff --git a/crates/store/src/backend/tikv/mod.rs b/crates/store/src/backend/tikv/mod.rs index 23acc9a6f..24953c1ed 100644 --- a/crates/store/src/backend/tikv/mod.rs +++ b/crates/store/src/backend/tikv/mod.rs @@ -5,15 +5,17 @@ */ use std::time::{Duration, Instant}; - -//use foundationdb::{api::NetworkAutoStop, Database, FdbError, Transaction}; -use tikv_client::{TransactionClient, Transaction, Error as TikvError}; +use tikv_client::{TransactionClient, Transaction, Error as TikvError, Snapshot, Value, Key}; pub mod blob; pub mod main; pub mod read; pub mod write; + +// https://github.com/tikv/tikv/issues/7272#issuecomment-604841372 + +const MAX_KEY_SIZE: usize = 4 * 1024; const MAX_VALUE_SIZE: usize = 100000; const MAX_KEYS: u32 = 100000; const MAX_KV_PAIRS: u32 = 50000; @@ -26,6 +28,36 @@ pub struct TikvStore { version: parking_lot::Mutex, } +pub(crate) enum ReadTransaction { + Transaction(Transaction), + Snapshot(Snapshot) +} + +impl ReadTransaction { + pub(crate) async fn get(&mut self, key: impl Into) -> trc::Result> { + match self { + ReadTransaction::Transaction(trx) => { + trx.get(key).await.map_err(into_error) + } + ReadTransaction::Snapshot(ss) => { + ss.get(key).await.map_err(into_error) + } + } + } +} + +impl From for ReadTransaction { + fn from(value: Transaction) -> Self { + Self::Transaction(value) + } +} + +impl From for ReadTransaction { + fn from(value: Snapshot) -> Self { + Self::Snapshot(value) + } +} + pub(crate) struct TimedTransaction { trx: Transaction, expires: Instant, diff --git a/crates/store/src/backend/tikv/read.rs b/crates/store/src/backend/tikv/read.rs index 0676c0bf2..bcbc068e7 100644 --- a/crates/store/src/backend/tikv/read.rs +++ b/crates/store/src/backend/tikv/read.rs @@ -3,8 +3,8 @@ * * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ - -use tikv_client::{Transaction, Value}; +use bincode::Options; +use tikv_client::{Key as TikvKey, Snapshot, Transaction, TransactionOptions, Value}; use futures::TryStreamExt; use roaring::RoaringBitmap; use crate::{ @@ -16,7 +16,7 @@ use crate::{ BitmapKey, Deserialize, IterateParams, Key, ValueKey, U32_LEN, WITH_SUBSPACE, }; -use super::{into_error, TikvStore}; +use super::{into_error, MAX_KEYS, MAX_KV_PAIRS, MAX_VALUE_SIZE, ReadTransaction, TikvStore}; #[allow(dead_code)] pub(crate) enum ChunkedValue { @@ -30,7 +30,14 @@ impl TikvStore { where U: Deserialize, { - todo!() + let key = key.serialize(WITH_SUBSPACE); + let read_trx = ReadTransaction::Snapshot(self.snapshot_trx().await?); + + match read_chunked_value(&key, read_trx).await? { + ChunkedValue::Single(bytes) => U::deserialize(&bytes).map(Some), + ChunkedValue::Chunked { bytes, .. } => U::deserialize(&bytes).map(Some), + ChunkedValue::None => Ok(None), + } } pub(crate) async fn get_bitmap( @@ -45,7 +52,52 @@ impl TikvStore { params: IterateParams, mut cb: impl for<'x> FnMut(&'x [u8], &'x [u8]) -> trc::Result + Sync + Send, ) -> trc::Result<()> { - todo!() + let mut begin: TikvKey = params.begin.serialize(WITH_SUBSPACE).into(); + let end: TikvKey = params.end.serialize(WITH_SUBSPACE).into(); + + if !params.first { + let mut trx = self.snapshot_trx().await?; + loop { + let mut values = trx + .scan((begin.clone(), end.clone()), MAX_KV_PAIRS) + .await + .map_err(into_error)?; + + let mut last_key: TikvKey = begin.clone(); + + let mut total_kv_pairs = 0; + + while let Some(kv_pair) = values.next() { + total_kv_pairs += 1; + // Costly + last_key = kv_pair.key().clone(); + let key: &[u8] = kv_pair.key().into(); + let value: &[u8] = kv_pair.key().into(); + + cb(key.get(1..).unwrap_or_default(), value)?; + } + + if total_kv_pairs != MAX_KV_PAIRS { + begin = last_key; + break; + } + } + } else { + let mut trx = self.snapshot_trx().await?; + let mut values = trx + .scan((begin, end), 1) + .await + .map_err(into_error)?; + + if let Some(kv_pair) = values.next() { + let key: &[u8] = kv_pair.key().into(); + let value: &[u8] = kv_pair.key().into(); + + cb(key.get(1..).unwrap_or_default(), value)?; + } + } + + Ok(()) } pub(crate) async fn get_counter( @@ -56,18 +108,50 @@ impl TikvStore { } pub(crate) async fn read_trx(&self) -> trc::Result { - todo!() + self.client + .begin_optimistic() + .await + .map_err(into_error) } - pub(crate) async fn timed_read_trx(&self) -> trc::Result { - todo!() + pub(crate) async fn snapshot_trx(&self) -> trc::Result { + let timestamp = self.client + .current_timestamp() + .await + .map_err(into_error)?; + + Ok(self.client.snapshot(timestamp, TransactionOptions::new_optimistic())) } } pub(crate) async fn read_chunked_value( key: &[u8], - trx: &Transaction, - snapshot: bool, + mut read_trx: ReadTransaction ) -> trc::Result { - todo!() + // TODO: Costly, redo + if let Some(bytes) = read_trx.get(key.to_vec()).await? { + if bytes.len() < MAX_VALUE_SIZE { + Ok(ChunkedValue::Single(bytes)) + } else { + let mut value = Vec::with_capacity(bytes.len() * 2); + value.extend_from_slice(&bytes); + let mut key = KeySerializer::new(key.len() + 1) + .write(key) + .write(0u8) + .finalize(); + + // TODO: Costly, redo + while let Some(bytes) = read_trx.get(key.clone()).await? { + value.extend_from_slice(&bytes); + *key.last_mut().unwrap() += 1; + } + + Ok(ChunkedValue::Chunked { + bytes: value, + n_chunks: *key.last().unwrap(), + }) + } + } else { + Ok(ChunkedValue::None) + } } diff --git a/crates/store/src/config.rs b/crates/store/src/config.rs index 4e04c32f8..1fe3a7b79 100644 --- a/crates/store/src/config.rs +++ b/crates/store/src/config.rs @@ -32,6 +32,9 @@ use crate::backend::foundationdb::FdbStore; #[cfg(feature = "rocks")] use crate::backend::rocksdb::RocksDbStore; +#[cfg(feature = "tikv")] +use crate::backend::tikv::TikvStore; + #[cfg(feature = "elastic")] use crate::backend::elastic::ElasticSearchStore; @@ -172,6 +175,28 @@ impl Stores { self.lookup_stores.insert(store_id.clone(), db.into()); } } + #[cfg(feature = "tikv")] + "tikv" => { + // Avoid opening the same store twice + if is_reload + && self + .stores + .values() + .any(|store| matches!(store, Store::TiKV(_))) + { + continue; + } + + if let Some(db) = TikvStore::open(config, prefix).await.map(Store::from) { + self.stores.insert(store_id.clone(), db.clone()); + self.fts_stores.insert(store_id.clone(), db.clone().into()); + self.blob_stores.insert( + store_id.clone(), + BlobStore::from(db.clone()).with_compression(compression_algo), + ); + self.lookup_stores.insert(store_id, db.into()); + } + } "fs" => { if let Some(db) = FsStore::open(config, prefix).await.map(BlobStore::from) { self.blob_stores diff --git a/resources/config/config.toml b/resources/config/config.toml index f043265e6..376e54f51 100644 --- a/resources/config/config.toml +++ b/resources/config/config.toml @@ -42,25 +42,44 @@ protocol = "http" bind = ["[::]:443"] tls.implicit = true +# Uncomment after testing +#[storage] +#data = "rocksdb" +#fts = "rocksdb" +#blob = "rocksdb" +#lookup = "rocksdb" +#directory = "internal" +# +#[store."rocksdb"] +#type = "rocksdb" +#path = "%{env:STALWART_PATH}%/data" +#compression = "lz4" +# +#[directory."internal"] +#type = "internal" +#store = "rocksdb" +# Uncomment after testing + +# Delete after testing [storage] -data = "rocksdb" -fts = "rocksdb" -blob = "rocksdb" -lookup = "rocksdb" +data = "tikv" +fts = "tikv" +blob = "tikv" +lookup = "tikv" directory = "internal" -[store."rocksdb"] -type = "rocksdb" -path = "%{env:STALWART_PATH}%/data" -compression = "lz4" +[store."tikv"] +type = "tikv" +pd-endpoints = ["127.0.0.1:2379"] [directory."internal"] type = "internal" -store = "rocksdb" +store = "tikv" +# Delete after testing [tracer."stdout"] type = "stdout" -level = "info" +level = "trace" # switch back to info after debugging ansi = false enable = true From 1d4625d1891c86c70c5798057fc80dbc91e9ab94 Mon Sep 17 00:00:00 2001 From: Alvin Peters Date: Sun, 4 Aug 2024 10:53:59 +1000 Subject: [PATCH 03/13] Add TiKV: Finish a runnable build --- crates/store/src/backend/tikv/blob.rs | 67 ++++- crates/store/src/backend/tikv/main.rs | 22 +- crates/store/src/backend/tikv/mod.rs | 39 ++- crates/store/src/backend/tikv/read.rs | 52 +++- crates/store/src/backend/tikv/write.rs | 364 ++++++++++++++++++++++++- resources/config/config.toml | 41 +-- 6 files changed, 500 insertions(+), 85 deletions(-) diff --git a/crates/store/src/backend/tikv/blob.rs b/crates/store/src/backend/tikv/blob.rs index 22c14f26d..8c4c8a6b0 100644 --- a/crates/store/src/backend/tikv/blob.rs +++ b/crates/store/src/backend/tikv/blob.rs @@ -5,9 +5,10 @@ */ use std::ops::Range; +use utils::BLOB_HASH_LEN; use crate::SUBSPACE_BLOBS; use crate::write::key::KeySerializer; -use super::{into_error, MAX_KV_PAIRS, MAX_VALUE_SIZE, TikvStore}; +use super::{into_error, MAX_KEYS, MAX_KV_PAIRS, MAX_VALUE_SIZE, TikvStore}; impl TikvStore { pub(crate) async fn get_blob( @@ -80,10 +81,70 @@ impl TikvStore { } pub(crate) async fn put_blob(&self, key: &[u8], data: &[u8]) -> trc::Result<()> { - todo!() + const N_CHUNKS: usize = (1 << 5) - 1; + let last_chunk = std::cmp::max( + (data.len() / MAX_VALUE_SIZE) + + if data.len() % MAX_VALUE_SIZE > 0 { + 1 + } else { + 0 + }, + 1, + ) - 1; + let mut trx = self.trx_client.begin_pessimistic().await.map_err(into_error)?; + + for (chunk_pos, chunk_bytes) in data.chunks(MAX_VALUE_SIZE).enumerate() { + trx.put( + KeySerializer::new(key.len() + 3) + .write(SUBSPACE_BLOBS) + .write(key) + .write(chunk_pos as u16) + .finalize(), + chunk_bytes, + ).await.map_err(into_error)?; + if chunk_pos == last_chunk || (chunk_pos > 0 && chunk_pos % N_CHUNKS == 0) { + self.commit(trx, false).await?; + if chunk_pos < last_chunk { + trx = self.trx_client.begin_pessimistic().await.map_err(into_error)?; + } else { + break; + } + } + } + + Ok(()) } pub(crate) async fn delete_blob(&self, key: &[u8]) -> trc::Result { - todo!() + if key.len() < BLOB_HASH_LEN { + return Ok(false); + } + // Shouldn't grab millions of keys anyways but + // TODO: Optimise + let mut trx = self.trx_client.begin_pessimistic().await.map_err(into_error)?; + + let begin = KeySerializer::new(key.len() + 3) + .write(SUBSPACE_BLOBS) + .write(key) + .write(0u16) + .finalize(); + let end = KeySerializer::new(key.len() + 3) + .write(SUBSPACE_BLOBS) + .write(key) + .write(u16::MAX) + .finalize(); + let keys = trx.scan_keys((begin, end), MAX_KEYS).await.map_err(into_error)?; + + let mut key_count = 0; + for key in keys { + key_count += 1; + trx.delete(key).await.map_err(into_error)?; + } + if key_count == 0 { + trx.rollback().await.map_err(into_error)?; + return Ok(false); + } + + self.commit(trx, false).await } } \ No newline at end of file diff --git a/crates/store/src/backend/tikv/main.rs b/crates/store/src/backend/tikv/main.rs index 88c765b84..7be31132a 100644 --- a/crates/store/src/backend/tikv/main.rs +++ b/crates/store/src/backend/tikv/main.rs @@ -6,7 +6,7 @@ use std::net::SocketAddr; -use tikv_client::TransactionClient; +use tikv_client::{RawClient, TransactionClient}; use utils::config::{utils::AsKey, Config}; use super::TikvStore; @@ -16,12 +16,12 @@ impl TikvStore { let prefix = prefix.as_key(); // Parse as SocketAddr but don't use it. TransactionClient takes only a String vector - let pd_endpoints = config.properties::((&prefix, "pd-endpoints")) + let pd_endpoints= config.properties::((&prefix, "pd-endpoints")) .into_iter() .map(|(_key, addr_str)| addr_str) - .collect(); + .collect::>(); - let client = TransactionClient::new(pd_endpoints) + let trx_client = TransactionClient::new(pd_endpoints.clone()) .await .map_err(|err| { config.new_build_error( @@ -31,8 +31,20 @@ impl TikvStore { }) .ok()?; + let raw_client = RawClient::new(pd_endpoints) + .await + .map_err(|err| { + config.new_build_error( + prefix.as_str(), + format!("Failed to create TiKV database: {err:?}"), + ) + }) + .ok()? + .with_atomic_for_cas(); + Some(Self { - client, + trx_client, + raw_client, version: Default::default(), }) } diff --git a/crates/store/src/backend/tikv/mod.rs b/crates/store/src/backend/tikv/mod.rs index 24953c1ed..b9ec1652b 100644 --- a/crates/store/src/backend/tikv/mod.rs +++ b/crates/store/src/backend/tikv/mod.rs @@ -5,7 +5,10 @@ */ use std::time::{Duration, Instant}; -use tikv_client::{TransactionClient, Transaction, Error as TikvError, Snapshot, Value, Key}; +use tikv_client::{TransactionClient, Transaction, Error as TikvError, Snapshot, Value, Key, Timestamp, RawClient}; +use tikv_client::proto::kvrpcpb; +use tikv_client::proto::kvrpcpb::Mutation; +use crate::write::{AssignedIds, ValueOp}; pub mod blob; pub mod main; @@ -24,17 +27,19 @@ pub const TRANSACTION_TIMEOUT: Duration = Duration::from_secs(4); #[allow(dead_code)] pub struct TikvStore { - client: TransactionClient, + trx_client: TransactionClient, + raw_client: RawClient, version: parking_lot::Mutex, } -pub(crate) enum ReadTransaction { - Transaction(Transaction), - Snapshot(Snapshot) +// TODO: Remove +pub(crate) enum ReadTransaction<'db> { + Transaction(&'db mut Transaction), + Snapshot(&'db mut Snapshot) } -impl ReadTransaction { - pub(crate) async fn get(&mut self, key: impl Into) -> trc::Result> { +impl<'a> ReadTransaction<'a> { + pub(crate) async fn get(&'a mut self, key: impl Into) -> trc::Result> { match self { ReadTransaction::Transaction(trx) => { trx.get(key).await.map_err(into_error) @@ -46,30 +51,18 @@ impl ReadTransaction { } } -impl From for ReadTransaction { - fn from(value: Transaction) -> Self { - Self::Transaction(value) - } -} - -impl From for ReadTransaction { - fn from(value: Snapshot) -> Self { - Self::Snapshot(value) - } -} - pub(crate) struct TimedTransaction { trx: Transaction, expires: Instant, } pub(crate) struct ReadVersion { - version: i64, + version: Timestamp, expires: Instant, } impl ReadVersion { - pub fn new(version: i64) -> Self { + pub fn new(version: Timestamp) -> Self { Self { version, expires: Instant::now() + TRANSACTION_EXPIRY, @@ -84,7 +77,7 @@ impl ReadVersion { impl Default for ReadVersion { fn default() -> Self { Self { - version: 0, + version: Timestamp::default(), expires: Instant::now(), } } @@ -111,6 +104,6 @@ impl TimedTransaction { #[inline(always)] fn into_error(error: TikvError) -> trc::Error { - trc::StoreEvent::FoundationdbError + trc::StoreEvent::TikvError .reason(error.to_string()) } diff --git a/crates/store/src/backend/tikv/read.rs b/crates/store/src/backend/tikv/read.rs index bcbc068e7..3594f52c4 100644 --- a/crates/store/src/backend/tikv/read.rs +++ b/crates/store/src/backend/tikv/read.rs @@ -31,9 +31,9 @@ impl TikvStore { U: Deserialize, { let key = key.serialize(WITH_SUBSPACE); - let read_trx = ReadTransaction::Snapshot(self.snapshot_trx().await?); + let mut ss = self.snapshot_trx().await?; - match read_chunked_value(&key, read_trx).await? { + match read_chunked_value_snapshot(&key, &mut ss).await? { ChunkedValue::Single(bytes) => U::deserialize(&bytes).map(Some), ChunkedValue::Chunked { bytes, .. } => U::deserialize(&bytes).map(Some), ChunkedValue::None => Ok(None), @@ -108,28 +108,29 @@ impl TikvStore { } pub(crate) async fn read_trx(&self) -> trc::Result { - self.client + self.trx_client .begin_optimistic() .await .map_err(into_error) } pub(crate) async fn snapshot_trx(&self) -> trc::Result { - let timestamp = self.client + let timestamp = self.trx_client .current_timestamp() .await .map_err(into_error)?; - Ok(self.client.snapshot(timestamp, TransactionOptions::new_optimistic())) + Ok(self.trx_client.snapshot(timestamp, TransactionOptions::new_optimistic())) } } -pub(crate) async fn read_chunked_value( +// TODO: Figure out a way to deduplicate the code +pub(crate) async fn read_chunked_value_snapshot( key: &[u8], - mut read_trx: ReadTransaction + ss: &mut Snapshot ) -> trc::Result { // TODO: Costly, redo - if let Some(bytes) = read_trx.get(key.to_vec()).await? { + if let Some(bytes) = ss.get(key.to_vec()).await.map_err(into_error)? { if bytes.len() < MAX_VALUE_SIZE { Ok(ChunkedValue::Single(bytes)) } else { @@ -141,7 +142,7 @@ pub(crate) async fn read_chunked_value( .finalize(); // TODO: Costly, redo - while let Some(bytes) = read_trx.get(key.clone()).await? { + while let Some(bytes) = ss.get(key.to_vec()).await.map_err(into_error)? { value.extend_from_slice(&bytes); *key.last_mut().unwrap() += 1; } @@ -155,3 +156,36 @@ pub(crate) async fn read_chunked_value( Ok(ChunkedValue::None) } } + +// TODO: Figure out a way to deduplicate the code +pub(crate) async fn read_chunked_value_transaction( + key: &[u8], + trx: &mut Transaction +) -> trc::Result { + // TODO: Costly, redo + if let Some(bytes) = trx.get(key.to_vec()).await.map_err(into_error)? { + if bytes.len() < MAX_VALUE_SIZE { + Ok(ChunkedValue::Single(bytes)) + } else { + let mut value = Vec::with_capacity(bytes.len() * 2); + value.extend_from_slice(&bytes); + let mut key = KeySerializer::new(key.len() + 1) + .write(key) + .write(0u8) + .finalize(); + + // TODO: Costly, redo + while let Some(bytes) = trx.get(key.to_vec()).await.map_err(into_error)? { + value.extend_from_slice(&bytes); + *key.last_mut().unwrap() += 1; + } + + Ok(ChunkedValue::Chunked { + bytes: value, + n_chunks: *key.last().unwrap(), + }) + } + } else { + Ok(ChunkedValue::None) + } +} \ No newline at end of file diff --git a/crates/store/src/backend/tikv/write.rs b/crates/store/src/backend/tikv/write.rs index 9f065647a..9569d4066 100644 --- a/crates/store/src/backend/tikv/write.rs +++ b/crates/store/src/backend/tikv/write.rs @@ -8,10 +8,8 @@ use std::{ cmp::Ordering, time::{Duration, Instant}, }; - -use tikv_client::{ - Transaction -}; +use std::collections::Bound; +use tikv_client::{BoundRange, TimestampExt, Transaction, Value}; use rand::Rng; use roaring::RoaringBitmap; @@ -24,26 +22,362 @@ use crate::{ }, BitmapKey, IndexKey, Key, LogKey, SUBSPACE_COUNTER, SUBSPACE_QUOTA, U32_LEN, WITH_SUBSPACE, }; - -use super::{ - into_error, - read::{read_chunked_value, ChunkedValue}, - TikvStore, ReadVersion, MAX_VALUE_SIZE, -}; +use crate::write::key; +use super::{into_error, read::{read_chunked_value_transaction, ChunkedValue}, TikvStore, ReadVersion, MAX_VALUE_SIZE, MAX_KEYS, ReadTransaction}; impl TikvStore { pub(crate) async fn write(&self, batch: Batch) -> trc::Result { - todo!() + let start = Instant::now(); + let mut retry_count = 0; + + loop { + let mut account_id = u32::MAX; + let mut collection = u8::MAX; + let mut document_id = u32::MAX; + let mut change_id = u64::MAX; + let mut result = AssignedIds::default(); + let mut atomic_adds = vec![]; + let mut trx = self.trx_client.begin_optimistic().await.map_err(into_error)?; + + for op in &batch.ops { + match op { + Operation::AccountId { + account_id: account_id_, + } => { + account_id = *account_id_; + } + Operation::Collection { + collection: collection_, + } => { + collection = *collection_; + } + Operation::DocumentId { + document_id: document_id_, + } => { + document_id = *document_id_; + } + Operation::ChangeId { + change_id: change_id_, + } => { + change_id = *change_id_; + } + Operation::Value { class, op } => { + let mut key = class.serialize( + account_id, + collection, + document_id, + WITH_SUBSPACE, + (&result).into(), + ); + let do_chunk = !class.is_counter(collection); + + match op { + ValueOp::Set(value) => { + let value = value.resolve(&result)?; + if !value.is_empty() && do_chunk { + for (pos, chunk) in value.chunks(MAX_VALUE_SIZE).enumerate() { + match pos.cmp(&1) { + Ordering::Less => {} + Ordering::Equal => { + key.push(0); + } + Ordering::Greater => { + if pos < u8::MAX as usize { + *key.last_mut().unwrap() += 1; + } else { + //trx.rollback().await.map_err(into_error)?; + return Err(trc::StoreEvent::TikvError + .ctx( + trc::Key::Reason, + "Value is too large", + )); + } + } + } + // TODO: Costly clone + trx.put(key.clone(), chunk).await.map_err(into_error)?; + } + } else { + // TODO: Costly clone + trx.put(key.clone(), value.as_ref()).await.map_err(into_error)?; + } + } + ValueOp::AtomicAdd(by) => { + // Duplicating AddAndGet because TiKV has no atomic add + // TODO: Costly clone + let atomic_add_key = key.clone(); + atomic_adds.push(self.atomic_add(atomic_add_key, *by)); + } + ValueOp::AddAndGet(by) => { + // TODO: Costly clone + let num = if let Some(bytes) = + trx.get_for_update(key.clone()).await.map_err(into_error)? + { + deserialize_i64_le(&key, &bytes)? + *by + } else { + *by + }; + // TODO: Costly clone + trx.put(key.clone(), &num.to_le_bytes()[..]).await.map_err(into_error)?; + result.push_counter_id(num); + } + ValueOp::Clear => { + if do_chunk { + let range = BoundRange::new( + // TODO: Costly clone jesus christ + Bound::Included(key.clone().into()), + Bound::Included(KeySerializer::new(key.len() + 1) + .write(key.as_slice()) + .write(u8::MAX) + .finalize().into()), + ); + // TODO: Repeat after reaching max keys + let mut keys = trx.scan_keys(range, MAX_KEYS).await.map_err(into_error)?; + + while let Some(key) = keys.next() { + trx.delete(key).await.map_err(into_error)?; + } + } else { + // TODO: Costly clone + trx.delete(key).await.map_err(into_error)?; + } + } + } + } + Operation::Index { field, key, set } => { + let key = IndexKey { + account_id, + collection, + document_id, + field: *field, + key, + } + .serialize(WITH_SUBSPACE); + + if *set { + trx.put(key, &[]).await.map_err(into_error)?; + } else { + trx.delete(key).await.map_err(into_error)?; + } + } + Operation::Bitmap { class, set } => { + // Find the next available document id + let assign_id = *set + && matches!(class, BitmapClass::DocumentIds) + && document_id == u32::MAX; + if assign_id { + let begin = BitmapKey { + account_id, + collection, + class: BitmapClass::DocumentIds, + document_id: 0, + } + .serialize(WITH_SUBSPACE); + let end = BitmapKey { + account_id, + collection, + class: BitmapClass::DocumentIds, + document_id: u32::MAX, + } + .serialize(WITH_SUBSPACE); + let key_len = begin.len(); + // TODO: Do repeat logic + let mut values = trx.scan_keys((begin, end), MAX_KEYS).await.map_err(into_error)?; + let mut found_ids = RoaringBitmap::new(); + while let Some(key) = values.next() { + if key.len() == key_len { + let key_vec: Vec = key.into(); + found_ids.insert(key_vec.as_slice().deserialize_be_u32(key_len - U32_LEN)?); + } else { + break; + } + } + document_id = found_ids.random_available_id(); + result.push_document_id(document_id); + } + + let key = class.serialize( + account_id, + collection, + document_id, + WITH_SUBSPACE, + (&result).into(), + ); + + if *set { + if assign_id { + let keys_iter = trx.scan_keys((key.clone(), class.serialize( + account_id, + collection, + document_id + 1, + WITH_SUBSPACE, + (&result).into(), + )), MAX_KEYS).await.map_err(into_error)?; + trx.lock_keys(keys_iter).await.map_err(into_error)?; + } + + trx.put(key, &[]).await.map_err(into_error)?; + } else { + trx.delete(key).await.map_err(into_error)?; + } + } + Operation::Log { set } => { + let key = LogKey { + account_id, + collection, + change_id, + } + .serialize(WITH_SUBSPACE); + trx.put(key, set.resolve(&result)?.as_ref()).await.map_err(into_error)?; + } + Operation::AssertValue { + class, + assert_value, + } => { + let key = class.serialize( + account_id, + collection, + document_id, + WITH_SUBSPACE, + (&result).into(), + ); + + let matches = match read_chunked_value_transaction(&key, &mut trx).await { + Ok(ChunkedValue::Single(bytes)) => assert_value.matches(bytes.as_ref()), + Ok(ChunkedValue::Chunked { bytes, .. }) => { + assert_value.matches(bytes.as_ref()) + } + Ok(ChunkedValue::None) => assert_value.is_none(), + Err(_) => false, + }; + + if !matches { + trx.rollback().await.map_err(into_error)?; + return Err(trc::StoreEvent::AssertValueFailed.into()); + } + } + } + } + + if self + .commit( + trx, + retry_count < MAX_COMMIT_ATTEMPTS && start.elapsed() < MAX_COMMIT_TIME, + ) + .await? + { + for fut in atomic_adds { + fut.await?; + } + return Ok(result); + } else { + let backoff = rand::thread_rng().gen_range(50..=300); + tokio::time::sleep(Duration::from_millis(backoff)).await; + retry_count += 1; + } + } } - pub(crate) async fn commit(&self, trx: Transaction, will_retry: bool) -> trc::Result { - todo!() + pub(crate) async fn commit(&self, mut trx: Transaction, will_retry: bool) -> trc::Result { + match trx.commit().await { + Ok(result) => { + let commit_timestamp = result.ok_or_else(|| trc::StoreEvent::FoundationdbError + .reason("couldn't get commit timestamp".to_string()))?; + let mut version = self.version.lock(); + // I hate this + if commit_timestamp.version() > version.version.version() { + *version = ReadVersion::new(commit_timestamp); + } + Ok(true) + } + Err(err) => { + trx.rollback().await.map_err(into_error)?; + if will_retry { + Ok(false) + } else { + Err(into_error(err)) + } + } + } } pub(crate) async fn purge_store(&self) -> trc::Result<()> { - todo!() + // Obtain all zero counters + for subspace in [SUBSPACE_COUNTER, SUBSPACE_QUOTA] { + let from_key = vec![subspace, 0u8]; + let to_key = vec![subspace, u8::MAX, u8::MAX, u8::MAX, u8::MAX, u8::MAX]; + + const CHUNK_LIMIT: u32 = 1024; + + loop { + let mut key_count = 0; + let mut trx = self.trx_client.begin_pessimistic().await.map_err(into_error)?; + let mut keys = trx.scan_keys((from_key.clone(), to_key.clone()), CHUNK_LIMIT).await.map_err(into_error)?; + for key in keys { + key_count += 1; + trx.delete(key).await.map_err(into_error)?; + } + if key_count == 0 { + trx.rollback().await.map_err(into_error)?; + } + // TODO: Retry on error + self.commit(trx, false).await?; + if key_count != CHUNK_LIMIT { + break; + } + } + } + + Ok(()) } pub(crate) async fn delete_range(&self, from: impl Key, to: impl Key) -> trc::Result<()> { - todo!() + let from = from.serialize(WITH_SUBSPACE); + let to = to.serialize(WITH_SUBSPACE); + + let mut trx = self.trx_client.begin_pessimistic().await.map_err(into_error)?; + // Have to manually get the range first; + // TODO: Chunked key scans and locks + let mut keys = trx.scan_keys((from, to), MAX_KEYS).await.map_err(into_error)?; + let key_vec: Vec = keys.collect(); + // TODO: Expensive clone :( + trx.lock_keys(key_vec.clone()).await.map_err(into_error)?; + for key in key_vec { + trx.delete(key).await.map_err(into_error)?; + } + + self.commit(trx, false).await.map(|_| ()) + } + + pub(crate) async fn atomic_compare_and_clear(&self, key: Vec, by: &[u8]) -> trc::Result { + let Some(value) = self.raw_client.get(key.clone()).await.map_err(into_error)? else { + // Nothing to compare as there is nothing to clear. + return Ok(false) + }; + + return if by == value.as_slice() { + self.raw_client.delete(key).await.map_err(into_error)?; + Ok(true) + } else { + Ok(false) + } + } + + pub(crate) async fn atomic_add(&self, key: Vec, by: i64) -> trc::Result> { + let maybe_set_value = self.raw_client.get(key.to_vec()).await.map_err(into_error)?; + + let sum = match &maybe_set_value { + None => by, + Some(original) => deserialize_i64_le(key.as_slice(), original.as_slice())? + by + }; + let (_previous, swapped) = self.raw_client + .compare_and_swap(key.to_vec(), maybe_set_value, sum.to_le_bytes().to_vec()) + .await + .map_err(into_error)?; + + return if swapped { + Ok(Some(sum)) + } else { + Ok(None) + } } } diff --git a/resources/config/config.toml b/resources/config/config.toml index 376e54f51..e68bf764a 100644 --- a/resources/config/config.toml +++ b/resources/config/config.toml @@ -42,44 +42,25 @@ protocol = "http" bind = ["[::]:443"] tls.implicit = true -# Uncomment after testing -#[storage] -#data = "rocksdb" -#fts = "rocksdb" -#blob = "rocksdb" -#lookup = "rocksdb" -#directory = "internal" -# -#[store."rocksdb"] -#type = "rocksdb" -#path = "%{env:STALWART_PATH}%/data" -#compression = "lz4" -# -#[directory."internal"] -#type = "internal" -#store = "rocksdb" -# Uncomment after testing - -# Delete after testing [storage] -data = "tikv" -fts = "tikv" -blob = "tikv" -lookup = "tikv" +data = "rocksdb" +fts = "rocksdb" +blob = "rocksdb" +lookup = "rocksdb" directory = "internal" -[store."tikv"] -type = "tikv" -pd-endpoints = ["127.0.0.1:2379"] +[store."rocksdb"] +type = "rocksdb" +path = "%{env:STALWART_PATH}%/data" +compression = "lz4" [directory."internal"] type = "internal" -store = "tikv" -# Delete after testing +store = "rocksdb" [tracer."stdout"] type = "stdout" -level = "trace" # switch back to info after debugging +level = "info" ansi = false enable = true @@ -89,4 +70,4 @@ enable = true [authentication.fallback-admin] user = "admin" -secret = "%{env:ADMIN_SECRET}%" +secret = "%{env:ADMIN_SECRET}%" \ No newline at end of file From 67985392581eeba83c3f1bb2e8b9cd2b8dedc79a Mon Sep 17 00:00:00 2001 From: Alvin Peters Date: Sun, 4 Aug 2024 14:25:46 +1000 Subject: [PATCH 04/13] Add TiKV: Get into a working shape --- crates/store/src/backend/tikv/blob.rs | 2 +- crates/store/src/backend/tikv/main.rs | 10 +-- crates/store/src/backend/tikv/read.rs | 86 +++++++++++++------- crates/store/src/backend/tikv/write.rs | 104 +++++++++++++++++++++---- tests/Cargo.toml | 5 +- tests/src/store/mod.rs | 4 + 6 files changed, 160 insertions(+), 51 deletions(-) diff --git a/crates/store/src/backend/tikv/blob.rs b/crates/store/src/backend/tikv/blob.rs index 8c4c8a6b0..6131ca967 100644 --- a/crates/store/src/backend/tikv/blob.rs +++ b/crates/store/src/backend/tikv/blob.rs @@ -33,7 +33,7 @@ impl TikvStore { let key_len = begin.len(); let mut trx = self.snapshot_trx().await?; // TODO: Create repeat logic for over max - let mut values = trx.scan((begin, end), MAX_KV_PAIRS).await.map_err(into_error)?; + let mut values = trx.scan((begin, end), u32::MAX).await.map_err(into_error)?; let mut blob_data: Option> = None; let blob_range = range.end - range.start; diff --git a/crates/store/src/backend/tikv/main.rs b/crates/store/src/backend/tikv/main.rs index 7be31132a..dc863f16c 100644 --- a/crates/store/src/backend/tikv/main.rs +++ b/crates/store/src/backend/tikv/main.rs @@ -4,11 +4,8 @@ * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ -use std::net::SocketAddr; - use tikv_client::{RawClient, TransactionClient}; use utils::config::{utils::AsKey, Config}; - use super::TikvStore; impl TikvStore { @@ -42,10 +39,13 @@ impl TikvStore { .ok()? .with_atomic_for_cas(); - Some(Self { + let store = Self { trx_client, raw_client, version: Default::default(), - }) + }; + + Some(store) } } + diff --git a/crates/store/src/backend/tikv/read.rs b/crates/store/src/backend/tikv/read.rs index 3594f52c4..10afa39ae 100644 --- a/crates/store/src/backend/tikv/read.rs +++ b/crates/store/src/backend/tikv/read.rs @@ -3,7 +3,7 @@ * * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ -use bincode::Options; + use tikv_client::{Key as TikvKey, Snapshot, Transaction, TransactionOptions, Value}; use futures::TryStreamExt; use roaring::RoaringBitmap; @@ -34,9 +34,14 @@ impl TikvStore { let mut ss = self.snapshot_trx().await?; match read_chunked_value_snapshot(&key, &mut ss).await? { - ChunkedValue::Single(bytes) => U::deserialize(&bytes).map(Some), - ChunkedValue::Chunked { bytes, .. } => U::deserialize(&bytes).map(Some), - ChunkedValue::None => Ok(None), + ChunkedValue::Single(bytes) => { + U::deserialize(&bytes).map(Some) + }, + ChunkedValue::Chunked { bytes, .. } => { + U::deserialize(&bytes).map(Some) }, + ChunkedValue::None => { + Ok(None) + }, } } @@ -44,7 +49,26 @@ impl TikvStore { &self, mut key: BitmapKey>, ) -> trc::Result> { - todo!() + let mut bm = RoaringBitmap::new(); + let begin = key.serialize(WITH_SUBSPACE); + key.document_id = u32::MAX; + let end = key.serialize(WITH_SUBSPACE); + let key_len = begin.len(); + // Maybe use transaction client? + let mut trx = self.snapshot_trx().await?; + let mut keys = trx.scan_keys( + (begin, end), + MAX_KEYS + ).await.map_err(into_error)?; + + for key in keys { + let key: Vec = key.into(); + if key.len() == key_len { + bm.insert(key.as_slice().deserialize_be_u32(key.len() - U32_LEN)?); + } + } + + Ok(if !bm.is_empty() { Some(bm) } else { None }) } pub(crate) async fn iterate( @@ -55,35 +79,27 @@ impl TikvStore { let mut begin: TikvKey = params.begin.serialize(WITH_SUBSPACE).into(); let end: TikvKey = params.end.serialize(WITH_SUBSPACE).into(); + let mut trx = self.snapshot_trx().await?; if !params.first { - let mut trx = self.snapshot_trx().await?; - loop { - let mut values = trx - .scan((begin.clone(), end.clone()), MAX_KV_PAIRS) - .await - .map_err(into_error)?; - - let mut last_key: TikvKey = begin.clone(); - - let mut total_kv_pairs = 0; - + // TODO: Limit by max_keys + if params.ascending { + let mut values = trx.scan((begin, end), u32::MAX).await.map_err(into_error)?; while let Some(kv_pair) = values.next() { - total_kv_pairs += 1; - // Costly - last_key = kv_pair.key().clone(); let key: &[u8] = kv_pair.key().into(); - let value: &[u8] = kv_pair.key().into(); - + let value: &[u8] = kv_pair.value().as_slice(); cb(key.get(1..).unwrap_or_default(), value)?; } - - if total_kv_pairs != MAX_KV_PAIRS { - begin = last_key; - break; + } else { + let mut values = trx.scan_reverse((begin, end), u32::MAX).await.map_err(into_error)?; + while let Some(kv_pair) = values.next() { + let mut last_key = &[] as &[u8]; + let key: &[u8] = kv_pair.key().into(); + let value: &[u8] = kv_pair.value().as_slice(); + cb(key.get(1..).unwrap_or_default(), value)?; } - } + }; + } else { - let mut trx = self.snapshot_trx().await?; let mut values = trx .scan((begin, end), 1) .await @@ -104,7 +120,18 @@ impl TikvStore { &self, key: impl Into>> + Sync + Send, ) -> trc::Result { - todo!() + let key = key.into().serialize(WITH_SUBSPACE); + // TODO: Expensive clone + if let Some(bytes) = self + .raw_client + .get(key.clone()) + .await + .map_err(into_error)? + { + deserialize_i64_le(&key, &bytes) + } else { + Ok(0) + } } pub(crate) async fn read_trx(&self) -> trc::Result { @@ -174,8 +201,9 @@ pub(crate) async fn read_chunked_value_transaction( .write(0u8) .finalize(); + // TODO: Costly, redo - while let Some(bytes) = trx.get(key.to_vec()).await.map_err(into_error)? { + while let Some(bytes) = trx.get(key.clone()).await.map_err(into_error)? { value.extend_from_slice(&bytes); *key.last_mut().unwrap() += 1; } diff --git a/crates/store/src/backend/tikv/write.rs b/crates/store/src/backend/tikv/write.rs index 9569d4066..e9e5e8047 100644 --- a/crates/store/src/backend/tikv/write.rs +++ b/crates/store/src/backend/tikv/write.rs @@ -12,7 +12,7 @@ use std::collections::Bound; use tikv_client::{BoundRange, TimestampExt, Transaction, Value}; use rand::Rng; use roaring::RoaringBitmap; - +use tikv_client::proto::kvrpcpb::{Assertion, Mutation, Op}; use crate::{ backend::deserialize_i64_le, write::{ @@ -36,7 +36,10 @@ impl TikvStore { let mut document_id = u32::MAX; let mut change_id = u64::MAX; let mut result = AssignedIds::default(); + let mut atomic_adds = vec![]; + let mut batch_mutate = vec![]; + let mut trx = self.trx_client.begin_optimistic().await.map_err(into_error)?; for op in &batch.ops { @@ -62,6 +65,7 @@ impl TikvStore { change_id = *change_id_; } Operation::Value { class, op } => { + //println!("{:?}", class); let mut key = class.serialize( account_id, collection, @@ -95,11 +99,24 @@ impl TikvStore { } } // TODO: Costly clone - trx.put(key.clone(), chunk).await.map_err(into_error)?; + let mutation = Mutation { + op: Op::Put.into(), + key: key.to_vec(), + value: chunk.to_vec(), + assertion: Assertion::None.into(), + }; + batch_mutate.push(mutation); } } else { // TODO: Costly clone - trx.put(key.clone(), value.as_ref()).await.map_err(into_error)?; + let mutation = Mutation { + op: Op::Put.into(), + key: key.to_vec(), + value: value.to_vec(), + assertion: Assertion::None.into(), + }; + batch_mutate.push(mutation); + //trx.put(key.clone(), value.as_ref()).await.map_err(into_error)?; } } ValueOp::AtomicAdd(by) => { @@ -118,7 +135,14 @@ impl TikvStore { *by }; // TODO: Costly clone - trx.put(key.clone(), &num.to_le_bytes()[..]).await.map_err(into_error)?; + //trx.put(key.clone(), &num.to_le_bytes()[..]).await.map_err(into_error)?; + let mutation = Mutation { + op: Op::Put.into(), + key: key.to_vec(), + value: num.to_le_bytes().to_vec(), + assertion: Assertion::None.into(), + }; + batch_mutate.push(mutation); result.push_counter_id(num); } ValueOp::Clear => { @@ -132,14 +156,29 @@ impl TikvStore { .finalize().into()), ); // TODO: Repeat after reaching max keys - let mut keys = trx.scan_keys(range, MAX_KEYS).await.map_err(into_error)?; + let mut keys = trx.scan_keys(range, u32::MAX).await.map_err(into_error)?; + while let Some(key) = keys.next() { - trx.delete(key).await.map_err(into_error)?; + //trx.delete(key).await.map_err(into_error)?; + let mutation = Mutation { + op: Op::Del.into(), + key: key.into(), + value: Default::default(), + assertion: Assertion::None.into(), + }; + batch_mutate.push(mutation); } } else { // TODO: Costly clone - trx.delete(key).await.map_err(into_error)?; + //trx.delete(key).await.map_err(into_error)?; + let mutation = Mutation { + op: Op::Del.into(), + key: key.into(), + value: Default::default(), + assertion: Assertion::None.into(), + }; + batch_mutate.push(mutation); } } } @@ -155,9 +194,23 @@ impl TikvStore { .serialize(WITH_SUBSPACE); if *set { - trx.put(key, &[]).await.map_err(into_error)?; + //trx.put(key, &[]).await.map_err(into_error)?; + let mutation = Mutation { + op: Op::Put.into(), + key, + value: vec![], + assertion: Assertion::None.into(), + }; + batch_mutate.push(mutation); } else { - trx.delete(key).await.map_err(into_error)?; + //trx.delete(key).await.map_err(into_error)?; + let mutation = Mutation { + op: Op::Del.into(), + key, + value: Default::default(), + assertion: Assertion::None.into(), + }; + batch_mutate.push(mutation); } } Operation::Bitmap { class, set } => { @@ -212,13 +265,26 @@ impl TikvStore { document_id + 1, WITH_SUBSPACE, (&result).into(), - )), MAX_KEYS).await.map_err(into_error)?; + )), u32::MAX).await.map_err(into_error)?; trx.lock_keys(keys_iter).await.map_err(into_error)?; } - - trx.put(key, &[]).await.map_err(into_error)?; + let mutation = Mutation { + op: Op::Put.into(), + key, + value: vec![], + assertion: Assertion::None.into(), + }; + batch_mutate.push(mutation); + //trx.put(key, &[]).await.map_err(into_error)?; } else { - trx.delete(key).await.map_err(into_error)?; + let mutation = Mutation { + op: Op::Del.into(), + key, + value: Default::default(), + assertion: Assertion::None.into(), + }; + batch_mutate.push(mutation); + //trx.delete(key).await.map_err(into_error)?; } } Operation::Log { set } => { @@ -228,7 +294,14 @@ impl TikvStore { change_id, } .serialize(WITH_SUBSPACE); - trx.put(key, set.resolve(&result)?.as_ref()).await.map_err(into_error)?; + let mutation = Mutation { + op: Op::Put.into(), + key, + value: set.resolve(&result)?.into_owned(), + assertion: Assertion::None.into(), + }; + batch_mutate.push(mutation); + //trx.put(key, set.resolve(&result)?.as_ref()).await.map_err(into_error)?; } Operation::AssertValue { class, @@ -259,6 +332,9 @@ impl TikvStore { } } + batch_mutate.reverse(); + trx.batch_mutate(batch_mutate).await.map_err(into_error)?; + if self .commit( trx, diff --git a/tests/Cargo.toml b/tests/Cargo.toml index df762f967..c57f5f3eb 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -5,8 +5,8 @@ edition = "2021" resolver = "2" [features] -default = ["sqlite", "postgres", "mysql", "rocks", "elastic", "s3", "redis", "foundationdb"] -#default = ["sqlite", "postgres", "mysql", "rocks", "elastic", "s3", "redis", "foundationdb"] +default = ["sqlite", "postgres", "mysql", "rocks", "elastic", "s3", "redis", "foundationdb", "tikv"] +#default = ["sqlite", "postgres", "mysql", "rocks", "elastic", "s3", "redis", "foundationdb", "tikv"] sqlite = ["store/sqlite"] foundationdb = ["store/foundation", "common/foundation"] postgres = ["store/postgres"] @@ -15,6 +15,7 @@ rocks = ["store/rocks"] elastic = ["store/elastic"] s3 = ["store/s3"] redis = ["store/redis"] +tikv = ["store/tikv"] [dev-dependencies] store = { path = "../crates/store", features = ["test_mode"] } diff --git a/tests/src/store/mod.rs b/tests/src/store/mod.rs index 503eb1a20..b9877ab68 100644 --- a/tests/src/store/mod.rs +++ b/tests/src/store/mod.rs @@ -67,6 +67,10 @@ type = "redis" urls = "redis://127.0.0.1" redis-type = "single" +[store."tikv"] +type = "tikv" +pd-endpoints = ["localhost:2379"] + "#; #[tokio::test(flavor = "multi_thread")] From 3152713284fac20aed2918a10a827037567f131c Mon Sep 17 00:00:00 2001 From: Alvin Peters Date: Sun, 4 Aug 2024 22:03:08 +1000 Subject: [PATCH 05/13] Add TiKV: Optimise write --- crates/store/src/backend/tikv/blob.rs | 15 +- crates/store/src/backend/tikv/main.rs | 61 ++- crates/store/src/backend/tikv/mod.rs | 4 +- crates/store/src/backend/tikv/write.rs | 711 ++++++++++++++----------- 4 files changed, 484 insertions(+), 307 deletions(-) diff --git a/crates/store/src/backend/tikv/blob.rs b/crates/store/src/backend/tikv/blob.rs index 6131ca967..2cbac8978 100644 --- a/crates/store/src/backend/tikv/blob.rs +++ b/crates/store/src/backend/tikv/blob.rs @@ -91,7 +91,10 @@ impl TikvStore { }, 1, ) - 1; - let mut trx = self.trx_client.begin_pessimistic().await.map_err(into_error)?; + let mut trx = self.trx_client + .begin_with_options(self.write_trx_options.clone()) + .await + .map_err(into_error)?; for (chunk_pos, chunk_bytes) in data.chunks(MAX_VALUE_SIZE).enumerate() { trx.put( @@ -105,7 +108,10 @@ impl TikvStore { if chunk_pos == last_chunk || (chunk_pos > 0 && chunk_pos % N_CHUNKS == 0) { self.commit(trx, false).await?; if chunk_pos < last_chunk { - trx = self.trx_client.begin_pessimistic().await.map_err(into_error)?; + trx = self.trx_client + .begin_with_options(self.write_trx_options.clone()) + .await + .map_err(into_error)?; } else { break; } @@ -121,7 +127,10 @@ impl TikvStore { } // Shouldn't grab millions of keys anyways but // TODO: Optimise - let mut trx = self.trx_client.begin_pessimistic().await.map_err(into_error)?; + let mut trx = self.trx_client + .begin_with_options(self.write_trx_options.clone()) + .await + .map_err(into_error)?; let begin = KeySerializer::new(key.len() + 3) .write(SUBSPACE_BLOBS) diff --git a/crates/store/src/backend/tikv/main.rs b/crates/store/src/backend/tikv/main.rs index dc863f16c..b2f97d5b8 100644 --- a/crates/store/src/backend/tikv/main.rs +++ b/crates/store/src/backend/tikv/main.rs @@ -3,8 +3,8 @@ * * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ - -use tikv_client::{RawClient, TransactionClient}; +use std::time::Duration; +use tikv_client::{Backoff, CheckLevel, RawClient, RetryOptions, TransactionClient, TransactionOptions}; use utils::config::{utils::AsKey, Config}; use super::TikvStore; @@ -39,9 +39,66 @@ impl TikvStore { .ok()? .with_atomic_for_cas(); + let backoff_min_delay = config + .property::((&prefix, "transaction.backoff-min-delay")) + .unwrap_or_else(|| Duration::from_millis(2)); + + let backoff_max_delay = config + .property::((&prefix, "transaction.backoff-max-delay")) + .unwrap_or_else(|| Duration::from_millis(500)); + + let max_attempts = config + .property::((&prefix, "transaction.backoff-retry-limit")) + .unwrap_or_else(|| 10); + + let backoff = if let Some(backoff_type) = config + .property::((&prefix, "transaction.backoff-type")) { + match backoff_type.as_str() { + "expo-jitter" => Backoff::no_jitter_backoff( + backoff_min_delay.as_millis() as u64, + backoff_max_delay.as_millis() as u64, + max_attempts + ), + "equal-jitter" => Backoff::equal_jitter_backoff( + backoff_min_delay.as_millis() as u64, + backoff_max_delay.as_millis() as u64, + max_attempts + ), + "decor-jitter" => Backoff::decorrelated_jitter_backoff( + backoff_min_delay.as_millis() as u64, + backoff_max_delay.as_millis() as u64, + max_attempts + ), + "none" => Backoff::no_backoff(), + // Default + "full-jitter" | &_ => Backoff::full_jitter_backoff( + backoff_min_delay.as_millis() as u64, + backoff_max_delay.as_millis() as u64, + max_attempts + ), + } + } else { + // Default + Backoff::full_jitter_backoff( + backoff_min_delay.as_millis() as u64, + backoff_max_delay.as_millis() as u64, + max_attempts + ) + }; + + let raw_client_retries = backoff.is_none().then(|| 0).unwrap_or_else(|| max_attempts); + + let write_trx_options = TransactionOptions::new_pessimistic() + .drop_check(CheckLevel::Warn) + .retry_options(RetryOptions::new(backoff.clone(), backoff.clone())); + + let raw_backoff = backoff; + let store = Self { trx_client, + write_trx_options, raw_client, + raw_backoff, version: Default::default(), }; diff --git a/crates/store/src/backend/tikv/mod.rs b/crates/store/src/backend/tikv/mod.rs index b9ec1652b..c51d04bd6 100644 --- a/crates/store/src/backend/tikv/mod.rs +++ b/crates/store/src/backend/tikv/mod.rs @@ -5,7 +5,7 @@ */ use std::time::{Duration, Instant}; -use tikv_client::{TransactionClient, Transaction, Error as TikvError, Snapshot, Value, Key, Timestamp, RawClient}; +use tikv_client::{TransactionClient, Transaction, Error as TikvError, Snapshot, Value, Key, Timestamp, RawClient, TransactionOptions, Backoff}; use tikv_client::proto::kvrpcpb; use tikv_client::proto::kvrpcpb::Mutation; use crate::write::{AssignedIds, ValueOp}; @@ -28,7 +28,9 @@ pub const TRANSACTION_TIMEOUT: Duration = Duration::from_secs(4); #[allow(dead_code)] pub struct TikvStore { trx_client: TransactionClient, + write_trx_options: TransactionOptions, raw_client: RawClient, + raw_backoff: Backoff, version: parking_lot::Mutex, } diff --git a/crates/store/src/backend/tikv/write.rs b/crates/store/src/backend/tikv/write.rs index e9e5e8047..4c1ed4487 100644 --- a/crates/store/src/backend/tikv/write.rs +++ b/crates/store/src/backend/tikv/write.rs @@ -9,7 +9,8 @@ use std::{ time::{Duration, Instant}, }; use std::collections::Bound; -use tikv_client::{BoundRange, TimestampExt, Transaction, Value}; +use std::num::Wrapping; +use tikv_client::{Backoff, BoundRange, Key as TikvKey, TimestampExt, Transaction, Value}; use rand::Rng; use roaring::RoaringBitmap; use tikv_client::proto::kvrpcpb::{Assertion, Mutation, Op}; @@ -27,150 +28,137 @@ use super::{into_error, read::{read_chunked_value_transaction, ChunkedValue}, Ti impl TikvStore { pub(crate) async fn write(&self, batch: Batch) -> trc::Result { - let start = Instant::now(); - let mut retry_count = 0; + let mut account_id = u32::MAX; + let mut collection = u8::MAX; + let mut document_id = u32::MAX; + let mut change_id = u64::MAX; + let mut result = AssignedIds::default(); - loop { - let mut account_id = u32::MAX; - let mut collection = u8::MAX; - let mut document_id = u32::MAX; - let mut change_id = u64::MAX; - let mut result = AssignedIds::default(); - - let mut atomic_adds = vec![]; - let mut batch_mutate = vec![]; - - let mut trx = self.trx_client.begin_optimistic().await.map_err(into_error)?; - - for op in &batch.ops { - match op { - Operation::AccountId { - account_id: account_id_, - } => { - account_id = *account_id_; - } - Operation::Collection { - collection: collection_, - } => { - collection = *collection_; - } - Operation::DocumentId { - document_id: document_id_, - } => { - document_id = *document_id_; - } - Operation::ChangeId { - change_id: change_id_, - } => { - change_id = *change_id_; - } - Operation::Value { class, op } => { - //println!("{:?}", class); - let mut key = class.serialize( - account_id, - collection, - document_id, - WITH_SUBSPACE, - (&result).into(), - ); - let do_chunk = !class.is_counter(collection); - - match op { - ValueOp::Set(value) => { - let value = value.resolve(&result)?; - if !value.is_empty() && do_chunk { - for (pos, chunk) in value.chunks(MAX_VALUE_SIZE).enumerate() { - match pos.cmp(&1) { - Ordering::Less => {} - Ordering::Equal => { - key.push(0); - } - Ordering::Greater => { - if pos < u8::MAX as usize { - *key.last_mut().unwrap() += 1; - } else { - //trx.rollback().await.map_err(into_error)?; - return Err(trc::StoreEvent::TikvError - .ctx( - trc::Key::Reason, - "Value is too large", - )); - } + // TODO: IDFK man this feels wrong. What if an error happened without calling this? + let mut atomic_subtracts_rollback = vec![]; + let mut batch_mutate = vec![]; + + let mut trx = self.trx_client.begin_optimistic().await.map_err(into_error)?; + + for op in &batch.ops { + match op { + Operation::AccountId { + account_id: account_id_, + } => { + account_id = *account_id_; + } + Operation::Collection { + collection: collection_, + } => { + collection = *collection_; + } + Operation::DocumentId { + document_id: document_id_, + } => { + document_id = *document_id_; + } + Operation::ChangeId { + change_id: change_id_, + } => { + change_id = *change_id_; + } + Operation::Value { class, op } => { + //println!("{:?}", class); + let mut key = class.serialize( + account_id, + collection, + document_id, + WITH_SUBSPACE, + (&result).into(), + ); + let do_chunk = !class.is_counter(collection); + + match op { + ValueOp::Set(value) => { + let value = value.resolve(&result)?; + if !value.is_empty() && do_chunk { + for (pos, chunk) in value.chunks(MAX_VALUE_SIZE).enumerate() { + match pos.cmp(&1) { + Ordering::Less => {} + Ordering::Equal => { + key.push(0); + } + Ordering::Greater => { + if pos < u8::MAX as usize { + *key.last_mut().unwrap() += 1; + } else { + //trx.rollback().await.map_err(into_error)?; + return Err(trc::StoreEvent::TikvError + .ctx( + trc::Key::Reason, + "Value is too large", + )); } } - // TODO: Costly clone - let mutation = Mutation { - op: Op::Put.into(), - key: key.to_vec(), - value: chunk.to_vec(), - assertion: Assertion::None.into(), - }; - batch_mutate.push(mutation); } - } else { // TODO: Costly clone let mutation = Mutation { op: Op::Put.into(), key: key.to_vec(), - value: value.to_vec(), + value: chunk.to_vec(), assertion: Assertion::None.into(), }; batch_mutate.push(mutation); - //trx.put(key.clone(), value.as_ref()).await.map_err(into_error)?; } - } - ValueOp::AtomicAdd(by) => { - // Duplicating AddAndGet because TiKV has no atomic add - // TODO: Costly clone - let atomic_add_key = key.clone(); - atomic_adds.push(self.atomic_add(atomic_add_key, *by)); - } - ValueOp::AddAndGet(by) => { - // TODO: Costly clone - let num = if let Some(bytes) = - trx.get_for_update(key.clone()).await.map_err(into_error)? - { - deserialize_i64_le(&key, &bytes)? + *by - } else { - *by - }; + } else { // TODO: Costly clone - //trx.put(key.clone(), &num.to_le_bytes()[..]).await.map_err(into_error)?; let mutation = Mutation { op: Op::Put.into(), key: key.to_vec(), - value: num.to_le_bytes().to_vec(), + value: value.to_vec(), assertion: Assertion::None.into(), }; batch_mutate.push(mutation); - result.push_counter_id(num); + //trx.put(key.clone(), value.as_ref()).await.map_err(into_error)?; } - ValueOp::Clear => { - if do_chunk { - let range = BoundRange::new( - // TODO: Costly clone jesus christ - Bound::Included(key.clone().into()), - Bound::Included(KeySerializer::new(key.len() + 1) - .write(key.as_slice()) - .write(u8::MAX) - .finalize().into()), - ); - // TODO: Repeat after reaching max keys - let mut keys = trx.scan_keys(range, u32::MAX).await.map_err(into_error)?; - - - while let Some(key) = keys.next() { - //trx.delete(key).await.map_err(into_error)?; - let mutation = Mutation { - op: Op::Del.into(), - key: key.into(), - value: Default::default(), - assertion: Assertion::None.into(), - }; - batch_mutate.push(mutation); - } - } else { - // TODO: Costly clone + } + ValueOp::AtomicAdd(by) => { + // Duplicating AddAndGet because TiKV has no atomic add + // TODO: Costly clone + let atomic_add_key = key.clone(); + self.atomic_add(atomic_add_key.clone(), *by).await?; + atomic_subtracts_rollback.push(self.atomic_subtract(atomic_add_key.clone(), *by)); + } + ValueOp::AddAndGet(by) => { + // TODO: Costly clone + let num = if let Some(bytes) = + trx.get_for_update(key.clone()).await.map_err(into_error)? + { + deserialize_i64_le(&key, &bytes)? + *by + } else { + *by + }; + // TODO: Costly clone + //trx.put(key.clone(), &num.to_le_bytes()[..]).await.map_err(into_error)?; + let mutation = Mutation { + op: Op::Put.into(), + key: key.to_vec(), + value: num.to_le_bytes().to_vec(), + assertion: Assertion::None.into(), + }; + batch_mutate.push(mutation); + result.push_counter_id(num); + } + ValueOp::Clear => { + if do_chunk { + let range = BoundRange::new( + // TODO: Costly clone jesus christ + Bound::Included(key.clone().into()), + Bound::Included(KeySerializer::new(key.len() + 1) + .write(key.as_slice()) + .write(u8::MAX) + .finalize().into()), + ); + // TODO: Repeat after reaching max keys + let mut keys = trx.scan_keys(range, u32::MAX).await.map_err(into_error)?; + + + while let Some(key) = keys.next() { //trx.delete(key).await.map_err(into_error)?; let mutation = Mutation { op: Op::Del.into(), @@ -180,185 +168,194 @@ impl TikvStore { }; batch_mutate.push(mutation); } + } else { + // TODO: Costly clone + //trx.delete(key).await.map_err(into_error)?; + let mutation = Mutation { + op: Op::Del.into(), + key: key.into(), + value: Default::default(), + assertion: Assertion::None.into(), + }; + batch_mutate.push(mutation); } } } - Operation::Index { field, key, set } => { - let key = IndexKey { + } + Operation::Index { field, key, set } => { + let key = IndexKey { + account_id, + collection, + document_id, + field: *field, + key, + } + .serialize(WITH_SUBSPACE); + + if *set { + //trx.put(key, &[]).await.map_err(into_error)?; + let mutation = Mutation { + op: Op::Put.into(), + key, + value: vec![], + assertion: Assertion::None.into(), + }; + batch_mutate.push(mutation); + } else { + //trx.delete(key).await.map_err(into_error)?; + let mutation = Mutation { + op: Op::Del.into(), + key, + value: Default::default(), + assertion: Assertion::None.into(), + }; + batch_mutate.push(mutation); + } + } + Operation::Bitmap { class, set } => { + // Find the next available document id + let assign_id = *set + && matches!(class, BitmapClass::DocumentIds) + && document_id == u32::MAX; + if assign_id { + let begin = BitmapKey { account_id, collection, - document_id, - field: *field, - key, + class: BitmapClass::DocumentIds, + document_id: 0, } .serialize(WITH_SUBSPACE); - - if *set { - //trx.put(key, &[]).await.map_err(into_error)?; - let mutation = Mutation { - op: Op::Put.into(), - key, - value: vec![], - assertion: Assertion::None.into(), - }; - batch_mutate.push(mutation); - } else { - //trx.delete(key).await.map_err(into_error)?; - let mutation = Mutation { - op: Op::Del.into(), - key, - value: Default::default(), - assertion: Assertion::None.into(), - }; - batch_mutate.push(mutation); - } - } - Operation::Bitmap { class, set } => { - // Find the next available document id - let assign_id = *set - && matches!(class, BitmapClass::DocumentIds) - && document_id == u32::MAX; - if assign_id { - let begin = BitmapKey { - account_id, - collection, - class: BitmapClass::DocumentIds, - document_id: 0, - } - .serialize(WITH_SUBSPACE); - let end = BitmapKey { - account_id, - collection, - class: BitmapClass::DocumentIds, - document_id: u32::MAX, - } - .serialize(WITH_SUBSPACE); - let key_len = begin.len(); - // TODO: Do repeat logic - let mut values = trx.scan_keys((begin, end), MAX_KEYS).await.map_err(into_error)?; - let mut found_ids = RoaringBitmap::new(); - while let Some(key) = values.next() { - if key.len() == key_len { - let key_vec: Vec = key.into(); - found_ids.insert(key_vec.as_slice().deserialize_be_u32(key_len - U32_LEN)?); - } else { - break; - } - } - document_id = found_ids.random_available_id(); - result.push_document_id(document_id); - } - - let key = class.serialize( + let end = BitmapKey { account_id, collection, - document_id, - WITH_SUBSPACE, - (&result).into(), - ); - - if *set { - if assign_id { - let keys_iter = trx.scan_keys((key.clone(), class.serialize( - account_id, - collection, - document_id + 1, - WITH_SUBSPACE, - (&result).into(), - )), u32::MAX).await.map_err(into_error)?; - trx.lock_keys(keys_iter).await.map_err(into_error)?; + class: BitmapClass::DocumentIds, + document_id: u32::MAX, + } + .serialize(WITH_SUBSPACE); + let key_len = begin.len(); + // TODO: Do repeat logic + let mut values = trx.scan_keys((begin, end), MAX_KEYS).await.map_err(into_error)?; + let mut found_ids = RoaringBitmap::new(); + while let Some(key) = values.next() { + if key.len() == key_len { + let key_vec: Vec = key.into(); + found_ids.insert(key_vec.as_slice().deserialize_be_u32(key_len - U32_LEN)?); + } else { + break; } - let mutation = Mutation { - op: Op::Put.into(), - key, - value: vec![], - assertion: Assertion::None.into(), - }; - batch_mutate.push(mutation); - //trx.put(key, &[]).await.map_err(into_error)?; - } else { - let mutation = Mutation { - op: Op::Del.into(), - key, - value: Default::default(), - assertion: Assertion::None.into(), - }; - batch_mutate.push(mutation); - //trx.delete(key).await.map_err(into_error)?; } + document_id = found_ids.random_available_id(); + result.push_document_id(document_id); } - Operation::Log { set } => { - let key = LogKey { - account_id, - collection, - change_id, + + let key = class.serialize( + account_id, + collection, + document_id, + WITH_SUBSPACE, + (&result).into(), + ); + + if *set { + if assign_id { + let keys_iter = trx.scan_keys((key.clone(), class.serialize( + account_id, + collection, + document_id + 1, + WITH_SUBSPACE, + (&result).into(), + )), u32::MAX).await.map_err(into_error)?; + trx.lock_keys(keys_iter).await.map_err(into_error)?; } - .serialize(WITH_SUBSPACE); let mutation = Mutation { op: Op::Put.into(), key, - value: set.resolve(&result)?.into_owned(), + value: vec![], assertion: Assertion::None.into(), }; batch_mutate.push(mutation); - //trx.put(key, set.resolve(&result)?.as_ref()).await.map_err(into_error)?; - } - Operation::AssertValue { - class, - assert_value, - } => { - let key = class.serialize( - account_id, - collection, - document_id, - WITH_SUBSPACE, - (&result).into(), - ); - - let matches = match read_chunked_value_transaction(&key, &mut trx).await { - Ok(ChunkedValue::Single(bytes)) => assert_value.matches(bytes.as_ref()), - Ok(ChunkedValue::Chunked { bytes, .. }) => { - assert_value.matches(bytes.as_ref()) - } - Ok(ChunkedValue::None) => assert_value.is_none(), - Err(_) => false, + //trx.put(key, &[]).await.map_err(into_error)?; + } else { + let mutation = Mutation { + op: Op::Del.into(), + key, + value: Default::default(), + assertion: Assertion::None.into(), }; + batch_mutate.push(mutation); + //trx.delete(key).await.map_err(into_error)?; + } + } + Operation::Log { set } => { + let key = LogKey { + account_id, + collection, + change_id, + } + .serialize(WITH_SUBSPACE); + let mutation = Mutation { + op: Op::Put.into(), + key, + value: set.resolve(&result)?.into_owned(), + assertion: Assertion::None.into(), + }; + batch_mutate.push(mutation); + //trx.put(key, set.resolve(&result)?.as_ref()).await.map_err(into_error)?; + } + Operation::AssertValue { + class, + assert_value, + } => { + let key = class.serialize( + account_id, + collection, + document_id, + WITH_SUBSPACE, + (&result).into(), + ); - if !matches { - trx.rollback().await.map_err(into_error)?; - return Err(trc::StoreEvent::AssertValueFailed.into()); + let matches = match read_chunked_value_transaction(&key, &mut trx).await { + Ok(ChunkedValue::Single(bytes)) => assert_value.matches(bytes.as_ref()), + Ok(ChunkedValue::Chunked { bytes, .. }) => { + assert_value.matches(bytes.as_ref()) } + Ok(ChunkedValue::None) => assert_value.is_none(), + Err(_) => false, + }; + + if !matches { + trx.rollback().await.map_err(into_error)?; + return Err(trc::StoreEvent::AssertValueFailed.into()); } } } + } - batch_mutate.reverse(); - trx.batch_mutate(batch_mutate).await.map_err(into_error)?; - - if self - .commit( - trx, - retry_count < MAX_COMMIT_ATTEMPTS && start.elapsed() < MAX_COMMIT_TIME, - ) - .await? - { - for fut in atomic_adds { - fut.await?; - } - return Ok(result); - } else { - let backoff = rand::thread_rng().gen_range(50..=300); - tokio::time::sleep(Duration::from_millis(backoff)).await; - retry_count += 1; + batch_mutate.reverse(); + trx.batch_mutate(batch_mutate).await.map_err(into_error)?; + + // Already handles retry logic through retry and backoff + if let Err(e) = trx.commit().await { + // the committer should have done the repeats and still failed + trx.rollback().await.map_err(into_error)?; + for fut in atomic_subtracts_rollback { + fut.await?; } + return Err(into_error(e)); } + // Success, we don't care about the timestamp for now, but it's in to do + return Ok(result); } pub(crate) async fn commit(&self, mut trx: Transaction, will_retry: bool) -> trc::Result { match trx.commit().await { Ok(result) => { - let commit_timestamp = result.ok_or_else(|| trc::StoreEvent::FoundationdbError - .reason("couldn't get commit timestamp".to_string()))?; + let Some(commit_timestamp) = result else { + // There was nothing to commit? + // .ok_or_else(|| trc::StoreEvent::TikvError + // .reason("couldn't get commit timestamp".to_string())) + return Ok(true); + }; let mut version = self.version.lock(); // I hate this if commit_timestamp.version() > version.version.version() { @@ -386,16 +383,18 @@ impl TikvStore { loop { let mut key_count = 0; - let mut trx = self.trx_client.begin_pessimistic().await.map_err(into_error)?; + + let mut trx = self.trx_client + .begin_with_options(self.write_trx_options.clone()) + .await + .map_err(into_error)?; + let mut keys = trx.scan_keys((from_key.clone(), to_key.clone()), CHUNK_LIMIT).await.map_err(into_error)?; for key in keys { key_count += 1; trx.delete(key).await.map_err(into_error)?; } - if key_count == 0 { - trx.rollback().await.map_err(into_error)?; - } - // TODO: Retry on error + self.commit(trx, false).await?; if key_count != CHUNK_LIMIT { break; @@ -410,7 +409,11 @@ impl TikvStore { let from = from.serialize(WITH_SUBSPACE); let to = to.serialize(WITH_SUBSPACE); - let mut trx = self.trx_client.begin_pessimistic().await.map_err(into_error)?; + let mut trx = self.trx_client + .begin_with_options(self.write_trx_options.clone()) + .await + .map_err(into_error)?; + // Have to manually get the range first; // TODO: Chunked key scans and locks let mut keys = trx.scan_keys((from, to), MAX_KEYS).await.map_err(into_error)?; @@ -425,35 +428,141 @@ impl TikvStore { } pub(crate) async fn atomic_compare_and_clear(&self, key: Vec, by: &[u8]) -> trc::Result { - let Some(value) = self.raw_client.get(key.clone()).await.map_err(into_error)? else { - // Nothing to compare as there is nothing to clear. - return Ok(false) - }; - - return if by == value.as_slice() { - self.raw_client.delete(key).await.map_err(into_error)?; - Ok(true) - } else { - Ok(false) + // Raw clients do not have retry logic + // TODO: Unpyramid of Doom + let mut backoff = self.raw_backoff.clone(); + loop { + let value = match self.raw_client.get(key.clone()).await { + Ok(value_opt) => { + if let Some(value) = value_opt { + value + } else { + // Nothing to compare as there is nothing to clear. + return Ok(false) + } + } + Err(e) => { + if let Some(wait) = backoff.next_delay_duration() { + tokio::time::sleep(wait).await; + continue; + } else { + return Err(into_error(e)); + } + } + }; + + return if by == value.as_slice() { + match self.raw_client.delete(key.clone()).await { + Ok(_) => Ok(true), + Err(e) => { + if let Some(wait) = backoff.next_delay_duration() { + tokio::time::sleep(wait).await; + continue; + } else { + return Err(into_error(e)); + } + } + } + } else { + Ok(false) + } } } pub(crate) async fn atomic_add(&self, key: Vec, by: i64) -> trc::Result> { - let maybe_set_value = self.raw_client.get(key.to_vec()).await.map_err(into_error)?; - - let sum = match &maybe_set_value { - None => by, - Some(original) => deserialize_i64_le(key.as_slice(), original.as_slice())? + by - }; - let (_previous, swapped) = self.raw_client - .compare_and_swap(key.to_vec(), maybe_set_value, sum.to_le_bytes().to_vec()) - .await - .map_err(into_error)?; + // Raw clients do not have retry logic + // TODO: Unpyramid of Doom + let mut backoff = self.raw_backoff.clone(); + loop { + let maybe_set_value = match self.raw_client.get(key.clone()).await { + Ok(value_opt) => value_opt, + Err(e) => { + if let Some(wait) = backoff.next_delay_duration() { + tokio::time::sleep(wait).await; + continue; + } else { + return Err(into_error(e)); + } + } + }; + + let sum = match &maybe_set_value { + None => Wrapping(by), + Some(original) => Wrapping(deserialize_i64_le(key.as_slice(), original.as_slice())?) + Wrapping(by) + }; + let (_previous, swapped) = match self.raw_client + .compare_and_swap(key.to_vec(), maybe_set_value, sum.0.to_le_bytes().to_vec()) + .await { + Ok(result) => result, + Err(e) => { + if let Some(wait) = backoff.next_delay_duration() { + tokio::time::sleep(wait).await; + continue; + } else { + return Err(into_error(e)); + } + } + }; + + return if swapped { + Ok(Some(sum.0)) + } else { + // TODO: Possible logic error but my eyes hurt already + Ok(None) + } + } + } + + pub(crate) async fn atomic_subtract(&self, key: Vec, minuend: i64) -> trc::Result> { + // Raw clients do not have retry logic + // TODO: Unpyramid of Doom + let mut backoff = self.raw_backoff.clone(); + loop { + let value = match self.raw_client.get(key.clone()).await { + Ok(value_opt) => value_opt.ok_or_else(|| { + trc::StoreEvent::TikvError + .reason("cannot do an atomic subtract on unset key-value") + })?, + Err(e) => { + if let Some(wait) = backoff.next_delay_duration() { + tokio::time::sleep(wait).await; + continue; + } else { + return Err(into_error(e)); + } + } + }; + + let subtrahend = Wrapping(deserialize_i64_le(key.as_slice(), value.as_slice())?); + + let difference = subtrahend - Wrapping(minuend); + + let (_previous, swapped) = match self.raw_client + .compare_and_swap(key.to_vec(), Some(subtrahend.0.to_le_bytes().to_vec()), difference.0.to_le_bytes().to_vec()) + .await { + Ok(result) => result, + Err(e) => { + if let Some(wait) = backoff.next_delay_duration() { + tokio::time::sleep(wait).await; + continue; + } else { + return Err(into_error(e)); + } + } + }; - return if swapped { - Ok(Some(sum)) - } else { - Ok(None) + return if swapped { + Ok(Some(difference.0)) + } else { + if let Some(wait) = backoff.next_delay_duration() { + tokio::time::sleep(wait).await; + continue; + } else { + return Err(trc::StoreEvent::TikvError + .reason("failed to subtract")); + //.ctx(key.clone().into(), minuend)); + } + } } } } From 79cc31bfb525b90a64ab9d28491a3133296c656d Mon Sep 17 00:00:00 2001 From: Alvin Peters Date: Mon, 12 Aug 2024 08:41:31 +1000 Subject: [PATCH 06/13] Add TiKV: Save work on rewrite --- .../store/src/backend/foundationdb/write.rs | 1 + crates/store/src/backend/tikv/blob.rs | 138 +-- crates/store/src/backend/tikv/main.rs | 4 +- crates/store/src/backend/tikv/mod.rs | 56 +- crates/store/src/backend/tikv/read.rs | 296 +++--- crates/store/src/backend/tikv/write.rs | 925 +++++++++--------- 6 files changed, 651 insertions(+), 769 deletions(-) diff --git a/crates/store/src/backend/foundationdb/write.rs b/crates/store/src/backend/foundationdb/write.rs index 0eb55082c..11a1c04f2 100644 --- a/crates/store/src/backend/foundationdb/write.rs +++ b/crates/store/src/backend/foundationdb/write.rs @@ -109,6 +109,7 @@ impl FdbStore { } } ValueOp::AtomicAdd(by) => { + println!("fdb atomic add key: {:?} val: {:?} ", key, by.to_le_bytes()); trx.atomic_op(&key, &by.to_le_bytes()[..], MutationType::Add); } ValueOp::AddAndGet(by) => { diff --git a/crates/store/src/backend/tikv/blob.rs b/crates/store/src/backend/tikv/blob.rs index 2cbac8978..875c08815 100644 --- a/crates/store/src/backend/tikv/blob.rs +++ b/crates/store/src/backend/tikv/blob.rs @@ -8,7 +8,7 @@ use std::ops::Range; use utils::BLOB_HASH_LEN; use crate::SUBSPACE_BLOBS; use crate::write::key::KeySerializer; -use super::{into_error, MAX_KEYS, MAX_KV_PAIRS, MAX_VALUE_SIZE, TikvStore}; +use super::{into_error, MAX_SCAN_KEYS_SIZE, MAX_SCAN_VALUES_SIZE, MAX_VALUE_SIZE, TikvStore}; impl TikvStore { pub(crate) async fn get_blob( @@ -16,144 +16,14 @@ impl TikvStore { key: &[u8], range: Range, ) -> trc::Result>> { - let block_start = range.start / MAX_VALUE_SIZE; - let bytes_start = range.start % MAX_VALUE_SIZE; - let block_end = (range.end / MAX_VALUE_SIZE) + 1; - - let begin = KeySerializer::new(key.len() + 3) - .write(SUBSPACE_BLOBS) - .write(key) - .write(block_start as u16) - .finalize(); - let end = KeySerializer::new(key.len() + 3) - .write(SUBSPACE_BLOBS) - .write(key) - .write(block_end as u16) - .finalize(); - let key_len = begin.len(); - let mut trx = self.snapshot_trx().await?; - // TODO: Create repeat logic for over max - let mut values = trx.scan((begin, end), u32::MAX).await.map_err(into_error)?; - let mut blob_data: Option> = None; - let blob_range = range.end - range.start; - - 'outer: while let Some(kv_pair) = values.next() { - let key = kv_pair.0; - if key.len() == key_len { - let value = kv_pair.1; - if let Some(blob_data) = &mut blob_data { - blob_data.extend_from_slice( - value - .get( - ..std::cmp::min( - blob_range.saturating_sub(blob_data.len()), - value.len(), - ), - ) - .unwrap_or(&[]), - ); - if blob_data.len() == blob_range { - break 'outer; - } - } else { - let blob_size = if blob_range <= (5 * (1 << 20)) { - blob_range - } else if value.len() == MAX_VALUE_SIZE { - MAX_VALUE_SIZE * 2 - } else { - value.len() - }; - let mut blob_data_ = Vec::with_capacity(blob_size); - blob_data_.extend_from_slice( - value - .get(bytes_start..std::cmp::min(bytes_start + blob_range, value.len())) - .unwrap_or(&[]), - ); - if blob_data_.len() == blob_range { - return Ok(Some(blob_data_)); - } - blob_data = blob_data_.into(); - } - } - } - - Ok(blob_data) + todo!() } pub(crate) async fn put_blob(&self, key: &[u8], data: &[u8]) -> trc::Result<()> { - const N_CHUNKS: usize = (1 << 5) - 1; - let last_chunk = std::cmp::max( - (data.len() / MAX_VALUE_SIZE) - + if data.len() % MAX_VALUE_SIZE > 0 { - 1 - } else { - 0 - }, - 1, - ) - 1; - let mut trx = self.trx_client - .begin_with_options(self.write_trx_options.clone()) - .await - .map_err(into_error)?; - - for (chunk_pos, chunk_bytes) in data.chunks(MAX_VALUE_SIZE).enumerate() { - trx.put( - KeySerializer::new(key.len() + 3) - .write(SUBSPACE_BLOBS) - .write(key) - .write(chunk_pos as u16) - .finalize(), - chunk_bytes, - ).await.map_err(into_error)?; - if chunk_pos == last_chunk || (chunk_pos > 0 && chunk_pos % N_CHUNKS == 0) { - self.commit(trx, false).await?; - if chunk_pos < last_chunk { - trx = self.trx_client - .begin_with_options(self.write_trx_options.clone()) - .await - .map_err(into_error)?; - } else { - break; - } - } - } - - Ok(()) + todo!() } pub(crate) async fn delete_blob(&self, key: &[u8]) -> trc::Result { - if key.len() < BLOB_HASH_LEN { - return Ok(false); - } - // Shouldn't grab millions of keys anyways but - // TODO: Optimise - let mut trx = self.trx_client - .begin_with_options(self.write_trx_options.clone()) - .await - .map_err(into_error)?; - - let begin = KeySerializer::new(key.len() + 3) - .write(SUBSPACE_BLOBS) - .write(key) - .write(0u16) - .finalize(); - let end = KeySerializer::new(key.len() + 3) - .write(SUBSPACE_BLOBS) - .write(key) - .write(u16::MAX) - .finalize(); - let keys = trx.scan_keys((begin, end), MAX_KEYS).await.map_err(into_error)?; - - let mut key_count = 0; - for key in keys { - key_count += 1; - trx.delete(key).await.map_err(into_error)?; - } - if key_count == 0 { - trx.rollback().await.map_err(into_error)?; - return Ok(false); - } - - self.commit(trx, false).await + todo!() } } \ No newline at end of file diff --git a/crates/store/src/backend/tikv/main.rs b/crates/store/src/backend/tikv/main.rs index b2f97d5b8..13ea97ae6 100644 --- a/crates/store/src/backend/tikv/main.rs +++ b/crates/store/src/backend/tikv/main.rs @@ -86,8 +86,6 @@ impl TikvStore { ) }; - let raw_client_retries = backoff.is_none().then(|| 0).unwrap_or_else(|| max_attempts); - let write_trx_options = TransactionOptions::new_pessimistic() .drop_check(CheckLevel::Warn) .retry_options(RetryOptions::new(backoff.clone(), backoff.clone())); @@ -99,6 +97,8 @@ impl TikvStore { write_trx_options, raw_client, raw_backoff, + api_v2: false, + keyspace: [0, 0, b's'], // Temporary version: Default::default(), }; diff --git a/crates/store/src/backend/tikv/mod.rs b/crates/store/src/backend/tikv/mod.rs index c51d04bd6..bfa115a03 100644 --- a/crates/store/src/backend/tikv/mod.rs +++ b/crates/store/src/backend/tikv/mod.rs @@ -5,10 +5,11 @@ */ use std::time::{Duration, Instant}; -use tikv_client::{TransactionClient, Transaction, Error as TikvError, Snapshot, Value, Key, Timestamp, RawClient, TransactionOptions, Backoff}; +use tikv_client::{TransactionClient, Transaction, Error as TikvError, Snapshot, Value, Key, Timestamp, RawClient, TransactionOptions, Backoff, KvPair, BoundRange}; use tikv_client::proto::kvrpcpb; use tikv_client::proto::kvrpcpb::Mutation; use crate::write::{AssignedIds, ValueOp}; +use crate::write::key::KeySerializer; pub mod blob; pub mod main; @@ -18,10 +19,21 @@ pub mod write; // https://github.com/tikv/tikv/issues/7272#issuecomment-604841372 -const MAX_KEY_SIZE: usize = 4 * 1024; -const MAX_VALUE_SIZE: usize = 100000; -const MAX_KEYS: u32 = 100000; -const MAX_KV_PAIRS: u32 = 50000; +// Default limit is 4194304 bytes +const MAX_KEY_SIZE: u32 = 4 * 1024; +// Default limit is 4194304 bytes. Let's use half of that as a base to be safe (2097152 bytes). +// Then, 2097152 +const MAX_GRPC_MESSAGE_SIZE: u32 = 2097152; +const MAX_ASSUMED_KEY_SIZE: u32 = 256; +const MAX_VALUE_SIZE: u32 = 131072; +const MAX_SCAN_KEYS_SIZE: u32 = MAX_GRPC_MESSAGE_SIZE / MAX_ASSUMED_KEY_SIZE; // 8192 +const MAX_SCAN_VALUES_SIZE: u32 = MAX_GRPC_MESSAGE_SIZE / MAX_VALUE_SIZE; // 16 + +// Preparation for API v2 +// RFC: https://github.com/tikv/rfcs/blob/master/text/0069-api-v2.md +const MODE_PREFIX_TXN_KV: u8 = b'x'; +const MODE_PREFIX_RAW_KV: u8 = b'x'; + pub const TRANSACTION_EXPIRY: Duration = Duration::from_secs(1); pub const TRANSACTION_TIMEOUT: Duration = Duration::from_secs(4); @@ -31,24 +43,30 @@ pub struct TikvStore { write_trx_options: TransactionOptions, raw_client: RawClient, raw_backoff: Backoff, + api_v2: bool, + keyspace: [u8; 3], // Keyspace is fixed-length of 3 bytes in network byte order. version: parking_lot::Mutex, } -// TODO: Remove -pub(crate) enum ReadTransaction<'db> { - Transaction(&'db mut Transaction), - Snapshot(&'db mut Snapshot) -} +impl TikvStore { + fn new_key_serializer(&self, capacity: usize, raw: bool) -> KeySerializer { + if self.api_v2 { + // We don't care about compatibility anymore + KeySerializer::new(capacity) + } else { + let mode_prefix = raw.then(|| MODE_PREFIX_RAW_KV).unwrap_or_else(|| MODE_PREFIX_TXN_KV); + // Capacity = mode_prefix length + keyspace length + capacity + KeySerializer::new(1 + 3 + capacity) + .write(mode_prefix) + .write(self.keyspace.as_slice()) + } + } -impl<'a> ReadTransaction<'a> { - pub(crate) async fn get(&'a mut self, key: impl Into) -> trc::Result> { - match self { - ReadTransaction::Transaction(trx) => { - trx.get(key).await.map_err(into_error) - } - ReadTransaction::Snapshot(ss) => { - ss.get(key).await.map_err(into_error) - } + fn remove_prefix<'a>(&self, key: &'a [u8]) -> &'a [u8] { + if self.api_v2 { + key + } else { + &key[4..] } } } diff --git a/crates/store/src/backend/tikv/read.rs b/crates/store/src/backend/tikv/read.rs index 10afa39ae..b6bdb242a 100644 --- a/crates/store/src/backend/tikv/read.rs +++ b/crates/store/src/backend/tikv/read.rs @@ -3,8 +3,8 @@ * * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ - -use tikv_client::{Key as TikvKey, Snapshot, Transaction, TransactionOptions, Value}; +use std::ops::Bound; +use tikv_client::{BoundRange, Key as TikvKey, KvPair, Snapshot, Transaction, TransactionOptions, Value}; use futures::TryStreamExt; use roaring::RoaringBitmap; use crate::{ @@ -16,7 +16,7 @@ use crate::{ BitmapKey, Deserialize, IterateParams, Key, ValueKey, U32_LEN, WITH_SUBSPACE, }; -use super::{into_error, MAX_KEYS, MAX_KV_PAIRS, MAX_VALUE_SIZE, ReadTransaction, TikvStore}; +use super::{into_error, MAX_KEY_SIZE, MAX_SCAN_KEYS_SIZE, MAX_SCAN_VALUES_SIZE, MAX_VALUE_SIZE, TikvStore}; #[allow(dead_code)] pub(crate) enum ChunkedValue { @@ -30,45 +30,14 @@ impl TikvStore { where U: Deserialize, { - let key = key.serialize(WITH_SUBSPACE); - let mut ss = self.snapshot_trx().await?; - - match read_chunked_value_snapshot(&key, &mut ss).await? { - ChunkedValue::Single(bytes) => { - U::deserialize(&bytes).map(Some) - }, - ChunkedValue::Chunked { bytes, .. } => { - U::deserialize(&bytes).map(Some) }, - ChunkedValue::None => { - Ok(None) - }, - } + todo!() } pub(crate) async fn get_bitmap( &self, mut key: BitmapKey>, ) -> trc::Result> { - let mut bm = RoaringBitmap::new(); - let begin = key.serialize(WITH_SUBSPACE); - key.document_id = u32::MAX; - let end = key.serialize(WITH_SUBSPACE); - let key_len = begin.len(); - // Maybe use transaction client? - let mut trx = self.snapshot_trx().await?; - let mut keys = trx.scan_keys( - (begin, end), - MAX_KEYS - ).await.map_err(into_error)?; - - for key in keys { - let key: Vec = key.into(); - if key.len() == key_len { - bm.insert(key.as_slice().deserialize_be_u32(key.len() - U32_LEN)?); - } - } - - Ok(if !bm.is_empty() { Some(bm) } else { None }) + todo!() } pub(crate) async fn iterate( @@ -76,44 +45,7 @@ impl TikvStore { params: IterateParams, mut cb: impl for<'x> FnMut(&'x [u8], &'x [u8]) -> trc::Result + Sync + Send, ) -> trc::Result<()> { - let mut begin: TikvKey = params.begin.serialize(WITH_SUBSPACE).into(); - let end: TikvKey = params.end.serialize(WITH_SUBSPACE).into(); - - let mut trx = self.snapshot_trx().await?; - if !params.first { - // TODO: Limit by max_keys - if params.ascending { - let mut values = trx.scan((begin, end), u32::MAX).await.map_err(into_error)?; - while let Some(kv_pair) = values.next() { - let key: &[u8] = kv_pair.key().into(); - let value: &[u8] = kv_pair.value().as_slice(); - cb(key.get(1..).unwrap_or_default(), value)?; - } - } else { - let mut values = trx.scan_reverse((begin, end), u32::MAX).await.map_err(into_error)?; - while let Some(kv_pair) = values.next() { - let mut last_key = &[] as &[u8]; - let key: &[u8] = kv_pair.key().into(); - let value: &[u8] = kv_pair.value().as_slice(); - cb(key.get(1..).unwrap_or_default(), value)?; - } - }; - - } else { - let mut values = trx - .scan((begin, end), 1) - .await - .map_err(into_error)?; - - if let Some(kv_pair) = values.next() { - let key: &[u8] = kv_pair.key().into(); - let value: &[u8] = kv_pair.key().into(); - - cb(key.get(1..).unwrap_or_default(), value)?; - } - } - - Ok(()) + todo!() } pub(crate) async fn get_counter( @@ -121,9 +53,10 @@ impl TikvStore { key: impl Into>> + Sync + Send, ) -> trc::Result { let key = key.into().serialize(WITH_SUBSPACE); - // TODO: Expensive clone + if let Some(bytes) = self - .raw_client + .snapshot_trx() + .await? .get(key.clone()) .await .map_err(into_error)? @@ -136,84 +69,165 @@ impl TikvStore { pub(crate) async fn read_trx(&self) -> trc::Result { self.trx_client - .begin_optimistic() + .begin_pessimistic() .await .map_err(into_error) } pub(crate) async fn snapshot_trx(&self) -> trc::Result { - let timestamp = self.trx_client - .current_timestamp() - .await - .map_err(into_error)?; + let read_trx = self.read_trx().await?; - Ok(self.trx_client.snapshot(timestamp, TransactionOptions::new_optimistic())) + Ok(Snapshot::new(read_trx)) } -} -// TODO: Figure out a way to deduplicate the code -pub(crate) async fn read_chunked_value_snapshot( - key: &[u8], - ss: &mut Snapshot -) -> trc::Result { - // TODO: Costly, redo - if let Some(bytes) = ss.get(key.to_vec()).await.map_err(into_error)? { - if bytes.len() < MAX_VALUE_SIZE { - Ok(ChunkedValue::Single(bytes)) - } else { - let mut value = Vec::with_capacity(bytes.len() * 2); - value.extend_from_slice(&bytes); - let mut key = KeySerializer::new(key.len() + 1) - .write(key) - .write(0u8) - .finalize(); - - // TODO: Costly, redo - while let Some(bytes) = ss.get(key.to_vec()).await.map_err(into_error)? { - value.extend_from_slice(&bytes); - *key.last_mut().unwrap() += 1; - } + pub(super) async fn read_chunked_value( + &self, + key: &[u8], + trx: &mut ReadTrx + ) -> trc::Result { + if let Some(mut bytes) = trx.get(key.to_vec()).await? { + if bytes.len() < MAX_VALUE_SIZE as usize { + Ok(ChunkedValue::Single(bytes)) + } else { + let mut value = Vec::with_capacity(bytes.len() * 2); + value.append(&mut bytes); + let mut n_chunks = 1; + + let mut first = Bound::Included(TikvKey::from(self.new_key_serializer(key.len() + 1, false) + .write(key) + .write(0u8) + .finalize())); + + 'outer: loop { + // Maybe use the last byte of the last key? + let mut count = 0; + + let last = Bound::Included(TikvKey::from(self.new_key_serializer(key.len() + 1, false) + .write(key) + .write(u8::MAX) + .finalize())); + + let bound_range = BoundRange::new(first, last); + + let mut kv_pair_iter = trx.scan(bound_range, MAX_SCAN_VALUES_SIZE) + .await? + .peekable(); + + while let Some(kv_pair) = kv_pair_iter.next() { + let (key, mut kv_value) = kv_pair.into(); + value.append(&mut kv_value); + count += 1; + if kv_pair_iter.peek().is_none() { + n_chunks += count; + if count < MAX_KEY_SIZE { + break 'outer; + } + first = Bound::Excluded(key); + continue 'outer; + } + } + + // Empty + break; + } - Ok(ChunkedValue::Chunked { - bytes: value, - n_chunks: *key.last().unwrap(), - }) + Ok(ChunkedValue::Chunked { + bytes: value, + n_chunks: *key.last().unwrap(), + }) + } + } else { + Ok(ChunkedValue::None) } - } else { - Ok(ChunkedValue::None) } + } -// TODO: Figure out a way to deduplicate the code -pub(crate) async fn read_chunked_value_transaction( - key: &[u8], - trx: &mut Transaction -) -> trc::Result { - // TODO: Costly, redo - if let Some(bytes) = trx.get(key.to_vec()).await.map_err(into_error)? { - if bytes.len() < MAX_VALUE_SIZE { - Ok(ChunkedValue::Single(bytes)) - } else { - let mut value = Vec::with_capacity(bytes.len() * 2); - value.extend_from_slice(&bytes); - let mut key = KeySerializer::new(key.len() + 1) - .write(key) - .write(0u8) - .finalize(); - - - // TODO: Costly, redo - while let Some(bytes) = trx.get(key.clone()).await.map_err(into_error)? { - value.extend_from_slice(&bytes); - *key.last_mut().unwrap() += 1; - } +pub(crate) trait ReadTransaction { + async fn get(&mut self, key: impl Into) -> trc::Result>; + async fn key_exists(&mut self, key: impl Into) -> trc::Result; + async fn batch_get( + &mut self, + keys: impl IntoIterator> + ) -> trc::Result>; + async fn scan( + &mut self, + range: impl Into, + limit: u32 + ) -> trc::Result>; + async fn scan_keys( + &mut self, + range: impl Into, + limit: u32 + ) -> trc::Result>; + async fn scan_reverse( + &mut self, + range: impl Into, + limit: u32 + ) -> trc::Result>; + async fn scan_keys_reverse( + &mut self, + range: impl Into, + limit: u32 + ) -> trc::Result>; +} - Ok(ChunkedValue::Chunked { - bytes: value, - n_chunks: *key.last().unwrap(), - }) - } - } else { - Ok(ChunkedValue::None) +impl ReadTransaction for Transaction { + async fn get(&mut self, key: impl Into) -> trc::Result> { + self.get(key).await.map_err(into_error) + } + + async fn key_exists(&mut self, key: impl Into) -> trc::Result { + self.key_exists(key).await.map_err(into_error) + } + + async fn batch_get(&mut self, keys: impl IntoIterator>) -> trc::Result> { + self.batch_get(keys).await.map_err(into_error) + } + + async fn scan(&mut self, range: impl Into, limit: u32) -> trc::Result> { + self.scan(range, limit).await.map_err(into_error) + } + + async fn scan_keys(&mut self, range: impl Into, limit: u32) -> trc::Result> { + self.scan_keys(range, limit).await.map_err(into_error) + } + + async fn scan_reverse(&mut self, range: impl Into, limit: u32) -> trc::Result> { + self.scan_reverse(range, limit).await.map_err(into_error) + } + + async fn scan_keys_reverse(&mut self, range: impl Into, limit: u32) -> trc::Result> { + self.scan_keys_reverse(range, limit).await.map_err(into_error) + } +} + +impl ReadTransaction for Snapshot { + async fn get(&mut self, key: impl Into) -> trc::Result> { + self.get(key).await.map_err(into_error) + } + + async fn key_exists(&mut self, key: impl Into) -> trc::Result { + self.key_exists(key).await.map_err(into_error) + } + + async fn batch_get(&mut self, keys: impl IntoIterator>) -> trc::Result> { + self.batch_get(keys).await.map_err(into_error) + } + + async fn scan(&mut self, range: impl Into, limit: u32) -> trc::Result> { + self.scan(range, limit).await.map_err(into_error) + } + + async fn scan_keys(&mut self, range: impl Into, limit: u32) -> trc::Result> { + self.scan_keys(range, limit).await.map_err(into_error) + } + + async fn scan_reverse(&mut self, range: impl Into, limit: u32) -> trc::Result> { + self.scan_reverse(range, limit).await.map_err(into_error) + } + + async fn scan_keys_reverse(&mut self, range: impl Into, limit: u32) -> trc::Result> { + self.scan_keys_reverse(range, limit).await.map_err(into_error) } } \ No newline at end of file diff --git a/crates/store/src/backend/tikv/write.rs b/crates/store/src/backend/tikv/write.rs index 4c1ed4487..35d71fa87 100644 --- a/crates/store/src/backend/tikv/write.rs +++ b/crates/store/src/backend/tikv/write.rs @@ -4,15 +4,12 @@ * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ -use std::{ - cmp::Ordering, - time::{Duration, Instant}, -}; +use std::{cmp::Ordering, iter, time::{Duration, Instant}}; use std::collections::Bound; -use std::num::Wrapping; -use tikv_client::{Backoff, BoundRange, Key as TikvKey, TimestampExt, Transaction, Value}; +use tikv_client::{Backoff, BoundRange, CheckLevel, Key as TikvKey, RetryOptions, TimestampExt, Transaction, Value}; use rand::Rng; use roaring::RoaringBitmap; +use tikv_client::TransactionOptions; use tikv_client::proto::kvrpcpb::{Assertion, Mutation, Op}; use crate::{ backend::deserialize_i64_le, @@ -24,7 +21,7 @@ use crate::{ BitmapKey, IndexKey, Key, LogKey, SUBSPACE_COUNTER, SUBSPACE_QUOTA, U32_LEN, WITH_SUBSPACE, }; use crate::write::key; -use super::{into_error, read::{read_chunked_value_transaction, ChunkedValue}, TikvStore, ReadVersion, MAX_VALUE_SIZE, MAX_KEYS, ReadTransaction}; +use super::{into_error, read::{ChunkedValue}, TikvStore, ReadVersion, MAX_VALUE_SIZE, MAX_SCAN_KEYS_SIZE}; impl TikvStore { pub(crate) async fn write(&self, batch: Batch) -> trc::Result { @@ -32,373 +29,373 @@ impl TikvStore { let mut collection = u8::MAX; let mut document_id = u32::MAX; let mut change_id = u64::MAX; - let mut result = AssignedIds::default(); - // TODO: IDFK man this feels wrong. What if an error happened without calling this? - let mut atomic_subtracts_rollback = vec![]; - let mut batch_mutate = vec![]; + let mut backoff = self.raw_backoff.clone(); - let mut trx = self.trx_client.begin_optimistic().await.map_err(into_error)?; + loop { + let mut result = AssignedIds::default(); - for op in &batch.ops { - match op { - Operation::AccountId { - account_id: account_id_, - } => { - account_id = *account_id_; - } - Operation::Collection { - collection: collection_, - } => { - collection = *collection_; - } - Operation::DocumentId { - document_id: document_id_, - } => { - document_id = *document_id_; - } - Operation::ChangeId { - change_id: change_id_, - } => { - change_id = *change_id_; - } - Operation::Value { class, op } => { - //println!("{:?}", class); - let mut key = class.serialize( - account_id, - collection, - document_id, - WITH_SUBSPACE, - (&result).into(), - ); - let do_chunk = !class.is_counter(collection); - - match op { - ValueOp::Set(value) => { - let value = value.resolve(&result)?; - if !value.is_empty() && do_chunk { - for (pos, chunk) in value.chunks(MAX_VALUE_SIZE).enumerate() { - match pos.cmp(&1) { - Ordering::Less => {} - Ordering::Equal => { - key.push(0); - } - Ordering::Greater => { - if pos < u8::MAX as usize { - *key.last_mut().unwrap() += 1; - } else { - //trx.rollback().await.map_err(into_error)?; - return Err(trc::StoreEvent::TikvError - .ctx( - trc::Key::Reason, - "Value is too large", - )); + let mut trx = self.write_trx_no_backoff().await?; + + for op in &batch.ops { + match op { + Operation::AccountId { + account_id: account_id_, + } => { + account_id = *account_id_; + } + Operation::Collection { + collection: collection_, + } => { + collection = *collection_; + } + Operation::DocumentId { + document_id: document_id_, + } => { + document_id = *document_id_; + } + Operation::ChangeId { + change_id: change_id_, + } => { + change_id = *change_id_; + } + Operation::Value { class, op } => { + let mut key_vec = class.serialize( + account_id, + collection, + document_id, + WITH_SUBSPACE, + (&result).into(), + ); + let mut key = self.new_key_serializer(key_vec.len(), false) + .write(key_vec.as_slice()) + .finalize(); + let do_chunk = !class.is_counter(collection); + + match op { + ValueOp::Set(value) => { + let value = value.resolve(&result)?; + + if !value.is_empty() && do_chunk { + for (pos, chunk) in value.chunks(MAX_VALUE_SIZE as usize).enumerate() { + match pos.cmp(&1) { + Ordering::Less => {} + Ordering::Equal => { + key.push(0); + } + Ordering::Greater => { + if pos < u8::MAX as usize { + *key.last_mut().unwrap() += 1; + } else { + trx.rollback().await.map_err(into_error)?; + return Err(trc::StoreEvent::TikvError + .ctx( + trc::Key::Reason, + "Value is too large", + )); + } } } + trx.put(key.clone(), chunk).await.map_err(into_error)?; } - // TODO: Costly clone - let mutation = Mutation { - op: Op::Put.into(), - key: key.to_vec(), - value: chunk.to_vec(), - assertion: Assertion::None.into(), - }; - batch_mutate.push(mutation); + } else { + trx.put(key, value.into_owned()).await.map_err(into_error)?; } - } else { - // TODO: Costly clone - let mutation = Mutation { - op: Op::Put.into(), - key: key.to_vec(), - value: value.to_vec(), - assertion: Assertion::None.into(), - }; - batch_mutate.push(mutation); - //trx.put(key.clone(), value.as_ref()).await.map_err(into_error)?; } - } - ValueOp::AtomicAdd(by) => { - // Duplicating AddAndGet because TiKV has no atomic add - // TODO: Costly clone - let atomic_add_key = key.clone(); - self.atomic_add(atomic_add_key.clone(), *by).await?; - atomic_subtracts_rollback.push(self.atomic_subtract(atomic_add_key.clone(), *by)); - } - ValueOp::AddAndGet(by) => { - // TODO: Costly clone - let num = if let Some(bytes) = - trx.get_for_update(key.clone()).await.map_err(into_error)? - { - deserialize_i64_le(&key, &bytes)? + *by - } else { - *by - }; - // TODO: Costly clone - //trx.put(key.clone(), &num.to_le_bytes()[..]).await.map_err(into_error)?; - let mutation = Mutation { - op: Op::Put.into(), - key: key.to_vec(), - value: num.to_le_bytes().to_vec(), - assertion: Assertion::None.into(), - }; - batch_mutate.push(mutation); - result.push_counter_id(num); - } - ValueOp::Clear => { - if do_chunk { - let range = BoundRange::new( - // TODO: Costly clone jesus christ - Bound::Included(key.clone().into()), - Bound::Included(KeySerializer::new(key.len() + 1) + ValueOp::AtomicAdd(by) => { + get_and_add(&mut trx, key, *by).await?; + } + ValueOp::AddAndGet(by) => { + let num = get_and_add(&mut trx, key, *by).await?; + result.push_counter_id(num); + } + ValueOp::Clear => { + if do_chunk { + let end_vec = self.new_key_serializer(key.len() + 1, false) .write(key.as_slice()) .write(u8::MAX) - .finalize().into()), - ); - // TODO: Repeat after reaching max keys - let mut keys = trx.scan_keys(range, u32::MAX).await.map_err(into_error)?; - - - while let Some(key) = keys.next() { - //trx.delete(key).await.map_err(into_error)?; - let mutation = Mutation { - op: Op::Del.into(), - key: key.into(), - value: Default::default(), - assertion: Assertion::None.into(), - }; - batch_mutate.push(mutation); + .finalize(); + let mut begin = Bound::Included(TikvKey::from(key)); + let end = Bound::Included(TikvKey::from(end_vec)); + + 'outer: loop { + let range = BoundRange::new(begin, end.clone()); + let mut keys_iter = trx.scan_keys(range, MAX_SCAN_KEYS_SIZE) + .await + .map_err(into_error)? + .peekable(); + + let mut count = 0; + while let Some(key) = keys_iter.next() { + count += 1; + if keys_iter.peek().is_none() { + if count < MAX_SCAN_KEYS_SIZE { + trx.delete(key).await.map_err(into_error)?; + break 'outer; + } else { + begin = Bound::Excluded(key.clone()); + trx.delete(key).await.map_err(into_error)?; + continue 'outer; + } + } else { + trx.delete(key).await.map_err(into_error)?; + } + } + + // Empty + break; + } + + } else { + trx.delete(key).await.map_err(into_error)?; } - } else { - // TODO: Costly clone - //trx.delete(key).await.map_err(into_error)?; - let mutation = Mutation { - op: Op::Del.into(), - key: key.into(), - value: Default::default(), - assertion: Assertion::None.into(), - }; - batch_mutate.push(mutation); } } } - } - Operation::Index { field, key, set } => { - let key = IndexKey { - account_id, - collection, - document_id, - field: *field, - key, - } - .serialize(WITH_SUBSPACE); - - if *set { - //trx.put(key, &[]).await.map_err(into_error)?; - let mutation = Mutation { - op: Op::Put.into(), - key, - value: vec![], - assertion: Assertion::None.into(), - }; - batch_mutate.push(mutation); - } else { - //trx.delete(key).await.map_err(into_error)?; - let mutation = Mutation { - op: Op::Del.into(), - key, - value: Default::default(), - assertion: Assertion::None.into(), - }; - batch_mutate.push(mutation); - } - } - Operation::Bitmap { class, set } => { - // Find the next available document id - let assign_id = *set - && matches!(class, BitmapClass::DocumentIds) - && document_id == u32::MAX; - if assign_id { - let begin = BitmapKey { - account_id, - collection, - class: BitmapClass::DocumentIds, - document_id: 0, - } - .serialize(WITH_SUBSPACE); - let end = BitmapKey { + Operation::Index { field, key, set } => { + let key_vec = IndexKey { account_id, collection, - class: BitmapClass::DocumentIds, - document_id: u32::MAX, + document_id, + field: *field, + key, + }.serialize(0); + let key = self.new_key_serializer(key_vec.len(), false) + .write(key_vec.as_slice()) + .finalize(); + + if *set { + trx.put(key, &[]).await.map_err(into_error)?; + } else { + trx.delete(key).await.map_err(into_error)?; } - .serialize(WITH_SUBSPACE); - let key_len = begin.len(); - // TODO: Do repeat logic - let mut values = trx.scan_keys((begin, end), MAX_KEYS).await.map_err(into_error)?; - let mut found_ids = RoaringBitmap::new(); - while let Some(key) = values.next() { - if key.len() == key_len { - let key_vec: Vec = key.into(); - found_ids.insert(key_vec.as_slice().deserialize_be_u32(key_len - U32_LEN)?); - } else { + } + Operation::Bitmap { class, set } => { + let assign_id = *set + && matches!(class, BitmapClass::DocumentIds) + && document_id == u32::MAX; + + if assign_id { + let begin_vec = BitmapKey { + account_id, + collection, + class: BitmapClass::DocumentIds, + document_id: 0, + }.serialize(WITH_SUBSPACE); + let begin = self.new_key_serializer(begin_vec.len(), false) + .write(begin_vec.as_slice()) + .finalize(); + let end_vec = BitmapKey { + account_id, + collection, + class: BitmapClass::DocumentIds, + document_id: u32::MAX, + }.serialize(WITH_SUBSPACE); + let end = self.new_key_serializer(end_vec.len(), false) + .write(end_vec.as_slice()) + .finalize(); + + let key_len = begin.len(); + let mut begin_bound = Bound::Included(TikvKey::from(begin)); + let end_bound = Bound::Included(TikvKey::from(end)); + + let mut found_ids = RoaringBitmap::new(); + 'outer: loop { + let range = BoundRange::new(begin_bound, end_bound.clone()); + let mut keys_iter = trx.scan_keys(range, MAX_SCAN_KEYS_SIZE) + .await + .map_err(into_error)? + .peekable(); + let mut count = 0; + + while let Some(key) = keys_iter.next() { + count += 1; + if key.len() == key_len { + let found_id = self.remove_prefix((&key).into()) + .deserialize_be_u32(key_len - U32_LEN)?; + found_ids.insert(found_id); + } else { + if count < MAX_SCAN_KEYS_SIZE { + + break 'outer; + } else { + begin_bound = Bound::Excluded(key); + continue 'outer; + } + } + let key_slice = self.remove_prefix((&key).into()); + found_ids.insert(key_slice.deserialize_be_u32(key_len - U32_LEN)?); + if keys_iter.peek().is_none() { + if count < MAX_SCAN_KEYS_SIZE { + + break 'outer; + } else { + begin_bound = Bound::Excluded(key); + continue 'outer; + } + } + } + + // Empty break; } - } - document_id = found_ids.random_available_id(); - result.push_document_id(document_id); - } - let key = class.serialize( - account_id, - collection, - document_id, - WITH_SUBSPACE, - (&result).into(), - ); + document_id = found_ids.random_available_id(); + result.push_document_id(document_id); + } - if *set { - if assign_id { - let keys_iter = trx.scan_keys((key.clone(), class.serialize( + let key_vec = class.serialize( + account_id, + collection, + document_id, + WITH_SUBSPACE, + (&result).into(), + ); + let key = self.new_key_serializer(key_vec.len(), false) + .write(key_vec.as_slice()) + .finalize(); + + if *set { + let mut begin = Bound::Included(TikvKey::from(key)); + let end_vec = class.serialize( account_id, collection, document_id + 1, WITH_SUBSPACE, (&result).into(), - )), u32::MAX).await.map_err(into_error)?; - trx.lock_keys(keys_iter).await.map_err(into_error)?; + ); + + loop { + let end_key = TikvKey::from(self.new_key_serializer(end_vec.len(), false) + .write(end_vec.as_slice()) + .finalize()); + let end = Bound::Included(end_key); + + let range = BoundRange::new(begin, end); + let keys: Vec = trx.scan_keys(range, MAX_SCAN_KEYS_SIZE) + .await + .map_err(into_error)? + .collect(); + + if keys.len() < MAX_SCAN_KEYS_SIZE as usize { + trx.lock_keys(keys).await.map_err(into_error)?; + break; + } else { + // Guaranteed to have the last value + begin = Bound::Excluded(keys.last().unwrap().clone()); + trx.lock_keys(keys).await.map_err(into_error)?; + continue; + } + } + } else { + trx.delete(key).await.map_err(into_error)?; } - let mutation = Mutation { - op: Op::Put.into(), - key, - value: vec![], - assertion: Assertion::None.into(), - }; - batch_mutate.push(mutation); - //trx.put(key, &[]).await.map_err(into_error)?; - } else { - let mutation = Mutation { - op: Op::Del.into(), - key, - value: Default::default(), - assertion: Assertion::None.into(), - }; - batch_mutate.push(mutation); - //trx.delete(key).await.map_err(into_error)?; } - } - Operation::Log { set } => { - let key = LogKey { - account_id, - collection, - change_id, + Operation::Log { set } => { + let key = LogKey { + account_id, + collection, + change_id, + }.serialize(WITH_SUBSPACE); + let key_vec = self.new_key_serializer(key.len(), false) + .write(key.as_slice()) + .finalize(); + + trx.put(key_vec, set.resolve(&result)?.as_ref()).await.map_err(into_error)?; } - .serialize(WITH_SUBSPACE); - let mutation = Mutation { - op: Op::Put.into(), - key, - value: set.resolve(&result)?.into_owned(), - assertion: Assertion::None.into(), - }; - batch_mutate.push(mutation); - //trx.put(key, set.resolve(&result)?.as_ref()).await.map_err(into_error)?; - } - Operation::AssertValue { - class, - assert_value, - } => { - let key = class.serialize( - account_id, - collection, - document_id, - WITH_SUBSPACE, - (&result).into(), - ); - - let matches = match read_chunked_value_transaction(&key, &mut trx).await { - Ok(ChunkedValue::Single(bytes)) => assert_value.matches(bytes.as_ref()), - Ok(ChunkedValue::Chunked { bytes, .. }) => { - assert_value.matches(bytes.as_ref()) - } - Ok(ChunkedValue::None) => assert_value.is_none(), - Err(_) => false, - }; + Operation::AssertValue { + class, + assert_value, + } => { + let key_vec = class.serialize( + account_id, + collection, + document_id, + WITH_SUBSPACE, + (&result).into(), + ); + let key = self.new_key_serializer(key_vec.len(), false) + .write(key_vec.as_slice()) + .finalize(); + + let matches = match self.read_chunked_value(&key, &mut trx).await { + Ok(ChunkedValue::Single(bytes)) => assert_value.matches(bytes.as_slice()), + Ok(ChunkedValue::Chunked { bytes, .. }) => { + assert_value.matches(bytes.as_ref()) + } + Ok(ChunkedValue::None) => { + assert_value.is_none() + } + Err(_) => false, + }; - if !matches { - trx.rollback().await.map_err(into_error)?; - return Err(trc::StoreEvent::AssertValueFailed.into()); + if !matches { + trx.rollback().await.map_err(into_error)?; + return Err(trc::StoreEvent::AssertValueFailed.into()); + } } } } - } - - batch_mutate.reverse(); - trx.batch_mutate(batch_mutate).await.map_err(into_error)?; - // Already handles retry logic through retry and backoff - if let Err(e) = trx.commit().await { - // the committer should have done the repeats and still failed - trx.rollback().await.map_err(into_error)?; - for fut in atomic_subtracts_rollback { - fut.await?; + if self.commit(trx, Some(&mut backoff)).await? { + return Ok(result) + } else { + continue; } - return Err(into_error(e)); } - // Success, we don't care about the timestamp for now, but it's in to do - return Ok(result); } - pub(crate) async fn commit(&self, mut trx: Transaction, will_retry: bool) -> trc::Result { - match trx.commit().await { - Ok(result) => { - let Some(commit_timestamp) = result else { - // There was nothing to commit? - // .ok_or_else(|| trc::StoreEvent::TikvError - // .reason("couldn't get commit timestamp".to_string())) - return Ok(true); - }; - let mut version = self.version.lock(); - // I hate this - if commit_timestamp.version() > version.version.version() { - *version = ReadVersion::new(commit_timestamp); - } - Ok(true) - } - Err(err) => { - trx.rollback().await.map_err(into_error)?; - if will_retry { - Ok(false) - } else { - Err(into_error(err)) - } - } - } - } pub(crate) async fn purge_store(&self) -> trc::Result<()> { - // Obtain all zero counters - for subspace in [SUBSPACE_COUNTER, SUBSPACE_QUOTA] { - let from_key = vec![subspace, 0u8]; - let to_key = vec![subspace, u8::MAX, u8::MAX, u8::MAX, u8::MAX, u8::MAX]; - - const CHUNK_LIMIT: u32 = 1024; - - loop { - let mut key_count = 0; + //let mut delete_keys = Vec::new(); - let mut trx = self.trx_client - .begin_with_options(self.write_trx_options.clone()) + for subspace in [SUBSPACE_COUNTER, SUBSPACE_QUOTA] { + let from_key = [subspace, 0u8]; + let to_key = [subspace, u8::MAX, u8::MAX, u8::MAX, u8::MAX, u8::MAX]; + + // Since we are deleting all of them anyways. No point moving the start bound + let begin = Bound::Included(TikvKey::from(self.new_key_serializer(from_key.len(), false) + .write(from_key.as_slice()) + .finalize())); + let end = Bound::Included(TikvKey::from(self.new_key_serializer(to_key.len(), false) + .write(to_key.as_slice()) + .finalize())); + let range = BoundRange::new(begin, end); + + let mut backoff = self.raw_backoff.clone(); + + // Might possibly cause infinite loop + // TODO: Check + 'outer: loop { + let mut trx = self.write_trx_no_backoff().await?; + let mut keys_iter = trx.scan_keys(range.clone(), MAX_SCAN_KEYS_SIZE) .await - .map_err(into_error)?; + .map_err(into_error)? + .peekable(); + + let mut count = 0; + let mut last_key = TikvKey::default(); + while let Some(key) = keys_iter.next() { + count += 1; + if let Some(value) = trx.get_for_update(key.clone()).await.map_err(into_error)? { + if deserialize_i64_le((&key).into(), value.as_slice())? == 0 { + trx.delete(key.clone()).await.map_err(into_error)?; + } + } + if keys_iter.peek().is_none() { + last_key = key; + } + } - let mut keys = trx.scan_keys((from_key.clone(), to_key.clone()), CHUNK_LIMIT).await.map_err(into_error)?; - for key in keys { - key_count += 1; - trx.delete(key).await.map_err(into_error)?; + if self.commit(trx, Some(&mut backoff)).await? { + } else { + continue; } - self.commit(trx, false).await?; - if key_count != CHUNK_LIMIT { - break; + if count < MAX_SCAN_KEYS_SIZE { + break 'outer; } + + break; } } @@ -406,163 +403,145 @@ impl TikvStore { } pub(crate) async fn delete_range(&self, from: impl Key, to: impl Key) -> trc::Result<()> { - let from = from.serialize(WITH_SUBSPACE); - let to = to.serialize(WITH_SUBSPACE); - - let mut trx = self.trx_client - .begin_with_options(self.write_trx_options.clone()) - .await - .map_err(into_error)?; - - // Have to manually get the range first; - // TODO: Chunked key scans and locks - let mut keys = trx.scan_keys((from, to), MAX_KEYS).await.map_err(into_error)?; - let key_vec: Vec = keys.collect(); - // TODO: Expensive clone :( - trx.lock_keys(key_vec.clone()).await.map_err(into_error)?; - for key in key_vec { - trx.delete(key).await.map_err(into_error)?; - } - - self.commit(trx, false).await.map(|_| ()) - } - - pub(crate) async fn atomic_compare_and_clear(&self, key: Vec, by: &[u8]) -> trc::Result { - // Raw clients do not have retry logic - // TODO: Unpyramid of Doom - let mut backoff = self.raw_backoff.clone(); - loop { - let value = match self.raw_client.get(key.clone()).await { - Ok(value_opt) => { - if let Some(value) = value_opt { - value - } else { - // Nothing to compare as there is nothing to clear. - return Ok(false) - } - } - Err(e) => { - if let Some(wait) = backoff.next_delay_duration() { - tokio::time::sleep(wait).await; - continue; + let from_vec = from.serialize(WITH_SUBSPACE); + let to_vec = to.serialize(WITH_SUBSPACE); + let mut trx = self.write_trx_with_backoff().await?; + + let mut begin = Bound::Included(TikvKey::from(self.new_key_serializer(from_vec.len(), false) + .write(from_vec.as_slice()) + .finalize())); + + 'outer: loop { + let end = Bound::Included(TikvKey::from(self.new_key_serializer(to_vec.len(), false) + .write(from_vec.as_slice()) + .finalize())); + + let range = BoundRange::new(begin, end); + let mut keys_iter = trx.scan_keys(range, MAX_SCAN_KEYS_SIZE) + .await + .map_err(into_error)? + .peekable(); + + let mut count = 0; + while let Some(key) = keys_iter.next() { + count += 1; + if keys_iter.peek().is_none() { + if count < MAX_SCAN_KEYS_SIZE { + trx.delete(key).await.map_err(into_error)?; + break 'outer; } else { - return Err(into_error(e)); - } - } - }; - - return if by == value.as_slice() { - match self.raw_client.delete(key.clone()).await { - Ok(_) => Ok(true), - Err(e) => { - if let Some(wait) = backoff.next_delay_duration() { - tokio::time::sleep(wait).await; - continue; - } else { - return Err(into_error(e)); - } + begin = Bound::Excluded(key.clone()); + trx.delete(key).await.map_err(into_error)?; + continue 'outer; } + } else { + trx.delete(key).await.map_err(into_error)?; } - } else { - Ok(false) } + + break; } - } - pub(crate) async fn atomic_add(&self, key: Vec, by: i64) -> trc::Result> { - // Raw clients do not have retry logic - // TODO: Unpyramid of Doom - let mut backoff = self.raw_backoff.clone(); - loop { - let maybe_set_value = match self.raw_client.get(key.clone()).await { - Ok(value_opt) => value_opt, - Err(e) => { - if let Some(wait) = backoff.next_delay_duration() { - tokio::time::sleep(wait).await; - continue; - } else { - return Err(into_error(e)); - } - } - }; - - let sum = match &maybe_set_value { - None => Wrapping(by), - Some(original) => Wrapping(deserialize_i64_le(key.as_slice(), original.as_slice())?) + Wrapping(by) - }; - let (_previous, swapped) = match self.raw_client - .compare_and_swap(key.to_vec(), maybe_set_value, sum.0.to_le_bytes().to_vec()) - .await { - Ok(result) => result, - Err(e) => { - if let Some(wait) = backoff.next_delay_duration() { - tokio::time::sleep(wait).await; - continue; - } else { - return Err(into_error(e)); - } - } - }; + trx.commit().await.map_err(into_error)?; + Ok(()) + } - return if swapped { - Ok(Some(sum.0)) + // async fn atomic_subtract(&self, key: impl Into + Clone, by: i64) -> trc::Result<()> { + // let mut backoff = self.raw_backoff.clone(); + // + // loop { + // let key = key.clone().into(); + // let mut trx = self.write_trx_no_backoff().await?; + // if let Some(previous) = trx.get_for_update(key.clone()).await.map_err(into_error)? { + // let subtrahend = deserialize_i64_le((&key).into(), &previous)?; + // let difference = subtrahend - by; + // + // if difference == 0 { + // trx.delete(key).await.map_err(into_error)?; + // } else { + // trx.put(key, difference.to_le_bytes().as_slice()).await.map_err(into_error)?; + // } + // } else { + // trx.put(key, by.to_le_bytes().as_slice()).await.map_err(into_error)?; + // } + // + // if self.commit(trx, Some(&mut backoff)).await? { + // return Ok(()); + // } else { + // continue; + // } + // } + // } + // + // async fn atomic_add(&self, key: impl Into + Clone, by: i64) -> trc::Result<()> { + // let mut backoff = self.raw_backoff.clone(); + // + // loop { + // let key = key.clone().into(); + // let mut trx = self.write_trx_no_backoff().await?; + // if let Some(previous) = trx.get_for_update(key.clone()).await.map_err(into_error)? { + // let addend = deserialize_i64_le((&key).into(), &previous)?; + // let sum = addend + by; + // + // trx.put(key, sum.to_le_bytes().as_slice()).await.map_err(into_error)?; + // } else { + // trx.put(key, by.to_le_bytes().as_slice()).await.map_err(into_error)?; + // } + // + // if self.commit(trx, Some(&mut backoff)).await? { + // return Ok(()); + // } else { + // continue; + // } + // } + // } + + async fn commit(&self, mut trx: Transaction, ext_backoff: Option<&mut Backoff>) -> trc::Result { + if let Err(e) = trx.commit().await { + if let Some(backoff) = ext_backoff { + let Some(backoff_duration) = backoff.next_delay_duration() else { + return Err(into_error(e)); + }; + tokio::time::sleep(backoff_duration).await; + Ok(false) } else { - // TODO: Possible logic error but my eyes hurt already - Ok(None) + Err(into_error(e)) } + } else { + Ok(true) } } - pub(crate) async fn atomic_subtract(&self, key: Vec, minuend: i64) -> trc::Result> { - // Raw clients do not have retry logic - // TODO: Unpyramid of Doom - let mut backoff = self.raw_backoff.clone(); - loop { - let value = match self.raw_client.get(key.clone()).await { - Ok(value_opt) => value_opt.ok_or_else(|| { - trc::StoreEvent::TikvError - .reason("cannot do an atomic subtract on unset key-value") - })?, - Err(e) => { - if let Some(wait) = backoff.next_delay_duration() { - tokio::time::sleep(wait).await; - continue; - } else { - return Err(into_error(e)); - } - } - }; - - let subtrahend = Wrapping(deserialize_i64_le(key.as_slice(), value.as_slice())?); - - let difference = subtrahend - Wrapping(minuend); + async fn write_trx_no_backoff(&self) -> trc::Result { + // TODO: Put inside struct + let write_trx_options = TransactionOptions::new_pessimistic() + .drop_check(CheckLevel::Warn) + .use_async_commit() + .retry_options(RetryOptions::none()); - let (_previous, swapped) = match self.raw_client - .compare_and_swap(key.to_vec(), Some(subtrahend.0.to_le_bytes().to_vec()), difference.0.to_le_bytes().to_vec()) - .await { - Ok(result) => result, - Err(e) => { - if let Some(wait) = backoff.next_delay_duration() { - tokio::time::sleep(wait).await; - continue; - } else { - return Err(into_error(e)); - } - } - }; + self.trx_client + .begin_with_options(write_trx_options) + .await + .map_err(into_error) + } - return if swapped { - Ok(Some(difference.0)) - } else { - if let Some(wait) = backoff.next_delay_duration() { - tokio::time::sleep(wait).await; - continue; - } else { - return Err(trc::StoreEvent::TikvError - .reason("failed to subtract")); - //.ctx(key.clone().into(), minuend)); - } - } - } + async fn write_trx_with_backoff(&self) -> trc::Result { + self.trx_client + .begin_with_options(self.write_trx_options.clone()) + .await + .map_err(into_error) } } + +async fn get_and_add(trx: &mut Transaction, key: impl Into, by: i64) -> trc::Result { + let key = key.into(); + if let Some(previous) = trx.get_for_update(key.clone()).await.map_err(into_error)? { + let addend = deserialize_i64_le((&key).into(), &previous)?; + let sum = addend + by; + trx.put(key, sum.to_le_bytes().as_slice()).await.map_err(into_error)?; + Ok(sum) + } else { + trx.put(key, by.to_le_bytes().as_slice()).await.map_err(into_error)?; + Ok(by) + } +} \ No newline at end of file From cfa376a201a5dcddd7eca66a11334cef7039e509 Mon Sep 17 00:00:00 2001 From: Alvin Peters Date: Mon, 12 Aug 2024 12:06:50 +1000 Subject: [PATCH 07/13] Add TiKV: Finish rewrite --- crates/store/src/backend/tikv/blob.rs | 165 +++++++++++++- crates/store/src/backend/tikv/main.rs | 3 +- crates/store/src/backend/tikv/mod.rs | 3 - crates/store/src/backend/tikv/read.rs | 304 ++++++++++++++++++------- crates/store/src/backend/tikv/write.rs | 31 ++- 5 files changed, 399 insertions(+), 107 deletions(-) diff --git a/crates/store/src/backend/tikv/blob.rs b/crates/store/src/backend/tikv/blob.rs index 875c08815..8f8f58406 100644 --- a/crates/store/src/backend/tikv/blob.rs +++ b/crates/store/src/backend/tikv/blob.rs @@ -4,11 +4,16 @@ * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ -use std::ops::Range; +use std::ops::{Bound, Range}; +use tikv_client::{BoundRange, Key as TikvKey}; +use trc::EventType::Store; +use trc::StoreEvent; use utils::BLOB_HASH_LEN; use crate::SUBSPACE_BLOBS; use crate::write::key::KeySerializer; -use super::{into_error, MAX_SCAN_KEYS_SIZE, MAX_SCAN_VALUES_SIZE, MAX_VALUE_SIZE, TikvStore}; +use super::{into_error, MAX_KEY_SIZE, MAX_SCAN_KEYS_SIZE, MAX_SCAN_VALUES_SIZE, MAX_VALUE_SIZE, TikvStore}; + +// TODO: Allow handling of more than MAX_SCAN_KEYS_SIZE impl TikvStore { pub(crate) async fn get_blob( @@ -16,14 +21,164 @@ impl TikvStore { key: &[u8], range: Range, ) -> trc::Result>> { - todo!() + let block_start = range.start / MAX_VALUE_SIZE as usize; + let bytes_start = range.start % MAX_VALUE_SIZE as usize; + let block_end = (range.end / MAX_VALUE_SIZE as usize) + 1; + + // Check if within keys limit + if (block_end - block_start) > MAX_SCAN_KEYS_SIZE as usize { + return Err(trc::Error::new(Store(StoreEvent::BlobRead)) + .reason(format!("blob is larger than maximum amount of keys possible for TiKV scan: {}", MAX_SCAN_KEYS_SIZE))) + } + + + let begin = self + .new_key_serializer(key.len() + 3, false) + .write(SUBSPACE_BLOBS) + .write(key) + .write(block_start as u16) + .finalize(); + let key_len = begin.len(); + let begin_range = Bound::Included(TikvKey::from(begin)); + let end = self + .new_key_serializer(key.len() + 3, false) + .write(SUBSPACE_BLOBS) + .write(key) + .write(block_end as u16) + .finalize(); + let end_range = Bound::Included(TikvKey::from(end)); + let bound_range = BoundRange::new(begin_range, end_range); + + let mut trx = self.snapshot_trx().await?; + + let mut keys = trx + .scan_keys(bound_range, MAX_SCAN_KEYS_SIZE) + .await + .map_err(into_error)?; + + let mut blob_data: Option> = None; + let blob_range = range.end - range.start; + + 'outer: while let Some(key) = keys.next() { + if key.len() == key_len { + let value = trx.get(key).await.map_err(into_error)?.unwrap(); + if let Some(blob_data) = &mut blob_data { + blob_data.extend_from_slice( + value + .get( + ..std::cmp::min( + blob_range.saturating_sub(blob_data.len()), + value.len(), + ), + ) + .unwrap_or(&[]), + ); + if blob_data.len() == blob_range { + break 'outer; + } + } else { + let blob_size = if blob_range <= (5 * (1 << 20)) { + blob_range + } else if value.len() == MAX_VALUE_SIZE as usize { + MAX_VALUE_SIZE as usize * 2 + } else { + value.len() + }; + let mut blob_data_ = Vec::with_capacity(blob_size); + blob_data_.extend_from_slice( + value + .get(bytes_start..std::cmp::min(bytes_start + blob_range, value.len())) + .unwrap_or(&[]), + ); + if blob_data_.len() == blob_range { + return Ok(Some(blob_data_)); + } + blob_data = blob_data_.into(); + } + } + } + + Ok(blob_data) } pub(crate) async fn put_blob(&self, key: &[u8], data: &[u8]) -> trc::Result<()> { - todo!() + const N_CHUNKS: usize = (1 << 5) - 1; + let last_chunk = std::cmp::max( + (data.len() / MAX_VALUE_SIZE as usize) + + if data.len() % MAX_VALUE_SIZE as usize > 0 { + 1 + } else { + 0 + }, + 1, + ) - 1; + + // Check if within keys limit + if last_chunk > MAX_SCAN_KEYS_SIZE as usize { + return Err(trc::Error::new(Store(StoreEvent::BlobWrite)) + .reason(format!("blob is larger than maximum amount of keys possible for TiKV scan: {}", MAX_SCAN_KEYS_SIZE))) + } + + let mut trx = self.trx_client + .begin_with_options(self.write_trx_options.clone()) + .await + .map_err(into_error)?; + + for (chunk_pos, chunk_bytes) in data.chunks(MAX_VALUE_SIZE as usize).enumerate() { + trx.put( + KeySerializer::new(key.len() + 3) + .write(SUBSPACE_BLOBS) + .write(key) + .write(chunk_pos as u16) + .finalize(), + chunk_bytes + ).await.map_err(into_error)?; + if chunk_pos == last_chunk || (chunk_pos > 0 && chunk_pos % N_CHUNKS == 0) { + self.commit(trx, None).await?; + if ! chunk_pos < last_chunk { + trx = self.trx_client + .begin_with_options(self.write_trx_options.clone()) + .await + .map_err(into_error)?; + } else { + break; + } + } + } + + Ok(()) } pub(crate) async fn delete_blob(&self, key: &[u8]) -> trc::Result { - todo!() + if key.len() < BLOB_HASH_LEN { + return Ok(false); + } + + let mut trx = self.trx_client + .begin_with_options(self.write_trx_options.clone()) + .await + .map_err(into_error)?; + + let mut keys = trx.scan_keys( + ( + self.new_key_serializer(key.len() + 3, false) + .write(SUBSPACE_BLOBS) + .write(key) + .write(0u16) + .finalize(), + self.new_key_serializer(key.len() + 3, false) + .write(SUBSPACE_BLOBS) + .write(key) + .write(u16::MAX) + .finalize() + ), + MAX_SCAN_KEYS_SIZE + ).await.map_err(into_error)?; + + while let Some(key) = keys.next() { + trx.delete(key).await.map_err(into_error)?; + } + + self.commit(trx, None).await } } \ No newline at end of file diff --git a/crates/store/src/backend/tikv/main.rs b/crates/store/src/backend/tikv/main.rs index 13ea97ae6..9ab878834 100644 --- a/crates/store/src/backend/tikv/main.rs +++ b/crates/store/src/backend/tikv/main.rs @@ -104,5 +104,4 @@ impl TikvStore { Some(store) } -} - +} \ No newline at end of file diff --git a/crates/store/src/backend/tikv/mod.rs b/crates/store/src/backend/tikv/mod.rs index bfa115a03..94c93b1c1 100644 --- a/crates/store/src/backend/tikv/mod.rs +++ b/crates/store/src/backend/tikv/mod.rs @@ -6,9 +6,6 @@ use std::time::{Duration, Instant}; use tikv_client::{TransactionClient, Transaction, Error as TikvError, Snapshot, Value, Key, Timestamp, RawClient, TransactionOptions, Backoff, KvPair, BoundRange}; -use tikv_client::proto::kvrpcpb; -use tikv_client::proto::kvrpcpb::Mutation; -use crate::write::{AssignedIds, ValueOp}; use crate::write::key::KeySerializer; pub mod blob; diff --git a/crates/store/src/backend/tikv/read.rs b/crates/store/src/backend/tikv/read.rs index b6bdb242a..46e390775 100644 --- a/crates/store/src/backend/tikv/read.rs +++ b/crates/store/src/backend/tikv/read.rs @@ -15,7 +15,7 @@ use crate::{ }, BitmapKey, Deserialize, IterateParams, Key, ValueKey, U32_LEN, WITH_SUBSPACE, }; - +use crate::backend::tikv::read::read_helpers::read_chunked_value; use super::{into_error, MAX_KEY_SIZE, MAX_SCAN_KEYS_SIZE, MAX_SCAN_VALUES_SIZE, MAX_VALUE_SIZE, TikvStore}; #[allow(dead_code)] @@ -30,14 +30,68 @@ impl TikvStore { where U: Deserialize, { - todo!() + let key_base = key.serialize(WITH_SUBSPACE); + let mut trx = self.read_trx().await?; + + match read_chunked_value(&self, &key_base, &mut trx).await? { + ChunkedValue::Single(bytes) => U::deserialize(&bytes).map(Some), + ChunkedValue::Chunked { bytes, .. } => U::deserialize(&bytes).map(Some), + ChunkedValue::None => Ok(None), + } } pub(crate) async fn get_bitmap( &self, mut key: BitmapKey>, ) -> trc::Result> { - todo!() + let mut bm = RoaringBitmap::new(); + let begin_base = key.serialize(WITH_SUBSPACE); + key.document_id = u32::MAX; + let end_base = key.serialize(WITH_SUBSPACE); + let key_len = begin_base.len(); + + let begin = self + .new_key_serializer(begin_base.len(), false) + .write(begin_base.as_slice()) + .finalize(); + + let mut trx = self.snapshot_trx().await?; + + let mut begin_range = Bound::Included(TikvKey::from(begin)); + loop { + let end = self + .new_key_serializer(end_base.len(), false) + .write(end_base.as_slice()) + .finalize(); + let end_range = Bound::Included(TikvKey::from(end)); + let range = BoundRange::new(begin_range, end_range); + + let keys = trx.scan_keys(range, MAX_SCAN_KEYS_SIZE) + .await + .map_err(into_error)?; + + let mut count = 0; + + let mut last_key = TikvKey::default(); + for key in keys { + count += 1; + let key_slice = key.as_ref().into(); + let key_base = self.remove_prefix(key_slice); + if key_base.len() == key_len { + bm.insert(key_base.deserialize_be_u32(key_base.len() - U32_LEN)?); + } + last_key = key; + } + + if count < MAX_SCAN_KEYS_SIZE { + break; + } else { + begin_range = Bound::Excluded(TikvKey::from(last_key)); + continue; + } + } + + Ok(if !bm.is_empty() { Some(bm) } else { None }) } pub(crate) async fn iterate( @@ -45,7 +99,88 @@ impl TikvStore { params: IterateParams, mut cb: impl for<'x> FnMut(&'x [u8], &'x [u8]) -> trc::Result + Sync + Send, ) -> trc::Result<()> { - todo!() + let begin_base = params.begin.serialize(WITH_SUBSPACE); + let begin = self.new_key_serializer(begin_base.len(), false) + .write(begin_base.as_slice()) + .finalize(); + let end_base = params.end.serialize(WITH_SUBSPACE); + let end = self.new_key_serializer(end_base.len(), false) + .write(end_base.as_slice()) + .finalize(); + + let mut trx = self.snapshot_trx().await?; + + if !params.first { + // TODO: Get rid of repeating code + if params.ascending { + let mut begin_range = Bound::Included(TikvKey::from(begin)); + loop { + let end_range = Bound::Included(TikvKey::from(end.clone())); + let range = BoundRange::new(begin_range, end_range); + let kv_pairs = trx.scan(range, MAX_SCAN_VALUES_SIZE) + .await + .map_err(into_error)?; + + let mut count = 0; + let mut last_key = TikvKey::default(); + for kv_pair in kv_pairs { + count += 1; + let (key, value) = kv_pair.into(); + let key_base = self.remove_prefix(key.as_ref().into()); + if !cb(key_base.get(1..).unwrap_or_default(), &value)? { + return Ok(()); + } + last_key = key; + } + if count < MAX_SCAN_VALUES_SIZE { + break; + } else { + begin_range = Bound::Excluded(TikvKey::from(last_key)); + continue; + } + } + } else { + let mut end_range = Bound::Included(TikvKey::from(end)); + loop { + let begin_range = Bound::Included(TikvKey::from(begin.clone())); + let range = BoundRange::new(begin_range, end_range); + let kv_pairs = trx.scan(range, MAX_SCAN_VALUES_SIZE) + .await + .map_err(into_error)?; + + let mut count = 0; + let mut last_key = TikvKey::default(); + for kv_pair in kv_pairs { + count += 1; + let (key, value) = kv_pair.into(); + let key_base = self.remove_prefix(key.as_ref().into()); + if !cb(key_base.get(1..).unwrap_or_default(), &value)? { + return Ok(()); + } + last_key = key; + } + if count < MAX_SCAN_VALUES_SIZE { + break; + } else { + end_range = Bound::Excluded(TikvKey::from(last_key)); + continue; + } + } + } + } else { + let mut possible_kv_pair = trx + .scan((begin, end), 1) + .await + .map_err(into_error)?; + + if let Some(kv_pair) = possible_kv_pair.next() { + let (key, value) = kv_pair.into(); + let key_base = self.remove_prefix(key.as_ref().into()); + cb(key_base.get(1..).unwrap_or_default(), &value)?; + } + } + + Ok(()) } pub(crate) async fn get_counter( @@ -80,8 +215,15 @@ impl TikvStore { Ok(Snapshot::new(read_trx)) } - pub(super) async fn read_chunked_value( - &self, +} + +pub(crate) mod read_helpers { + use tikv_client::{BoundRange, KvPair, Snapshot, Transaction, Value}; + use super::*; + + + pub(crate) async fn read_chunked_value( + store: &TikvStore, key: &[u8], trx: &mut ReadTrx ) -> trc::Result { @@ -93,7 +235,7 @@ impl TikvStore { value.append(&mut bytes); let mut n_chunks = 1; - let mut first = Bound::Included(TikvKey::from(self.new_key_serializer(key.len() + 1, false) + let mut first = Bound::Included(TikvKey::from(store.new_key_serializer(key.len() + 1, false) .write(key) .write(0u8) .finalize())); @@ -102,7 +244,7 @@ impl TikvStore { // Maybe use the last byte of the last key? let mut count = 0; - let last = Bound::Included(TikvKey::from(self.new_key_serializer(key.len() + 1, false) + let last = Bound::Included(TikvKey::from(store.new_key_serializer(key.len() + 1, false) .write(key) .write(u8::MAX) .finalize())); @@ -141,93 +283,93 @@ impl TikvStore { } } -} + trait ReadTransaction { + async fn get(&mut self, key: impl Into) -> trc::Result>; + async fn key_exists(&mut self, key: impl Into) -> trc::Result; + async fn batch_get( + &mut self, + keys: impl IntoIterator> + ) -> trc::Result>; + async fn scan( + &mut self, + range: impl Into, + limit: u32 + ) -> trc::Result>; + async fn scan_keys( + &mut self, + range: impl Into, + limit: u32 + ) -> trc::Result>; + async fn scan_reverse( + &mut self, + range: impl Into, + limit: u32 + ) -> trc::Result>; + async fn scan_keys_reverse( + &mut self, + range: impl Into, + limit: u32 + ) -> trc::Result>; + } -pub(crate) trait ReadTransaction { - async fn get(&mut self, key: impl Into) -> trc::Result>; - async fn key_exists(&mut self, key: impl Into) -> trc::Result; - async fn batch_get( - &mut self, - keys: impl IntoIterator> - ) -> trc::Result>; - async fn scan( - &mut self, - range: impl Into, - limit: u32 - ) -> trc::Result>; - async fn scan_keys( - &mut self, - range: impl Into, - limit: u32 - ) -> trc::Result>; - async fn scan_reverse( - &mut self, - range: impl Into, - limit: u32 - ) -> trc::Result>; - async fn scan_keys_reverse( - &mut self, - range: impl Into, - limit: u32 - ) -> trc::Result>; -} + impl ReadTransaction for Transaction { + async fn get(&mut self, key: impl Into) -> trc::Result> { + self.get(key).await.map_err(into_error) + } -impl ReadTransaction for Transaction { - async fn get(&mut self, key: impl Into) -> trc::Result> { - self.get(key).await.map_err(into_error) - } + async fn key_exists(&mut self, key: impl Into) -> trc::Result { + self.key_exists(key).await.map_err(into_error) + } - async fn key_exists(&mut self, key: impl Into) -> trc::Result { - self.key_exists(key).await.map_err(into_error) - } + async fn batch_get(&mut self, keys: impl IntoIterator>) -> trc::Result> { + self.batch_get(keys).await.map_err(into_error) + } - async fn batch_get(&mut self, keys: impl IntoIterator>) -> trc::Result> { - self.batch_get(keys).await.map_err(into_error) - } + async fn scan(&mut self, range: impl Into, limit: u32) -> trc::Result> { + self.scan(range, limit).await.map_err(into_error) + } - async fn scan(&mut self, range: impl Into, limit: u32) -> trc::Result> { - self.scan(range, limit).await.map_err(into_error) - } + async fn scan_keys(&mut self, range: impl Into, limit: u32) -> trc::Result> { + self.scan_keys(range, limit).await.map_err(into_error) + } - async fn scan_keys(&mut self, range: impl Into, limit: u32) -> trc::Result> { - self.scan_keys(range, limit).await.map_err(into_error) - } + async fn scan_reverse(&mut self, range: impl Into, limit: u32) -> trc::Result> { + self.scan_reverse(range, limit).await.map_err(into_error) + } - async fn scan_reverse(&mut self, range: impl Into, limit: u32) -> trc::Result> { - self.scan_reverse(range, limit).await.map_err(into_error) + async fn scan_keys_reverse(&mut self, range: impl Into, limit: u32) -> trc::Result> { + self.scan_keys_reverse(range, limit).await.map_err(into_error) + } } - async fn scan_keys_reverse(&mut self, range: impl Into, limit: u32) -> trc::Result> { - self.scan_keys_reverse(range, limit).await.map_err(into_error) - } -} + impl ReadTransaction for Snapshot { + async fn get(&mut self, key: impl Into) -> trc::Result> { + self.get(key).await.map_err(into_error) + } -impl ReadTransaction for Snapshot { - async fn get(&mut self, key: impl Into) -> trc::Result> { - self.get(key).await.map_err(into_error) - } + async fn key_exists(&mut self, key: impl Into) -> trc::Result { + self.key_exists(key).await.map_err(into_error) + } - async fn key_exists(&mut self, key: impl Into) -> trc::Result { - self.key_exists(key).await.map_err(into_error) - } + async fn batch_get(&mut self, keys: impl IntoIterator>) -> trc::Result> { + self.batch_get(keys).await.map_err(into_error) + } - async fn batch_get(&mut self, keys: impl IntoIterator>) -> trc::Result> { - self.batch_get(keys).await.map_err(into_error) - } + async fn scan(&mut self, range: impl Into, limit: u32) -> trc::Result> { + self.scan(range, limit).await.map_err(into_error) + } - async fn scan(&mut self, range: impl Into, limit: u32) -> trc::Result> { - self.scan(range, limit).await.map_err(into_error) - } + async fn scan_keys(&mut self, range: impl Into, limit: u32) -> trc::Result> { + self.scan_keys(range, limit).await.map_err(into_error) + } - async fn scan_keys(&mut self, range: impl Into, limit: u32) -> trc::Result> { - self.scan_keys(range, limit).await.map_err(into_error) - } + async fn scan_reverse(&mut self, range: impl Into, limit: u32) -> trc::Result> { + self.scan_reverse(range, limit).await.map_err(into_error) + } - async fn scan_reverse(&mut self, range: impl Into, limit: u32) -> trc::Result> { - self.scan_reverse(range, limit).await.map_err(into_error) + async fn scan_keys_reverse(&mut self, range: impl Into, limit: u32) -> trc::Result> { + self.scan_keys_reverse(range, limit).await.map_err(into_error) + } } +} - async fn scan_keys_reverse(&mut self, range: impl Into, limit: u32) -> trc::Result> { - self.scan_keys_reverse(range, limit).await.map_err(into_error) - } -} \ No newline at end of file diff --git a/crates/store/src/backend/tikv/write.rs b/crates/store/src/backend/tikv/write.rs index 35d71fa87..f5c72acdd 100644 --- a/crates/store/src/backend/tikv/write.rs +++ b/crates/store/src/backend/tikv/write.rs @@ -20,6 +20,7 @@ use crate::{ }, BitmapKey, IndexKey, Key, LogKey, SUBSPACE_COUNTER, SUBSPACE_QUOTA, U32_LEN, WITH_SUBSPACE, }; +use crate::backend::tikv::read::read_helpers::read_chunked_value; use crate::write::key; use super::{into_error, read::{ChunkedValue}, TikvStore, ReadVersion, MAX_VALUE_SIZE, MAX_SCAN_KEYS_SIZE}; @@ -244,20 +245,20 @@ impl TikvStore { result.push_document_id(document_id); } - let key_vec = class.serialize( + let key_base = class.serialize( account_id, collection, document_id, WITH_SUBSPACE, (&result).into(), ); - let key = self.new_key_serializer(key_vec.len(), false) - .write(key_vec.as_slice()) + let key = self.new_key_serializer(key_base.len(), false) + .write(key_base.as_slice()) .finalize(); if *set { let mut begin = Bound::Included(TikvKey::from(key)); - let end_vec = class.serialize( + let end_base = class.serialize( account_id, collection, document_id + 1, @@ -266,8 +267,8 @@ impl TikvStore { ); loop { - let end_key = TikvKey::from(self.new_key_serializer(end_vec.len(), false) - .write(end_vec.as_slice()) + let end_key = TikvKey::from(self.new_key_serializer(end_base.len(), false) + .write(end_base.as_slice()) .finalize()); let end = Bound::Included(end_key); @@ -292,33 +293,31 @@ impl TikvStore { } } Operation::Log { set } => { - let key = LogKey { + let key_base = LogKey { account_id, collection, change_id, }.serialize(WITH_SUBSPACE); - let key_vec = self.new_key_serializer(key.len(), false) - .write(key.as_slice()) + let key = self.new_key_serializer(key_base.len(), false) + .write(key_base.as_slice()) .finalize(); - trx.put(key_vec, set.resolve(&result)?.as_ref()).await.map_err(into_error)?; + trx.put(key, set.resolve(&result)?.as_ref()).await.map_err(into_error)?; } Operation::AssertValue { class, assert_value, } => { - let key_vec = class.serialize( + // Don't prepend with API v2 compatibility prefix + let key_base = class.serialize( account_id, collection, document_id, WITH_SUBSPACE, (&result).into(), ); - let key = self.new_key_serializer(key_vec.len(), false) - .write(key_vec.as_slice()) - .finalize(); - let matches = match self.read_chunked_value(&key, &mut trx).await { + let matches = match read_chunked_value(&self, &key_base, &mut trx).await { Ok(ChunkedValue::Single(bytes)) => assert_value.matches(bytes.as_slice()), Ok(ChunkedValue::Chunked { bytes, .. }) => { assert_value.matches(bytes.as_ref()) @@ -496,7 +495,7 @@ impl TikvStore { // } // } - async fn commit(&self, mut trx: Transaction, ext_backoff: Option<&mut Backoff>) -> trc::Result { + pub(crate) async fn commit(&self, mut trx: Transaction, ext_backoff: Option<&mut Backoff>) -> trc::Result { if let Err(e) = trx.commit().await { if let Some(backoff) = ext_backoff { let Some(backoff_duration) = backoff.next_delay_duration() else { From 49b4b77400738eeea168389dc740a0bfcd0d52ed Mon Sep 17 00:00:00 2001 From: Alvin Peters Date: Mon, 26 Aug 2024 12:40:02 +1000 Subject: [PATCH 08/13] Add TiKV: Rewrite --- crates/store/src/backend/tikv/blob.rs | 224 ++++++++---------- crates/store/src/backend/tikv/main.rs | 28 +-- crates/store/src/backend/tikv/mod.rs | 59 +---- crates/store/src/backend/tikv/read.rs | 258 ++++++++++----------- crates/store/src/backend/tikv/write.rs | 305 +++++++++---------------- crates/trc/src/event/description.rs | 2 + crates/trc/src/event/level.rs | 1 + crates/trc/src/serializers/binary.rs | 2 + 8 files changed, 349 insertions(+), 530 deletions(-) diff --git a/crates/store/src/backend/tikv/blob.rs b/crates/store/src/backend/tikv/blob.rs index 81a5bb074..4d0ca44bc 100644 --- a/crates/store/src/backend/tikv/blob.rs +++ b/crates/store/src/backend/tikv/blob.rs @@ -5,13 +5,15 @@ */ use std::ops::{Bound, Range}; +use roaring::RoaringBitmap; use tikv_client::{BoundRange, Key as TikvKey}; use trc::EventType::Store; use trc::StoreEvent; use utils::BLOB_HASH_LEN; -use crate::SUBSPACE_BLOBS; -use crate::write::key::KeySerializer; -use super::{into_error, MAX_KEY_SIZE, MAX_SCAN_KEYS_SIZE, MAX_SCAN_VALUES_SIZE, MAX_VALUE_SIZE, TikvStore}; +use crate::{write::key::KeySerializer, SUBSPACE_BLOBS}; +use super::write::chunking::{delete_chunked_value, put_chunked_value}; +use super::read::chunking::get_chunked_value; +use super::{into_error, MAX_KEY_SIZE, MAX_SCAN_KEYS_SIZE, MAX_SCAN_VALUES_SIZE, MAX_VALUE_SIZE, TikvStore, MAX_CHUNKED_SIZED}; // TODO: Allow handling of more than MAX_SCAN_KEYS_SIZE @@ -21,127 +23,102 @@ impl TikvStore { key: &[u8], range: Range, ) -> trc::Result>> { - let block_start = range.start / MAX_VALUE_SIZE as usize; - let bytes_start = range.start % MAX_VALUE_SIZE as usize; - let block_end = (range.end / MAX_VALUE_SIZE as usize) + 1; + let mut trx = self.snapshot_read().await?; - let begin = KeySerializer::new(key.len() + 3) + let block_start = range.start / MAX_VALUE_SIZE; + let bytes_start = range.start % MAX_VALUE_SIZE; + let block_end = (range.end / MAX_VALUE_SIZE) + 1; + + let mut begin = KeySerializer::new(1 + key.len() + 2) .write(SUBSPACE_BLOBS) .write(key) .write(block_start as u16) .finalize(); - let key_len = begin.len(); - let mut begin_range = Bound::Included(TikvKey::from(begin)); - let end = KeySerializer::new(key.len() + 3) + let end = KeySerializer::new(1 + key.len() + 2) .write(SUBSPACE_BLOBS) .write(key) .write(block_end as u16) + .write(u8::MIN) // Null byte to make the end inclusive .finalize(); - let end_range = Bound::Included(TikvKey::from(end)); - - let mut trx = self.snapshot_trx().await?; - let mut blob_data: Option> = None; - let blob_range = range.end - range.start; + let mut blob_data_opt: Option> = None; + let mut blob_range = range.end - range.start; 'outer: loop { - let bound_range = BoundRange::new(begin_range, end_range.clone()); - let mut keys = trx - .scan_keys(bound_range, MAX_SCAN_KEYS_SIZE) + let mut keys = trx.scan((begin, end.clone()), MAX_SCAN_VALUES_SIZE) .await .map_err(into_error)?; - let mut last_key = TikvKey::default(); - let mut count = 0; - 'inner: while let Some(key) = keys.next() { - count += 1; - if key.len() == key_len { - let value = trx.get(key.clone()).await.map_err(into_error)?.unwrap(); - if let Some(blob_data) = &mut blob_data { - blob_data.extend_from_slice( - value - .get( - ..std::cmp::min( - blob_range.saturating_sub(blob_data.len()), - value.len(), - ), - ) - .unwrap_or(&[]), - ); - if blob_data.len() == blob_range { - break 'outer; - } + + let mut counter = 0; + let mut last_key = None; + while let Some(kv_pair) = keys.next() { + let key: Vec = kv_pair.0.into(); + let mut value: Vec = kv_pair.1.into(); + + if let Some(blob_data) = &mut blob_data_opt { + blob_data.extend_from_slice( + value + .get( + ..std::cmp::min( + blob_range.saturating_sub(blob_data.len()), + value.len(), + ), + ) + .unwrap_or(&[]), + ); + if blob_data.len() == blob_range { + break 'outer; + } + } else { + let blob_size = if blob_range <= (5 * (1 << 20)) { + blob_range + } else if value.len() == MAX_VALUE_SIZE { + MAX_VALUE_SIZE * 2 } else { - let blob_size = if blob_range <= (5 * (1 << 20)) { - blob_range - } else if value.len() == MAX_VALUE_SIZE as usize { - MAX_VALUE_SIZE as usize * 2 - } else { - value.len() - }; - let mut blob_data_ = Vec::with_capacity(blob_size); - blob_data_.extend_from_slice( - value - .get(bytes_start..std::cmp::min(bytes_start + blob_range, value.len())) - .unwrap_or(&[]), - ); - if blob_data_.len() == blob_range { - return Ok(Some(blob_data_)); - } - blob_data = blob_data_.into(); + value.len() + }; + let mut blob_data = Vec::with_capacity(blob_size); + blob_data.extend_from_slice( + value + .get(bytes_start..std::cmp::min(bytes_start + blob_range, value.len())) + .unwrap_or(&[]), + ); + if blob_data.len() == blob_range { + return Ok(Some(blob_data)); } + blob_data_opt = Some(blob_data) } - last_key = key; + + last_key = Some(key); } - if count < MAX_SCAN_KEYS_SIZE { - break; - } else { - begin_range = Bound::Excluded(last_key); + + if counter == MAX_SCAN_VALUES_SIZE { + // Guaranteed to have the last key + begin = last_key.unwrap(); continue; + } else { + break; } + } - Ok(blob_data) + Ok(blob_data_opt) } pub(crate) async fn put_blob(&self, key: &[u8], data: &[u8]) -> trc::Result<()> { - const N_CHUNKS: usize = (1 << 5) - 1; - let last_chunk = std::cmp::max( - (data.len() / MAX_VALUE_SIZE as usize) - + if data.len() % MAX_VALUE_SIZE as usize > 0 { - 1 - } else { - 0 - }, - 1, - ) - 1; - - let mut trx = self.trx_client - .begin_with_options(self.write_trx_options.clone()) - .await - .map_err(into_error)?; - - for (chunk_pos, chunk_bytes) in data.chunks(MAX_VALUE_SIZE as usize).enumerate() { - trx.put( - KeySerializer::new(key.len() + 3) - .write(SUBSPACE_BLOBS) - .write(key) - .write(chunk_pos as u16) - .finalize(), - chunk_bytes - ).await.map_err(into_error)?; - if chunk_pos == last_chunk || (chunk_pos > 0 && chunk_pos % N_CHUNKS == 0) { - self.commit(trx, None).await?; - if ! chunk_pos < last_chunk { - trx = self.trx_client - .begin_with_options(self.write_trx_options.clone()) - .await - .map_err(into_error)?; - } else { - break; - } - } + let mut trx = self.write_trx_with_backoff().await?; + + for (chunk_pos, chunk_value) in data.chunks(MAX_VALUE_SIZE).enumerate() { + let chunk_key = KeySerializer::new(1 + key.len() + 2) + .write(SUBSPACE_BLOBS) + .write(key) + .write(chunk_pos as u16) + .finalize(); + + trx.put(chunk_key, chunk_value).await.map_err(into_error)?; } + trx.commit().await.map_err(into_error)?; Ok(()) } @@ -150,42 +127,41 @@ impl TikvStore { return Ok(false); } - let mut trx = self.trx_client - .begin_with_options(self.write_trx_options.clone()) - .await - .map_err(into_error)?; + let begin = KeySerializer::new(1 + key.len() + 1) + .write(SUBSPACE_BLOBS) + .write(key) + .write(u16::MIN) + .finalize(); + let end = KeySerializer::new(1 + key.len() + 3) + .write(SUBSPACE_BLOBS) + .write(key) + .write(u16::MAX) + .write(u8::MIN) // Null byte to make the end inclusive + .finalize(); + + let range = BoundRange::from((begin, end)); + + let mut trx = self.write_trx_with_backoff().await?; - // Since we are deleting the entire range anyway, - // there is absolutely no point on moving the range bounds. loop { - let mut keys = trx.scan_keys( - ( - KeySerializer::new(key.len() + 3) - .write(SUBSPACE_BLOBS) - .write(key) - .write(0u16) - .finalize(), - KeySerializer::new(key.len() + 3) - .write(SUBSPACE_BLOBS) - .write(key) - .write(u16::MAX) - .finalize() - ), - MAX_SCAN_KEYS_SIZE - ).await.map_err(into_error)?; - - let mut count = 1; - while let Some(key) = keys.next() { + let keys = trx + .scan_keys(range.clone(), MAX_SCAN_KEYS_SIZE) + .await + .map_err(into_error)?; + + let mut count = 0; + for key in keys { count += 1; trx.delete(key).await.map_err(into_error)?; } - // TODO: Replace with MAX_SCAN_KEYS_SIZE - if count == 0 { + if count < MAX_SCAN_KEYS_SIZE { break; } } - self.commit(trx, None).await + trx.commit().await.map_err(into_error)?; + + Ok(true) } } \ No newline at end of file diff --git a/crates/store/src/backend/tikv/main.rs b/crates/store/src/backend/tikv/main.rs index 9ab878834..47d3b1760 100644 --- a/crates/store/src/backend/tikv/main.rs +++ b/crates/store/src/backend/tikv/main.rs @@ -4,7 +4,7 @@ * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ use std::time::Duration; -use tikv_client::{Backoff, CheckLevel, RawClient, RetryOptions, TransactionClient, TransactionOptions}; +use tikv_client::{Backoff, CheckLevel, RetryOptions, TransactionClient, TransactionOptions}; use utils::config::{utils::AsKey, Config}; use super::TikvStore; @@ -28,17 +28,6 @@ impl TikvStore { }) .ok()?; - let raw_client = RawClient::new(pd_endpoints) - .await - .map_err(|err| { - config.new_build_error( - prefix.as_str(), - format!("Failed to create TiKV database: {err:?}"), - ) - }) - .ok()? - .with_atomic_for_cas(); - let backoff_min_delay = config .property::((&prefix, "transaction.backoff-min-delay")) .unwrap_or_else(|| Duration::from_millis(2)); @@ -90,16 +79,19 @@ impl TikvStore { .drop_check(CheckLevel::Warn) .retry_options(RetryOptions::new(backoff.clone(), backoff.clone())); - let raw_backoff = backoff; + let read_trx_options = TransactionOptions::new_optimistic() + .drop_check(CheckLevel::None) + .retry_options(RetryOptions::none()) + .read_only(); + + // Used for write transactions + let backoff = backoff; let store = Self { trx_client, write_trx_options, - raw_client, - raw_backoff, - api_v2: false, - keyspace: [0, 0, b's'], // Temporary - version: Default::default(), + read_trx_options, + backoff, }; Some(store) diff --git a/crates/store/src/backend/tikv/mod.rs b/crates/store/src/backend/tikv/mod.rs index 64a4e450d..f1ae84b4c 100644 --- a/crates/store/src/backend/tikv/mod.rs +++ b/crates/store/src/backend/tikv/mod.rs @@ -13,7 +13,6 @@ pub mod main; pub mod read; pub mod write; - // https://github.com/tikv/tikv/issues/7272#issuecomment-604841372 // Default limit is 4194304 bytes @@ -22,9 +21,10 @@ const MAX_KEY_SIZE: u32 = 4 * 1024; // Then, 2097152 const MAX_GRPC_MESSAGE_SIZE: u32 = 2097152; const MAX_ASSUMED_KEY_SIZE: u32 = 256; -const MAX_VALUE_SIZE: u32 = 131072; +const MAX_VALUE_SIZE: usize = 131072; +const MAX_CHUNKED_SIZED: usize = MAX_VALUE_SIZE * (1 + 256); const MAX_SCAN_KEYS_SIZE: u32 = MAX_GRPC_MESSAGE_SIZE / MAX_ASSUMED_KEY_SIZE; // 8192 -const MAX_SCAN_VALUES_SIZE: u32 = MAX_GRPC_MESSAGE_SIZE / MAX_VALUE_SIZE; // 16 +const MAX_SCAN_VALUES_SIZE: u32 = MAX_GRPC_MESSAGE_SIZE / MAX_VALUE_SIZE as u32; // 16 // Preparation for API v2 // RFC: https://github.com/tikv/rfcs/blob/master/text/0069-api-v2.md @@ -38,11 +38,8 @@ pub const TRANSACTION_TIMEOUT: Duration = Duration::from_secs(4); pub struct TikvStore { trx_client: TransactionClient, write_trx_options: TransactionOptions, - raw_client: RawClient, - raw_backoff: Backoff, - api_v2: bool, - keyspace: [u8; 3], // Keyspace is fixed-length of 3 bytes in network byte order. - version: parking_lot::Mutex, + read_trx_options: TransactionOptions, + backoff: Backoff, } pub(crate) struct TimedTransaction { @@ -50,52 +47,6 @@ pub(crate) struct TimedTransaction { expires: Instant, } -pub(crate) struct ReadVersion { - version: Timestamp, - expires: Instant, -} - -impl ReadVersion { - pub fn new(version: Timestamp) -> Self { - Self { - version, - expires: Instant::now() + TRANSACTION_EXPIRY, - } - } - - pub fn is_expired(&self) -> bool { - self.expires < Instant::now() - } -} - -impl Default for ReadVersion { - fn default() -> Self { - Self { - version: Timestamp::default(), - expires: Instant::now(), - } - } -} - -impl AsRef for TimedTransaction { - fn as_ref(&self) -> &Transaction { - &self.trx - } -} - -impl TimedTransaction { - pub fn new(trx: Transaction) -> Self { - Self { - trx, - expires: Instant::now() + TRANSACTION_TIMEOUT, - } - } - - pub fn is_expired(&self) -> bool { - self.expires < Instant::now() - } -} - #[inline(always)] fn into_error(error: TikvError) -> trc::Error { trc::StoreEvent::TikvError diff --git a/crates/store/src/backend/tikv/read.rs b/crates/store/src/backend/tikv/read.rs index d62002fcb..24e5a59cc 100644 --- a/crates/store/src/backend/tikv/read.rs +++ b/crates/store/src/backend/tikv/read.rs @@ -4,8 +4,7 @@ * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ use std::ops::Bound; -use tikv_client::{BoundRange, CheckLevel, Key as TikvKey, KvPair, Snapshot, Transaction, TransactionOptions, Value}; -use futures::TryStreamExt; +use tikv_client::{BoundRange, CheckLevel, Key as TikvKey, KvPair, Snapshot, Transaction, Value}; use roaring::RoaringBitmap; use crate::{ backend::deserialize_i64_le, @@ -15,28 +14,20 @@ use crate::{ }, BitmapKey, Deserialize, IterateParams, Key, ValueKey, U32_LEN, WITH_SUBSPACE, }; -use crate::backend::tikv::read::read_helpers::read_chunked_value; +use crate::backend::tikv::read::chunking::get_chunked_value; use super::{into_error, MAX_KEY_SIZE, MAX_SCAN_KEYS_SIZE, MAX_SCAN_VALUES_SIZE, MAX_VALUE_SIZE, TikvStore}; -#[allow(dead_code)] -pub(crate) enum ChunkedValue { - Single(Value), - Chunked { n_chunks: u8, bytes: Vec }, - None, -} - impl TikvStore { pub(crate) async fn get_value(&self, key: impl Key) -> trc::Result> where U: Deserialize, { - let key_base = key.serialize(WITH_SUBSPACE); - let mut trx = self.snapshot_trx().await?; + let key = key.serialize(WITH_SUBSPACE); + let mut snapshot = self.snapshot_read().await?; - match read_chunked_value(&self, &key_base, &mut trx).await? { - ChunkedValue::Single(bytes) => U::deserialize(&bytes).map(Some), - ChunkedValue::Chunked { bytes, .. } => U::deserialize(&bytes).map(Some), - ChunkedValue::None => Ok(None), + match get_chunked_value(&key, &mut snapshot).await? { + Some(bytes) => U::deserialize(&bytes).map(Some), + None => Ok(None) } } @@ -44,40 +35,40 @@ impl TikvStore { &self, mut key: BitmapKey>, ) -> trc::Result> { + let mut trx = self.snapshot_read().await?; let mut bm = RoaringBitmap::new(); - let begin = key.serialize(WITH_SUBSPACE); + + let mut begin = key.serialize(WITH_SUBSPACE); key.document_id = u32::MAX; + let mut end = key.serialize(WITH_SUBSPACE); + end.push(u8::MIN); // Inclusive let key_len = begin.len(); - let mut trx = self.snapshot_trx().await?; - - let mut begin_range = Bound::Included(TikvKey::from(begin)); - loop { - let end = key.serialize(WITH_SUBSPACE); - let end_range = Bound::Included(TikvKey::from(end)); - let range = BoundRange::new(begin_range, end_range); - - let keys = trx.scan_keys(range, MAX_SCAN_KEYS_SIZE) + 'outer: loop { + let keys = trx + .scan_keys((begin, end.clone()), MAX_SCAN_KEYS_SIZE) .await .map_err(into_error)?; let mut count = 0; + let mut last_key = None; - let mut last_key = TikvKey::default(); for key in keys { count += 1; let key_slice: &[u8] = key.as_ref().into(); - if key_slice.len() == key_len { - bm.insert(key_slice.deserialize_be_u32(key_slice.len() - U32_LEN)?); + if key.len() == key_len { + bm.insert(key_slice.deserialize_be_u32(key.len() - U32_LEN)?); } - last_key = key; + last_key = Some(key) } - if count < MAX_SCAN_KEYS_SIZE { - break; - } else { - begin_range = Bound::Excluded(TikvKey::from(last_key)); + if count == MAX_SCAN_KEYS_SIZE { + // Guaranteed to have a key unless MAX_SCAN_KEYS_SIZE is 0 + begin = last_key.unwrap().into(); + begin.push(u8::MIN); // To make the start range exclusive continue; + } else { + break; } } @@ -89,79 +80,88 @@ impl TikvStore { params: IterateParams, mut cb: impl for<'x> FnMut(&'x [u8], &'x [u8]) -> trc::Result + Sync + Send, ) -> trc::Result<()> { - let begin = params.begin.serialize(WITH_SUBSPACE); - let end = params.end.serialize(WITH_SUBSPACE); + let mut begin = params.begin.serialize(WITH_SUBSPACE); + let mut end = params.end.serialize(WITH_SUBSPACE); + end.push(u8::MIN); // Inclusive - let mut trx = self.snapshot_trx().await?; + let mut trx = self.snapshot_read().await?; if !params.first { - // TODO: Get rid of repeating code if params.ascending { - let mut begin_range = Bound::Included(TikvKey::from(begin)); loop { - let end_range = Bound::Included(TikvKey::from(end.clone())); - let range = BoundRange::new(begin_range, end_range); - let kv_pairs = trx.scan(range, MAX_SCAN_VALUES_SIZE) + let keys = trx + .scan((begin, end.clone()), MAX_SCAN_VALUES_SIZE) .await .map_err(into_error)?; let mut count = 0; - let mut last_key = TikvKey::default(); - for kv_pair in kv_pairs { + let mut last_key = None; + for kv_pair in keys { count += 1; - let (key, value) = kv_pair.into(); - let key_slice: &[u8] = key.as_ref().into(); - println!("tf {:?}", &value); - if !cb(key_slice.get(1..).unwrap_or_default(), &value)? { + let key_slice: &[u8] = kv_pair.key().into(); + let value = kv_pair.value().as_slice(); + + if !cb(key_slice.get(1..).unwrap_or_default(), value)? { return Ok(()); } - last_key = key; + + last_key = Some(kv_pair.into_key()); } - if count < MAX_SCAN_VALUES_SIZE { - break; - } else { - begin_range = Bound::Excluded(TikvKey::from(last_key)); + + if count == MAX_SCAN_VALUES_SIZE { + begin = last_key.unwrap().into(); + begin.push(u8::MIN); continue; + } else { + break; } } } else { - let mut end_range = Bound::Included(TikvKey::from(end)); loop { - let begin_range = Bound::Included(TikvKey::from(begin.clone())); - let range = BoundRange::new(begin_range, end_range); - let kv_pairs = trx.scan_reverse(range, MAX_SCAN_VALUES_SIZE) + let keys = trx + .scan_reverse((begin.clone(), end), MAX_SCAN_VALUES_SIZE) .await .map_err(into_error)?; let mut count = 0; - let mut last_key = TikvKey::default(); - for kv_pair in kv_pairs { + let mut last_key = None; + for kv_pair in keys { count += 1; - let (key, value) = kv_pair.into(); - let key_slice: &[u8] = key.as_ref().into(); - if !cb(key_slice.get(1..).unwrap_or_default(), &value)? { + let key_slice: &[u8] = kv_pair.key().into(); + let value = kv_pair.value().as_slice(); + + if !cb(key_slice.get(1..).unwrap_or_default(), value)? { return Ok(()); } - last_key = key; + + last_key = Some(kv_pair.into_key()); } - if count < MAX_SCAN_VALUES_SIZE { - break; - } else { - end_range = Bound::Excluded(TikvKey::from(last_key)); + + if count == MAX_SCAN_VALUES_SIZE { + end = last_key.unwrap().into(); continue; + } else { + break; } } } } else { - let mut possible_kv_pair = trx - .scan((begin, end), 1) - .await - .map_err(into_error)?; - - if let Some(kv_pair) = possible_kv_pair.next() { - let (key, value) = kv_pair.into(); - let key_slice: &[u8] = key.as_ref().into(); - cb(key_slice.get(1..).unwrap_or_default(), &value)?; + let result = if params.ascending { + trx.scan((begin, end), 1) + .await + .map_err(into_error)? + .next() + } else { + trx.scan_reverse((begin, end), 1) + .await + .map_err(into_error)? + .next() + }; + + if let Some(kv_pair) = result { + let key: &[u8] = kv_pair.key().into(); + let value = kv_pair.value().as_slice(); + cb(key.get(1..).unwrap_or_default(), value)?; } } @@ -175,7 +175,7 @@ impl TikvStore { let key = key.into().serialize(WITH_SUBSPACE); if let Some(bytes) = self - .snapshot_trx() + .snapshot_read() .await? .get(key.clone()) .await @@ -189,87 +189,60 @@ impl TikvStore { pub(crate) async fn read_trx(&self) -> trc::Result { self.trx_client - .begin_with_options( - TransactionOptions::new_optimistic() - .read_only() - .drop_check(CheckLevel::None) - ) + .begin_with_options(self.read_trx_options.clone()) .await .map_err(into_error) } - pub(crate) async fn snapshot_trx(&self) -> trc::Result { - let read_trx = self.read_trx().await?; + pub(crate) async fn snapshot_read(&self) -> trc::Result { + let current_timestamp = self + .trx_client + .current_timestamp() + .await + .map_err(into_error)?; - Ok(Snapshot::new(read_trx)) + Ok(self.trx_client.snapshot(current_timestamp, self.read_trx_options.clone())) } } -pub(crate) mod read_helpers { - use tikv_client::{BoundRange, KvPair, Snapshot, Transaction, Value}; +pub(super) mod chunking { use super::*; - - pub(crate) async fn read_chunked_value( - store: &TikvStore, + pub(in super::super) async fn get_chunked_value( key: &[u8], trx: &mut ReadTrx - ) -> trc::Result { - if let Some(mut bytes) = trx.get(key.to_vec()).await? { - if bytes.len() < MAX_VALUE_SIZE as usize { - Ok(ChunkedValue::Single(bytes)) - } else { - let mut value = Vec::with_capacity(bytes.len() * 2); - value.append(&mut bytes); - let mut n_chunks = 1; - - let mut first = Bound::Included(TikvKey::from(KeySerializer::new(key.len() + 1) - .write(key) - .write(0u8) - .finalize())); - - 'outer: loop { - // Maybe use the last byte of the last key? - let mut count = 0; - - let last = Bound::Included(TikvKey::from(KeySerializer::new(key.len() + 1) - .write(key) - .write(u8::MAX) - .finalize())); - - let bound_range = BoundRange::new(first, last); + ) -> trc::Result>> { + let Some(mut bytes) = trx.get(key.to_vec()).await? else { + return Ok(None); + }; - let mut kv_pair_iter = trx.scan(bound_range, MAX_SCAN_VALUES_SIZE) - .await? - .peekable(); - - while let Some(kv_pair) = kv_pair_iter.next() { - let (key, mut kv_value) = kv_pair.into(); - value.append(&mut kv_value); - count += 1; - if kv_pair_iter.peek().is_none() { - n_chunks += count; - if count < MAX_KEY_SIZE { - break 'outer; - } - first = Bound::Excluded(key); - continue 'outer; - } - } + if bytes.len() != MAX_VALUE_SIZE { + return Ok(Some(bytes)) + } - // Empty - break; - } + let start_key = KeySerializer::new(key.len() + 1) + .write(key) + .write(u8::MIN) + .finalize(); + let end_key = KeySerializer::new(key.len() + 2) + .write(key) + .write(u8::MAX) + .write(u8::MIN) // Null byte to make the end inclusive + .finalize(); + + let mut keys: Vec = trx + .scan_keys((start_key, end_key), 256 + 1) + .await? + .collect(); - Ok(ChunkedValue::Chunked { - bytes: value, - n_chunks: *key.last().unwrap(), - }) - } - } else { - Ok(ChunkedValue::None) + for chunk_key in keys { + // Any scanned keys are guaranteed to have a value + let mut value = trx.get(chunk_key).await?.unwrap(); + bytes.append(&mut value); } + + Ok(Some(bytes)) } trait ReadTransaction { @@ -360,5 +333,4 @@ pub(crate) mod read_helpers { self.scan_keys_reverse(range, limit).await.map_err(into_error) } } -} - +} \ No newline at end of file diff --git a/crates/store/src/backend/tikv/write.rs b/crates/store/src/backend/tikv/write.rs index f9a7c8eda..cf26f16bd 100644 --- a/crates/store/src/backend/tikv/write.rs +++ b/crates/store/src/backend/tikv/write.rs @@ -20,19 +20,18 @@ use crate::{ }, BitmapKey, IndexKey, Key, LogKey, SUBSPACE_COUNTER, SUBSPACE_QUOTA, U32_LEN, WITH_SUBSPACE, }; -use crate::backend::tikv::read::read_helpers::read_chunked_value; -use crate::write::key; -use super::{into_error, read::{ChunkedValue}, TikvStore, ReadVersion, MAX_VALUE_SIZE, MAX_SCAN_KEYS_SIZE}; +use super::write::chunking::{put_chunked_value, delete_chunked_value}; +use super::read::chunking::get_chunked_value; +use super::{into_error, TikvStore, MAX_VALUE_SIZE, MAX_SCAN_KEYS_SIZE}; impl TikvStore { pub(crate) async fn write(&self, batch: Batch) -> trc::Result { - println!("write"); let mut account_id = u32::MAX; let mut collection = u8::MAX; let mut document_id = u32::MAX; let mut change_id = u64::MAX; - let mut backoff = self.raw_backoff.clone(); + let mut backoff = self.backoff.clone(); loop { let mut result = AssignedIds::default(); @@ -62,44 +61,19 @@ impl TikvStore { change_id = *change_id_; } Operation::Value { class, op } => { - let mut key = class.serialize( + let key = class.serialize( account_id, collection, document_id, WITH_SUBSPACE, (&result).into(), ); - let do_chunk = !class.is_counter(collection); match op { ValueOp::Set(value) => { let value = value.resolve(&result)?; - if !value.is_empty() && do_chunk { - for (pos, chunk) in value.chunks(MAX_VALUE_SIZE as usize).enumerate() { - match pos.cmp(&1) { - Ordering::Less => {} - Ordering::Equal => { - key.push(0); - } - Ordering::Greater => { - if pos < u8::MAX as usize { - *key.last_mut().unwrap() += 1; - } else { - trx.rollback().await.map_err(into_error)?; - return Err(trc::StoreEvent::TikvError - .ctx( - trc::Key::Reason, - "Value is too large", - )); - } - } - } - trx.put(key.clone(), chunk).await.map_err(into_error)?; - } - } else { - trx.put(key, value.into_owned()).await.map_err(into_error)?; - } + put_chunked_value(&key, &value, &mut trx, false).await?; } ValueOp::AtomicAdd(by) => { get_and_add(&mut trx, key, *by).await?; @@ -109,45 +83,7 @@ impl TikvStore { result.push_counter_id(num); } ValueOp::Clear => { - if do_chunk { - let end_key = KeySerializer::new(key.len() + 1) - .write(key.as_slice()) - .write(u8::MAX) - .finalize(); - let mut begin = Bound::Included(TikvKey::from(key)); - let end = Bound::Included(TikvKey::from(end_key)); - - 'outer: loop { - let range = BoundRange::new(begin, end.clone()); - let mut keys_iter = trx.scan_keys(range, MAX_SCAN_KEYS_SIZE) - .await - .map_err(into_error)? - .peekable(); - - let mut count = 0; - while let Some(key) = keys_iter.next() { - count += 1; - if keys_iter.peek().is_none() { - if count < MAX_SCAN_KEYS_SIZE { - trx.delete(key).await.map_err(into_error)?; - break 'outer; - } else { - begin = Bound::Excluded(key.clone()); - trx.delete(key).await.map_err(into_error)?; - continue 'outer; - } - } else { - trx.delete(key).await.map_err(into_error)?; - } - } - - // Empty - break; - } - - } else { - trx.delete(key).await.map_err(into_error)?; - } + delete_chunked_value(&key, &mut trx, false).await?; } } } @@ -172,61 +108,51 @@ impl TikvStore { && document_id == u32::MAX; if assign_id { - let begin = BitmapKey { + let mut begin = BitmapKey { account_id, collection, class: BitmapClass::DocumentIds, document_id: 0, }.serialize(WITH_SUBSPACE); - let end = BitmapKey { + let mut end = BitmapKey { account_id, collection, class: BitmapClass::DocumentIds, document_id: u32::MAX, }.serialize(WITH_SUBSPACE); + end.push(u8::MIN); // Null byte to make the end inclusive + let key_len = begin.len(); - let mut begin_bound = Bound::Included(TikvKey::from(begin)); - let end_bound = Bound::Included(TikvKey::from(end)); let mut found_ids = RoaringBitmap::new(); + 'outer: loop { - let range = BoundRange::new(begin_bound, end_bound.clone()); - let mut keys_iter = trx.scan_keys(range, MAX_SCAN_KEYS_SIZE) + let mut keys = trx.scan_keys((begin, end.clone()), MAX_SCAN_KEYS_SIZE) .await .map_err(into_error)? .peekable(); - let mut count = 0; - while let Some(key) = keys_iter.next() { + let mut count = 0; + while let Some(key) = keys.next() { count += 1; if key.len() == key_len { let key_slice: &[u8] = key.as_ref().into(); - let found_id = key_slice - .deserialize_be_u32(key_len - U32_LEN)?; - found_ids.insert(found_id); + found_ids.insert(key_slice.deserialize_be_u32(key_len - U32_LEN)?); } else { - if count < MAX_SCAN_KEYS_SIZE { - - break 'outer; - } else { - begin_bound = Bound::Excluded(key); - continue 'outer; - } + break 'outer; } - let key_slice: &[u8] = key.as_ref().into(); - found_ids.insert(key_slice.deserialize_be_u32(key_len - U32_LEN)?); - if keys_iter.peek().is_none() { - if count < MAX_SCAN_KEYS_SIZE { + if keys.peek().is_none() { + if count < MAX_SCAN_KEYS_SIZE { break 'outer; } else { - begin_bound = Bound::Excluded(key); + begin = key.into(); + begin.push(u8::MIN); // Null byte to make the beginning exclusive continue 'outer; } } } - // Empty break; } @@ -244,18 +170,18 @@ impl TikvStore { ); if *set { - let mut begin = Bound::Included(TikvKey::from(key)); - let end = Bound::Included(TikvKey::from(class.serialize( + let mut begin = key; + let mut end = class.serialize( account_id, collection, document_id + 1, WITH_SUBSPACE, (&result).into(), - ))); + ); + end.push(u8::MIN); loop { - let range = BoundRange::new(begin, end.clone()); - let keys: Vec = trx.scan_keys(range, MAX_SCAN_KEYS_SIZE) + let keys: Vec = trx.scan_keys((begin, end.clone()), MAX_SCAN_KEYS_SIZE) .await .map_err(into_error)? .collect(); @@ -265,7 +191,7 @@ impl TikvStore { break; } else { // Guaranteed to have the last value - begin = Bound::Excluded(keys.last().unwrap().clone()); + begin = keys.last().unwrap().clone().into(); trx.lock_keys(keys).await.map_err(into_error)?; continue; } @@ -287,7 +213,7 @@ impl TikvStore { class, assert_value, } => { - let key_base = class.serialize( + let key = class.serialize( account_id, collection, document_id, @@ -295,12 +221,9 @@ impl TikvStore { (&result).into(), ); - let matches = match read_chunked_value(&self, &key_base, &mut trx).await { - Ok(ChunkedValue::Single(bytes)) => assert_value.matches(bytes.as_slice()), - Ok(ChunkedValue::Chunked { bytes, .. }) => { - assert_value.matches(bytes.as_ref()) - } - Ok(ChunkedValue::None) => { + let matches = match get_chunked_value(&key, &mut trx).await { + Ok(Some(bytes)) => assert_value.matches(bytes.as_slice()), + Ok(None) => { assert_value.is_none() } Err(_) => false, @@ -332,7 +255,7 @@ impl TikvStore { // Since we are deleting all of them anyways. No point moving the start bound let mut begin = Bound::Included(TikvKey::from(from_key.to_vec())); - let mut backoff = self.raw_backoff.clone(); + let mut backoff = self.backoff.clone(); 'outer: loop { let end = Bound::Included(TikvKey::from(to_key.to_vec())); @@ -373,99 +296,33 @@ impl TikvStore { } pub(crate) async fn delete_range(&self, from: impl Key, to: impl Key) -> trc::Result<()> { - let from_vec = from.serialize(WITH_SUBSPACE); - let to_vec = to.serialize(WITH_SUBSPACE); - let mut trx = self.write_trx_with_backoff().await?; - - let mut begin = Bound::Included(TikvKey::from(KeySerializer::new(from_vec.len()) - .write(from_vec.as_slice()) - .finalize())); + let begin_range = Bound::Included(TikvKey::from(from.serialize(WITH_SUBSPACE))); + let end_range = Bound::Included(TikvKey::from(to.serialize(WITH_SUBSPACE))); + let range = BoundRange::new(begin_range, end_range); - 'outer: loop { - let end = Bound::Included(TikvKey::from(KeySerializer::new(to_vec.len()) - .write(from_vec.as_slice()) - .finalize())); + let mut trx = self.write_trx_with_backoff().await?; - let range = BoundRange::new(begin, end); - let mut keys_iter = trx.scan_keys(range, MAX_SCAN_KEYS_SIZE) + loop { + let keys = trx + .scan_keys(range.clone(), MAX_SCAN_KEYS_SIZE) .await - .map_err(into_error)? - .peekable(); + .map_err(into_error)?; let mut count = 0; - while let Some(key) = keys_iter.next() { + for key in keys { count += 1; - if keys_iter.peek().is_none() { - if count < MAX_SCAN_KEYS_SIZE { - trx.delete(key).await.map_err(into_error)?; - break 'outer; - } else { - begin = Bound::Excluded(key.clone()); - trx.delete(key).await.map_err(into_error)?; - continue 'outer; - } - } else { - trx.delete(key).await.map_err(into_error)?; - } + trx.delete(key).await.map_err(into_error)?; } - break; + if count != MAX_SCAN_KEYS_SIZE { + break; + } } trx.commit().await.map_err(into_error)?; Ok(()) } - // async fn atomic_subtract(&self, key: impl Into + Clone, by: i64) -> trc::Result<()> { - // let mut backoff = self.raw_backoff.clone(); - // - // loop { - // let key = key.clone().into(); - // let mut trx = self.write_trx_no_backoff().await?; - // if let Some(previous) = trx.get_for_update(key.clone()).await.map_err(into_error)? { - // let subtrahend = deserialize_i64_le((&key).into(), &previous)?; - // let difference = subtrahend - by; - // - // if difference == 0 { - // trx.delete(key).await.map_err(into_error)?; - // } else { - // trx.put(key, difference.to_le_bytes().as_slice()).await.map_err(into_error)?; - // } - // } else { - // trx.put(key, by.to_le_bytes().as_slice()).await.map_err(into_error)?; - // } - // - // if self.commit(trx, Some(&mut backoff)).await? { - // return Ok(()); - // } else { - // continue; - // } - // } - // } - // - // async fn atomic_add(&self, key: impl Into + Clone, by: i64) -> trc::Result<()> { - // let mut backoff = self.raw_backoff.clone(); - // - // loop { - // let key = key.clone().into(); - // let mut trx = self.write_trx_no_backoff().await?; - // if let Some(previous) = trx.get_for_update(key.clone()).await.map_err(into_error)? { - // let addend = deserialize_i64_le((&key).into(), &previous)?; - // let sum = addend + by; - // - // trx.put(key, sum.to_le_bytes().as_slice()).await.map_err(into_error)?; - // } else { - // trx.put(key, by.to_le_bytes().as_slice()).await.map_err(into_error)?; - // } - // - // if self.commit(trx, Some(&mut backoff)).await? { - // return Ok(()); - // } else { - // continue; - // } - // } - // } - pub(crate) async fn commit(&self, mut trx: Transaction, ext_backoff: Option<&mut Backoff>) -> trc::Result { if let Err(e) = trx.commit().await { if let Some(backoff) = ext_backoff { @@ -482,8 +339,7 @@ impl TikvStore { } } - async fn write_trx_no_backoff(&self) -> trc::Result { - // TODO: Put inside struct + pub(super) async fn write_trx_no_backoff(&self) -> trc::Result { let write_trx_options = TransactionOptions::new_pessimistic() .drop_check(CheckLevel::Warn) .use_async_commit() @@ -495,7 +351,7 @@ impl TikvStore { .map_err(into_error) } - async fn write_trx_with_backoff(&self) -> trc::Result { + pub(super) async fn write_trx_with_backoff(&self) -> trc::Result { self.trx_client .begin_with_options(self.write_trx_options.clone()) .await @@ -514,4 +370,71 @@ async fn get_and_add(trx: &mut Transaction, key: impl Into, by: i64) -> trx.put(key, by.to_le_bytes().as_slice()).await.map_err(into_error)?; Ok(by) } +} + +pub(super) mod chunking { + use super::*; + + pub(in super::super) async fn delete_chunked_value( + key: &[u8], + trx: &mut Transaction, + commit: bool, + ) -> trc::Result<()> { + let begin_key = key.to_vec(); + + let end_key = KeySerializer::new(key.len() + 1) + .write(key) + .write(u8::MAX) + .finalize(); + + let keys = trx.scan_keys((begin_key, end_key), 256) + .await + .map_err(into_error)?; + + for chunk_key in keys { + trx.delete(chunk_key).await.map_err(into_error)?; + } + + if commit { + trx.commit().await.map_err(into_error)?; + } + + Ok(()) + } + + pub(in super::super) async fn put_chunked_value( + key: &[u8], + value: &[u8], + trx: &mut Transaction, + commit: bool + ) -> trc::Result<()> { + let mut chunk_iter = value.chunks(MAX_VALUE_SIZE); + + if chunk_iter.len() > 1 + 256 { + // Expected to be thrown back so might as well roll it back. + trx.rollback().await.map_err(into_error)?; + return Err(trc::StoreEvent::TikvError + .ctx( + trc::Key::Reason, + "Value is too large", + )); + } + + let first_chunk = chunk_iter.next().unwrap_or_else(|| &[]); + trx.put(key.to_vec(), first_chunk).await.map_err(into_error)?; + + for (chunk_pos, value_chunk) in chunk_iter.enumerate() { + let chunk_key = KeySerializer::new(key.len() + 1) + .write(key) + .write(chunk_pos as u8) + .finalize(); + trx.put(chunk_key, value_chunk).await.map_err(into_error)?; + } + + if commit { + trx.commit().await.map_err(into_error)?; + } + + Ok(()) + } } \ No newline at end of file diff --git a/crates/trc/src/event/description.rs b/crates/trc/src/event/description.rs index a11c43b44..0237fe363 100644 --- a/crates/trc/src/event/description.rs +++ b/crates/trc/src/event/description.rs @@ -1530,6 +1530,7 @@ impl StoreEvent { StoreEvent::PostgresqlError => "PostgreSQL error", StoreEvent::RocksdbError => "RocksDB error", StoreEvent::SqliteError => "SQLite error", + StoreEvent::TikvError => "TiKV error", StoreEvent::LdapError => "LDAP error", StoreEvent::ElasticsearchError => "ElasticSearch error", StoreEvent::RedisError => "Redis error", @@ -1564,6 +1565,7 @@ impl StoreEvent { StoreEvent::PostgresqlError => "A PostgreSQL error occurred", StoreEvent::RocksdbError => "A RocksDB error occurred", StoreEvent::SqliteError => "An SQLite error occurred", + StoreEvent::TikvError => "A TiKV error occured", StoreEvent::LdapError => "An LDAP error occurred", StoreEvent::ElasticsearchError => "An ElasticSearch error occurred", StoreEvent::RedisError => "A Redis error occurred", diff --git a/crates/trc/src/event/level.rs b/crates/trc/src/event/level.rs index 4142f0457..b9a16a6d8 100644 --- a/crates/trc/src/event/level.rs +++ b/crates/trc/src/event/level.rs @@ -27,6 +27,7 @@ impl EventType { | StoreEvent::PostgresqlError | StoreEvent::RocksdbError | StoreEvent::SqliteError + | StoreEvent::TikvError | StoreEvent::LdapError | StoreEvent::ElasticsearchError | StoreEvent::RedisError diff --git a/crates/trc/src/serializers/binary.rs b/crates/trc/src/serializers/binary.rs index 0ba4f681c..fd682c053 100644 --- a/crates/trc/src/serializers/binary.rs +++ b/crates/trc/src/serializers/binary.rs @@ -854,6 +854,7 @@ impl EventType { EventType::Tls(TlsEvent::MultipleCertificatesAvailable) => 545, EventType::Tls(TlsEvent::NoCertificatesAvailable) => 546, EventType::Tls(TlsEvent::NotConfigured) => 547, + EventType::Store(StoreEvent::TikvError) => 548, } } @@ -1447,6 +1448,7 @@ impl EventType { 545 => Some(EventType::Tls(TlsEvent::MultipleCertificatesAvailable)), 546 => Some(EventType::Tls(TlsEvent::NoCertificatesAvailable)), 547 => Some(EventType::Tls(TlsEvent::NotConfigured)), + 548 => Some(EventType::Store(StoreEvent::TikvError)), _ => None, } } From 0f497b106352d7bf356c5a959f8aa0bef81d70f7 Mon Sep 17 00:00:00 2001 From: Alvin Peters Date: Mon, 26 Aug 2024 18:57:17 +1000 Subject: [PATCH 09/13] Add TiKV: Add .into() to allow compilation --- crates/common/src/telemetry/metrics/prometheus.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/common/src/telemetry/metrics/prometheus.rs b/crates/common/src/telemetry/metrics/prometheus.rs index 35ff3c016..836a56420 100644 --- a/crates/common/src/telemetry/metrics/prometheus.rs +++ b/crates/common/src/telemetry/metrics/prometheus.rs @@ -28,7 +28,7 @@ impl Core { metric.set_name(metric_name(counter.id().name())); metric.set_help(counter.id().description().into()); metric.set_field_type(MetricType::COUNTER); - metric.set_metric(vec![new_counter(counter.value())]); + metric.set_metric(vec![new_counter(counter.value())].into()); metrics.push(metric); } @@ -38,7 +38,7 @@ impl Core { metric.set_name(metric_name(gauge.id().name())); metric.set_help(gauge.id().description().into()); metric.set_field_type(MetricType::GAUGE); - metric.set_metric(vec![new_gauge(gauge.get())]); + metric.set_metric(vec![new_gauge(gauge.get())].into()); metrics.push(metric); } @@ -48,7 +48,7 @@ impl Core { metric.set_name(metric_name(histogram.id().name())); metric.set_help(histogram.id().description().into()); metric.set_field_type(MetricType::HISTOGRAM); - metric.set_metric(vec![new_histogram(histogram)]); + metric.set_metric(vec![new_histogram(histogram)].into()); metrics.push(metric); } From 0f14a5d3fd17c93462df9c64e443b2380a505713 Mon Sep 17 00:00:00 2001 From: Alvin Peters Date: Wed, 28 Aug 2024 09:23:37 +1000 Subject: [PATCH 10/13] Add TiKV: Remove debug println from FoundationDB --- crates/store/src/backend/foundationdb/write.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/store/src/backend/foundationdb/write.rs b/crates/store/src/backend/foundationdb/write.rs index 11a1c04f2..0eb55082c 100644 --- a/crates/store/src/backend/foundationdb/write.rs +++ b/crates/store/src/backend/foundationdb/write.rs @@ -109,7 +109,6 @@ impl FdbStore { } } ValueOp::AtomicAdd(by) => { - println!("fdb atomic add key: {:?} val: {:?} ", key, by.to_le_bytes()); trx.atomic_op(&key, &by.to_le_bytes()[..], MutationType::Add); } ValueOp::AddAndGet(by) => { From 25af5717352f6c26bf4cc06ac3cbe90440acd47c Mon Sep 17 00:00:00 2001 From: Alvin Peters Date: Wed, 28 Aug 2024 09:39:06 +1000 Subject: [PATCH 11/13] Add TiKV: Clean up and save --- crates/store/src/backend/tikv/main.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/crates/store/src/backend/tikv/main.rs b/crates/store/src/backend/tikv/main.rs index 47d3b1760..6f27031f9 100644 --- a/crates/store/src/backend/tikv/main.rs +++ b/crates/store/src/backend/tikv/main.rs @@ -84,9 +84,6 @@ impl TikvStore { .retry_options(RetryOptions::none()) .read_only(); - // Used for write transactions - let backoff = backoff; - let store = Self { trx_client, write_trx_options, From 13712464657a0fc17d68150508cb8eff5fb76676 Mon Sep 17 00:00:00 2001 From: Alvin Peters Date: Thu, 29 Aug 2024 15:04:54 +1000 Subject: [PATCH 12/13] Add TiKV: Add timestamp lock --- crates/store/src/backend/tikv/main.rs | 23 +- crates/store/src/backend/tikv/mod.rs | 1 + crates/store/src/backend/tikv/write.rs | 392 +++++++++++++------------ 3 files changed, 222 insertions(+), 194 deletions(-) diff --git a/crates/store/src/backend/tikv/main.rs b/crates/store/src/backend/tikv/main.rs index 6f27031f9..fbed11e67 100644 --- a/crates/store/src/backend/tikv/main.rs +++ b/crates/store/src/backend/tikv/main.rs @@ -6,7 +6,7 @@ use std::time::Duration; use tikv_client::{Backoff, CheckLevel, RetryOptions, TransactionClient, TransactionOptions}; use utils::config::{utils::AsKey, Config}; -use super::TikvStore; +use super::{into_error, TikvStore}; impl TikvStore { pub async fn open(config: &mut Config, prefix: impl AsKey) -> Option { @@ -30,15 +30,15 @@ impl TikvStore { let backoff_min_delay = config .property::((&prefix, "transaction.backoff-min-delay")) - .unwrap_or_else(|| Duration::from_millis(2)); + .unwrap_or_else(|| Duration::from_millis(500)); let backoff_max_delay = config .property::((&prefix, "transaction.backoff-max-delay")) - .unwrap_or_else(|| Duration::from_millis(500)); + .unwrap_or_else(|| Duration::from_millis(30000)); let max_attempts = config .property::((&prefix, "transaction.backoff-retry-limit")) - .unwrap_or_else(|| 10); + .unwrap_or_else(|| 30); let backoff = if let Some(backoff_type) = config .property::((&prefix, "transaction.backoff-type")) { @@ -68,12 +68,18 @@ impl TikvStore { } } else { // Default - Backoff::full_jitter_backoff( + Backoff::decorrelated_jitter_backoff( backoff_min_delay.as_millis() as u64, backoff_max_delay.as_millis() as u64, max_attempts ) + // Backoff::full_jitter_backoff( + // backoff_min_delay.as_millis() as u64, + // backoff_max_delay.as_millis() as u64, + // max_attempts + // ) }; + println!("using backoff {:?}", backoff); let write_trx_options = TransactionOptions::new_pessimistic() .drop_check(CheckLevel::Warn) @@ -84,10 +90,17 @@ impl TikvStore { .retry_options(RetryOptions::none()) .read_only(); + let current_timestamp = trx_client.current_timestamp().await.map_err(|err| { + config.new_build_error( + prefix.as_str(), + format!("Failed to create TiKV database: {err:?}"), + )}).ok()?; + let store = Self { trx_client, write_trx_options, read_trx_options, + version: parking_lot::Mutex::new(current_timestamp), backoff, }; diff --git a/crates/store/src/backend/tikv/mod.rs b/crates/store/src/backend/tikv/mod.rs index f1ae84b4c..82a4bb62b 100644 --- a/crates/store/src/backend/tikv/mod.rs +++ b/crates/store/src/backend/tikv/mod.rs @@ -39,6 +39,7 @@ pub struct TikvStore { trx_client: TransactionClient, write_trx_options: TransactionOptions, read_trx_options: TransactionOptions, + version: parking_lot::Mutex, backoff: Backoff, } diff --git a/crates/store/src/backend/tikv/write.rs b/crates/store/src/backend/tikv/write.rs index cf26f16bd..350700553 100644 --- a/crates/store/src/backend/tikv/write.rs +++ b/crates/store/src/backend/tikv/write.rs @@ -6,11 +6,13 @@ use std::{cmp::Ordering, iter, time::{Duration, Instant}}; use std::collections::Bound; +use std::ops::DerefMut; use tikv_client::{Backoff, BoundRange, CheckLevel, Key as TikvKey, RetryOptions, TimestampExt, Transaction, Value}; use rand::Rng; use roaring::RoaringBitmap; use tikv_client::TransactionOptions; use tikv_client::proto::kvrpcpb::{Assertion, Mutation, Op}; +use tikv_client::transaction::ResolveLocksOptions; use crate::{ backend::deserialize_i64_le, write::{ @@ -20,229 +22,241 @@ use crate::{ }, BitmapKey, IndexKey, Key, LogKey, SUBSPACE_COUNTER, SUBSPACE_QUOTA, U32_LEN, WITH_SUBSPACE, }; +use crate::write::key; use super::write::chunking::{put_chunked_value, delete_chunked_value}; use super::read::chunking::get_chunked_value; use super::{into_error, TikvStore, MAX_VALUE_SIZE, MAX_SCAN_KEYS_SIZE}; impl TikvStore { pub(crate) async fn write(&self, batch: Batch) -> trc::Result { - let mut account_id = u32::MAX; - let mut collection = u8::MAX; - let mut document_id = u32::MAX; - let mut change_id = u64::MAX; - let mut backoff = self.backoff.clone(); loop { - let mut result = AssignedIds::default(); - let mut trx = self.write_trx_no_backoff().await?; - for op in &batch.ops { - match op { - Operation::AccountId { - account_id: account_id_, - } => { - account_id = *account_id_; - } - Operation::Collection { - collection: collection_, - } => { - collection = *collection_; - } - Operation::DocumentId { - document_id: document_id_, - } => { - document_id = *document_id_; - } - Operation::ChangeId { - change_id: change_id_, - } => { - change_id = *change_id_; - } - Operation::Value { class, op } => { - let key = class.serialize( - account_id, - collection, - document_id, - WITH_SUBSPACE, - (&result).into(), - ); + match self.write_trx(&mut trx, &batch).await { + Ok(result) => return Ok(result), + Err(err) => { + let _ = trx.rollback().await; + let version = self.version.lock().clone(); + self.trx_client.gc(version).await.map_err(into_error)?; + //self.trx_client.cleanup_locks(BoundRange::range_from(TikvKey::from(vec![])), &ts, ResolveLocksOptions::default()).await.map_err(into_error)?; + let Some(backoff_duration) = backoff.next_delay_duration() else { + return Err(err); + }; + println!("backoff for {} secs with {} attempts", backoff_duration.as_secs_f32(), backoff.current_attempts()); + tokio::time::sleep(backoff_duration).await; + continue; + } + } + } - match op { - ValueOp::Set(value) => { - let value = value.resolve(&result)?; - put_chunked_value(&key, &value, &mut trx, false).await?; - } - ValueOp::AtomicAdd(by) => { - get_and_add(&mut trx, key, *by).await?; - } - ValueOp::AddAndGet(by) => { - let num = get_and_add(&mut trx, key, *by).await?; - result.push_counter_id(num); + } + + async fn write_trx(&self, trx: &mut Transaction, batch: &Batch) -> trc::Result { + let mut account_id = u32::MAX; + let mut collection = u8::MAX; + let mut document_id = u32::MAX; + let mut change_id = u64::MAX; + let mut result = AssignedIds::default(); + + for op in &batch.ops { + match op { + Operation::AccountId { + account_id: account_id_, + } => { + account_id = *account_id_; + } + Operation::Collection { + collection: collection_, + } => { + collection = *collection_; + } + Operation::DocumentId { + document_id: document_id_, + } => { + document_id = *document_id_; + } + Operation::ChangeId { + change_id: change_id_, + } => { + change_id = *change_id_; + } + Operation::Value { class, op } => { + let key = class.serialize( + account_id, + collection, + document_id, + WITH_SUBSPACE, + (&result).into(), + ); + println!("writing key: {:?}", key); + let do_chunk = !class.is_counter(collection); + + match op { + ValueOp::Set(value) => { + let value = value.resolve(&result)?; + if do_chunk { + put_chunked_value(&key, &value, trx, false).await?; + } else { + trx.put(key, value.as_ref()).await.map_err(into_error)?; } - ValueOp::Clear => { - delete_chunked_value(&key, &mut trx, false).await?; + } + ValueOp::AtomicAdd(by) => { + get_and_add(trx, key, *by).await?; + } + ValueOp::AddAndGet(by) => { + let num = get_and_add(trx, key, *by).await?; + result.push_counter_id(num); + } + ValueOp::Clear => { + if do_chunk { + delete_chunked_value(&key, trx, false).await?; + } else { + trx.delete(key).await.map_err(into_error)?; } } } - Operation::Index { field, key, set } => { - let key = IndexKey { + } + Operation::Index { field, key, set } => { + let key = IndexKey { + account_id, + collection, + document_id, + field: *field, + key, + }.serialize(0); + println!("writing index key: {:?}", key); + + if *set { + trx.put(key, &[]).await.map_err(into_error)?; + } else { + trx.delete(key).await.map_err(into_error)?; + } + } + Operation::Bitmap { class, set } => { + let assign_id = *set + && matches!(class, BitmapClass::DocumentIds) + && document_id == u32::MAX; + + if assign_id { + let mut begin = BitmapKey { account_id, collection, - document_id, - field: *field, - key, - }.serialize(0); - - if *set { - trx.put(key, &[]).await.map_err(into_error)?; - } else { - trx.delete(key).await.map_err(into_error)?; - } - } - Operation::Bitmap { class, set } => { - let assign_id = *set - && matches!(class, BitmapClass::DocumentIds) - && document_id == u32::MAX; - - if assign_id { - let mut begin = BitmapKey { - account_id, - collection, - class: BitmapClass::DocumentIds, - document_id: 0, - }.serialize(WITH_SUBSPACE); - let mut end = BitmapKey { - account_id, - collection, - class: BitmapClass::DocumentIds, - document_id: u32::MAX, - }.serialize(WITH_SUBSPACE); - end.push(u8::MIN); // Null byte to make the end inclusive - - - let key_len = begin.len(); - - let mut found_ids = RoaringBitmap::new(); - - 'outer: loop { - let mut keys = trx.scan_keys((begin, end.clone()), MAX_SCAN_KEYS_SIZE) - .await - .map_err(into_error)? - .peekable(); - - let mut count = 0; - while let Some(key) = keys.next() { - count += 1; - if key.len() == key_len { - let key_slice: &[u8] = key.as_ref().into(); - found_ids.insert(key_slice.deserialize_be_u32(key_len - U32_LEN)?); - } else { - break 'outer; - } + class: BitmapClass::DocumentIds, + document_id: 0, + }.serialize(WITH_SUBSPACE); + let mut end = BitmapKey { + account_id, + collection, + class: BitmapClass::DocumentIds, + document_id: u32::MAX, + }.serialize(WITH_SUBSPACE); + end.push(u8::MIN); // Null byte to make the end inclusive - if keys.peek().is_none() { - if count < MAX_SCAN_KEYS_SIZE { - break 'outer; - } else { - begin = key.into(); - begin.push(u8::MIN); // Null byte to make the beginning exclusive - continue 'outer; - } - } - } - // Empty - break; - } - document_id = found_ids.random_available_id(); - result.push_document_id(document_id); - } + let key_len = begin.len(); - let key = class.serialize( - account_id, - collection, - document_id, - WITH_SUBSPACE, - (&result).into(), - ); - - if *set { - let mut begin = key; - let mut end = class.serialize( - account_id, - collection, - document_id + 1, - WITH_SUBSPACE, - (&result).into(), - ); - end.push(u8::MIN); - - loop { - let keys: Vec = trx.scan_keys((begin, end.clone()), MAX_SCAN_KEYS_SIZE) - .await - .map_err(into_error)? - .collect(); - - if keys.len() < MAX_SCAN_KEYS_SIZE as usize { - trx.lock_keys(keys).await.map_err(into_error)?; - break; + let mut found_ids = RoaringBitmap::new(); + + 'outer: loop { + println!("scanning keys {:?} and {:?}", begin, end); + let mut keys = trx.scan_keys((begin, end.clone()), MAX_SCAN_KEYS_SIZE) + .await + .map_err(into_error)? + .peekable(); + + let mut count = 0; + while let Some(key) = keys.next() { + count += 1; + let key_slice: &[u8] = key.as_ref().into(); + println!("found key {:?}", key_slice); + if key_slice.len() == key_len { + found_ids.insert(key_slice.deserialize_be_u32(key_len - U32_LEN)?); } else { - // Guaranteed to have the last value - begin = keys.last().unwrap().clone().into(); - trx.lock_keys(keys).await.map_err(into_error)?; - continue; + break 'outer; + } + + if keys.peek().is_none() { + if count < MAX_SCAN_KEYS_SIZE { + break 'outer; + } else { + begin = key.into(); + begin.push(u8::MIN); // Null byte to make the beginning exclusive + continue 'outer; + } } } - } else { - trx.delete(key).await.map_err(into_error)?; + // Empty + break; } - } - Operation::Log { set } => { - let key = LogKey { - account_id, - collection, - change_id, - }.serialize(WITH_SUBSPACE); - trx.put(key, set.resolve(&result)?.as_ref()).await.map_err(into_error)?; + document_id = found_ids.random_available_id(); + println!("using document id: {} from found IDs: {:?}", document_id, found_ids); + result.push_document_id(document_id); } - Operation::AssertValue { - class, - assert_value, - } => { - let key = class.serialize( - account_id, - collection, - document_id, - WITH_SUBSPACE, - (&result).into(), - ); - - let matches = match get_chunked_value(&key, &mut trx).await { - Ok(Some(bytes)) => assert_value.matches(bytes.as_slice()), - Ok(None) => { - assert_value.is_none() - } - Err(_) => false, - }; - if !matches { - trx.rollback().await.map_err(into_error)?; - return Err(trc::StoreEvent::AssertValueFailed.into()); + let key = class.serialize( + account_id, + collection, + document_id, + WITH_SUBSPACE, + (&result).into(), + ); + + if *set { + trx.lock_keys([key.clone()]).await.map_err(into_error)?; + trx.put(key, &[]).await.map_err(into_error)?; + } else { + trx.delete(key).await.map_err(into_error)?; + } + } + Operation::Log { set } => { + let key = LogKey { + account_id, + collection, + change_id, + }.serialize(WITH_SUBSPACE); + + trx.put(key, set.resolve(&result)?.as_ref()).await.map_err(into_error)?; + } + Operation::AssertValue { + class, + assert_value, + } => { + let key = class.serialize( + account_id, + collection, + document_id, + WITH_SUBSPACE, + (&result).into(), + ); + + let matches = match get_chunked_value(&key, trx).await { + Ok(Some(bytes)) => assert_value.matches(bytes.as_slice()), + Ok(None) => { + assert_value.is_none() } + Err(_) => false, + }; + + if !matches { + trx.rollback().await.map_err(into_error)?; + return Err(trc::StoreEvent::AssertValueFailed.into()); } } } + } - if self.commit(trx, Some(&mut backoff)).await? { - return Ok(result) - } else { - continue; - } + if let Some(ts) = trx.commit().await.map_err(into_error)? { + let mut previous = self.version.lock(); + *previous = ts; + } + if ! result.counter_ids.is_empty() || ! result.document_ids.is_empty() { + println!("success with counters: [{:?}] and doc ids: [{:?}]", result.counter_ids, result.document_ids); } + Ok(result) } pub(crate) async fn purge_store(&self) -> trc::Result<()> { @@ -340,7 +354,7 @@ impl TikvStore { } pub(super) async fn write_trx_no_backoff(&self) -> trc::Result { - let write_trx_options = TransactionOptions::new_pessimistic() + let write_trx_options = TransactionOptions::new_optimistic() .drop_check(CheckLevel::Warn) .use_async_commit() .retry_options(RetryOptions::none()); From 71dc38a82c790441b7ad1ddfced33f1e04ef10f4 Mon Sep 17 00:00:00 2001 From: Alvin Peters Date: Thu, 29 Aug 2024 21:28:13 +1000 Subject: [PATCH 13/13] Add TiKV: Save work Passed all tests until Store query tests --- crates/store/src/backend/tikv/blob.rs | 2 +- crates/store/src/backend/tikv/main.rs | 6 ++-- crates/store/src/backend/tikv/mod.rs | 2 +- crates/store/src/backend/tikv/read.rs | 2 +- crates/store/src/backend/tikv/write.rs | 50 +++++++++++++++++--------- 5 files changed, 40 insertions(+), 22 deletions(-) diff --git a/crates/store/src/backend/tikv/blob.rs b/crates/store/src/backend/tikv/blob.rs index 4d0ca44bc..9a1e3fc8c 100644 --- a/crates/store/src/backend/tikv/blob.rs +++ b/crates/store/src/backend/tikv/blob.rs @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: 2020 Stalwart Labs Ltd + * SPDX-FileCopyrightText: 2024 Stalwart Labs Ltd * * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ diff --git a/crates/store/src/backend/tikv/main.rs b/crates/store/src/backend/tikv/main.rs index fbed11e67..722c8fab1 100644 --- a/crates/store/src/backend/tikv/main.rs +++ b/crates/store/src/backend/tikv/main.rs @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: 2020 Stalwart Labs Ltd + * SPDX-FileCopyrightText: 2024 Stalwart Labs Ltd * * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ @@ -34,7 +34,7 @@ impl TikvStore { let backoff_max_delay = config .property::((&prefix, "transaction.backoff-max-delay")) - .unwrap_or_else(|| Duration::from_millis(30000)); + .unwrap_or_else(|| Duration::from_millis(2000)); let max_attempts = config .property::((&prefix, "transaction.backoff-retry-limit")) @@ -87,7 +87,7 @@ impl TikvStore { let read_trx_options = TransactionOptions::new_optimistic() .drop_check(CheckLevel::None) - .retry_options(RetryOptions::none()) + .retry_options(RetryOptions::new(backoff.clone(), backoff.clone())) .read_only(); let current_timestamp = trx_client.current_timestamp().await.map_err(|err| { diff --git a/crates/store/src/backend/tikv/mod.rs b/crates/store/src/backend/tikv/mod.rs index 82a4bb62b..24a51fbcd 100644 --- a/crates/store/src/backend/tikv/mod.rs +++ b/crates/store/src/backend/tikv/mod.rs @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: 2020 Stalwart Labs Ltd + * SPDX-FileCopyrightText: 2024 Stalwart Labs Ltd * * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ diff --git a/crates/store/src/backend/tikv/read.rs b/crates/store/src/backend/tikv/read.rs index 24e5a59cc..7f2f73b2e 100644 --- a/crates/store/src/backend/tikv/read.rs +++ b/crates/store/src/backend/tikv/read.rs @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: 2020 Stalwart Labs Ltd + * SPDX-FileCopyrightText: 2024 Stalwart Labs Ltd * * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ diff --git a/crates/store/src/backend/tikv/write.rs b/crates/store/src/backend/tikv/write.rs index 350700553..22a6b2743 100644 --- a/crates/store/src/backend/tikv/write.rs +++ b/crates/store/src/backend/tikv/write.rs @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: 2020 Stalwart Labs Ltd + * SPDX-FileCopyrightText: 2024 Stalwart Labs Ltd * * SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-SEL */ @@ -13,6 +13,7 @@ use roaring::RoaringBitmap; use tikv_client::TransactionOptions; use tikv_client::proto::kvrpcpb::{Assertion, Mutation, Op}; use tikv_client::transaction::ResolveLocksOptions; +use trc::Error; use crate::{ backend::deserialize_i64_le, write::{ @@ -39,12 +40,19 @@ impl TikvStore { Err(err) => { let _ = trx.rollback().await; let version = self.version.lock().clone(); - self.trx_client.gc(version).await.map_err(into_error)?; - //self.trx_client.cleanup_locks(BoundRange::range_from(TikvKey::from(vec![])), &ts, ResolveLocksOptions::default()).await.map_err(into_error)?; + // match self.trx_client.gc(version).await.map_err(into_error) { + // Ok(a) => {} + // Err(_) => {} + // } + //self.trx_client.cleanup_locks(BoundRange::range_from(TikvKey::from(vec![0])), &version, ResolveLocksOptions::default()).await.map_err(into_error)?; + drop(version); let Some(backoff_duration) = backoff.next_delay_duration() else { + println!("giving up, error: {}", err); return Err(err); + }; - println!("backoff for {} secs with {} attempts", backoff_duration.as_secs_f32(), backoff.current_attempts()); + println!("backing off because of error: {}", err); + //println!("backoff for {} secs with {} attempts", backoff_duration.as_secs_f32(), backoff.current_attempts()); tokio::time::sleep(backoff_duration).await; continue; } @@ -91,7 +99,7 @@ impl TikvStore { WITH_SUBSPACE, (&result).into(), ); - println!("writing key: {:?}", key); + //println!("writing key: {:?}", key); let do_chunk = !class.is_counter(collection); match op { @@ -127,7 +135,7 @@ impl TikvStore { field: *field, key, }.serialize(0); - println!("writing index key: {:?}", key); + //println!("writing index key: {:?}", key); if *set { trx.put(key, &[]).await.map_err(into_error)?; @@ -161,7 +169,7 @@ impl TikvStore { let mut found_ids = RoaringBitmap::new(); 'outer: loop { - println!("scanning keys {:?} and {:?}", begin, end); + //println!("scanning keys {:?} and {:?}", begin, end); let mut keys = trx.scan_keys((begin, end.clone()), MAX_SCAN_KEYS_SIZE) .await .map_err(into_error)? @@ -171,7 +179,7 @@ impl TikvStore { while let Some(key) = keys.next() { count += 1; let key_slice: &[u8] = key.as_ref().into(); - println!("found key {:?}", key_slice); + //println!("found key {:?}", key_slice); if key_slice.len() == key_len { found_ids.insert(key_slice.deserialize_be_u32(key_len - U32_LEN)?); } else { @@ -193,7 +201,7 @@ impl TikvStore { } document_id = found_ids.random_available_id(); - println!("using document id: {} from found IDs: {:?}", document_id, found_ids); + //println!("using document id: {} from found IDs: {:?}", document_id, found_ids); result.push_document_id(document_id); } @@ -206,7 +214,15 @@ impl TikvStore { ); if *set { - trx.lock_keys([key.clone()]).await.map_err(into_error)?; + let first = key.clone(); + let second = class.serialize( + account_id, + collection, + document_id + 1, + WITH_SUBSPACE, + (&result).into(), + ); + trx.lock_keys([first, second]).await.map_err(into_error)?; trx.put(key, &[]).await.map_err(into_error)?; } else { trx.delete(key).await.map_err(into_error)?; @@ -249,13 +265,15 @@ impl TikvStore { } } - if let Some(ts) = trx.commit().await.map_err(into_error)? { - let mut previous = self.version.lock(); - *previous = ts; - } - if ! result.counter_ids.is_empty() || ! result.document_ids.is_empty() { - println!("success with counters: [{:?}] and doc ids: [{:?}]", result.counter_ids, result.document_ids); + if let Some(current_ts) = trx.commit().await.map_err(into_error)? { + let mut previous_ts = self.version.lock(); + if previous_ts.version() < current_ts.version() { + *previous_ts = current_ts; + } } + // if ! result.counter_ids.is_empty() || ! result.document_ids.is_empty() { + // println!("success with counters: [{:?}] and doc ids: [{:?}]", result.counter_ids, result.document_ids); + // } Ok(result) }