From 15600c97026a311f383ec07c611f43e22648f7a1 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 21 Jan 2025 12:20:15 -0500 Subject: [PATCH] scx_prev: a simple scheduler tested on OLTP workloads A FIFO-only variation on scx_simple with CPU selection that prioritizes an idle previous CPU over a fully idle core (as is done in scx_simple and scx_rusty). scx_prev outperforms a few other schedulers on OLTP workloads run on systems with relatively flat topology (i.e. non-NUMA, single LLC) by changing CPU selection as above and by taking advantage of the more aggressive work conservation (i.e. idle balancing) that comes with sched_ext by default. It's far from being a full-fledged scheduler, but it demonstrates how a small change to an existing scheduler can improve performance in a real application. Notes: - AMD EPYC 7J13 (16-CPU VM) server running v6.12-based UEK-next kernel, scx (688bffcd "Merge pull request #1192 from devnexen/code_simpl3"), and MySQL Community Edition 8.4[0] - AMD EPYC 7551 (128-CPU BM) client running BMK[1] (a sysbench-based BenchMark Kit) - Each data point in the table below represents the average of ten, one-minute runs done after a three-minute warmup. The server is rebooted between each scheduler. - "cli" means the number of database clients. - Each %diff column is relative to eevdf. Representative BMK testcase: sb11-OLTP_RO_10M_8tab-uniform-ps-notrx.sh cli eevdf (std%) rusty (std%) %diff simple (std%) %diff prev (std%) %diff --- ------------ ------------ ----- ------------- ----- ----------- ----- throughput 16 4140 ( 1%) 4224 ( 1%) ( 2%) 4276 ( 2%) ( 3%) 4263 ( 1%) ( 3%) 32 7382 ( 1%) 7259 ( 1%) ( -2%) 7314 ( 1%) ( -1%) 7919 ( 1%) ( 7%) 48 9015 ( 0%) 9644 ( 0%) ( 7%) 10055 ( 0%) ( 12%) 10411 ( 1%) ( 15%) 64 9765 ( 1%) 9601 ( 0%) ( -2%) 10214 ( 0%) ( 5%) 10481 ( 0%) ( 7%) average latency 16 4 ( 1%) 4 ( 1%) ( -2%) 4 ( 2%) ( -3%) 4 ( 1%) ( -3%) 32 4 ( 1%) 4 ( 1%) ( 2%) 4 ( 1%) ( 1%) 4 ( 1%) ( -7%) 48 5 ( 0%) 5 ( 0%) ( -7%) 5 ( 0%) (-10%) 5 ( 1%) (-13%) 64 7 ( 1%) 7 ( 0%) ( 2%) 6 ( 0%) ( -4%) 6 ( 0%) ( -7%) 95p latency 16 4 ( 3%) 4 ( 2%) ( -4%) 4 ( 4%) ( -1%) 4 ( 4%) ( -7%) 32 5 ( 2%) 5 ( 1%) ( 1%) 5 ( 2%) ( 1%) 4 ( 2%) (-11%) 48 7 ( 1%) 6 ( 1%) (-16%) 5 ( 1%) (-24%) 5 ( 1%) (-26%) 64 9 ( 3%) 8 ( 0%) (-12%) 7 ( 0%) (-26%) 7 ( 1%) (-26%) In the read-only workload, prev consistently outperforms with equal or better throughput and latency across the board. [0] https://github.com/mysql/mysql-server/tree/8.4 [1] http://dimitrik.free.fr/blog/posts/mysql-perf-bmk-kit.html Signed-off-by: Daniel Jordan --- scheds/c/README.md | 21 ++++++++ scheds/c/meson.build | 2 +- scheds/c/scx_prev.bpf.c | 79 ++++++++++++++++++++++++++++ scheds/c/scx_prev.c | 112 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 213 insertions(+), 1 deletion(-) create mode 100644 scheds/c/scx_prev.bpf.c create mode 100644 scheds/c/scx_prev.c diff --git a/scheds/c/README.md b/scheds/c/README.md index 07a448af7..cc2e1a459 100644 --- a/scheds/c/README.md +++ b/scheds/c/README.md @@ -164,6 +164,27 @@ simple scheduling policy. -------------------------------------------------------------------------------- +## scx_prev + +### Overview + +A variation on scx_simple with CPU selection that prioritizes an idle previous +CPU over finding a fully idle core (as is done in scx_simple and scx_rusty). + +### Typical Use Case + +This scheduler outperforms the in-kernel fair class, scx_simple, and scx_rusty +on OLTP workloads run on systems with simple topology (i.e. non-NUMA, single +LLC). + +### Production Ready? + +scx_prev has not been tested in a production environment, but given its +similarity to scx_simple, it might be production ready for specific workloads +on hardware with simple topology. + +-------------------------------------------------------------------------------- + ## scx_userland ### Overview diff --git a/scheds/c/meson.build b/scheds/c/meson.build index a1e912b01..4879526f5 100644 --- a/scheds/c/meson.build +++ b/scheds/c/meson.build @@ -1,5 +1,5 @@ c_scheds = ['scx_simple', 'scx_qmap', 'scx_central', 'scx_userland', 'scx_nest', - 'scx_flatcg', 'scx_pair'] + 'scx_flatcg', 'scx_pair', 'scx_prev'] c_scheds_lib = ['scx_sdt'] diff --git a/scheds/c/scx_prev.bpf.c b/scheds/c/scx_prev.bpf.c new file mode 100644 index 000000000..9c7d210a7 --- /dev/null +++ b/scheds/c/scx_prev.bpf.c @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A variation on scx_simple with CPU selection that prioritizes an idle + * previous CPU over finding a fully idle core (as is done in scx_simple and + * scx_rusty). + * + * Outperforms the in-kernel fair class (v6.12), scx_simple, and scx_rusty on + * OLTP workloads run on systems with simple topology (i.e. non-NUMA, single + * LLC). + * + * Copyright (c) 2025, Oracle and/or its affiliates. + * Copyright (c) 2025, Daniel Jordan + */ +#include + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, 4); /* [local, select_fail, prev_cpu, idle_cpu] */ +} stats SEC(".maps"); + +static void stat_inc(u32 idx) +{ + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); + if (cnt_p) + (*cnt_p)++; +} + +s32 BPF_STRUCT_OPS(prev_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + s32 cpu; + + if (p->nr_cpus_allowed == 1) { + if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + goto insert; + } else { + return prev_cpu; + } + } + + if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + stat_inc(2); /* prev_cpu */ + cpu = prev_cpu; + goto insert; + } + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) { + stat_inc(3); /* idle_cpu */ + goto insert; + } + + stat_inc(1); /* select_fail */ + + return prev_cpu; + +insert: + stat_inc(0); /* local */ + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + + return cpu; +} + +void BPF_STRUCT_OPS(prev_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(prev_ops, + .select_cpu = (void *)prev_select_cpu, + .exit = (void *)prev_exit, + .name = "prev" +); diff --git a/scheds/c/scx_prev.c b/scheds/c/scx_prev.c new file mode 100644 index 000000000..291684dad --- /dev/null +++ b/scheds/c/scx_prev.c @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. + * Copyright (c) 2025, Daniel Jordan + */ +#include +#include +#include +#include +#include +#include + +#include "scx_prev.bpf.skel.h" + +const char help_fmt[] = +"A variation on scx_simple with CPU selection that prioritizes an idle\n" +"previous CPU over finding a fully idle core.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-i sec] [-v]\n" +"\n" +" -h Display this help and exit\n" +" -i Sampling interval for statistics in seconds\n" +" -v Print libbpf debug messages\n"; + +static bool verbose; +static unsigned stat_interval = 1; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int unused) +{ + exit_req = 1; +} + +static void read_stats(struct scx_prev *skel, __u64 *stats) +{ + int nr_cpus = libbpf_num_possible_cpus(); + __u64 cnts[4][nr_cpus]; + __u32 idx; + + memset(stats, 0, sizeof(stats[0]) * 4); + + for (idx = 0; idx < 4; idx++) { + int ret, cpu; + + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), + &idx, cnts[idx]); + if (ret < 0) + continue; + for (cpu = 0; cpu < nr_cpus; cpu++) + stats[idx] += cnts[idx][cpu]; + } +} + +int main(int argc, char **argv) +{ + struct scx_prev *skel; + struct bpf_link *link; + __u32 opt; + __u64 ecode; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); +restart: + skel = SCX_OPS_OPEN(prev_ops, scx_prev); + + while ((opt = getopt(argc, argv, "hi:v")) != -1) { + switch (opt) { + case 'i': + stat_interval = strtoull(optarg, NULL, 0); + if (!stat_interval) + stat_interval = 1; + break; + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_OPS_LOAD(skel, prev_ops, scx_prev, uei); + link = SCX_OPS_ATTACH(skel, prev_ops, scx_prev); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + __u64 stats[4]; + + read_stats(skel, stats); + printf("local=%llu select_fail=%llu prev_cpu=%llu idle_cpu=%llu\n", + stats[0], stats[1], stats[2], stats[3]); + fflush(stdout); + sleep(stat_interval); + } + + bpf_link__destroy(link); + ecode = UEI_REPORT(skel, uei); + scx_prev__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +}