diff --git a/scheds/c/README.md b/scheds/c/README.md index 07a448af7..cc2e1a459 100644 --- a/scheds/c/README.md +++ b/scheds/c/README.md @@ -164,6 +164,27 @@ simple scheduling policy. -------------------------------------------------------------------------------- +## scx_prev + +### Overview + +A variation on scx_simple with CPU selection that prioritizes an idle previous +CPU over finding a fully idle core (as is done in scx_simple and scx_rusty). + +### Typical Use Case + +This scheduler outperforms the in-kernel fair class, scx_simple, and scx_rusty +on OLTP workloads run on systems with simple topology (i.e. non-NUMA, single +LLC). + +### Production Ready? + +scx_prev has not been tested in a production environment, but given its +similarity to scx_simple, it might be production ready for specific workloads +on hardware with simple topology. + +-------------------------------------------------------------------------------- + ## scx_userland ### Overview diff --git a/scheds/c/meson.build b/scheds/c/meson.build index a1e912b01..4879526f5 100644 --- a/scheds/c/meson.build +++ b/scheds/c/meson.build @@ -1,5 +1,5 @@ c_scheds = ['scx_simple', 'scx_qmap', 'scx_central', 'scx_userland', 'scx_nest', - 'scx_flatcg', 'scx_pair'] + 'scx_flatcg', 'scx_pair', 'scx_prev'] c_scheds_lib = ['scx_sdt'] diff --git a/scheds/c/scx_prev.bpf.c b/scheds/c/scx_prev.bpf.c new file mode 100644 index 000000000..9c7d210a7 --- /dev/null +++ b/scheds/c/scx_prev.bpf.c @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A variation on scx_simple with CPU selection that prioritizes an idle + * previous CPU over finding a fully idle core (as is done in scx_simple and + * scx_rusty). + * + * Outperforms the in-kernel fair class (v6.12), scx_simple, and scx_rusty on + * OLTP workloads run on systems with simple topology (i.e. non-NUMA, single + * LLC). + * + * Copyright (c) 2025, Oracle and/or its affiliates. + * Copyright (c) 2025, Daniel Jordan + */ +#include + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, 4); /* [local, select_fail, prev_cpu, idle_cpu] */ +} stats SEC(".maps"); + +static void stat_inc(u32 idx) +{ + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); + if (cnt_p) + (*cnt_p)++; +} + +s32 BPF_STRUCT_OPS(prev_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + s32 cpu; + + if (p->nr_cpus_allowed == 1) { + if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + goto insert; + } else { + return prev_cpu; + } + } + + if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + stat_inc(2); /* prev_cpu */ + cpu = prev_cpu; + goto insert; + } + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) { + stat_inc(3); /* idle_cpu */ + goto insert; + } + + stat_inc(1); /* select_fail */ + + return prev_cpu; + +insert: + stat_inc(0); /* local */ + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + + return cpu; +} + +void BPF_STRUCT_OPS(prev_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(prev_ops, + .select_cpu = (void *)prev_select_cpu, + .exit = (void *)prev_exit, + .name = "prev" +); diff --git a/scheds/c/scx_prev.c b/scheds/c/scx_prev.c new file mode 100644 index 000000000..291684dad --- /dev/null +++ b/scheds/c/scx_prev.c @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. + * Copyright (c) 2025, Daniel Jordan + */ +#include +#include +#include +#include +#include +#include + +#include "scx_prev.bpf.skel.h" + +const char help_fmt[] = +"A variation on scx_simple with CPU selection that prioritizes an idle\n" +"previous CPU over finding a fully idle core.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-i sec] [-v]\n" +"\n" +" -h Display this help and exit\n" +" -i Sampling interval for statistics in seconds\n" +" -v Print libbpf debug messages\n"; + +static bool verbose; +static unsigned stat_interval = 1; +static volatile int exit_req; + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !verbose) + return 0; + return vfprintf(stderr, format, args); +} + +static void sigint_handler(int unused) +{ + exit_req = 1; +} + +static void read_stats(struct scx_prev *skel, __u64 *stats) +{ + int nr_cpus = libbpf_num_possible_cpus(); + __u64 cnts[4][nr_cpus]; + __u32 idx; + + memset(stats, 0, sizeof(stats[0]) * 4); + + for (idx = 0; idx < 4; idx++) { + int ret, cpu; + + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), + &idx, cnts[idx]); + if (ret < 0) + continue; + for (cpu = 0; cpu < nr_cpus; cpu++) + stats[idx] += cnts[idx][cpu]; + } +} + +int main(int argc, char **argv) +{ + struct scx_prev *skel; + struct bpf_link *link; + __u32 opt; + __u64 ecode; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); +restart: + skel = SCX_OPS_OPEN(prev_ops, scx_prev); + + while ((opt = getopt(argc, argv, "hi:v")) != -1) { + switch (opt) { + case 'i': + stat_interval = strtoull(optarg, NULL, 0); + if (!stat_interval) + stat_interval = 1; + break; + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_OPS_LOAD(skel, prev_ops, scx_prev, uei); + link = SCX_OPS_ATTACH(skel, prev_ops, scx_prev); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + __u64 stats[4]; + + read_stats(skel, stats); + printf("local=%llu select_fail=%llu prev_cpu=%llu idle_cpu=%llu\n", + stats[0], stats[1], stats[2], stats[3]); + fflush(stdout); + sleep(stat_interval); + } + + bpf_link__destroy(link); + ecode = UEI_REPORT(skel, uei); + scx_prev__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +}