Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scx_prev: a simple scheduler tested on OLTP workloads #1275

Merged
merged 1 commit into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions scheds/c/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,27 @@ simple scheduling policy.

--------------------------------------------------------------------------------

## scx_prev

### Overview

A variation on scx_simple with CPU selection that prioritizes an idle previous
CPU over finding a fully idle core (as is done in scx_simple and scx_rusty).

### Typical Use Case

This scheduler outperforms the in-kernel fair class, scx_simple, and scx_rusty
on OLTP workloads run on systems with simple topology (i.e. non-NUMA, single
LLC).

### Production Ready?

scx_prev has not been tested in a production environment, but given its
similarity to scx_simple, it might be production ready for specific workloads
on hardware with simple topology.

--------------------------------------------------------------------------------

## scx_userland

### Overview
Expand Down
2 changes: 1 addition & 1 deletion scheds/c/meson.build
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
c_scheds = ['scx_simple', 'scx_qmap', 'scx_central', 'scx_userland', 'scx_nest',
'scx_flatcg', 'scx_pair']
'scx_flatcg', 'scx_pair', 'scx_prev']

c_scheds_lib = ['scx_sdt']

Expand Down
79 changes: 79 additions & 0 deletions scheds/c/scx_prev.bpf.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* A variation on scx_simple with CPU selection that prioritizes an idle
* previous CPU over finding a fully idle core (as is done in scx_simple and
* scx_rusty).
*
* Outperforms the in-kernel fair class (v6.12), scx_simple, and scx_rusty on
* OLTP workloads run on systems with simple topology (i.e. non-NUMA, single
* LLC).
*
* Copyright (c) 2025, Oracle and/or its affiliates.
* Copyright (c) 2025, Daniel Jordan <[email protected]>
*/
#include <scx/common.bpf.h>

char _license[] SEC("license") = "GPL";

UEI_DEFINE(uei);

struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(key_size, sizeof(u32));
__uint(value_size, sizeof(u64));
__uint(max_entries, 4); /* [local, select_fail, prev_cpu, idle_cpu] */
} stats SEC(".maps");

static void stat_inc(u32 idx)
{
u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
if (cnt_p)
(*cnt_p)++;
}

s32 BPF_STRUCT_OPS(prev_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
s32 cpu;

if (p->nr_cpus_allowed == 1) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this condition is always false, ops.select_cpu() is always skipped if the task can only run on 1 cpu.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's true, thanks, I see how ->select_cpu() is always skipped in the in-kernel scheduler core for nr_cpus_allowed == 1. I'll send a follow up deleting the unused branch.

if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
cpu = prev_cpu;
goto insert;
} else {
return prev_cpu;
}
}

if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
stat_inc(2); /* prev_cpu */
cpu = prev_cpu;
goto insert;
}

cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
if (cpu >= 0) {
stat_inc(3); /* idle_cpu */
goto insert;
}

stat_inc(1); /* select_fail */

return prev_cpu;

insert:
stat_inc(0); /* local */
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);

return cpu;
}

void BPF_STRUCT_OPS(prev_exit, struct scx_exit_info *ei)
{
UEI_RECORD(uei, ei);
}

SCX_OPS_DEFINE(prev_ops,
.select_cpu = (void *)prev_select_cpu,
.exit = (void *)prev_exit,
.name = "prev"
);
112 changes: 112 additions & 0 deletions scheds/c/scx_prev.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2025, Oracle and/or its affiliates.
* Copyright (c) 2025, Daniel Jordan <[email protected]>
*/
#include <stdio.h>
#include <unistd.h>
#include <signal.h>
#include <libgen.h>
#include <bpf/bpf.h>
#include <scx/common.h>

#include "scx_prev.bpf.skel.h"

const char help_fmt[] =
"A variation on scx_simple with CPU selection that prioritizes an idle\n"
"previous CPU over finding a fully idle core.\n"
"\n"
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-i sec] [-v]\n"
"\n"
" -h Display this help and exit\n"
" -i Sampling interval for statistics in seconds\n"
" -v Print libbpf debug messages\n";

static bool verbose;
static unsigned stat_interval = 1;
static volatile int exit_req;

static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}

static void sigint_handler(int unused)
{
exit_req = 1;
}

static void read_stats(struct scx_prev *skel, __u64 *stats)
{
int nr_cpus = libbpf_num_possible_cpus();
__u64 cnts[4][nr_cpus];
__u32 idx;

memset(stats, 0, sizeof(stats[0]) * 4);

for (idx = 0; idx < 4; idx++) {
int ret, cpu;

ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
&idx, cnts[idx]);
if (ret < 0)
continue;
for (cpu = 0; cpu < nr_cpus; cpu++)
stats[idx] += cnts[idx][cpu];
}
}

int main(int argc, char **argv)
{
struct scx_prev *skel;
struct bpf_link *link;
__u32 opt;
__u64 ecode;

libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
restart:
skel = SCX_OPS_OPEN(prev_ops, scx_prev);

while ((opt = getopt(argc, argv, "hi:v")) != -1) {
switch (opt) {
case 'i':
stat_interval = strtoull(optarg, NULL, 0);
if (!stat_interval)
stat_interval = 1;
break;
case 'v':
verbose = true;
break;
default:
fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h';
}
}

SCX_OPS_LOAD(skel, prev_ops, scx_prev, uei);
link = SCX_OPS_ATTACH(skel, prev_ops, scx_prev);

while (!exit_req && !UEI_EXITED(skel, uei)) {
__u64 stats[4];

read_stats(skel, stats);
printf("local=%llu select_fail=%llu prev_cpu=%llu idle_cpu=%llu\n",
stats[0], stats[1], stats[2], stats[3]);
fflush(stdout);
sleep(stat_interval);
}

bpf_link__destroy(link);
ecode = UEI_REPORT(skel, uei);
scx_prev__destroy(skel);

if (UEI_ECODE_RESTART(ecode))
goto restart;
return 0;
}