Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pkhuong/cold cache #12

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions bench/notebooks/bench_synthetic_sizes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
---
jupyter:
jupytext:
formats: ipynb,md
text_representation:
extension: .md
format_name: markdown
format_version: '1.2'
jupytext_version: 1.6.0
kernelspec:
display_name: Python 3
language: python
name: python3
---

```python
# Latency test for synthetic size distribution

from collections import defaultdict
import math
import random
import umash_bench
import umash_traces
from exact_test import *
import plotly.express as px
```

```python
INPUT_SIZES=[513] * 10000 # Worst case for I$ misses with an implementation that special cases <= 512 bytes

random.shuffle(INPUT_SIZES)

def size_bucket(size):
return size
```

```python
# Gather the raw data for the two revisions we want to compare
TEST = "WIP" # Or an actual commit ref
BASELINE = "HEAD" # Or any other commit ref
CFLAGS = None
CC = None
FLUSH_LEVEL = 3 # 3: fully flush out the code out of cache
results = umash_bench.compare_inputs(INPUT_SIZES,
current=TEST,
baseline=BASELINE,
cflags=CFLAGS,
cc=CC,
min_count=20000 * len(set(INPUT_SIZES)),
options={'flush_code': FLUSH_LEVEL})

TEST, BASELINE = results.keys() # Convert to the actual keys: HEAD etc. are normalised to SHAs

regrouped = dict()
for k, results in results.items():
regrouped_results = defaultdict(list)
regrouped[k] = regrouped_results
for size, timings in results.items():
regrouped_results[size_bucket(size)] += timings

regrouped_keys = sorted(regrouped[TEST].keys())
```

```python
# Summarise the range of latencies (in RDTSC cycles) for the two revisions and input size classes
for label, values in regrouped.items():
print(label)
for i in regrouped_keys:
total = len(values[i])
kept = sum(x < 100 for x in values[i])
print("\t%s: %i %i %f (%i %i)" % (i, total, kept, kept / total, min(values[i]), max(values[i])))
```

```python
# Visualise the two latency distributions for each input size
for sz in regrouped_keys:
test = list(regrouped[TEST][sz])
baseline = list(regrouped[BASELINE][sz])
random.shuffle(test)
random.shuffle(baseline)
test = test[:5000]
baseline = baseline[:5000]
fig = px.histogram(dict(Test=test, Baseline=baseline),
title="Latency for input size = %s" % sz,
histnorm='probability density',
nbins=max(test + baseline),
barmode="overlay",
opacity=0.5,
marginal="box")
fig.update_xaxes(range=(0, 100 + min(test + baseline) + 50 * (sz // 100)))
fig.show()
```

```python
# Run an exact permutation test for each input size to see if any difference is worth looking at.
stats = [(i,
exact_test(a=regrouped[TEST][i][:20000], # We don't need too too many data points
b=regrouped[BASELINE][i][:20000],
eps=1e-4,
statistics=[
mean("mean", .5e-3),
lte_prob("lte"),
q99("q99"),
q99("q99_sa", a_offset=5) # Compare against q99 w/ A 5 cycles slower than B
])
) for i in regrouped_keys]
```

```python
stats
```
72 changes: 68 additions & 4 deletions bench/runner.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <x86intrin.h>

#include "umash.h"

Expand Down Expand Up @@ -31,6 +32,45 @@ setup_params(void)
return;
}

static inline void
cpuid_barrier(void)
{

asm volatile("cpuid\n\t" ::: "%rax", "%rdx", "%rbx", "%rcx", "memory", "cc");
return;
}

static __attribute__((__noinline__)) void
flush_code(int level)
{
uintptr_t begin = (uintptr_t)ID(__start_umash_code);
uintptr_t end = (uintptr_t)ID(__stop_umash_code);

if (level == 0)
return;

for (uintptr_t addr = begin; addr < end; addr += 64)
_mm_clflush((void *)addr);

cpuid_barrier();

switch (level) {
case 1:
for (uintptr_t addr = begin; addr < end; addr += 64)
_mm_prefetch((void *)addr, _MM_HINT_T1);
break;
case 2:
for (uintptr_t addr = begin; addr < end; addr += 64)
_mm_prefetch((void *)addr, _MM_HINT_T2);
break;
default:
return;
}

cpuid_barrier();
return;
}

/*
* We use difference instruction sequences for the beginning and end
* of the timed sequence because that's what Intel recommends.
Expand Down Expand Up @@ -140,15 +180,36 @@ uint64_t ID(umash_bench_fp_aggregate)(
return end - begin;
}

void ID(umash_bench_individual)(const struct bench_individual_options *options,
static struct bench_individual_options
normalize_options(const struct bench_individual_options *options)
{
struct bench_individual_options ret = {
.size = sizeof(ret),
};

if (options == NULL)
return ret;

if (options->size < ret.size) {
memcpy(&ret, options, options->size);
} else {
memcpy(&ret, options, sizeof(ret));
}

ret.size = sizeof(ret);
return ret;
}

void ID(umash_bench_individual)(const struct bench_individual_options *options_ptr,
uint64_t *restrict timings, const size_t *input_len, size_t num_trials,
size_t max_len)
{
struct bench_individual_options options;
size_t bufsz = ALLOC_ALIGNMENT * (1 + (max_len + JITTER_MASK) / ALLOC_ALIGNMENT);
char *buf;
uint64_t seed = 0;

(void)options;
options = normalize_options(options_ptr);
if (posix_memalign((void *)&buf, ALLOC_ALIGNMENT, bufsz) != 0)
assert(0 && "Failed to allocate buffer.");

Expand All @@ -158,6 +219,7 @@ void ID(umash_bench_individual)(const struct bench_individual_options *options,
uint64_t begin, end;
uint64_t hash;

flush_code(options.flush_code);
begin = get_ticks_begin(&seed);
seed += begin;

Expand All @@ -174,15 +236,16 @@ void ID(umash_bench_individual)(const struct bench_individual_options *options,
return;
}

void ID(umash_bench_fp_individual)(const struct bench_individual_options *options,
void ID(umash_bench_fp_individual)(const struct bench_individual_options *options_ptr,
uint64_t *restrict timings, const size_t *input_len, size_t num_trials,
size_t max_len)
{
struct bench_individual_options options;
size_t bufsz = ALLOC_ALIGNMENT * (1 + (max_len + JITTER_MASK) / ALLOC_ALIGNMENT);
char *buf;
uint64_t seed = 0;

(void)options;
options = normalize_options(options_ptr);
if (posix_memalign((void *)&buf, ALLOC_ALIGNMENT, bufsz) != 0)
assert(0 && "Failed to allocate buffer.");

Expand All @@ -193,6 +256,7 @@ void ID(umash_bench_fp_individual)(const struct bench_individual_options *option
uint64_t begin, end;
uint64_t hash;

flush_code(options.flush_code);
begin = get_ticks_begin(&seed);
seed += begin;

Expand Down
11 changes: 11 additions & 0 deletions bench/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,17 @@
*/
struct bench_individual_options {
size_t size; /* sizeof(struct bench_individual_options) */

/*
* When `flush_code` is non-zero, the benchmarking loop
* CLFLUSHes out all umash code before every hash call.
*
* flush_code = 0: keep code hot in cache
* flush_code = 1: try to move code back into L2
* flush_code = 2: try to move code back into L3
* flush_code >= 3: leave code fully flushed out.
*/
int flush_code;
};

/*
Expand Down
9 changes: 6 additions & 3 deletions t/umash_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def compare_inputs(
block_size=128,
min_count=100000,
runner="umash_bench_individual",
options={},
):
"""Compares the performance of two implementations for input sizes in `length_arguments`.

Expand All @@ -64,9 +65,11 @@ def compare_inputs(
timings = ffi.new("uint64_t[]", block_size)

def make_options(target_ffi):
options = target_ffi.new("struct bench_individual_options *")
options.size = target_ffi.sizeof("struct bench_individual_options")
return options
ret = target_ffi.new("struct bench_individual_options *")
ret.size = target_ffi.sizeof("struct bench_individual_options")
for field, value in options.items():
setattr(ret, field, value)
return ret

implementations = [
(0, getattr(current_lib, runner), make_options(ffi), defaultdict(list)),
Expand Down