diff --git a/bench/notebooks/bench_synthetic_sizes.md b/bench/notebooks/bench_synthetic_sizes.md new file mode 100644 index 0000000..8ff0569 --- /dev/null +++ b/bench/notebooks/bench_synthetic_sizes.md @@ -0,0 +1,111 @@ +--- +jupyter: + jupytext: + formats: ipynb,md + text_representation: + extension: .md + format_name: markdown + format_version: '1.2' + jupytext_version: 1.6.0 + kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```python +# Latency test for synthetic size distribution + +from collections import defaultdict +import math +import random +import umash_bench +import umash_traces +from exact_test import * +import plotly.express as px +``` + +```python +INPUT_SIZES=[513] * 10000 # Worst case for I$ misses with an implementation that special cases <= 512 bytes + +random.shuffle(INPUT_SIZES) + +def size_bucket(size): + return size +``` + +```python +# Gather the raw data for the two revisions we want to compare +TEST = "WIP" # Or an actual commit ref +BASELINE = "HEAD" # Or any other commit ref +CFLAGS = None +CC = None +FLUSH_LEVEL = 3 # 3: fully flush out the code out of cache +results = umash_bench.compare_inputs(INPUT_SIZES, + current=TEST, + baseline=BASELINE, + cflags=CFLAGS, + cc=CC, + min_count=20000 * len(set(INPUT_SIZES)), + options={'flush_code': FLUSH_LEVEL}) + +TEST, BASELINE = results.keys() # Convert to the actual keys: HEAD etc. are normalised to SHAs + +regrouped = dict() +for k, results in results.items(): + regrouped_results = defaultdict(list) + regrouped[k] = regrouped_results + for size, timings in results.items(): + regrouped_results[size_bucket(size)] += timings + +regrouped_keys = sorted(regrouped[TEST].keys()) +``` + +```python +# Summarise the range of latencies (in RDTSC cycles) for the two revisions and input size classes +for label, values in regrouped.items(): + print(label) + for i in regrouped_keys: + total = len(values[i]) + kept = sum(x < 100 for x in values[i]) + print("\t%s: %i %i %f (%i %i)" % (i, total, kept, kept / total, min(values[i]), max(values[i]))) +``` + +```python +# Visualise the two latency distributions for each input size +for sz in regrouped_keys: + test = list(regrouped[TEST][sz]) + baseline = list(regrouped[BASELINE][sz]) + random.shuffle(test) + random.shuffle(baseline) + test = test[:5000] + baseline = baseline[:5000] + fig = px.histogram(dict(Test=test, Baseline=baseline), + title="Latency for input size = %s" % sz, + histnorm='probability density', + nbins=max(test + baseline), + barmode="overlay", + opacity=0.5, + marginal="box") + fig.update_xaxes(range=(0, 100 + min(test + baseline) + 50 * (sz // 100))) + fig.show() +``` + +```python +# Run an exact permutation test for each input size to see if any difference is worth looking at. +stats = [(i, + exact_test(a=regrouped[TEST][i][:20000], # We don't need too too many data points + b=regrouped[BASELINE][i][:20000], + eps=1e-4, + statistics=[ + mean("mean", .5e-3), + lte_prob("lte"), + q99("q99"), + q99("q99_sa", a_offset=5) # Compare against q99 w/ A 5 cycles slower than B + ]) + ) for i in regrouped_keys] +``` + +```python +stats +``` diff --git a/bench/runner.c b/bench/runner.c index ed8923a..37c0bae 100644 --- a/bench/runner.c +++ b/bench/runner.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "umash.h" @@ -31,6 +32,45 @@ setup_params(void) return; } +static inline void +cpuid_barrier(void) +{ + + asm volatile("cpuid\n\t" ::: "%rax", "%rdx", "%rbx", "%rcx", "memory", "cc"); + return; +} + +static __attribute__((__noinline__)) void +flush_code(int level) +{ + uintptr_t begin = (uintptr_t)ID(__start_umash_code); + uintptr_t end = (uintptr_t)ID(__stop_umash_code); + + if (level == 0) + return; + + for (uintptr_t addr = begin; addr < end; addr += 64) + _mm_clflush((void *)addr); + + cpuid_barrier(); + + switch (level) { + case 1: + for (uintptr_t addr = begin; addr < end; addr += 64) + _mm_prefetch((void *)addr, _MM_HINT_T1); + break; + case 2: + for (uintptr_t addr = begin; addr < end; addr += 64) + _mm_prefetch((void *)addr, _MM_HINT_T2); + break; + default: + return; + } + + cpuid_barrier(); + return; +} + /* * We use difference instruction sequences for the beginning and end * of the timed sequence because that's what Intel recommends. @@ -140,15 +180,36 @@ uint64_t ID(umash_bench_fp_aggregate)( return end - begin; } -void ID(umash_bench_individual)(const struct bench_individual_options *options, +static struct bench_individual_options +normalize_options(const struct bench_individual_options *options) +{ + struct bench_individual_options ret = { + .size = sizeof(ret), + }; + + if (options == NULL) + return ret; + + if (options->size < ret.size) { + memcpy(&ret, options, options->size); + } else { + memcpy(&ret, options, sizeof(ret)); + } + + ret.size = sizeof(ret); + return ret; +} + +void ID(umash_bench_individual)(const struct bench_individual_options *options_ptr, uint64_t *restrict timings, const size_t *input_len, size_t num_trials, size_t max_len) { + struct bench_individual_options options; size_t bufsz = ALLOC_ALIGNMENT * (1 + (max_len + JITTER_MASK) / ALLOC_ALIGNMENT); char *buf; uint64_t seed = 0; - (void)options; + options = normalize_options(options_ptr); if (posix_memalign((void *)&buf, ALLOC_ALIGNMENT, bufsz) != 0) assert(0 && "Failed to allocate buffer."); @@ -158,6 +219,7 @@ void ID(umash_bench_individual)(const struct bench_individual_options *options, uint64_t begin, end; uint64_t hash; + flush_code(options.flush_code); begin = get_ticks_begin(&seed); seed += begin; @@ -174,15 +236,16 @@ void ID(umash_bench_individual)(const struct bench_individual_options *options, return; } -void ID(umash_bench_fp_individual)(const struct bench_individual_options *options, +void ID(umash_bench_fp_individual)(const struct bench_individual_options *options_ptr, uint64_t *restrict timings, const size_t *input_len, size_t num_trials, size_t max_len) { + struct bench_individual_options options; size_t bufsz = ALLOC_ALIGNMENT * (1 + (max_len + JITTER_MASK) / ALLOC_ALIGNMENT); char *buf; uint64_t seed = 0; - (void)options; + options = normalize_options(options_ptr); if (posix_memalign((void *)&buf, ALLOC_ALIGNMENT, bufsz) != 0) assert(0 && "Failed to allocate buffer."); @@ -193,6 +256,7 @@ void ID(umash_bench_fp_individual)(const struct bench_individual_options *option uint64_t begin, end; uint64_t hash; + flush_code(options.flush_code); begin = get_ticks_begin(&seed); seed += begin; diff --git a/bench/runner.h b/bench/runner.h index 7d99ee6..8f5da33 100644 --- a/bench/runner.h +++ b/bench/runner.h @@ -20,6 +20,17 @@ */ struct bench_individual_options { size_t size; /* sizeof(struct bench_individual_options) */ + + /* + * When `flush_code` is non-zero, the benchmarking loop + * CLFLUSHes out all umash code before every hash call. + * + * flush_code = 0: keep code hot in cache + * flush_code = 1: try to move code back into L2 + * flush_code = 2: try to move code back into L3 + * flush_code >= 3: leave code fully flushed out. + */ + int flush_code; }; /* diff --git a/t/umash_bench.py b/t/umash_bench.py index 135d7c1..ac9cf0a 100644 --- a/t/umash_bench.py +++ b/t/umash_bench.py @@ -46,6 +46,7 @@ def compare_inputs( block_size=128, min_count=100000, runner="umash_bench_individual", + options={}, ): """Compares the performance of two implementations for input sizes in `length_arguments`. @@ -64,9 +65,11 @@ def compare_inputs( timings = ffi.new("uint64_t[]", block_size) def make_options(target_ffi): - options = target_ffi.new("struct bench_individual_options *") - options.size = target_ffi.sizeof("struct bench_individual_options") - return options + ret = target_ffi.new("struct bench_individual_options *") + ret.size = target_ffi.sizeof("struct bench_individual_options") + for field, value in options.items(): + setattr(ret, field, value) + return ret implementations = [ (0, getattr(current_lib, runner), make_options(ffi), defaultdict(list)),