backtrace-labs · pkhuong · Sep 26, 2020 · Sep 26, 2020
diff --git a/bench/notebooks/bench_synthetic_sizes.md b/bench/notebooks/bench_synthetic_sizes.md
@@ -0,0 +1,111 @@
+---
+jupyter:
+  jupytext:
+    formats: ipynb,md
+    text_representation:
+      extension: .md
+      format_name: markdown
+      format_version: '1.2'
+      jupytext_version: 1.6.0
+  kernelspec:
+    display_name: Python 3
+    language: python
+    name: python3
+---
+
+```python
+# Latency test for synthetic size distribution
+
+from collections import defaultdict
+import math
+import random
+import umash_bench
+import umash_traces
+from exact_test import *
+import plotly.express as px
+```
+
+```python
+INPUT_SIZES=[513] * 10000  # Worst case for I$ misses with an implementation that special cases <= 512 bytes
+
+random.shuffle(INPUT_SIZES)
+
+def size_bucket(size):
+    return size
+```
+
+```python
+# Gather the raw data for the two revisions we want to compare
+TEST = "WIP"  # Or an actual commit ref
+BASELINE = "HEAD"  # Or any other commit ref
+CFLAGS = None
+CC = None
+FLUSH_LEVEL = 3  # 3: fully flush out the code out of cache
+results = umash_bench.compare_inputs(INPUT_SIZES,
+                                     current=TEST,
+                                     baseline=BASELINE,
+                                     cflags=CFLAGS,
+                                     cc=CC,
+                                     min_count=20000 * len(set(INPUT_SIZES)),
+                                     options={'flush_code': FLUSH_LEVEL})
+
+TEST, BASELINE = results.keys()  # Convert to the actual keys: HEAD etc. are normalised to SHAs
+
+regrouped = dict()
+for k, results in results.items():
+    regrouped_results = defaultdict(list)
+    regrouped[k] = regrouped_results
+    for size, timings in results.items():
+        regrouped_results[size_bucket(size)] += timings
+
+regrouped_keys = sorted(regrouped[TEST].keys())
+```
+
+```python
+# Summarise the range of latencies (in RDTSC cycles) for the two revisions and input size classes
+for label, values in regrouped.items():
+    print(label)
+    for i in regrouped_keys:
+        total = len(values[i])
+        kept = sum(x < 100 for x in values[i])
+        print("\t%s: %i %i %f (%i %i)" % (i, total, kept, kept / total, min(values[i]), max(values[i])))
+```
+
+```python
+# Visualise the two latency distributions for each input size
+for sz in regrouped_keys:
+    test = list(regrouped[TEST][sz])
+    baseline = list(regrouped[BASELINE][sz])
+    random.shuffle(test)
+    random.shuffle(baseline)
+    test = test[:5000]
+    baseline = baseline[:5000]
+    fig = px.histogram(dict(Test=test, Baseline=baseline),
+                       title="Latency for input size = %s" % sz,
+                       histnorm='probability density',
+                       nbins=max(test + baseline),
+                       barmode="overlay",
+                       opacity=0.5,
+                       marginal="box")
+    fig.update_xaxes(range=(0, 100 + min(test + baseline) + 50 * (sz // 100)))
+    fig.show()
+```
+
+```python
+# Run an exact permutation test for each input size to see if any difference is worth looking at.
+stats = [(i,
+          exact_test(a=regrouped[TEST][i][:20000],  # We don't need too too many data points
+                     b=regrouped[BASELINE][i][:20000],
+                     eps=1e-4,
+                     statistics=[
+                         mean("mean", .5e-3),
+                         lte_prob("lte"),
+                         q99("q99"),
+                         q99("q99_sa", a_offset=5)  # Compare against q99 w/ A 5 cycles slower than B
+                     ])
+         ) for i in regrouped_keys]
+```
+
+```python
+stats
+```
diff --git a/bench/runner.c b/bench/runner.c
@@ -3,6 +3,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
+#include <x86intrin.h>
 
 #include "umash.h"
 
@@ -31,6 +32,45 @@ setup_params(void)
 	return;
 }
 
+static inline void
+cpuid_barrier(void)
+{
+
+	asm volatile("cpuid\n\t" ::: "%rax", "%rdx", "%rbx", "%rcx", "memory", "cc");
+	return;
+}
+
+static __attribute__((__noinline__)) void
+flush_code(int level)
+{
+	uintptr_t begin = (uintptr_t)ID(__start_umash_code);
+	uintptr_t end = (uintptr_t)ID(__stop_umash_code);
+
+	if (level == 0)
+		return;
+
+	for (uintptr_t addr = begin; addr < end; addr += 64)
+		_mm_clflush((void *)addr);
+
+	cpuid_barrier();
+
+	switch (level) {
+	case 1:
+		for (uintptr_t addr = begin; addr < end; addr += 64)
+			_mm_prefetch((void *)addr, _MM_HINT_T1);
+		break;
+	case 2:
+		for (uintptr_t addr = begin; addr < end; addr += 64)
+			_mm_prefetch((void *)addr, _MM_HINT_T2);
+		break;
+	default:
+		return;
+	}
+
+	cpuid_barrier();
+	return;
+}
+
 /*
  * We use difference instruction sequences for the beginning and end
  * of the timed sequence because that's what Intel recommends.
@@ -140,15 +180,36 @@ uint64_t ID(umash_bench_fp_aggregate)(
 	return end - begin;
 }
 
-void ID(umash_bench_individual)(const struct bench_individual_options *options,
+static struct bench_individual_options
+normalize_options(const struct bench_individual_options *options)
+{
+	struct bench_individual_options ret = {
+		.size = sizeof(ret),
+	};
+
+	if (options == NULL)
+		return ret;
+
+	if (options->size < ret.size) {
+		memcpy(&ret, options, options->size);
+	} else {
+		memcpy(&ret, options, sizeof(ret));
+	}
+
+	ret.size = sizeof(ret);
+	return ret;
+}
+
+void ID(umash_bench_individual)(const struct bench_individual_options *options_ptr,
     uint64_t *restrict timings, const size_t *input_len, size_t num_trials,
     size_t max_len)
 {
+	struct bench_individual_options options;
 	size_t bufsz = ALLOC_ALIGNMENT * (1 + (max_len + JITTER_MASK) / ALLOC_ALIGNMENT);
 	char *buf;
 	uint64_t seed = 0;
 
-	(void)options;
+	options = normalize_options(options_ptr);
 	if (posix_memalign((void *)&buf, ALLOC_ALIGNMENT, bufsz) != 0)
 		assert(0 && "Failed to allocate buffer.");
 
@@ -158,6 +219,7 @@ void ID(umash_bench_individual)(const struct bench_individual_options *options,
 		uint64_t begin, end;
 		uint64_t hash;
 
+		flush_code(options.flush_code);
 		begin = get_ticks_begin(&seed);
 		seed += begin;
 
@@ -174,15 +236,16 @@ void ID(umash_bench_individual)(const struct bench_individual_options *options,
 	return;
 }
 
-void ID(umash_bench_fp_individual)(const struct bench_individual_options *options,
+void ID(umash_bench_fp_individual)(const struct bench_individual_options *options_ptr,
     uint64_t *restrict timings, const size_t *input_len, size_t num_trials,
     size_t max_len)
 {
+	struct bench_individual_options options;
 	size_t bufsz = ALLOC_ALIGNMENT * (1 + (max_len + JITTER_MASK) / ALLOC_ALIGNMENT);
 	char *buf;
 	uint64_t seed = 0;
 
-	(void)options;
+	options = normalize_options(options_ptr);
 	if (posix_memalign((void *)&buf, ALLOC_ALIGNMENT, bufsz) != 0)
 		assert(0 && "Failed to allocate buffer.");
 
@@ -193,6 +256,7 @@ void ID(umash_bench_fp_individual)(const struct bench_individual_options *option
 		uint64_t begin, end;
 		uint64_t hash;
 
+		flush_code(options.flush_code);
 		begin = get_ticks_begin(&seed);
 		seed += begin;
 

diff --git a/bench/runner.h b/bench/runner.h
@@ -20,6 +20,17 @@
  */
 struct bench_individual_options {
 	size_t size; /* sizeof(struct bench_individual_options) */
+
+	/*
+	 * When `flush_code` is non-zero, the benchmarking loop
+	 * CLFLUSHes out all umash code before every hash call.
+	 *
+	 * flush_code = 0: keep code hot in cache
+	 * flush_code = 1: try to move code back into L2
+	 * flush_code = 2: try to move code back into L3
+	 * flush_code >= 3: leave code fully flushed out.
+	 */
+	int flush_code;
 };
 
 /*

diff --git a/t/umash_bench.py b/t/umash_bench.py
@@ -46,6 +46,7 @@ def compare_inputs(
     block_size=128,
     min_count=100000,
     runner="umash_bench_individual",
+    options={},
 ):
     """Compares the performance of two implementations for input sizes in `length_arguments`.
 
@@ -64,9 +65,11 @@ def compare_inputs(
     timings = ffi.new("uint64_t[]", block_size)
 
     def make_options(target_ffi):
-        options = target_ffi.new("struct bench_individual_options *")
-        options.size = target_ffi.sizeof("struct bench_individual_options")
-        return options
+        ret = target_ffi.new("struct bench_individual_options *")
+        ret.size = target_ffi.sizeof("struct bench_individual_options")
+        for field, value in options.items():
+            setattr(ret, field, value)
+        return ret
 
     implementations = [
         (0, getattr(current_lib, runner), make_options(ffi), defaultdict(list)),