Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cache misses: explore prefetching to address #446 #447

Draft
wants to merge 16 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions PLANNING.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,13 @@ Other tracks are stretch goals, contributions towards them are accepted.

- ARM assembly
- Finish Nvidia GPU codegenerator up to MSM
- Implement a backend for prime moduli of special form with fast reduction
that don't need Montgomery form
- Implement a backend for Solinas prime like P256
- Implement an unsaturated finite fields backend for Risc-V, WASM, WebGPU, AMD GPU, Apple Metal, Vulkan, ...
- ideally in LLVM IR so that pristine Risc-V assembly can be generated
and used in zkVMs without any risk of C stdlib or syscalls being used
and without depending on the Nim compiler at build time.
- introduce batchAffine_vartime
- Optimized square_repeated in assembly for Montgomery and Crandall/Pseudo-Mersenne primes

### User Experience track

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_blueprint.nim
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ proc warmup*() =
let stop = cpuTime()
echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"

warmup()
# warmup()

when defined(gcc):
echo "\nCompiled with GCC"
Expand Down
70 changes: 35 additions & 35 deletions benchmarks/bench_ec_g1.nim
Original file line number Diff line number Diff line change
Expand Up @@ -26,54 +26,54 @@ import
# ############################################################


const Iters = 10_000
const Iters = 10_000_000
const MulIters = 100
const AvailableCurves = [
# P224,
BN254_Nogami,
BN254_Snarks,
# BN254_Nogami,
# BN254_Snarks,
# Edwards25519,
# P256,
Secp256k1,
Pallas,
Vesta,
BLS12_377,
BLS12_381,
# Pallas,
# Vesta,
# BLS12_377,
# BLS12_381,
]

proc main() =
separator()
staticFor i, 0, AvailableCurves.len:
const curve = AvailableCurves[i]
addBench(EC_ShortW_Prj[Fp[curve], G1], Iters)
addBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
addBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
mixedAddBench(EC_ShortW_Prj[Fp[curve], G1], Iters)
mixedAddBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
mixedAddBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
doublingBench(EC_ShortW_Prj[Fp[curve], G1], Iters)
doublingBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
doublingBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
separator()
affFromProjBench(EC_ShortW_Prj[Fp[curve], G1], MulIters)
affFromJacBench(EC_ShortW_Jac[Fp[curve], G1], MulIters)
separator()
for numPoints in [10, 100, 1000, 10000]:
let batchIters = max(1, Iters div numPoints)
affFromProjBatchBench(EC_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = false, batchIters)
separator()
for numPoints in [10, 100, 1000, 10000]:
let batchIters = max(1, Iters div numPoints)
affFromProjBatchBench(EC_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = true, batchIters)
separator()
for numPoints in [10, 100, 1000, 10000]:
let batchIters = max(1, Iters div numPoints)
affFromJacBatchBench(EC_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = false, batchIters)
separator()
for numPoints in [10, 100, 1000, 10000]:
let batchIters = max(1, Iters div numPoints)
affFromJacBatchBench(EC_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = true, batchIters)
separator()
# addBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
# addBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
# mixedAddBench(EC_ShortW_Prj[Fp[curve], G1], Iters)
# mixedAddBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
# mixedAddBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
# doublingBench(EC_ShortW_Prj[Fp[curve], G1], Iters)
# doublingBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
# doublingBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
# separator()
# affFromProjBench(EC_ShortW_Prj[Fp[curve], G1], MulIters)
# affFromJacBench(EC_ShortW_Jac[Fp[curve], G1], MulIters)
# separator()
# for numPoints in [10, 100, 1000, 10000]:
# let batchIters = max(1, Iters div numPoints)
# affFromProjBatchBench(EC_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = false, batchIters)
# separator()
# for numPoints in [10, 100, 1000, 10000]:
# let batchIters = max(1, Iters div numPoints)
# affFromProjBatchBench(EC_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = true, batchIters)
# separator()
# for numPoints in [10, 100, 1000, 10000]:
# let batchIters = max(1, Iters div numPoints)
# affFromJacBatchBench(EC_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = false, batchIters)
# separator()
# for numPoints in [10, 100, 1000, 10000]:
# let batchIters = max(1, Iters div numPoints)
# affFromJacBatchBench(EC_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = true, batchIters)
# separator()
separator()

main()
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/bench_elliptic_parallel_template.nim
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ export bench_elliptic_template
#
# ############################################################

proc multiAddParallelBench*(EC: typedesc, numInputs: int, iters: int) =
proc multiAddParallelBench*(EC: typedesc, numInputs: int, iters: int) {.noinline.} =
var points = newSeq[EC_ShortW_Aff[EC.F, EC.G]](numInputs)

for i in 0 ..< numInputs:
Expand All @@ -59,7 +59,7 @@ type BenchMsmContext*[EC] = object
coefs: seq[BigInt[64]] # seq[getBigInt(EC.getName(), kScalarField)]
points: seq[affine(EC)]

proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmContext[EC] =
proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmContext[EC] {.noinline.} =
result.tp = Threadpool.new()
let maxNumInputs = inputSizes.max()

Expand Down Expand Up @@ -103,7 +103,7 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC
let stop = getMonotime()
stdout.write &"in {float64(inNanoSeconds(stop-start)) / 1e6:6.3f} ms\n"

proc msmParallelBench*[EC](ctx: var BenchMsmContext[EC], numInputs: int, iters: int) =
proc msmParallelBench*[EC](ctx: var BenchMsmContext[EC], numInputs: int, iters: int) {.noinline.} =
const bits = 64 # EC.getScalarField().bits()
type ECaff = affine(EC)

Expand Down
42 changes: 21 additions & 21 deletions benchmarks/bench_elliptic_template.nim
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ func `+=`[F; G: static Subgroup](P: var EC_ShortW_JacExt[F, G], Q: EC_ShortW_Jac
func `+=`[F; G: static Subgroup](P: var EC_ShortW_JacExt[F, G], Q: EC_ShortW_Aff[F, G]) {.inline.}=
P.mixedSum_vartime(P, Q)

proc addBench*(EC: typedesc, iters: int) =
proc addBench*(EC: typedesc, iters: int) {.noinline.} =
var r {.noInit.}: EC
let P = rng.random_unsafe(EC)
let Q = rng.random_unsafe(EC)
Expand All @@ -84,11 +84,11 @@ proc addBench*(EC: typedesc, iters: int) =
block:
bench("EC Add " & $EC.G, EC, iters):
r.sum(P, Q)
block:
bench("EC Add vartime " & $EC.G, EC, iters):
r.sum_vartime(P, Q)
# block:
# bench("EC Add vartime " & $EC.G, EC, iters):
# r.sum_vartime(P, Q)

proc mixedAddBench*(EC: typedesc, iters: int) =
proc mixedAddBench*(EC: typedesc, iters: int) {.noinline.} =
var r {.noInit.}: EC
let P = rng.random_unsafe(EC)
let Q = rng.random_unsafe(EC)
Expand All @@ -106,25 +106,25 @@ proc mixedAddBench*(EC: typedesc, iters: int) =
bench("EC Mixed Addition vartime " & $EC.G, EC, iters):
r.mixedSum_vartime(P, Qaff)

proc doublingBench*(EC: typedesc, iters: int) =
proc doublingBench*(EC: typedesc, iters: int) {.noinline.} =
var r {.noInit.}: EC
let P = rng.random_unsafe(EC)
bench("EC Double " & $EC.G, EC, iters):
r.double(P)

proc affFromProjBench*(EC: typedesc, iters: int) =
proc affFromProjBench*(EC: typedesc, iters: int) {.noinline.} =
var r {.noInit.}: EC_ShortW_Aff[EC.F, EC.G]
let P = rng.random_unsafe(EC)
bench("EC Projective to Affine " & $EC.G, EC, iters):
r.affine(P)

proc affFromJacBench*(EC: typedesc, iters: int) =
proc affFromJacBench*(EC: typedesc, iters: int) {.noinline.} =
var r {.noInit.}: EC_ShortW_Aff[EC.F, EC.G]
let P = rng.random_unsafe(EC)
bench("EC Jacobian to Affine " & $EC.G, EC, iters):
r.affine(P)

proc affFromProjBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) =
proc affFromProjBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) {.noinline.} =
var r = newSeq[affine(EC)](numPoints)
var points = newSeq[EC](numPoints)

Expand All @@ -139,7 +139,7 @@ proc affFromProjBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, ite
for i in 0 ..< numPoints:
r[i].affine(points[i])

proc affFromJacBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) =
proc affFromJacBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) {.noinline.} =
var r = newSeq[affine(EC)](numPoints)
var points = newSeq[EC](numPoints)

Expand All @@ -154,7 +154,7 @@ proc affFromJacBatchBench*(EC: typedesc, numPoints: int, useBatching: bool, iter
for i in 0 ..< numPoints:
r[i].affine(points[i])

proc scalarMulGenericBench*(EC: typedesc, bits, window: static int, iters: int) =
proc scalarMulGenericBench*(EC: typedesc, bits, window: static int, iters: int) {.noinline.} =
var r {.noInit.}: EC
var P = rng.random_unsafe(EC)
P.clearCofactor()
Expand All @@ -165,7 +165,7 @@ proc scalarMulGenericBench*(EC: typedesc, bits, window: static int, iters: int)
r = P
r.scalarMulGeneric(exponent, window)

proc scalarMulEndo*(EC: typedesc, bits: static int, iters: int) =
proc scalarMulEndo*(EC: typedesc, bits: static int, iters: int) {.noinline.} =
var r {.noInit.}: EC
var P = rng.random_unsafe(EC)
P.clearCofactor()
Expand All @@ -176,7 +176,7 @@ proc scalarMulEndo*(EC: typedesc, bits: static int, iters: int) =
r = P
r.scalarMulEndo(exponent)

proc scalarMulEndoWindow*(EC: typedesc, bits: static int, iters: int) =
proc scalarMulEndoWindow*(EC: typedesc, bits: static int, iters: int) {.noinline.} =
var r {.noInit.}: EC
var P = rng.random_unsafe(EC)
P.clearCofactor()
Expand All @@ -190,7 +190,7 @@ proc scalarMulEndoWindow*(EC: typedesc, bits: static int, iters: int) =
else:
{.error: "Not implemented".}

proc scalarMulVartimeDoubleAddBench*(EC: typedesc, bits: static int, iters: int) =
proc scalarMulVartimeDoubleAddBench*(EC: typedesc, bits: static int, iters: int) {.noinline.} =
var r {.noInit.}: EC
var P = rng.random_unsafe(EC)
P.clearCofactor()
Expand All @@ -201,7 +201,7 @@ proc scalarMulVartimeDoubleAddBench*(EC: typedesc, bits: static int, iters: int)
r = P
r.scalarMul_doubleAdd_vartime(exponent)

proc scalarMulVartimeMinHammingWeightRecodingBench*(EC: typedesc, bits: static int, iters: int) =
proc scalarMulVartimeMinHammingWeightRecodingBench*(EC: typedesc, bits: static int, iters: int) {.noinline.} =
var r {.noInit.}: EC
var P = rng.random_unsafe(EC)
P.clearCofactor()
Expand All @@ -212,7 +212,7 @@ proc scalarMulVartimeMinHammingWeightRecodingBench*(EC: typedesc, bits: static i
r = P
r.scalarMul_jy00_vartime(exponent)

proc scalarMulVartimeWNAFBench*(EC: typedesc, bits, window: static int, iters: int) =
proc scalarMulVartimeWNAFBench*(EC: typedesc, bits, window: static int, iters: int) {.noinline.} =
var r {.noInit.}: EC
var P = rng.random_unsafe(EC)
P.clearCofactor()
Expand All @@ -223,7 +223,7 @@ proc scalarMulVartimeWNAFBench*(EC: typedesc, bits, window: static int, iters: i
r = P
r.scalarMul_wNAF_vartime(exponent, window)

proc scalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: static int, iters: int) =
proc scalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: static int, iters: int) {.noinline.} =
var r {.noInit.}: EC
var P = rng.random_unsafe(EC)
P.clearCofactor()
Expand All @@ -234,14 +234,14 @@ proc scalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: static int, iter
r = P
r.scalarMulEndo_wNAF_vartime(exponent, window)

proc subgroupCheckBench*(EC: typedesc, iters: int) =
proc subgroupCheckBench*(EC: typedesc, iters: int) {.noinline.} =
var P = rng.random_unsafe(EC)
P.clearCofactor()

bench("Subgroup check", EC, iters):
discard P.isInSubgroup()

proc subgroupCheckScalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: static int, iters: int) =
proc subgroupCheckScalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: static int, iters: int) {.noinline.} =
var r {.noInit.}: EC
var P = rng.random_unsafe(EC)
P.clearCofactor()
Expand All @@ -253,7 +253,7 @@ proc subgroupCheckScalarMulVartimeEndoWNAFBench*(EC: typedesc, bits, window: sta
discard r.isInSubgroup()
r.scalarMulEndo_wNAF_vartime(exponent, window)

proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) =
proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int) {.noinline.} =
var points = newSeq[EC_ShortW_Aff[EC.F, EC.G]](numPoints)

for i in 0 ..< numPoints:
Expand All @@ -271,7 +271,7 @@ proc multiAddBench*(EC: typedesc, numPoints: int, useBatching: bool, iters: int)
r += points[i]


proc msmBench*(EC: typedesc, numPoints: int, iters: int) =
proc msmBench*(EC: typedesc, numPoints: int, iters: int) {.noinline.} =
const bits = EC.getScalarField().bits()
var points = newSeq[EC_ShortW_Aff[EC.F, EC.G]](numPoints)
var scalars = newSeq[BigInt[bits]](numPoints)
Expand Down
Loading
Loading