From b7f9e1017abd82141c625054ffa3a21f116a4eca Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Wed, 31 Jul 2024 07:25:54 +0200 Subject: [PATCH] Explore prefetching to try to fix #446 --- benchmarks/bench_blueprint.nim | 2 +- benchmarks/bench_ec_g1.nim | 70 +++++++++---------- benchmarks/bench_elliptic_template.nim | 6 +- .../assembly/limbs_asm_modular_x86.nim | 17 +++++ .../math/arithmetic/limbs_crandall.nim | 24 +++---- .../isa_x86/macro_assembler_x86_att.nim | 12 ++++ .../isa_x86/macro_assembler_x86_intel.nim | 25 +++++++ 7 files changed, 105 insertions(+), 51 deletions(-) diff --git a/benchmarks/bench_blueprint.nim b/benchmarks/bench_blueprint.nim index 4bfe4dd2f..46cb08d0f 100644 --- a/benchmarks/bench_blueprint.nim +++ b/benchmarks/bench_blueprint.nim @@ -41,7 +41,7 @@ proc warmup*() = let stop = cpuTime() echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n" -warmup() +# warmup() when defined(gcc): echo "\nCompiled with GCC" diff --git a/benchmarks/bench_ec_g1.nim b/benchmarks/bench_ec_g1.nim index 0b857337b..ecd7b3fa2 100644 --- a/benchmarks/bench_ec_g1.nim +++ b/benchmarks/bench_ec_g1.nim @@ -26,19 +26,19 @@ import # ############################################################ -const Iters = 10_000 +const Iters = 10_000_000 const MulIters = 100 const AvailableCurves = [ # P224, - BN254_Nogami, - BN254_Snarks, + # BN254_Nogami, + # BN254_Snarks, # Edwards25519, # P256, Secp256k1, - Pallas, - Vesta, - BLS12_377, - BLS12_381, + # Pallas, + # Vesta, + # BLS12_377, + # BLS12_381, ] proc main() = @@ -46,34 +46,34 @@ proc main() = staticFor i, 0, AvailableCurves.len: const curve = AvailableCurves[i] addBench(EC_ShortW_Prj[Fp[curve], G1], Iters) - addBench(EC_ShortW_Jac[Fp[curve], G1], Iters) - addBench(EC_ShortW_JacExt[Fp[curve], G1], Iters) - mixedAddBench(EC_ShortW_Prj[Fp[curve], G1], Iters) - mixedAddBench(EC_ShortW_Jac[Fp[curve], G1], Iters) - mixedAddBench(EC_ShortW_JacExt[Fp[curve], G1], Iters) - doublingBench(EC_ShortW_Prj[Fp[curve], G1], Iters) - doublingBench(EC_ShortW_Jac[Fp[curve], G1], Iters) - doublingBench(EC_ShortW_JacExt[Fp[curve], G1], Iters) - separator() - affFromProjBench(EC_ShortW_Prj[Fp[curve], G1], MulIters) - affFromJacBench(EC_ShortW_Jac[Fp[curve], G1], MulIters) - separator() - for numPoints in [10, 100, 1000, 10000]: - let batchIters = max(1, Iters div numPoints) - affFromProjBatchBench(EC_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = false, batchIters) - separator() - for numPoints in [10, 100, 1000, 10000]: - let batchIters = max(1, Iters div numPoints) - affFromProjBatchBench(EC_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = true, batchIters) - separator() - for numPoints in [10, 100, 1000, 10000]: - let batchIters = max(1, Iters div numPoints) - affFromJacBatchBench(EC_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = false, batchIters) - separator() - for numPoints in [10, 100, 1000, 10000]: - let batchIters = max(1, Iters div numPoints) - affFromJacBatchBench(EC_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = true, batchIters) - separator() + # addBench(EC_ShortW_Jac[Fp[curve], G1], Iters) + # addBench(EC_ShortW_JacExt[Fp[curve], G1], Iters) + # mixedAddBench(EC_ShortW_Prj[Fp[curve], G1], Iters) + # mixedAddBench(EC_ShortW_Jac[Fp[curve], G1], Iters) + # mixedAddBench(EC_ShortW_JacExt[Fp[curve], G1], Iters) + # doublingBench(EC_ShortW_Prj[Fp[curve], G1], Iters) + # doublingBench(EC_ShortW_Jac[Fp[curve], G1], Iters) + # doublingBench(EC_ShortW_JacExt[Fp[curve], G1], Iters) + # separator() + # affFromProjBench(EC_ShortW_Prj[Fp[curve], G1], MulIters) + # affFromJacBench(EC_ShortW_Jac[Fp[curve], G1], MulIters) + # separator() + # for numPoints in [10, 100, 1000, 10000]: + # let batchIters = max(1, Iters div numPoints) + # affFromProjBatchBench(EC_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = false, batchIters) + # separator() + # for numPoints in [10, 100, 1000, 10000]: + # let batchIters = max(1, Iters div numPoints) + # affFromProjBatchBench(EC_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = true, batchIters) + # separator() + # for numPoints in [10, 100, 1000, 10000]: + # let batchIters = max(1, Iters div numPoints) + # affFromJacBatchBench(EC_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = false, batchIters) + # separator() + # for numPoints in [10, 100, 1000, 10000]: + # let batchIters = max(1, Iters div numPoints) + # affFromJacBatchBench(EC_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = true, batchIters) + # separator() separator() main() diff --git a/benchmarks/bench_elliptic_template.nim b/benchmarks/bench_elliptic_template.nim index cbd20624b..1ae11c274 100644 --- a/benchmarks/bench_elliptic_template.nim +++ b/benchmarks/bench_elliptic_template.nim @@ -84,9 +84,9 @@ proc addBench*(EC: typedesc, iters: int) {.noinline.} = block: bench("EC Add " & $EC.G, EC, iters): r.sum(P, Q) - block: - bench("EC Add vartime " & $EC.G, EC, iters): - r.sum_vartime(P, Q) + # block: + # bench("EC Add vartime " & $EC.G, EC, iters): + # r.sum_vartime(P, Q) proc mixedAddBench*(EC: typedesc, iters: int) {.noinline.} = var r {.noInit.}: EC diff --git a/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim b/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim index a68830d94..671497161 100644 --- a/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim +++ b/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim @@ -41,6 +41,9 @@ proc finalSubNoOverflowImpl*( if not a_in_scratch: ctx.mov scratch[0], a[0] ctx.sub scratch[0], M[0] + # Combat cache-misses + # https://github.com/mratsim/constantine/issues/446#issuecomment-2254258024 + ctx.prefetchw r for i in 1 ..< N: if not a_in_scratch: ctx.mov scratch[i], a[i] @@ -75,6 +78,9 @@ proc finalSubMayOverflowImpl*( if not a_in_scratch: ctx.mov scratch[0], a[0] ctx.sub scratch[0], M[0] + # Combat cache-misses + # https://github.com/mratsim/constantine/issues/446#issuecomment-2254258024 + ctx.prefetchw r for i in 1 ..< N: if not a_in_scratch: ctx.mov scratch[i], a[i] @@ -156,6 +162,9 @@ macro addmod_gen[N: static int](r_PIR: var Limbs[N], a_PIR, b_PIR, M_MEM: Limbs[ # Addition ctx.add u[0], b[0] ctx.mov v[0], u[0] + # Combat cache-misses + # https://github.com/mratsim/constantine/issues/446#issuecomment-2254258024 + ctx.prefetcht0 M for i in 1 ..< N: ctx.adc u[i], b[i] # Interleaved copy in a second buffer as well @@ -214,6 +223,10 @@ macro submod_gen[N: static int](r_PIR: var Limbs[N], a_PIR, b_PIR, M_MEM: Limbs[ let underflowed = b.reuseRegister() ctx.sbb underflowed, underflowed + # Combat cache-misses + # https://github.com/mratsim/constantine/issues/446#issuecomment-2254258024 + ctx.prefetchw r + # Now mask the adder, with 0 or the modulus limbs for i in 0 ..< N: ctx.`and` v[i], underflowed @@ -264,6 +277,10 @@ macro negmod_gen[N: static int](r_PIR: var Limbs[N], a_MEM, M_MEM: Limbs[N]): un ctx.mov u[i], M[i] ctx.sbb u[i], a[i] + # Combat cache-misses + # https://github.com/mratsim/constantine/issues/446#issuecomment-2254258024 + ctx.prefetchw r + # Deal with a == 0 ctx.mov isZero, a[0] for i in 1 ..< N: diff --git a/constantine/math/arithmetic/limbs_crandall.nim b/constantine/math/arithmetic/limbs_crandall.nim index 83915d6c6..d3e4f292a 100644 --- a/constantine/math/arithmetic/limbs_crandall.nim +++ b/constantine/math/arithmetic/limbs_crandall.nim @@ -189,10 +189,10 @@ func mulCranPartialReduce[N: static int]( m: static int, c: static SecretWord) {.inline.} = when UseASM_X86_64 and a.len in {3..6}: # ADX implies BMI2 - if ({.noSideEffect.}: hasAdx()): + # if ({.noSideEffect.}: hasAdx()): r.mulCranPartialReduce_asm_adx(a, b, m, c) - else: - r.mulCranPartialReduce_asm(a, b, m, c) + # else: + # r.mulCranPartialReduce_asm(a, b, m, c) else: var r2 {.noInit.}: Limbs[2*N] r2.prod(a, b) @@ -208,10 +208,10 @@ func mulCran*[N: static int]( r.mulCranPartialReduce(a, b, m, c) elif UseASM_X86_64 and a.len in {3..6}: # ADX implies BMI2 - if ({.noSideEffect.}: hasAdx()): + # if ({.noSideEffect.}: hasAdx()): r.mulCran_asm_adx(a, b, p, m, c) - else: - r.mulCran_asm(a, b, p, m, c) + # else: + # r.mulCran_asm(a, b, p, m, c) else: var r2 {.noInit.}: Limbs[2*N] r2.prod(a, b) @@ -224,10 +224,10 @@ func squareCranPartialReduce[N: static int]( m: static int, c: static SecretWord) {.inline.} = when UseASM_X86_64 and a.len in {3..6}: # ADX implies BMI2 - if ({.noSideEffect.}: hasAdx()): + # if ({.noSideEffect.}: hasAdx()): r.squareCranPartialReduce_asm_adx(a, m, c) - else: - r.squareCranPartialReduce_asm(a, m, c) + # else: + # r.squareCranPartialReduce_asm(a, m, c) else: var r2 {.noInit.}: Limbs[2*N] r2.square(a) @@ -243,10 +243,10 @@ func squareCran*[N: static int]( r.squareCranPartialReduce(a, m, c) elif UseASM_X86_64 and a.len in {3..6}: # ADX implies BMI2 - if ({.noSideEffect.}: hasAdx()): + # if ({.noSideEffect.}: hasAdx()): r.squareCran_asm_adx(a, p, m, c) - else: - r.squareCran_asm(a, p, m, c) + # else: + # r.squareCran_asm(a, p, m, c) else: var r2 {.noInit.}: Limbs[2*N] r2.square(a) diff --git a/constantine/platforms/isa_x86/macro_assembler_x86_att.nim b/constantine/platforms/isa_x86/macro_assembler_x86_att.nim index e95f3eafa..a406aae92 100644 --- a/constantine/platforms/isa_x86/macro_assembler_x86_att.nim +++ b/constantine/platforms/isa_x86/macro_assembler_x86_att.nim @@ -821,6 +821,18 @@ func setc*(a: var Assembler_x86, dst: Register) = a.code &= "setc " & Reg8Low[dst] & '\n' # No flags affected +func prefetcht0*(a: var Assembler_x86, mem: Operand or OperandArray) = + ## Retrieve memory in all cache levels for reading + let loc = a.getStrOffset(mem[0]) + a.code &= "prefetcht0 " & loc & '\n' + # No flags affected + +func prefetchw*(a: var Assembler_x86, mem: Operand or OperandArray) = + ## Retrieve memory in all cache levels for writing + let loc = a.getStrOffset(mem[0]) + a.code &= "prefetchw " & loc & '\n' + # No flags affected + func add*(a: var Assembler_x86, dst, src: Operand) = ## Does: dst <- dst + src doAssert dst.isOutput() diff --git a/constantine/platforms/isa_x86/macro_assembler_x86_intel.nim b/constantine/platforms/isa_x86/macro_assembler_x86_intel.nim index e9bb3c443..ede8a0b6e 100644 --- a/constantine/platforms/isa_x86/macro_assembler_x86_intel.nim +++ b/constantine/platforms/isa_x86/macro_assembler_x86_intel.nim @@ -788,6 +788,31 @@ func setc*(a: var Assembler_x86, dst: Register) = a.code &= "setc " & Reg8Low[dst] & '\n' # No flags affected +func getPrefetchLoc(mem: Operand or OperandArray): string = + let mem = mem[0] + if mem.desc.rm in {Mem, MemOffsettable}: + return "BYTE ptr %" & mem.desc.asmId + elif mem.desc.rm == PointerInReg or + mem.desc.rm in SpecificRegisters or + (mem.desc.rm == ElemsInReg and mem.kind == kFromArray): + return "BYTE ptr [%" & mem.desc.asmId & "]" + elif mem.desc.rm == ClobberedReg: + return "BYTE ptr [" & mem.desc.asmId & "]" + else: + error("Unsupported memory operand type for prefetch: " & mem.repr) + +func prefetcht0*(a: var Assembler_x86, mem: Operand or OperandArray) = + ## Retrieve memory in all cache levels for reading + let loc = getPrefetchLoc(mem) + a.code &= "prefetcht0 " & loc & '\n' + # No flags affected + +func prefetchw*(a: var Assembler_x86, mem: Operand or OperandArray) = + ## Retrieve memory in all cache levels for writing + let loc = getPrefetchLoc(mem) + a.code &= "prefetchw " & loc & '\n' + # No flags affected + func add*(a: var Assembler_x86, dst, src: Operand) = ## Does: dst <- dst + src doAssert dst.isOutput()