From d1e8aa22d34bfa55a02620f4001fc23b3d9041e8 Mon Sep 17 00:00:00 2001 From: Robbin Ehn Date: Wed, 22 Nov 2023 15:31:23 +0100 Subject: [PATCH] Share code --- src/hotspot/cpu/riscv/stubGenerator_riscv.cpp | 1044 +++++------------ 1 file changed, 273 insertions(+), 771 deletions(-) diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp index 6827448fe8c4c..7d0b5e42e2179 100644 --- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp @@ -3659,19 +3659,20 @@ class StubGenerator: public StubCodeGenerator { return entry; } }; -#endif // COMPILER2 - - // Arguments: - // - // Inputs: - // c_rarg0 - byte[] source+offset - // c_rarg1 - int[] SHA.state - // c_rarg2 - int offset - // c_rarg3 - int limit - // - address generate_sha256_implCompress(bool multi_block, const char *name) { - static const uint32_t round_consts[64] = { +#undef __ +#define __ this-> + class Sha2Generator : public MacroAssembler { + public: + Sha2Generator(MacroAssembler* masm) : MacroAssembler(masm->code()) {} + address generate_sha256_implCompress(bool multi_block) { + return generate_sha2_implCompress(multi_block); + } + address generate_sha512_implCompress(bool multi_block) { + return generate_sha2_implCompress(multi_block); + } + private: + const uint32_t round_consts_256[64] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, @@ -3689,424 +3690,7 @@ class StubGenerator: public StubCodeGenerator { 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, }; - __ align(CodeEntryAlignment); - StubCodeMark mark(this, "StubRoutines", name); - address start = __ pc(); - - Register buf = c_rarg0; - Register state = c_rarg1; - Register ofs = c_rarg2; - Register limit = c_rarg3; - Register consts = t0; - - Label multi_block_loop; - - __ enter(); - - // Register use in this function: - // - // VECTORS - // v10 - v13 (512-bits / 4*128 bits / 4*4*32 bits), hold the message - // schedule words (Wt). They start with the message block - // content (W0 to W15), then further words in the message - // schedule generated via vsha2ms from previous Wt. - // Initially: - // v10 = W[ 3:0] = { W3, W2, W1, W0} - // v11 = W[ 7:4] = { W7, W6, W5, W4} - // v12 = W[ 11:8] = {W11, W10, W9, W8} - // v13 = W[15:12] = {W15, W14, W13, W12} - // - // v16 - v17 hold the working state variables (a, b, ..., h) - // v16 = {a[t],b[t],e[t],f[t]} - // v17 = {c[t],d[t],g[t],h[t]} - // Initially: - // v16 = {H5i-1, H4i-1, H1i-1 , H0i-1} - // v17 = {H7i-i, H6i-1, H3i-1 , H2i-1} - // - // v0 = masks for vrgather/vmerge. Single value during the 16 rounds. - // - // v14 = temporary, Wt+Kt - // v15 = temporary, Kt - // - // v18/v19 = temporaries, in the epilogue, to re-arrange - // and byte-swap v16/v17 - // - // v26/v27 = hold the initial values of the hash, byte-swapped. - // - // v30/v31 = used to generate masks, vrgather indices. - // - // During most of the function the vector state is configured so that each - // vector is interpreted as containing four 32 bits (e32) elements (128 bits). - - // Set vectors as 4 * 32 bits - // - // e32: vector of 32b/4B elements - // m1: LMUL=1 - // ta: tail agnostic (don't care about those lanes) - // ma: mask agnostic (don't care about those lanes) - // x0 is not written, we known the number of vector elements, 8. - __ vsetivli(x0, 4, Assembler::e32, Assembler::m1, Assembler::ma, Assembler::ta); - - // Load H[0..8] to produce - // v16 = {a,b,e,f} - // v17 = {c,d,g,h} - __ vle32_v(v16, state); // v16 = {d,c,b,a} - __ addi(state, state, 16); - __ vle32_v(v17, state); // v17 = {h,g,f,e} - - __ vid_v(v30); // v30 = {3,2,1,0} - __ vxor_vi(v30, v30, 0x3); // v30 = {0,1,2,3} - __ vrgather_vv(v26, v16, v30); // v26 = {a,b,c,d} - __ vrgather_vv(v27, v17, v30); // v27 = {e,f,g,h} - __ vmsgeu_vi(v0, v30, 2); // v0 = {f,f,t,t} - // Copy elements [3..2] of v26 ({d,c}) into elements [3..2] of v17. - __ vslideup_vi(v17, v26, 2); // v17 = {c,d,_,_} - // Merge elements [1..0] of v27 ({g,h}) into elements [1..0] of v17 - __ vmerge_vvm(v17, v17, v27); // v17 = {c,d,g,h} - // Copy elements [1..0] of v27 ({f,e}) into elements [1..0] of v16. - __ vslidedown_vi(v16, v27, 2); // v16 = {_,_,e,f} - // Merge elements [3..2] of v26 ({a,b}) into elements [3..2] of v16 - __ vmerge_vvm(v16, v26, v16); // v16 = {a,b,e,f} - - __ bind(multi_block_loop); - - // Capture the initial H values in v26 and v27 to allow for computing - // the resulting H', since H' = H+{a',b',c',...,h'}. - __ vmv_v_v(v26, v16); - __ vmv_v_v(v27, v17); - - // Load the 512-bits of the message block in v10-v13 and perform - // an endian swap on each 4 bytes element. - // - // If Zvkb is not implemented, one can use vrgather with the right index - // sequence. It requires loading in separate registers since the destination - // of vrgather cannot overlap the source. - // // We generate the lane (byte) index sequence - // // v24 = [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12] - // // gives us "N ^ 3" as a nice formula to generate - // // this sequence. 'vid' gives us the N. - // // - // // We switch the vector type to SEW=8 temporarily. - // vsetivli x0, 16, e8, m1, ta, ma - // vid.v v24 - // vxor.vi v24, v24, 0x3 - // // Byteswap the bytes in each word of the text. - // vrgather.vv v10, v20, v24 - // vrgather.vv v11, v21, v24 - // vrgather.vv v12, v22, v24 - // vrgather.vv v13, v23, v24 - // // Switch back to SEW=32 - // vsetivli x0, 4, e32, m1, ta, ma - __ vle32_v(v10, buf); - __ vrev8_v(v10, v10); - __ addi(buf, buf, 16); - __ vle32_v(v11, buf); - __ vrev8_v(v11, v11); - __ addi(buf, buf, 16); - __ vle32_v(v12, buf); - __ vrev8_v(v12, v12); - __ addi(buf, buf, 16); - __ vle32_v(v13, buf); - __ vrev8_v(v13, v13); - __ addi(buf, buf, 16); - - // Set v0 up for the vmerge that replaces the first word (idx==0) - __ vid_v(v0); - __ vmseq_vi(v0, v0, 0x0); // v0.mask[i] = (i == 0 ? 1 : 0) - - __ la(consts, ExternalAddress((address)round_consts)); - - // Overview of the logic in each "quad round". - // - // The code below repeats 16 times the logic implementing four rounds - // of the SHA-256 core loop as documented by NIST. 16 "quad rounds" - // to implementing the 64 single rounds. - // - // // Load four word (u32) constants (K[t+3], K[t+2], K[t+1], K[t+0]) - // // Output: - // // v15 = {K[t+3], K[t+2], K[t+1], K[t+0]} - // vl1re32.v v15, ofs - // - // // Increment word contant address by stride (16 bytes, 4*4B, 128b) - // addi ofs, ofs, 16 - // - // // Add constants to message schedule words: - // // Input - // // v15 = {K[t+3], K[t+2], K[t+1], K[t+0]} - // // v10 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0]; - // // Output - // // v14 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} - // vadd.vv v14, v15, v10 - // - // // 2 rounds of working variables updates. - // // v17[t+4] <- v17[t], v16[t], v14[t] - // // Input: - // // v17 = {c[t],d[t],g[t],h[t]} " = v17[t] " - // // v16 = {a[t],b[t],e[t],f[t]} - // // v14 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} - // // Output: - // // v17 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = v16[t+2] " - // // = {h[t+4],g[t+4],d[t+4],c[t+4]} " = v17[t+4] " - // vsha2cl.vv v17, v16, v14 - // - // // 2 rounds of working variables updates. - // // v16[t+4] <- v16[t], v16[t+2], v14[t] - // // Input - // // v16 = {a[t],b[t],e[t],f[t]} " = v16[t] " - // // = {h[t+2],g[t+2],d[t+2],c[t+2]} " = v17[t+2] " - // // v17 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = v16[t+2] " - // // v14 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} - // // Output: - // // v16 = {f[t+4],e[t+4],b[t+4],a[t+4]} " = v16[t+4] " - // vsha2ch.vv v16, v17, v14 - // - // // Combine 2QW into 1QW - // // - // // To generate the next 4 words, "new_v10"/"v14" from v10-v13, vsha2ms needs - // // v10[0..3], v11[0], v12[1..3], v13[0, 2..3] - // // and it can only take 3 vectors as inputs. Hence we need to combine - // // v11[0] and v12[1..3] in a single vector. - // // - // // vmerge Vt4, Vt1, Vt2, V0 - // // Input - // // V0 = mask // first word from v12, 1..3 words from v11 - // // V12 = {Wt-8, Wt-7, Wt-6, Wt-5} - // // V11 = {Wt-12, Wt-11, Wt-10, Wt-9} - // // Output - // // Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5} - // vmerge.vvm v14, v12, v11, v0 - // - // // Generate next Four Message Schedule Words (hence allowing for 4 more rounds) - // // Input - // // V10 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]} W[ 3: 0] - // // V13 = {W[t+15], W[t+14], W[t+13], W[t+12]} W[15:12] - // // V14 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]} W[11: 9,4] - // // Output (next four message schedule words) - // // v10 = {W[t+19], W[t+18], W[t+17], W[t+16]} W[19:16] - // vsha2ms.vv v10, v14, v13 - // - // BEFORE - // v10 - v13 hold the message schedule words (initially the block words) - // v10 = W[ 3: 0] "oldest" - // v11 = W[ 7: 4] - // v12 = W[11: 8] - // v13 = W[15:12] "newest" - // - // vt6 - vt7 hold the working state variables - // v16 = {a[t],b[t],e[t],f[t]} // initially {H5,H4,H1,H0} - // v17 = {c[t],d[t],g[t],h[t]} // initially {H7,H6,H3,H2} - // - // AFTER - // v10 - v13 hold the message schedule words (initially the block words) - // v11 = W[ 7: 4] "oldest" - // v12 = W[11: 8] - // v13 = W[15:12] - // v10 = W[19:16] "newest" - // - // v16 and v17 hold the working state variables - // v16 = {a[t+4],b[t+4],e[t+4],f[t+4]} - // v17 = {c[t+4],d[t+4],g[t+4],h[t+4]} - // - // The group of vectors v10,v11,v12,v13 is "rotated" by one in each quad-round, - // hence the uses of those vectors rotate in each round, and we get back to the - // initial configuration every 4 quad-rounds. We could avoid those changes at - // the cost of moving those vectors at the end of each quad-rounds. - - //-------------------------------------------------------------------------------- - // Quad-round 0 (+0, v10->v11->v12->v13) - __ vl1re32_v(v15, consts); - __ addi(consts, consts, 16); - __ vadd_vv(v14, v15, v10); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v12, v11); - __ vsha2ms_vv(v10, v14, v13); // Generate W[19:16] - //-------------------------------------------------------------------------------- - // Quad-round 1 (+1, v11->v12->v13->v10) - __ vl1re32_v(v15, consts); - __ addi(consts, consts, 16); - __ vadd_vv(v14, v15, v11); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v13, v12); - __ vsha2ms_vv(v11, v14, v10); // Generate W[23:20] - //-------------------------------------------------------------------------------- - // Quad-round 2 (+2, v12->v13->v10->v11) - __ vl1re32_v(v15, consts); - __ addi(consts, consts, 16); - __ vadd_vv(v14, v15, v12); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v10, v13); - __ vsha2ms_vv(v12, v14, v11); // Generate W[27:24] - //-------------------------------------------------------------------------------- - // Quad-round 3 (+3, v13->v10->v11->v12) - __ vl1re32_v(v15, consts); - __ addi(consts, consts, 16); - __ vadd_vv(v14, v15, v13); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v11, v10); - __ vsha2ms_vv(v13, v14, v12); // Generate W[31:28] - - //-------------------------------------------------------------------------------- - // Quad-round 4 (+0, v10->v11->v12->v13) - __ vl1re32_v(v15, consts); - __ addi(consts, consts, 16); - __ vadd_vv(v14, v15, v10); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v12, v11); - __ vsha2ms_vv(v10, v14, v13); // Generate W[35:32] - //-------------------------------------------------------------------------------- - // Quad-round 5 (+1, v11->v12->v13->v10) - __ vl1re32_v(v15, consts); - __ addi(consts, consts, 16); - __ vadd_vv(v14, v15, v11); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v13, v12); - __ vsha2ms_vv(v11, v14, v10); // Generate W[39:36] - //-------------------------------------------------------------------------------- - // Quad-round 6 (+2, v12->v13->v10->v11) - __ vl1re32_v(v15, consts); - __ addi(consts, consts, 16); - __ vadd_vv(v14, v15, v12); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v10, v13); - __ vsha2ms_vv(v12, v14, v11); // Generate W[43:40] - //-------------------------------------------------------------------------------- - // Quad-round 7 (+3, v13->v10->v11->v12) - __ vl1re32_v(v15, consts); - __ addi(consts, consts, 16); - __ vadd_vv(v14, v15, v13); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v11, v10); - __ vsha2ms_vv(v13, v14, v12); // Generate W[47:44] - - //-------------------------------------------------------------------------------- - // Quad-round 8 (+0, v10->v11->v12->v13) - __ vl1re32_v(v15, consts); - __ addi(consts, consts, 16); - __ vadd_vv(v14, v15, v10); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v12, v11); - __ vsha2ms_vv(v10, v14, v13); // Generate W[51:48] - //-------------------------------------------------------------------------------- - // Quad-round 9 (+1, v11->v12->v13->v10) - __ vl1re32_v(v15, consts); - __ addi(consts, consts, 16); - __ vadd_vv(v14, v15, v11); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v13, v12); - __ vsha2ms_vv(v11, v14, v10); // Generate W[55:52] - //-------------------------------------------------------------------------------- - // Quad-round 10 (+2, v12->v13->v10->v11) - __ vl1re32_v(v15, consts); - __ addi(consts, consts, 16); - __ vadd_vv(v14, v15, v12); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v10, v13); - __ vsha2ms_vv(v12, v14, v11); // Generate W[59:56] - //-------------------------------------------------------------------------------- - // Quad-round 11 (+3, v13->v10->v11->v12) - __ vl1re32_v(v15, consts); - __ addi(consts, consts, 16); - __ vadd_vv(v14, v15, v13); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v11, v10); - __ vsha2ms_vv(v13, v14, v12); // Generate W[63:60] - - //-------------------------------------------------------------------------------- - // Quad-round 12 (+0, v10->v11->v12->v13) - // Note that we stop generating new message schedule words (Wt, v10-13) - // as we already generated all the words we end up consuming (i.e., W[63:60]). - __ vl1re32_v(v15, consts); - __ addi(consts, consts, 16); - __ vadd_vv(v14, v15, v10); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - //-------------------------------------------------------------------------------- - // Quad-round 13 (+1, v11->v12->v13->v10) - __ vl1re32_v(v15, consts); - __ addi(consts, consts, 16); - __ vadd_vv(v14, v15, v11); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - //-------------------------------------------------------------------------------- - // Quad-round 14 (+2, v12->v13->v10->v11) - __ vl1re32_v(v15, consts); - __ addi(consts, consts, 16); - __ vadd_vv(v14, v15, v12); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - //-------------------------------------------------------------------------------- - // Quad-round 15 (+3, v13->v10->v11->v12) - __ vl1re32_v(v15, consts); - // No consts increment needed - __ vadd_vv(v14, v15, v13); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - - //-------------------------------------------------------------------------------- - // Compute the updated hash value H' - // H' = H + {h',g',...,b',a'} - // = {h,g,...,b,a} + {h',g',...,b',a'} - // = {h+h',g+g',...,b+b',a+a'} - - __ vadd_vv(v16, v26, v16); - __ vadd_vv(v17, v27, v17); - - if (multi_block) { - __ add(ofs, ofs, 64); - __ ble(ofs, limit, multi_block_loop); - __ mv(c_rarg0, ofs); // return ofs - } - - // Store H[0..8] = {a,b,c,d,e,f,g,h} from - // v16 = {f,e,b,a} - // v17 = {h,g,d,c} - __ vid_v(v30); // v30 = {3,2,1,0} - __ vxor_vi(v30, v30, 0x3); // v30 = {0,1,2,3} - __ vrgather_vv(v26, v16, v30); // v26 = {f,e,b,a} - __ vrgather_vv(v27, v17, v30); // v27 = {h,g,d,c} - __ vmsgeu_vi(v0, v30, 2); // v0 = {f,f,t,t} - // Copy elements [3..2] of v26 ({f,e}) into elements [1..0] of v17. - __ vslidedown_vi(v17, v26, 2); // v17 = {_,_,f,e} - // Merge elements [3..2] of v27 ({g,h}) into elements [3..2] of v17 - __ vmerge_vvm(v17, v27, v17); // v17 = {h,g,f,e} - // Copy elements [1..0] of v27 ({c,d}) into elements [3..2] of v16. - __ vslideup_vi(v16, v27, 2); // v16 = {d,c,_,_} - // Merge elements [1..0] of v26 ({a,b}) into elements [1..0] of v16 - __ vmerge_vvm(v16, v16, v26); // v16 = {d,c,b,a} - - // Save the hash - __ vse32_v(v17, state); - __ addi(state, state, -16); - __ vse32_v(v16, state); - - __ leave(); - __ ret(); - - return start; - } - - // Arguments: - // - // Inputs: - // c_rarg0 - byte[] source+offset - // c_rarg1 - int[] SHA.state - // c_rarg2 - int offset - // c_rarg3 - int limit - // - address generate_sha512_implCompress(bool multi_block, const char *name) { - static const uint64_t round_consts[80] = { + const uint64_t round_consts_512[80] = { 0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl, 0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l, 0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l, @@ -4135,132 +3719,38 @@ class StubGenerator: public StubCodeGenerator { 0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al, 0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l }; - __ align(CodeEntryAlignment); - StubCodeMark mark(this, "StubRoutines", name); - address start = __ pc(); - - Register buf = c_rarg0; - Register state = c_rarg1; - Register ofs = c_rarg2; - Register limit = c_rarg3; - Register consts = t0; - - Label multi_block_loop; - __ enter(); + template + void vl1reXX_v(VectorRegister vr, Register sr) { + if (T == Assembler::e32) __ vl1re32_v(vr, sr); + else __ vl1re64_v(vr, sr); + } - // Register use in this function: - // - // VECTORS - // v10 - v13 (1024-bits / 4*256 bits / 4*4*64 bits), hold the message - // schedule words (Wt). They start with the message block - // content (W0 to W15), then further words in the message - // schedule generated via vsha2ms from previous Wt. - // Initially: - // v10 = W[ 3:0] = { W3, W2, W1, W0} - // v11 = W[ 7:4] = { W7, W6, W5, W4} - // v12 = W[ 11:8] = {W11, W10, W9, W8} - // v13 = W[15:12] = {W15, W14, W13, W12} - // - // v16 - v17 hold the working state variables (a, b, ..., h) - // v16 = {f[t],e[t],b[t],a[t]} - // v17 = {h[t],g[t],d[t],c[t]} - // Initially: - // v16 = {H5i-1, H4i-1, H1i-1 , H0i-1} - // v17 = {H7i-i, H6i-1, H3i-1 , H2i-1} - // - // v0 = masks for vrgather/vmerge. Single value during the 16 rounds. - // - // v14 = temporary, Wt+Kt - // v15 = temporary, Kt - // - // v18/v19 = temporaries, in the epilogue, to re-arrange - // and byte-swap v16/v17 - // - // v26/v27 = hold the initial values of the hash, byte-swapped. - // - // v30/v31 = used to generate masks, vrgather indices. - // - // During most of the function the vector state is configured so that each - // vector is interpreted as containing four 64 bits (e64) elements (256 bits). + template + void vleXX_v(VectorRegister vr, Register sr) { + if (T == Assembler::e32) __ vle32_v(vr, sr); + else __ vle64_v(vr, sr); + } - // Set vectors as 4 * 64 - // - // e64: vector of 64b/8B elements - // m1: LMUL=1 - // ta: tail agnostic (don't care about those lanes) - // ma: mask agnostic (don't care about those lanes) - // x0 is not written, we known the number of vector elements, 2. - __ vsetivli(x0, 4, Assembler::e64, Assembler::m1, Assembler::ma, Assembler::ta); - - // Load H[0..8] to produce - // v16 = {a,b,e,f} - // v17 = {c,d,g,h} - __ vle64_v(v16, state); // v16 = {d,c,b,a} - __ addi(state, state, 32); - __ vle64_v(v17, state); // v17 = {h,g,f,e} - - __ vid_v(v30); // v30 = {3,2,1,0} - __ vxor_vi(v30, v30, 0x3); // v30 = {0,1,2,3} - __ vrgather_vv(v26, v16, v30); // v26 = {a,b,c,d} - __ vrgather_vv(v27, v17, v30); // v27 = {e,f,g,h} - __ vmsgeu_vi(v0, v30, 2); // v0 = {f,f,t,t} - // Copy elements [3..2] of v26 ({d,c}) into elements [3..2] of v17. - __ vslideup_vi(v17, v26, 2); // v17 = {c,d,_,_} - // Merge elements [1..0] of v27 ({g,h}) into elements [1..0] of v17 - __ vmerge_vvm(v17, v17, v27); // v17 = {c,d,g,h} - // Copy elements [1..0] of v27 ({f,e}) into elements [1..0] of v16. - __ vslidedown_vi(v16, v27, 2); // v16 = {_,_,e,f} - // Merge elements [3..2] of v26 ({a,b}) into elements [3..2] of v16 - __ vmerge_vvm(v16, v26, v16); // v16 = {a,b,e,f} - - __ bind(multi_block_loop); - - // Capture the initial H values in v26 and v27 to allow for computing - // the resulting H', since H' = H+{a',b',c',...,h'}. - __ vmv_v_v(v26, v16); - __ vmv_v_v(v27, v17); - - // Load the 1024-bits of the message block in v10-v13 and perform - // an endian swap on each 8 bytes element. - // - // If Zvkb is not implemented, similar to SHA-256, one can use vrgather - // with an index sequence to byte-swap. - // sequence = [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12] - // gives us "N ^ 3" as a nice formula to generate - // this sequence. 'vid' gives us the N. - __ vle64_v(v10, buf); - __ vrev8_v(v10, v10); - __ add(buf, buf, 32); - __ vle64_v(v11, buf); - __ vrev8_v(v11, v11); - __ add(buf, buf, 32); - __ vle64_v(v12, buf); - __ vrev8_v(v12, v12); - __ add(buf, buf, 32); - __ vle64_v(v13, buf); - __ vrev8_v(v13, v13); - __ add(buf, buf, 32); - - // Set v0 up for the vmerge that replaces the first word (idx==0) - __ vid_v(v0); - __ vmseq_vi(v0, v0, 0x0); // v0.mask[i] = (i == 0 ? 1 : 0) - - __ la(consts, ExternalAddress((address)round_consts)); + template + void vseXX_v(VectorRegister vr, Register sr) { + if (T == Assembler::e32) __ vse32_v(vr, sr); + else __ vse64_v(vr, sr); + } // Overview of the logic in each "quad round". // - // The code below repeats 20 times the logic implementing four rounds - // of the SHA-512 core loop as documented by NIST. 20 "quad rounds" - // to implementing the 80 single rounds. + // The code below repeats 16/20 times the logic implementing four rounds + // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds" + // to implementing the 64/80 single rounds. // - // // Load four word (u64) constants (K[t+3], K[t+2], K[t+1], K[t+0]) + // // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0]) // // Output: // // v15 = {K[t+3], K[t+2], K[t+1], K[t+0]} - // vl1re32.v v15, (a2) + // vl1reXX.v v15, ofs // - // // Increment word contant address by stride (32 bytes, 4*8B, 256b) - // addi consts, consts, 32 + // // Increment word contant address by stride (16/32 bytes, 4*4B/8B, 128b/256b) + // addi ofs, ofs, 16/32 // // // Add constants to message schedule words: // // Input @@ -4343,232 +3833,228 @@ class StubGenerator: public StubCodeGenerator { // hence the uses of those vectors rotate in each round, and we get back to the // initial configuration every 4 quad-rounds. We could avoid those changes at // the cost of moving those vectors at the end of each quad-rounds. + template + void sha2_quad_round(VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4, + Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister vtemp3, VectorRegister vtemp4, + bool gen_words = true, bool step_const = true) { + __ vl1reXX_v(vtemp, scalarconst); + if (step_const) { + __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32); + } + __ vadd_vv(vtemp2, vtemp, rot1); + __ vsha2cl_vv(vtemp4, vtemp3, vtemp2); + __ vsha2ch_vv(vtemp3, vtemp4, vtemp2); + if ((vset_sew == Assembler::e64 && step_const) || gen_words) { + __ vmerge_vvm(vtemp2, rot3, rot2); + } + if (gen_words) { + __ vsha2ms_vv(rot1, vtemp2, rot4); + } + } - //-------------------------------------------------------------------------------- - // Quad-round 0 (+0, v10->v11->v12->v13) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v10); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v12, v11); - __ vsha2ms_vv(v10, v14, v13); - //-------------------------------------------------------------------------------- - // Quad-round 1 (+1, v11->v12->v13->v10) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v11); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v13, v12); - __ vsha2ms_vv(v11, v14, v10); - //-------------------------------------------------------------------------------- - // Quad-round 2 (+2, v12->v13->v10->v11) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v12); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v10, v13); - __ vsha2ms_vv(v12, v14, v11); - //-------------------------------------------------------------------------------- - // Quad-round 3 (+3, v13->v10->v11->v12) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v13); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v11, v10); - __ vsha2ms_vv(v13, v14, v12); - - //-------------------------------------------------------------------------------- - // Quad-round 4 (+0, v10->v11->v12->v13) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v10); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v12, v11); - __ vsha2ms_vv(v10, v14, v13); - //-------------------------------------------------------------------------------- - // Quad-round 5 (+1, v11->v12->v13->v10) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v11); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v13, v12); - __ vsha2ms_vv(v11, v14, v10); - //-------------------------------------------------------------------------------- - // Quad-round 6 (+2, v12->v13->v10->v11) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v12); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v10, v13); - __ vsha2ms_vv(v12, v14, v11); - //-------------------------------------------------------------------------------- - // Quad-round 7 (+3, v13->v10->v11->v12) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v13); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v11, v10); - __ vsha2ms_vv(v13, v14, v12); - - //-------------------------------------------------------------------------------- - // Quad-round 8 (+0, v10->v11->v12->v13) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v10); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v12, v11); - __ vsha2ms_vv(v10, v14, v13); - //-------------------------------------------------------------------------------- - // Quad-round 9 (+1, v11->v12->v13->v10) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v11); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v13, v12); - __ vsha2ms_vv(v11, v14, v10); - //-------------------------------------------------------------------------------- - // Quad-round 10 (+2, v12->v13->v10->v11) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v12); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v10, v13); - __ vsha2ms_vv(v12, v14, v11); - //-------------------------------------------------------------------------------- - // Quad-round 11 (+3, v13->v10->v11->v12) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v13); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v11, v10); - __ vsha2ms_vv(v13, v14, v12); - - //-------------------------------------------------------------------------------- - // Quad-round 12 (+0, v10->v11->v12->v13) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v10); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v12, v11); - __ vsha2ms_vv(v10, v14, v13); - //-------------------------------------------------------------------------------- - // Quad-round 13 (+1, v11->v12->v13->v10) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v11); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v13, v12); - __ vsha2ms_vv(v11, v14, v10); - //-------------------------------------------------------------------------------- - // Quad-round 14 (+2, v12->v13->v10->v11) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v12); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v10, v13); - __ vsha2ms_vv(v12, v14, v11); - //-------------------------------------------------------------------------------- - // Quad-round 15 (+3, v13->v10->v11->v12) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v13); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v11, v10); - __ vsha2ms_vv(v13, v14, v12); - - //-------------------------------------------------------------------------------- - // Quad-round 16 (+0, v10->v11->v12->v13) - // Note that we stop generating new message schedule words (Wt, v10-13) - // as we already generated all the words we end up consuming (i.e., W[79:76]). - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v10); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v12, v11); - //-------------------------------------------------------------------------------- - // Quad-round 17 (+1, v11->v12->v13->v10) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v11); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v13, v12); - //-------------------------------------------------------------------------------- - // Quad-round 18 (+2, v12->v13->v10->v11) - __ vl1re64_v(v15, consts); - __ addi(consts, consts, 32); - __ vadd_vv(v14, v15, v12); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - __ vmerge_vvm(v14, v10, v13); - //-------------------------------------------------------------------------------- - // Quad-round 19 (+3, v13->v10->v11->v12) - __ vl1re64_v(v15, consts); - // No consts increment needed. - __ vadd_vv(v14, v15, v13); - __ vsha2cl_vv(v17, v16, v14); - __ vsha2ch_vv(v16, v17, v14); - - //-------------------------------------------------------------------------------- - // Compute the updated hash value H' - // H' = H + {h',g',...,b',a'} - // = {h,g,...,b,a} + {h',g',...,b',a'} - // = {h+h',g+g',...,b+b',a+a'} - - // H' = H+{a',b',c',...,h'} - __ vadd_vv(v16, v26, v16); - __ vadd_vv(v17, v27, v17); + // Arguments: + // + // Inputs: + // c_rarg0 - byte[] source+offset + // c_rarg1 - int[] SHA.state + // c_rarg2 - int offset + // c_rarg3 - int limit + // + template + address generate_sha2_implCompress(bool multi_block) { + constexpr int const_add = vset_sew == Assembler::e32 ? 16 : 32; - if (multi_block) { - __ add(ofs, ofs, 128); - __ ble(ofs, limit, multi_block_loop); - __ mv(c_rarg0, ofs); // return ofs - } + __ align(CodeEntryAlignment); + address start = __ pc(); - // Store H[0..8] = {a,b,c,d,e,f,g,h} from - // v16 = {f,e,b,a} - // v17 = {h,g,d,c} - __ vid_v(v30); // v30 = {3,2,1,0} - __ vxor_vi(v30, v30, 0x3); // v30 = {0,1,2,3} - __ vrgather_vv(v26, v16, v30); // v26 = {f,e,b,a} - __ vrgather_vv(v27, v17, v30); // v27 = {h,g,d,c} - __ vmsgeu_vi(v0, v30, 2); // v0 = {f,f,t,t} - // Copy elements [3..2] of v26 ({f,e}) into elements [1..0] of v17. - __ vslidedown_vi(v17, v26, 2); // v17 = {_,_,f,e} - // Merge elements [3..2] of v27 ({g,h}) into elements [3..2] of v17 - __ vmerge_vvm(v17, v27, v17); // v17 = {h,g,f,e} - // Copy elements [1..0] of v27 ({c,d}) into elements [3..2] of v16. - __ vslideup_vi(v16, v27, 2); // v16 = {d,c,_,_} - // Merge elements [1..0] of v26 ({a,b}) into elements [1..0] of v16 - __ vmerge_vvm(v16, v16, v26); // v16 = {d,c,b,a} - - // Save the hash - __ vse64_v(v17, state); - __ addi(state, state, -32); - __ vse64_v(v16, state); + Register buf = c_rarg0; + Register state = c_rarg1; + Register ofs = c_rarg2; + Register limit = c_rarg3; + Register consts = t2; - __ leave(); - __ ret(); + Label multi_block_loop; - return start; - } + __ enter(); + // Register use in this function: + // + // VECTORS + // v10 - v13 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message + // schedule words (Wt). They start with the message block + // content (W0 to W15), then further words in the message + // schedule generated via vsha2ms from previous Wt. + // Initially: + // v10 = W[ 3:0] = { W3, W2, W1, W0} + // v11 = W[ 7:4] = { W7, W6, W5, W4} + // v12 = W[ 11:8] = {W11, W10, W9, W8} + // v13 = W[15:12] = {W15, W14, W13, W12} + // + // v16 - v17 hold the working state variables (a, b, ..., h) + // v16 = {f[t],e[t],b[t],a[t]} + // v17 = {h[t],g[t],d[t],c[t]} + // Initially: + // v16 = {H5i-1, H4i-1, H1i-1 , H0i-1} + // v17 = {H7i-i, H6i-1, H3i-1 , H2i-1} + // + // v0 = masks for vrgather/vmerge. Single value during the 16 rounds. + // + // v14 = temporary, Wt+Kt + // v15 = temporary, Kt + // + // v18/v19 = temporaries, in the epilogue, to re-arrange + // and byte-swap v16/v17 + // + // v26/v27 = hold the initial values of the hash, byte-swapped. + // + // v30/v31 = used to generate masks, vrgather indices. + // + // During most of the function the vector state is configured so that each + // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits). + + // Set vectors as 4 * 32/64 bits + // + // e32/e64: vector of 32b/64b/4B/8B elements + // m1: LMUL=1 + // ta: tail agnostic (don't care about those lanes) + // ma: mask agnostic (don't care about those lanes) + // x0 is not written, we known the number of vector elements. + __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta); + + // Load H[0..8] to produce + // v16 = {a,b,e,f} + // v17 = {c,d,g,h} + __ vleXX_v(v16, state); // v16 = {d,c,b,a} + __ addi(state, state, const_add); + __ vleXX_v(v17, state); // v17 = {h,g,f,e} + + __ vid_v(v30); // v30 = {3,2,1,0} + __ vxor_vi(v30, v30, 0x3); // v30 = {0,1,2,3} + __ vrgather_vv(v26, v16, v30); // v26 = {a,b,c,d} + __ vrgather_vv(v27, v17, v30); // v27 = {e,f,g,h} + __ vmsgeu_vi(v0, v30, 2); // v0 = {f,f,t,t} + // Copy elements [3..2] of v26 ({d,c}) into elements [3..2] of v17. + __ vslideup_vi(v17, v26, 2); // v17 = {c,d,_,_} + // Merge elements [1..0] of v27 ({g,h}) into elements [1..0] of v17 + __ vmerge_vvm(v17, v17, v27); // v17 = {c,d,g,h} + // Copy elements [1..0] of v27 ({f,e}) into elements [1..0] of v16. + __ vslidedown_vi(v16, v27, 2); // v16 = {_,_,e,f} + // Merge elements [3..2] of v26 ({a,b}) into elements [3..2] of v16 + __ vmerge_vvm(v16, v26, v16); // v16 = {a,b,e,f} + + __ bind(multi_block_loop); + + // Capture the initial H values in v26 and v27 to allow for computing + // the resulting H', since H' = H+{a',b',c',...,h'}. + __ vmv_v_v(v26, v16); + __ vmv_v_v(v27, v17); + + // Load the 512/1024-bits of the message block in v10-v13 and perform + // an endian swap on each 4/8 bytes element. + // + // If Zvkb is not implemented one can use vrgather + // with an index sequence to byte-swap. + // sequence = [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12] + // gives us "N ^ 3" as a nice formula to generate + // this sequence. 'vid' gives us the N. + __ vleXX_v(v10, buf); + __ vrev8_v(v10, v10); + __ addi(buf, buf, const_add); + __ vleXX_v(v11, buf); + __ vrev8_v(v11, v11); + __ addi(buf, buf, const_add); + __ vleXX_v(v12, buf); + __ vrev8_v(v12, v12); + __ addi(buf, buf, const_add); + __ vleXX_v(v13, buf); + __ vrev8_v(v13, v13); + __ addi(buf, buf, const_add); + + // Set v0 up for the vmerge that replaces the first word (idx==0) + __ vid_v(v0); + __ vmseq_vi(v0, v0, 0x0); // v0.mask[i] = (i == 0 ? 1 : 0) + + __ la(consts, ExternalAddress(vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512)); + + VectorRegister rotation_regs[] = {v10, v11, v12, v13}; + int rot_pos = 0; + // Quad-round #0 (+0, v10->v11->v12->v13) ... #11 (+3, v13->v10->v11->v12) + constexpr int qr_end = vset_sew == Assembler::e32 ? 12 : 16; + for (int i = 0; i < qr_end; i++) { + BLOCK_COMMENT("QUAD ROUND"); + sha2_quad_round + (rotation_regs[(rot_pos + 0) & 0x3], + rotation_regs[(rot_pos + 1) & 0x3], + rotation_regs[(rot_pos + 2) & 0x3], + rotation_regs[(rot_pos + 3) & 0x3], + consts, + v15, v14, v16, v17); + ++rot_pos; + } + // Quad-round #12 (+0, v10->v11->v12->v13) ... #15 (+3, v13->v10->v11->v12) + // Note that we stop generating new message schedule words (Wt, v10-13) + // as we already generated all the words we end up consuming (i.e., W[63:60]). + constexpr int qr_c_end = qr_end + 4; + for (int i = qr_end; i < qr_c_end; i++) { + BLOCK_COMMENT("QUAD ROUND CONSUME"); + sha2_quad_round + (rotation_regs[(rot_pos + 0) & 0x3], + rotation_regs[(rot_pos + 1) & 0x3], + rotation_regs[(rot_pos + 2) & 0x3], + rotation_regs[(rot_pos + 3) & 0x3], + consts, + v15, v14, v16, v17, false, i < (qr_c_end-1)); + ++rot_pos; + } + BLOCK_COMMENT("QUAD END"); + + //-------------------------------------------------------------------------------- + // Compute the updated hash value H' + // H' = H + {h',g',...,b',a'} + // = {h,g,...,b,a} + {h',g',...,b',a'} + // = {h+h',g+g',...,b+b',a+a'} + + // H' = H+{a',b',c',...,h'} + __ vadd_vv(v16, v26, v16); + __ vadd_vv(v17, v27, v17); + + if (multi_block) { + __ add(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128); + __ ble(ofs, limit, multi_block_loop); + __ mv(c_rarg0, ofs); // return ofs + } + + // Store H[0..8] = {a,b,c,d,e,f,g,h} from + // v16 = {f,e,b,a} + // v17 = {h,g,d,c} + __ vid_v(v30); // v30 = {3,2,1,0} + __ vxor_vi(v30, v30, 0x3); // v30 = {0,1,2,3} + __ vrgather_vv(v26, v16, v30); // v26 = {f,e,b,a} + __ vrgather_vv(v27, v17, v30); // v27 = {h,g,d,c} + __ vmsgeu_vi(v0, v30, 2); // v0 = {f,f,t,t} + // Copy elements [3..2] of v26 ({f,e}) into elements [1..0] of v17. + __ vslidedown_vi(v17, v26, 2); // v17 = {_,_,f,e} + // Merge elements [3..2] of v27 ({g,h}) into elements [3..2] of v17 + __ vmerge_vvm(v17, v27, v17); // v17 = {h,g,f,e} + // Copy elements [1..0] of v27 ({c,d}) into elements [3..2] of v16. + __ vslideup_vi(v16, v27, 2); // v16 = {d,c,_,_} + // Merge elements [1..0] of v26 ({a,b}) into elements [1..0] of v16 + __ vmerge_vvm(v16, v16, v26); // v16 = {d,c,b,a} + + // Save the hash + __ vseXX_v(v17, state); + __ addi(state, state, -const_add); + __ vseXX_v(v16, state); + + __ leave(); + __ ret(); + + return start; + } + }; +#undef __ +#define __ masm-> + +#endif // COMPILER2 // Continuation point for throwing of implicit exceptions that are // not handled in the current activation. Fabricates an exception @@ -5551,12 +5037,28 @@ class StubGenerator: public StubCodeGenerator { #endif // COMPILER2 if (UseSHA256Intrinsics) { - StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); - StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); + { + Sha2Generator g(_masm); + StubCodeMark mark(this, "StubRoutines", "sha256_implCompress"); + StubRoutines::_sha256_implCompress = g.generate_sha256_implCompress(false); + } + { + Sha2Generator g(_masm); + StubCodeMark mark(this, "StubRoutines", "sha256_implCompressMB"); + StubRoutines::_sha256_implCompressMB = g.generate_sha256_implCompress(true); + } } if (UseSHA512Intrinsics) { - StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); - StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); + { + Sha2Generator g(_masm); + StubCodeMark mark(this, "StubRoutines", "sha512_implCompress"); + StubRoutines::_sha512_implCompress = g.generate_sha512_implCompress(false); + } + { + Sha2Generator g(_masm); + StubCodeMark mark(this, "StubRoutines", "sha512_implCompressMB"); + StubRoutines::_sha512_implCompressMB = g.generate_sha512_implCompress(true); + } } generate_compare_long_strings();