From d1e8aa22d34bfa55a02620f4001fc23b3d9041e8 Mon Sep 17 00:00:00 2001
From: Robbin Ehn <rehn@rivosinc.com>
Date: Wed, 22 Nov 2023 15:31:23 +0100
Subject: [PATCH] Share code

---
 src/hotspot/cpu/riscv/stubGenerator_riscv.cpp | 1044 +++++------------
 1 file changed, 273 insertions(+), 771 deletions(-)

diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
index 6827448fe8c4c..7d0b5e42e2179 100644
--- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
+++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
@@ -3659,19 +3659,20 @@ class StubGenerator: public StubCodeGenerator {
       return entry;
     }
   };
-#endif // COMPILER2
-
 
-  // Arguments:
-  //
-  // Inputs:
-  //   c_rarg0   - byte[]  source+offset
-  //   c_rarg1   - int[]   SHA.state
-  //   c_rarg2   - int     offset
-  //   c_rarg3   - int     limit
-  //
-  address generate_sha256_implCompress(bool multi_block, const char *name) {
-    static const uint32_t round_consts[64] = {
+#undef __
+#define __ this->
+  class Sha2Generator : public MacroAssembler {
+   public:
+      Sha2Generator(MacroAssembler* masm) : MacroAssembler(masm->code()) {}
+      address generate_sha256_implCompress(bool multi_block) {
+        return generate_sha2_implCompress<Assembler::e32>(multi_block);
+      }
+      address generate_sha512_implCompress(bool multi_block) {
+        return generate_sha2_implCompress<Assembler::e64>(multi_block);
+      }
+   private:
+    const uint32_t round_consts_256[64] = {
       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -3689,424 +3690,7 @@ class StubGenerator: public StubCodeGenerator {
       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
     };
-    __ align(CodeEntryAlignment);
-    StubCodeMark mark(this, "StubRoutines", name);
-    address start = __ pc();
-
-    Register buf   = c_rarg0;
-    Register state = c_rarg1;
-    Register ofs   = c_rarg2;
-    Register limit = c_rarg3;
-    Register consts = t0;
-
-    Label multi_block_loop;
-
-    __ enter();
-
-    // Register use in this function:
-    //
-    // VECTORS
-    //  v10 - v13 (512-bits / 4*128 bits / 4*4*32 bits), hold the message
-    //             schedule words (Wt). They start with the message block
-    //             content (W0 to W15), then further words in the message
-    //             schedule generated via vsha2ms from previous Wt.
-    //   Initially:
-    //     v10 = W[  3:0] = { W3,  W2,  W1,  W0}
-    //     v11 = W[  7:4] = { W7,  W6,  W5,  W4}
-    //     v12 = W[ 11:8] = {W11, W10,  W9,  W8}
-    //     v13 = W[15:12] = {W15, W14, W13, W12}
-    //
-    //  v16 - v17 hold the working state variables (a, b, ..., h)
-    //    v16 = {a[t],b[t],e[t],f[t]}
-    //    v17 = {c[t],d[t],g[t],h[t]}
-    //   Initially:
-    //    v16 = {H5i-1, H4i-1, H1i-1 , H0i-1}
-    //    v17 = {H7i-i, H6i-1, H3i-1 , H2i-1}
-    //
-    //  v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
-    //
-    //  v14 = temporary, Wt+Kt
-    //  v15 = temporary, Kt
-    //
-    //  v18/v19 = temporaries, in the epilogue, to re-arrange
-    //            and byte-swap v16/v17
-    //
-    //  v26/v27 = hold the initial values of the hash, byte-swapped.
-    //
-    //  v30/v31 = used to generate masks, vrgather indices.
-    //
-    // During most of the function the vector state is configured so that each
-    // vector is interpreted as containing four 32 bits (e32) elements (128 bits).
-
-    // Set vectors as 4 * 32 bits
-    //
-    // e32: vector of 32b/4B elements
-    // m1: LMUL=1
-    // ta: tail agnostic (don't care about those lanes)
-    // ma: mask agnostic (don't care about those lanes)
-    // x0 is not written, we known the number of vector elements, 8.
-    __ vsetivli(x0, 4, Assembler::e32, Assembler::m1, Assembler::ma, Assembler::ta);
-
-    // Load H[0..8] to produce
-    //  v16 = {a,b,e,f}
-    //  v17 = {c,d,g,h}
-    __ vle32_v(v16, state);                          // v16 = {d,c,b,a}
-    __ addi(state, state, 16);
-    __ vle32_v(v17, state);                          // v17 = {h,g,f,e}
-
-    __ vid_v(v30);                                   // v30 = {3,2,1,0}
-    __ vxor_vi(v30, v30, 0x3);                       // v30 = {0,1,2,3}
-    __ vrgather_vv(v26, v16, v30);                   // v26 = {a,b,c,d}
-    __ vrgather_vv(v27, v17, v30);                   // v27 = {e,f,g,h}
-    __ vmsgeu_vi(v0, v30, 2);                        // v0  = {f,f,t,t}
-    // Copy elements [3..2] of v26 ({d,c}) into elements [3..2] of v17.
-    __ vslideup_vi(v17, v26, 2);                     // v17 = {c,d,_,_}
-    // Merge elements [1..0] of v27 ({g,h}) into elements [1..0] of v17
-    __ vmerge_vvm(v17, v17, v27);                    // v17 = {c,d,g,h}
-    // Copy elements [1..0] of v27 ({f,e}) into elements [1..0] of v16.
-    __ vslidedown_vi(v16, v27, 2);                   // v16 = {_,_,e,f}
-    // Merge elements [3..2] of v26 ({a,b}) into elements [3..2] of v16
-    __ vmerge_vvm(v16, v26, v16);                    // v16 = {a,b,e,f}
-
-    __ bind(multi_block_loop);
-
-    // Capture the initial H values in v26 and v27 to allow for computing
-    // the resulting H', since H' = H+{a',b',c',...,h'}.
-    __ vmv_v_v(v26, v16);
-    __ vmv_v_v(v27, v17);
-
-    // Load the 512-bits of the message block in v10-v13 and perform
-    // an endian swap on each 4 bytes element.
-    //
-    // If Zvkb is not implemented, one can use vrgather with the right index
-    // sequence. It requires loading in separate registers since the destination
-    // of vrgather cannot overlap the source.
-    //    // We generate the lane (byte) index sequence
-    //    //    v24 = [3 2 1 0   7 6 5 4  11 10 9 8   15 14 13 12]
-    //    // <https://oeis.org/a104444> gives us "N ^ 3" as a nice formula to generate
-    //    // this sequence. 'vid' gives us the N.
-    //    //
-    //    // We switch the vector type to SEW=8 temporarily.
-    //    vsetivli x0, 16, e8, m1, ta, ma
-    //    vid.v v24
-    //    vxor.vi v24, v24, 0x3
-    //    // Byteswap the bytes in each word of the text.
-    //    vrgather.vv v10, v20, v24
-    //    vrgather.vv v11, v21, v24
-    //    vrgather.vv v12, v22, v24
-    //    vrgather.vv v13, v23, v24
-    //    // Switch back to SEW=32
-    //    vsetivli x0, 4, e32, m1, ta, ma
-    __ vle32_v(v10, buf);
-    __ vrev8_v(v10, v10);
-    __ addi(buf, buf, 16);
-    __ vle32_v(v11, buf);
-    __ vrev8_v(v11, v11);
-    __ addi(buf, buf, 16);
-    __ vle32_v(v12, buf);
-    __ vrev8_v(v12, v12);
-    __ addi(buf, buf, 16);
-    __ vle32_v(v13, buf);
-    __ vrev8_v(v13, v13);
-    __ addi(buf, buf, 16);
-
-    // Set v0 up for the vmerge that replaces the first word (idx==0)
-    __ vid_v(v0);
-    __ vmseq_vi(v0, v0, 0x0);  // v0.mask[i] = (i == 0 ? 1 : 0)
-
-    __ la(consts, ExternalAddress((address)round_consts));
-
-    // Overview of the logic in each "quad round".
-    //
-    // The code below repeats 16 times the logic implementing four rounds
-    // of the SHA-256 core loop as documented by NIST. 16 "quad rounds"
-    // to implementing the 64 single rounds.
-    //
-    //    // Load four word (u32) constants (K[t+3], K[t+2], K[t+1], K[t+0])
-    //    // Output:
-    //    //   v15 = {K[t+3], K[t+2], K[t+1], K[t+0]}
-    //    vl1re32.v v15, ofs
-    //
-    //    // Increment word contant address by stride (16 bytes, 4*4B, 128b)
-    //    addi ofs, ofs, 16
-    //
-    //    // Add constants to message schedule words:
-    //    //  Input
-    //    //    v15 = {K[t+3], K[t+2], K[t+1], K[t+0]}
-    //    //    v10 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0];
-    //    //  Output
-    //    //    v14 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
-    //    vadd.vv v14, v15, v10
-    //
-    //    //  2 rounds of working variables updates.
-    //    //     v17[t+4] <- v17[t], v16[t], v14[t]
-    //    //  Input:
-    //    //    v17 = {c[t],d[t],g[t],h[t]}   " = v17[t] "
-    //    //    v16 = {a[t],b[t],e[t],f[t]}
-    //    //    v14 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
-    //    //  Output:
-    //    //    v17 = {f[t+2],e[t+2],b[t+2],a[t+2]}  " = v16[t+2] "
-    //    //        = {h[t+4],g[t+4],d[t+4],c[t+4]}  " = v17[t+4] "
-    //    vsha2cl.vv v17, v16, v14
-    //
-    //    //  2 rounds of working variables updates.
-    //    //     v16[t+4] <- v16[t], v16[t+2], v14[t]
-    //    //  Input
-    //    //   v16 = {a[t],b[t],e[t],f[t]}       " = v16[t] "
-    //    //       = {h[t+2],g[t+2],d[t+2],c[t+2]}   " = v17[t+2] "
-    //    //   v17 = {f[t+2],e[t+2],b[t+2],a[t+2]}   " = v16[t+2] "
-    //    //   v14 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
-    //    //  Output:
-    //    //   v16 = {f[t+4],e[t+4],b[t+4],a[t+4]}   " = v16[t+4] "
-    //    vsha2ch.vv v16, v17, v14
-    //
-    //    // Combine 2QW into 1QW
-    //    //
-    //    // To generate the next 4 words, "new_v10"/"v14" from v10-v13, vsha2ms needs
-    //    //     v10[0..3], v11[0], v12[1..3], v13[0, 2..3]
-    //    // and it can only take 3 vectors as inputs. Hence we need to combine
-    //    // v11[0] and v12[1..3] in a single vector.
-    //    //
-    //    // vmerge Vt4, Vt1, Vt2, V0
-    //    // Input
-    //    //  V0 = mask // first word from v12, 1..3 words from v11
-    //    //  V12 = {Wt-8, Wt-7, Wt-6, Wt-5}
-    //    //  V11 = {Wt-12, Wt-11, Wt-10, Wt-9}
-    //    // Output
-    //    //  Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5}
-    //    vmerge.vvm v14, v12, v11, v0
-    //
-    //    // Generate next Four Message Schedule Words (hence allowing for 4 more rounds)
-    //    // Input
-    //    //  V10 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]}     W[ 3: 0]
-    //    //  V13 = {W[t+15], W[t+14], W[t+13], W[t+12]}     W[15:12]
-    //    //  V14 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]}     W[11: 9,4]
-    //    // Output (next four message schedule words)
-    //    //  v10 = {W[t+19],  W[t+18],  W[t+17],  W[t+16]}  W[19:16]
-    //    vsha2ms.vv v10, v14, v13
-    //
-    // BEFORE
-    //  v10 - v13 hold the message schedule words (initially the block words)
-    //    v10 = W[ 3: 0]   "oldest"
-    //    v11 = W[ 7: 4]
-    //    v12 = W[11: 8]
-    //    v13 = W[15:12]   "newest"
-    //
-    //  vt6 - vt7 hold the working state variables
-    //    v16 = {a[t],b[t],e[t],f[t]}   // initially {H5,H4,H1,H0}
-    //    v17 = {c[t],d[t],g[t],h[t]}   // initially {H7,H6,H3,H2}
-    //
-    // AFTER
-    //  v10 - v13 hold the message schedule words (initially the block words)
-    //    v11 = W[ 7: 4]   "oldest"
-    //    v12 = W[11: 8]
-    //    v13 = W[15:12]
-    //    v10 = W[19:16]   "newest"
-    //
-    //  v16 and v17 hold the working state variables
-    //    v16 = {a[t+4],b[t+4],e[t+4],f[t+4]}
-    //    v17 = {c[t+4],d[t+4],g[t+4],h[t+4]}
-    //
-    //  The group of vectors v10,v11,v12,v13 is "rotated" by one in each quad-round,
-    //  hence the uses of those vectors rotate in each round, and we get back to the
-    //  initial configuration every 4 quad-rounds. We could avoid those changes at
-    //  the cost of moving those vectors at the end of each quad-rounds.
-
-    //--------------------------------------------------------------------------------
-    // Quad-round 0 (+0, v10->v11->v12->v13)
-    __ vl1re32_v(v15, consts);
-    __ addi(consts, consts, 16);
-    __ vadd_vv(v14, v15, v10);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v12, v11);
-    __ vsha2ms_vv(v10, v14, v13); // Generate W[19:16]
-    //--------------------------------------------------------------------------------
-    // Quad-round 1 (+1, v11->v12->v13->v10)
-    __ vl1re32_v(v15, consts);
-    __ addi(consts, consts, 16);
-    __ vadd_vv(v14, v15, v11);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v13, v12);
-    __ vsha2ms_vv(v11, v14, v10); // Generate W[23:20]
-    //--------------------------------------------------------------------------------
-    // Quad-round 2 (+2, v12->v13->v10->v11)
-    __ vl1re32_v(v15, consts);
-    __ addi(consts, consts, 16);
-    __ vadd_vv(v14, v15, v12);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v10, v13);
-    __ vsha2ms_vv(v12, v14, v11); // Generate W[27:24]
-    //--------------------------------------------------------------------------------
-    // Quad-round 3 (+3, v13->v10->v11->v12)
-    __ vl1re32_v(v15, consts);
-    __ addi(consts, consts, 16);
-    __ vadd_vv(v14, v15, v13);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v11, v10);
-    __ vsha2ms_vv(v13, v14, v12); // Generate W[31:28]
-
-    //--------------------------------------------------------------------------------
-    // Quad-round 4 (+0, v10->v11->v12->v13)
-    __ vl1re32_v(v15, consts);
-    __ addi(consts, consts, 16);
-    __ vadd_vv(v14, v15, v10);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v12, v11);
-    __ vsha2ms_vv(v10, v14, v13); // Generate W[35:32]
-    //--------------------------------------------------------------------------------
-    // Quad-round 5 (+1, v11->v12->v13->v10)
-    __ vl1re32_v(v15, consts);
-    __ addi(consts, consts, 16);
-    __ vadd_vv(v14, v15, v11);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v13, v12);
-    __ vsha2ms_vv(v11, v14, v10); // Generate W[39:36]
-    //--------------------------------------------------------------------------------
-    // Quad-round 6 (+2, v12->v13->v10->v11)
-    __ vl1re32_v(v15, consts);
-    __ addi(consts, consts, 16);
-    __ vadd_vv(v14, v15, v12);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v10, v13);
-    __ vsha2ms_vv(v12, v14, v11); // Generate W[43:40]
-    //--------------------------------------------------------------------------------
-    // Quad-round 7 (+3, v13->v10->v11->v12)
-    __ vl1re32_v(v15, consts);
-    __ addi(consts, consts, 16);
-    __ vadd_vv(v14, v15, v13);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v11, v10);
-    __ vsha2ms_vv(v13, v14, v12); // Generate W[47:44]
-
-    //--------------------------------------------------------------------------------
-    // Quad-round 8 (+0, v10->v11->v12->v13)
-    __ vl1re32_v(v15, consts);
-    __ addi(consts, consts, 16);
-    __ vadd_vv(v14, v15, v10);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v12, v11);
-    __ vsha2ms_vv(v10, v14, v13); // Generate W[51:48]
-    //--------------------------------------------------------------------------------
-    // Quad-round 9 (+1, v11->v12->v13->v10)
-    __ vl1re32_v(v15, consts);
-    __ addi(consts, consts, 16);
-    __ vadd_vv(v14, v15, v11);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v13, v12);
-    __ vsha2ms_vv(v11, v14, v10); // Generate W[55:52]
-    //--------------------------------------------------------------------------------
-    // Quad-round 10 (+2, v12->v13->v10->v11)
-    __ vl1re32_v(v15, consts);
-    __ addi(consts, consts, 16);
-    __ vadd_vv(v14, v15, v12);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v10, v13);
-    __ vsha2ms_vv(v12, v14, v11); // Generate W[59:56]
-    //--------------------------------------------------------------------------------
-    // Quad-round 11 (+3, v13->v10->v11->v12)
-    __ vl1re32_v(v15, consts);
-    __ addi(consts, consts, 16);
-    __ vadd_vv(v14, v15, v13);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v11, v10);
-    __ vsha2ms_vv(v13, v14, v12); // Generate W[63:60]
-
-    //--------------------------------------------------------------------------------
-    // Quad-round 12 (+0, v10->v11->v12->v13)
-    // Note that we stop generating new message schedule words (Wt, v10-13)
-    // as we already generated all the words we end up consuming (i.e., W[63:60]).
-    __ vl1re32_v(v15, consts);
-    __ addi(consts, consts, 16);
-    __ vadd_vv(v14, v15, v10);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    //--------------------------------------------------------------------------------
-    // Quad-round 13 (+1, v11->v12->v13->v10)
-    __ vl1re32_v(v15, consts);
-    __ addi(consts, consts, 16);
-    __ vadd_vv(v14, v15, v11);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    //--------------------------------------------------------------------------------
-    // Quad-round 14 (+2, v12->v13->v10->v11)
-    __ vl1re32_v(v15, consts);
-    __ addi(consts, consts, 16);
-    __ vadd_vv(v14, v15, v12);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    //--------------------------------------------------------------------------------
-    // Quad-round 15 (+3, v13->v10->v11->v12)
-    __ vl1re32_v(v15, consts);
-    // No consts increment needed
-    __ vadd_vv(v14, v15, v13);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-
-    //--------------------------------------------------------------------------------
-    // Compute the updated hash value H'
-    //   H' = H + {h',g',...,b',a'}
-    //      = {h,g,...,b,a} + {h',g',...,b',a'}
-    //      = {h+h',g+g',...,b+b',a+a'}
-
-    __ vadd_vv(v16, v26, v16);
-    __ vadd_vv(v17, v27, v17);
-
-    if (multi_block) {
-      __ add(ofs, ofs, 64);
-      __ ble(ofs, limit, multi_block_loop);
-      __ mv(c_rarg0, ofs); // return ofs
-    }
-
-    // Store H[0..8] = {a,b,c,d,e,f,g,h} from
-    //  v16 = {f,e,b,a}
-    //  v17 = {h,g,d,c}
-    __ vid_v(v30);                                   // v30 = {3,2,1,0}
-    __ vxor_vi(v30, v30, 0x3);                       // v30 = {0,1,2,3}
-    __ vrgather_vv(v26, v16, v30);                   // v26 = {f,e,b,a}
-    __ vrgather_vv(v27, v17, v30);                   // v27 = {h,g,d,c}
-    __ vmsgeu_vi(v0, v30, 2);                        // v0  = {f,f,t,t}
-    // Copy elements [3..2] of v26 ({f,e}) into elements [1..0] of v17.
-    __ vslidedown_vi(v17, v26, 2);                   // v17 = {_,_,f,e}
-    // Merge elements [3..2] of v27 ({g,h}) into elements [3..2] of v17
-    __ vmerge_vvm(v17, v27, v17);                    // v17 = {h,g,f,e}
-    // Copy elements [1..0] of v27 ({c,d}) into elements [3..2] of v16.
-    __ vslideup_vi(v16, v27, 2);                     // v16 = {d,c,_,_}
-    // Merge elements [1..0] of v26 ({a,b}) into elements [1..0] of v16
-    __ vmerge_vvm(v16, v16, v26);                    // v16 = {d,c,b,a}
-
-    // Save the hash
-    __ vse32_v(v17, state);
-    __ addi(state, state, -16);
-    __ vse32_v(v16, state);
-
-    __ leave();
-    __ ret();
-
-    return start;
-  }
-
-  // Arguments:
-  //
-  // Inputs:
-  //   c_rarg0   - byte[]  source+offset
-  //   c_rarg1   - int[]   SHA.state
-  //   c_rarg2   - int     offset
-  //   c_rarg3   - int     limit
-  //
-  address generate_sha512_implCompress(bool multi_block, const char *name) {
-    static const uint64_t round_consts[80] = {
+    const uint64_t round_consts_512[80] = {
       0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl,
       0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l,
       0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l,
@@ -4135,132 +3719,38 @@ class StubGenerator: public StubCodeGenerator {
       0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al,
       0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l
     };
-    __ align(CodeEntryAlignment);
-    StubCodeMark mark(this, "StubRoutines", name);
-    address start = __ pc();
-
-    Register buf   = c_rarg0;
-    Register state = c_rarg1;
-    Register ofs   = c_rarg2;
-    Register limit = c_rarg3;
-    Register consts = t0;
-
-    Label multi_block_loop;
 
-    __ enter();
+    template<Assembler::SEW T>
+    void vl1reXX_v(VectorRegister vr, Register sr) {
+      if (T == Assembler::e32) __ vl1re32_v(vr, sr);
+      else                     __ vl1re64_v(vr, sr);
+    }
 
-    // Register use in this function:
-    //
-    // VECTORS
-    //  v10 - v13 (1024-bits / 4*256 bits / 4*4*64 bits), hold the message
-    //             schedule words (Wt). They start with the message block
-    //             content (W0 to W15), then further words in the message
-    //             schedule generated via vsha2ms from previous Wt.
-    //   Initially:
-    //     v10 = W[  3:0] = { W3,  W2,  W1,  W0}
-    //     v11 = W[  7:4] = { W7,  W6,  W5,  W4}
-    //     v12 = W[ 11:8] = {W11, W10,  W9,  W8}
-    //     v13 = W[15:12] = {W15, W14, W13, W12}
-    //
-    //  v16 - v17 hold the working state variables (a, b, ..., h)
-    //    v16 = {f[t],e[t],b[t],a[t]}
-    //    v17 = {h[t],g[t],d[t],c[t]}
-    //   Initially:
-    //    v16 = {H5i-1, H4i-1, H1i-1 , H0i-1}
-    //    v17 = {H7i-i, H6i-1, H3i-1 , H2i-1}
-    //
-    //  v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
-    //
-    //  v14 = temporary, Wt+Kt
-    //  v15 = temporary, Kt
-    //
-    //  v18/v19 = temporaries, in the epilogue, to re-arrange
-    //            and byte-swap v16/v17
-    //
-    //  v26/v27 = hold the initial values of the hash, byte-swapped.
-    //
-    //  v30/v31 = used to generate masks, vrgather indices.
-    //
-    // During most of the function the vector state is configured so that each
-    // vector is interpreted as containing four 64 bits (e64) elements (256 bits).
+    template<Assembler::SEW T>
+    void vleXX_v(VectorRegister vr, Register sr) {
+      if (T == Assembler::e32) __ vle32_v(vr, sr);
+      else                     __ vle64_v(vr, sr);
+    }
 
-    // Set vectors as 4 * 64
-    //
-    // e64: vector of 64b/8B elements
-    // m1: LMUL=1
-    // ta: tail agnostic (don't care about those lanes)
-    // ma: mask agnostic (don't care about those lanes)
-    // x0 is not written, we known the number of vector elements, 2.
-    __ vsetivli(x0, 4, Assembler::e64, Assembler::m1, Assembler::ma, Assembler::ta);
-
-    // Load H[0..8] to produce
-    //  v16 = {a,b,e,f}
-    //  v17 = {c,d,g,h}
-    __ vle64_v(v16, state);                          // v16 = {d,c,b,a}
-    __ addi(state, state, 32);
-    __ vle64_v(v17, state);                          // v17 = {h,g,f,e}
-
-    __ vid_v(v30);                                   // v30 = {3,2,1,0}
-    __ vxor_vi(v30, v30, 0x3);                       // v30 = {0,1,2,3}
-    __ vrgather_vv(v26, v16, v30);                   // v26 = {a,b,c,d}
-    __ vrgather_vv(v27, v17, v30);                   // v27 = {e,f,g,h}
-    __ vmsgeu_vi(v0, v30, 2);                        // v0  = {f,f,t,t}
-    // Copy elements [3..2] of v26 ({d,c}) into elements [3..2] of v17.
-    __ vslideup_vi(v17, v26, 2);                     // v17 = {c,d,_,_}
-    // Merge elements [1..0] of v27 ({g,h}) into elements [1..0] of v17
-    __ vmerge_vvm(v17, v17, v27);                    // v17 = {c,d,g,h}
-    // Copy elements [1..0] of v27 ({f,e}) into elements [1..0] of v16.
-    __ vslidedown_vi(v16, v27, 2);                   // v16 = {_,_,e,f}
-    // Merge elements [3..2] of v26 ({a,b}) into elements [3..2] of v16
-    __ vmerge_vvm(v16, v26, v16);                    // v16 = {a,b,e,f}
-
-    __ bind(multi_block_loop);
-
-    // Capture the initial H values in v26 and v27 to allow for computing
-    // the resulting H', since H' = H+{a',b',c',...,h'}.
-    __ vmv_v_v(v26, v16);
-    __ vmv_v_v(v27, v17);
-
-    // Load the 1024-bits of the message block in v10-v13 and perform
-    // an endian swap on each 8 bytes element.
-    //
-    // If Zvkb is not implemented, similar to SHA-256, one can use vrgather
-    // with an index sequence to byte-swap.
-    //  sequence = [3 2 1 0   7 6 5 4  11 10 9 8   15 14 13 12]
-    //   <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
-    //  this sequence. 'vid' gives us the N.
-    __ vle64_v(v10, buf);
-    __ vrev8_v(v10, v10);
-    __ add(buf, buf, 32);
-    __ vle64_v(v11, buf);
-    __ vrev8_v(v11, v11);
-    __ add(buf, buf, 32);
-    __ vle64_v(v12, buf);
-    __ vrev8_v(v12, v12);
-    __ add(buf, buf, 32);
-    __ vle64_v(v13, buf);
-    __ vrev8_v(v13, v13);
-    __ add(buf, buf, 32);
-
-    // Set v0 up for the vmerge that replaces the first word (idx==0)
-    __ vid_v(v0);
-    __ vmseq_vi(v0, v0, 0x0);  // v0.mask[i] = (i == 0 ? 1 : 0)
-
-    __ la(consts, ExternalAddress((address)round_consts));
+    template<Assembler::SEW T>
+    void vseXX_v(VectorRegister vr, Register sr) {
+      if (T == Assembler::e32) __ vse32_v(vr, sr);
+      else                     __ vse64_v(vr, sr);
+    }
 
     // Overview of the logic in each "quad round".
     //
-    // The code below repeats 20 times the logic implementing four rounds
-    // of the SHA-512 core loop as documented by NIST. 20 "quad rounds"
-    // to implementing the 80 single rounds.
+    // The code below repeats 16/20 times the logic implementing four rounds
+    // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds"
+    // to implementing the 64/80 single rounds.
     //
-    //    // Load four word (u64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
+    //    // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
     //    // Output:
     //    //   v15 = {K[t+3], K[t+2], K[t+1], K[t+0]}
-    //    vl1re32.v v15, (a2)
+    //    vl1reXX.v v15, ofs
     //
-    //    // Increment word contant address by stride (32 bytes, 4*8B, 256b)
-    //    addi consts, consts, 32
+    //    // Increment word contant address by stride (16/32 bytes, 4*4B/8B, 128b/256b)
+    //    addi ofs, ofs, 16/32
     //
     //    // Add constants to message schedule words:
     //    //  Input
@@ -4343,232 +3833,228 @@ class StubGenerator: public StubCodeGenerator {
     //  hence the uses of those vectors rotate in each round, and we get back to the
     //  initial configuration every 4 quad-rounds. We could avoid those changes at
     //  the cost of moving those vectors at the end of each quad-rounds.
+    template<Assembler::SEW vset_sew>
+    void sha2_quad_round(VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4,
+                         Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister vtemp3, VectorRegister vtemp4,
+                         bool gen_words = true, bool step_const = true) {
+      __ vl1reXX_v<vset_sew>(vtemp, scalarconst);
+      if (step_const) {
+        __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32);
+      }
+      __ vadd_vv(vtemp2, vtemp, rot1);
+      __ vsha2cl_vv(vtemp4, vtemp3, vtemp2);
+      __ vsha2ch_vv(vtemp3, vtemp4, vtemp2);
+      if ((vset_sew == Assembler::e64 && step_const) || gen_words) {
+        __ vmerge_vvm(vtemp2, rot3, rot2);
+      }
+      if (gen_words) {
+        __ vsha2ms_vv(rot1, vtemp2, rot4);
+      }
+    }
 
-    //--------------------------------------------------------------------------------
-    // Quad-round 0 (+0, v10->v11->v12->v13)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v10);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v12, v11);
-    __ vsha2ms_vv(v10, v14, v13);
-    //--------------------------------------------------------------------------------
-    // Quad-round 1 (+1, v11->v12->v13->v10)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v11);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v13, v12);
-    __ vsha2ms_vv(v11, v14, v10);
-    //--------------------------------------------------------------------------------
-    // Quad-round 2 (+2, v12->v13->v10->v11)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v12);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v10, v13);
-    __ vsha2ms_vv(v12, v14, v11);
-    //--------------------------------------------------------------------------------
-    // Quad-round 3 (+3, v13->v10->v11->v12)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v13);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v11, v10);
-    __ vsha2ms_vv(v13, v14, v12);
-
-    //--------------------------------------------------------------------------------
-    // Quad-round 4 (+0, v10->v11->v12->v13)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v10);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v12, v11);
-    __ vsha2ms_vv(v10, v14, v13);
-    //--------------------------------------------------------------------------------
-    // Quad-round 5 (+1, v11->v12->v13->v10)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v11);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v13, v12);
-    __ vsha2ms_vv(v11, v14, v10);
-    //--------------------------------------------------------------------------------
-    // Quad-round 6 (+2, v12->v13->v10->v11)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v12);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v10, v13);
-    __ vsha2ms_vv(v12, v14, v11);
-    //--------------------------------------------------------------------------------
-    // Quad-round 7 (+3, v13->v10->v11->v12)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v13);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v11, v10);
-    __ vsha2ms_vv(v13, v14, v12);
-
-    //--------------------------------------------------------------------------------
-    // Quad-round 8 (+0, v10->v11->v12->v13)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v10);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v12, v11);
-    __ vsha2ms_vv(v10, v14, v13);
-    //--------------------------------------------------------------------------------
-    // Quad-round 9 (+1, v11->v12->v13->v10)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v11);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v13, v12);
-    __ vsha2ms_vv(v11, v14, v10);
-    //--------------------------------------------------------------------------------
-    // Quad-round 10 (+2, v12->v13->v10->v11)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v12);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v10, v13);
-    __ vsha2ms_vv(v12, v14, v11);
-    //--------------------------------------------------------------------------------
-    // Quad-round 11 (+3, v13->v10->v11->v12)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v13);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v11, v10);
-    __ vsha2ms_vv(v13, v14, v12);
-
-    //--------------------------------------------------------------------------------
-    // Quad-round 12 (+0, v10->v11->v12->v13)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v10);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v12, v11);
-    __ vsha2ms_vv(v10, v14, v13);
-    //--------------------------------------------------------------------------------
-    // Quad-round 13 (+1, v11->v12->v13->v10)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v11);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v13, v12);
-    __ vsha2ms_vv(v11, v14, v10);
-    //--------------------------------------------------------------------------------
-    // Quad-round 14 (+2, v12->v13->v10->v11)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v12);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v10, v13);
-    __ vsha2ms_vv(v12, v14, v11);
-    //--------------------------------------------------------------------------------
-    // Quad-round 15 (+3, v13->v10->v11->v12)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v13);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v11, v10);
-    __ vsha2ms_vv(v13, v14, v12);
-
-    //--------------------------------------------------------------------------------
-    // Quad-round 16 (+0, v10->v11->v12->v13)
-    // Note that we stop generating new message schedule words (Wt, v10-13)
-    // as we already generated all the words we end up consuming (i.e., W[79:76]).
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v10);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v12, v11);
-    //--------------------------------------------------------------------------------
-    // Quad-round 17 (+1, v11->v12->v13->v10)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v11);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v13, v12);
-    //--------------------------------------------------------------------------------
-    // Quad-round 18 (+2, v12->v13->v10->v11)
-    __ vl1re64_v(v15, consts);
-    __ addi(consts, consts, 32);
-    __ vadd_vv(v14, v15, v12);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-    __ vmerge_vvm(v14, v10, v13);
-    //--------------------------------------------------------------------------------
-    // Quad-round 19 (+3, v13->v10->v11->v12)
-    __ vl1re64_v(v15, consts);
-    // No consts increment needed.
-    __ vadd_vv(v14, v15, v13);
-    __ vsha2cl_vv(v17, v16, v14);
-    __ vsha2ch_vv(v16, v17, v14);
-
-    //--------------------------------------------------------------------------------
-    // Compute the updated hash value H'
-    //   H' = H + {h',g',...,b',a'}
-    //      = {h,g,...,b,a} + {h',g',...,b',a'}
-    //      = {h+h',g+g',...,b+b',a+a'}
-
-    // H' = H+{a',b',c',...,h'}
-    __ vadd_vv(v16, v26, v16);
-    __ vadd_vv(v17, v27, v17);
+    // Arguments:
+    //
+    // Inputs:
+    //   c_rarg0   - byte[]  source+offset
+    //   c_rarg1   - int[]   SHA.state
+    //   c_rarg2   - int     offset
+    //   c_rarg3   - int     limit
+    //
+    template<Assembler::SEW vset_sew>
+    address generate_sha2_implCompress(bool multi_block) {
+      constexpr int const_add = vset_sew == Assembler::e32 ? 16 : 32;
 
-    if (multi_block) {
-      __ add(ofs, ofs, 128);
-      __ ble(ofs, limit, multi_block_loop);
-      __ mv(c_rarg0, ofs); // return ofs
-    }
+      __ align(CodeEntryAlignment);
+      address start = __ pc();
 
-    // Store H[0..8] = {a,b,c,d,e,f,g,h} from
-    //  v16 = {f,e,b,a}
-    //  v17 = {h,g,d,c}
-    __ vid_v(v30);                                   // v30 = {3,2,1,0}
-    __ vxor_vi(v30, v30, 0x3);                       // v30 = {0,1,2,3}
-    __ vrgather_vv(v26, v16, v30);                   // v26 = {f,e,b,a}
-    __ vrgather_vv(v27, v17, v30);                   // v27 = {h,g,d,c}
-    __ vmsgeu_vi(v0, v30, 2);                        // v0  = {f,f,t,t}
-    // Copy elements [3..2] of v26 ({f,e}) into elements [1..0] of v17.
-    __ vslidedown_vi(v17, v26, 2);                   // v17 = {_,_,f,e}
-    // Merge elements [3..2] of v27 ({g,h}) into elements [3..2] of v17
-    __ vmerge_vvm(v17, v27, v17);                    // v17 = {h,g,f,e}
-    // Copy elements [1..0] of v27 ({c,d}) into elements [3..2] of v16.
-    __ vslideup_vi(v16, v27, 2);                     // v16 = {d,c,_,_}
-    // Merge elements [1..0] of v26 ({a,b}) into elements [1..0] of v16
-    __ vmerge_vvm(v16, v16, v26);                    // v16 = {d,c,b,a}
-
-    // Save the  hash
-    __ vse64_v(v17, state);
-    __ addi(state, state, -32);
-    __ vse64_v(v16, state);
+      Register buf   = c_rarg0;
+      Register state = c_rarg1;
+      Register ofs   = c_rarg2;
+      Register limit = c_rarg3;
+      Register consts = t2;
 
-    __ leave();
-    __ ret();
+      Label multi_block_loop;
 
-    return start;
-  }
+      __ enter();
+      // Register use in this function:
+      //
+      // VECTORS
+      //  v10 - v13 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message
+      //             schedule words (Wt). They start with the message block
+      //             content (W0 to W15), then further words in the message
+      //             schedule generated via vsha2ms from previous Wt.
+      //   Initially:
+      //     v10 = W[  3:0] = { W3,  W2,  W1,  W0}
+      //     v11 = W[  7:4] = { W7,  W6,  W5,  W4}
+      //     v12 = W[ 11:8] = {W11, W10,  W9,  W8}
+      //     v13 = W[15:12] = {W15, W14, W13, W12}
+      //
+      //  v16 - v17 hold the working state variables (a, b, ..., h)
+      //    v16 = {f[t],e[t],b[t],a[t]}
+      //    v17 = {h[t],g[t],d[t],c[t]}
+      //   Initially:
+      //    v16 = {H5i-1, H4i-1, H1i-1 , H0i-1}
+      //    v17 = {H7i-i, H6i-1, H3i-1 , H2i-1}
+      //
+      //  v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
+      //
+      //  v14 = temporary, Wt+Kt
+      //  v15 = temporary, Kt
+      //
+      //  v18/v19 = temporaries, in the epilogue, to re-arrange
+      //            and byte-swap v16/v17
+      //
+      //  v26/v27 = hold the initial values of the hash, byte-swapped.
+      //
+      //  v30/v31 = used to generate masks, vrgather indices.
+      //
+      // During most of the function the vector state is configured so that each
+      // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits).
+
+      // Set vectors as 4 * 32/64 bits
+      //
+      // e32/e64: vector of 32b/64b/4B/8B elements
+      // m1: LMUL=1
+      // ta: tail agnostic (don't care about those lanes)
+      // ma: mask agnostic (don't care about those lanes)
+			// x0 is not written, we known the number of vector elements.
+      __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta);
+
+      // Load H[0..8] to produce
+      //  v16 = {a,b,e,f}
+      //  v17 = {c,d,g,h}
+      __ vleXX_v<vset_sew>(v16, state);                // v16 = {d,c,b,a}
+      __ addi(state, state, const_add);
+      __ vleXX_v<vset_sew>(v17, state);                // v17 = {h,g,f,e}
+
+      __ vid_v(v30);                                   // v30 = {3,2,1,0}
+      __ vxor_vi(v30, v30, 0x3);                       // v30 = {0,1,2,3}
+      __ vrgather_vv(v26, v16, v30);                   // v26 = {a,b,c,d}
+      __ vrgather_vv(v27, v17, v30);                   // v27 = {e,f,g,h}
+      __ vmsgeu_vi(v0, v30, 2);                        // v0  = {f,f,t,t}
+      // Copy elements [3..2] of v26 ({d,c}) into elements [3..2] of v17.
+      __ vslideup_vi(v17, v26, 2);                     // v17 = {c,d,_,_}
+      // Merge elements [1..0] of v27 ({g,h}) into elements [1..0] of v17
+      __ vmerge_vvm(v17, v17, v27);                    // v17 = {c,d,g,h}
+      // Copy elements [1..0] of v27 ({f,e}) into elements [1..0] of v16.
+      __ vslidedown_vi(v16, v27, 2);                   // v16 = {_,_,e,f}
+      // Merge elements [3..2] of v26 ({a,b}) into elements [3..2] of v16
+      __ vmerge_vvm(v16, v26, v16);                    // v16 = {a,b,e,f}
+
+      __ bind(multi_block_loop);
+
+      // Capture the initial H values in v26 and v27 to allow for computing
+      // the resulting H', since H' = H+{a',b',c',...,h'}.
+      __ vmv_v_v(v26, v16);
+      __ vmv_v_v(v27, v17);
+
+	    // Load the 512/1024-bits of the message block in v10-v13 and perform
+  	  // an endian swap on each 4/8 bytes element.
+    	//
+	    // If Zvkb is not implemented one can use vrgather
+  	  // with an index sequence to byte-swap.
+    	//  sequence = [3 2 1 0   7 6 5 4  11 10 9 8   15 14 13 12]
+	    //   <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
+  	  //  this sequence. 'vid' gives us the N.
+      __ vleXX_v<vset_sew>(v10, buf);
+      __ vrev8_v(v10, v10);
+      __ addi(buf, buf, const_add);
+      __ vleXX_v<vset_sew>(v11, buf);
+      __ vrev8_v(v11, v11);
+      __ addi(buf, buf, const_add);
+      __ vleXX_v<vset_sew>(v12, buf);
+      __ vrev8_v(v12, v12);
+      __ addi(buf, buf, const_add);
+      __ vleXX_v<vset_sew>(v13, buf);
+      __ vrev8_v(v13, v13);
+      __ addi(buf, buf, const_add);
+
+      // Set v0 up for the vmerge that replaces the first word (idx==0)
+      __ vid_v(v0);
+      __ vmseq_vi(v0, v0, 0x0);  // v0.mask[i] = (i == 0 ? 1 : 0)
+
+      __ la(consts, ExternalAddress(vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512));
+
+      VectorRegister rotation_regs[] = {v10, v11, v12, v13};
+      int rot_pos = 0;
+      // Quad-round #0 (+0, v10->v11->v12->v13) ... #11 (+3, v13->v10->v11->v12)
+      constexpr int qr_end = vset_sew == Assembler::e32 ? 12 : 16;
+      for (int i = 0; i < qr_end; i++) {
+        BLOCK_COMMENT("QUAD ROUND");
+        sha2_quad_round<vset_sew>
+                  (rotation_regs[(rot_pos + 0) & 0x3],
+                   rotation_regs[(rot_pos + 1) & 0x3],
+                   rotation_regs[(rot_pos + 2) & 0x3],
+                   rotation_regs[(rot_pos + 3) & 0x3],
+                   consts,
+                   v15, v14, v16, v17);
+        ++rot_pos;
+      }
+      // Quad-round #12 (+0, v10->v11->v12->v13) ... #15 (+3, v13->v10->v11->v12)
+      // Note that we stop generating new message schedule words (Wt, v10-13)
+      // as we already generated all the words we end up consuming (i.e., W[63:60]).
+      constexpr int qr_c_end = qr_end + 4;
+      for (int i = qr_end; i < qr_c_end; i++) {
+        BLOCK_COMMENT("QUAD ROUND CONSUME");
+        sha2_quad_round<vset_sew>
+                  (rotation_regs[(rot_pos + 0) & 0x3],
+                   rotation_regs[(rot_pos + 1) & 0x3],
+                   rotation_regs[(rot_pos + 2) & 0x3],
+                   rotation_regs[(rot_pos + 3) & 0x3],
+                   consts,
+                   v15, v14, v16, v17, false, i < (qr_c_end-1));
+        ++rot_pos;
+      }
+      BLOCK_COMMENT("QUAD END");
+
+  	  //--------------------------------------------------------------------------------
+	    // Compute the updated hash value H'
+    	//   H' = H + {h',g',...,b',a'}
+  	  //      = {h,g,...,b,a} + {h',g',...,b',a'}
+	    //      = {h+h',g+g',...,b+b',a+a'}
+
+	    // H' = H+{a',b',c',...,h'}
+      __ vadd_vv(v16, v26, v16);
+      __ vadd_vv(v17, v27, v17);
+
+      if (multi_block) {
+        __ add(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128);
+        __ ble(ofs, limit, multi_block_loop);
+        __ mv(c_rarg0, ofs); // return ofs
+      }
+
+      // Store H[0..8] = {a,b,c,d,e,f,g,h} from
+      //  v16 = {f,e,b,a}
+      //  v17 = {h,g,d,c}
+      __ vid_v(v30);                                   // v30 = {3,2,1,0}
+      __ vxor_vi(v30, v30, 0x3);                       // v30 = {0,1,2,3}
+      __ vrgather_vv(v26, v16, v30);                   // v26 = {f,e,b,a}
+      __ vrgather_vv(v27, v17, v30);                   // v27 = {h,g,d,c}
+      __ vmsgeu_vi(v0, v30, 2);                        // v0  = {f,f,t,t}
+      // Copy elements [3..2] of v26 ({f,e}) into elements [1..0] of v17.
+      __ vslidedown_vi(v17, v26, 2);                   // v17 = {_,_,f,e}
+      // Merge elements [3..2] of v27 ({g,h}) into elements [3..2] of v17
+      __ vmerge_vvm(v17, v27, v17);                    // v17 = {h,g,f,e}
+      // Copy elements [1..0] of v27 ({c,d}) into elements [3..2] of v16.
+      __ vslideup_vi(v16, v27, 2);                     // v16 = {d,c,_,_}
+      // Merge elements [1..0] of v26 ({a,b}) into elements [1..0] of v16
+      __ vmerge_vvm(v16, v16, v26);                    // v16 = {d,c,b,a}
+
+      // Save the hash
+      __ vseXX_v<vset_sew>(v17, state);
+      __ addi(state, state, -const_add);
+      __ vseXX_v<vset_sew>(v16, state);
+
+      __ leave();
+      __ ret();
+
+      return start;
+    }
+  };
+#undef __
+#define __ masm->
+
+#endif // COMPILER2
 
   // Continuation point for throwing of implicit exceptions that are
   // not handled in the current activation. Fabricates an exception
@@ -5551,12 +5037,28 @@ class StubGenerator: public StubCodeGenerator {
 #endif // COMPILER2
 
     if (UseSHA256Intrinsics) {
-      StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
-      StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
+      {
+        Sha2Generator g(_masm);
+        StubCodeMark mark(this, "StubRoutines", "sha256_implCompress");
+        StubRoutines::_sha256_implCompress   = g.generate_sha256_implCompress(false);
+      }
+      {
+        Sha2Generator g(_masm);
+        StubCodeMark mark(this, "StubRoutines", "sha256_implCompressMB");
+        StubRoutines::_sha256_implCompressMB = g.generate_sha256_implCompress(true);
+      }
     }
     if (UseSHA512Intrinsics) {
-      StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
-      StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
+      {
+        Sha2Generator g(_masm);
+        StubCodeMark mark(this, "StubRoutines", "sha512_implCompress");
+        StubRoutines::_sha512_implCompress   = g.generate_sha512_implCompress(false);
+      }
+      {
+        Sha2Generator g(_masm);
+        StubCodeMark mark(this, "StubRoutines", "sha512_implCompressMB");
+        StubRoutines::_sha512_implCompressMB = g.generate_sha512_implCompress(true);
+      }
     }
 
     generate_compare_long_strings();