diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc index 32b7a0d1..6e2496eb 100644 --- a/src/aarch64/assembler-aarch64.cc +++ b/src/aarch64/assembler-aarch64.cc @@ -3852,6 +3852,15 @@ void Assembler::udot(const VRegister& vd, Emit(VFormat(vd) | NEON_UDOT | Rm(vm) | Rn(vn) | Rd(vd)); } +void Assembler::usdot(const VRegister& vd, + const VRegister& vn, + const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON, CPUFeatures::kI8MM)); + VIXL_ASSERT(AreSameFormat(vn, vm)); + VIXL_ASSERT((vd.Is2S() && vn.Is8B()) || (vd.Is4S() && vn.Is16B())); + + Emit(VFormat(vd) | 0x0e809c00 | Rm(vm) | Rn(vn) | Rd(vd)); +} void Assembler::faddp(const VRegister& vd, const VRegister& vn) { VIXL_ASSERT(CPUHas(CPUFeatures::kFP, CPUFeatures::kNEON)); @@ -4166,6 +4175,32 @@ void Assembler::udot(const VRegister& vd, ImmNEONHLM(vm_index, index_num_bits) | Rm(vm) | Rn(vn) | Rd(vd)); } +void Assembler::sudot(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int vm_index) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON, CPUFeatures::kI8MM)); + VIXL_ASSERT((vd.Is2S() && vn.Is8B() && vm.Is1S4B()) || + (vd.Is4S() && vn.Is16B() && vm.Is1S4B())); + int q = vd.Is4S() ? (1U << NEONQ_offset) : 0; + int index_num_bits = 2; + Emit(q | 0x0f00f000 | ImmNEONHLM(vm_index, index_num_bits) | Rm(vm) | Rn(vn) | + Rd(vd)); +} + + +void Assembler::usdot(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int vm_index) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON, CPUFeatures::kI8MM)); + VIXL_ASSERT((vd.Is2S() && vn.Is8B() && vm.Is1S4B()) || + (vd.Is4S() && vn.Is16B() && vm.Is1S4B())); + int q = vd.Is4S() ? (1U << NEONQ_offset) : 0; + int index_num_bits = 2; + Emit(q | 0x0f80f000 | ImmNEONHLM(vm_index, index_num_bits) | Rm(vm) | Rn(vn) | + Rd(vd)); +} // clang-format off #define NEON_BYELEMENT_LIST(V) \ diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h index 684ee915..af63ae77 100644 --- a/src/aarch64/assembler-aarch64.h +++ b/src/aarch64/assembler-aarch64.h @@ -3367,6 +3367,21 @@ class Assembler : public vixl::internal::AssemblerBase { // Unsigned dot product [Armv8.2]. void udot(const VRegister& vd, const VRegister& vn, const VRegister& vm); + // Dot Product with unsigned and signed integers (vector). + void usdot(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // Dot product with signed and unsigned integers (vector, by element). + void sudot(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int vm_index); + + // Dot product with unsigned and signed integers (vector, by element). + void usdot(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int vm_index); + // Signed saturating rounding doubling multiply subtract returning high half // [Armv8.1]. void sqrdmlsh(const VRegister& vd, const VRegister& vn, const VRegister& vm); diff --git a/src/aarch64/cpu-features-auditor-aarch64.cc b/src/aarch64/cpu-features-auditor-aarch64.cc index a5f0fdd4..689038d7 100644 --- a/src/aarch64/cpu-features-auditor-aarch64.cc +++ b/src/aarch64/cpu-features-auditor-aarch64.cc @@ -1701,6 +1701,12 @@ void CPUFeaturesAuditor::Visit(Metadata* metadata, const Instruction* instr) { CPUFeatures(CPUFeatures::kSVE, CPUFeatures::kSVEF64MM)}, {"ld1roh_z_p_br_contiguous", CPUFeatures(CPUFeatures::kSVE, CPUFeatures::kSVEF64MM)}, + {"usdot_asimdsame2_d", + CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kI8MM)}, + {"sudot_asimdelem_d", + CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kI8MM)}, + {"usdot_asimdelem_d", + CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kI8MM)}, {"usdot_z_zzz_s", CPUFeatures(CPUFeatures::kSVE, CPUFeatures::kSVEI8MM)}, {"usdot_z_zzzi_s", diff --git a/src/aarch64/decoder-visitor-map-aarch64.h b/src/aarch64/decoder-visitor-map-aarch64.h index 50cd2f4b..2a3c975e 100644 --- a/src/aarch64/decoder-visitor-map-aarch64.h +++ b/src/aarch64/decoder-visitor-map-aarch64.h @@ -2861,12 +2861,9 @@ {"subg_64_addsub_immtags", &VISITORCLASS::VisitUnimplemented}, \ {"subps_64s_dp_2src", &VISITORCLASS::VisitUnimplemented}, \ {"subp_64s_dp_2src", &VISITORCLASS::VisitUnimplemented}, \ - {"sudot_asimdelem_d", &VISITORCLASS::VisitUnimplemented}, \ {"tcancel_ex_exception", &VISITORCLASS::VisitUnimplemented}, \ {"tstart_br_systemresult", &VISITORCLASS::VisitUnimplemented}, \ {"ttest_br_systemresult", &VISITORCLASS::VisitUnimplemented}, \ - {"usdot_asimdelem_d", &VISITORCLASS::VisitUnimplemented}, \ - {"usdot_asimdsame2_d", &VISITORCLASS::VisitUnimplemented}, \ {"wfet_only_systeminstrswithreg", &VISITORCLASS::VisitUnimplemented}, \ {"wfit_only_systeminstrswithreg", &VISITORCLASS::VisitUnimplemented}, \ {"xar_vvv2_crypto3_imm6", &VISITORCLASS::VisitUnimplemented}, \ diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc index 90a67081..e8af6246 100644 --- a/src/aarch64/disasm-aarch64.cc +++ b/src/aarch64/disasm-aarch64.cc @@ -46,6 +46,8 @@ Disassembler::FormToVisitorFnMap Disassembler::form_to_visitor_ = { {"sqdmlsl_asimdelem_l", &Disassembler::DisassembleNEONMulByElementLong}, {"sdot_asimdelem_d", &Disassembler::DisassembleNEONDotProdByElement}, {"udot_asimdelem_d", &Disassembler::DisassembleNEONDotProdByElement}, + {"usdot_asimdelem_d", &Disassembler::DisassembleNEONDotProdByElement}, + {"sudot_asimdelem_d", &Disassembler::DisassembleNEONDotProdByElement}, {"fmlal2_asimdelem_lh", &Disassembler::DisassembleNEONFPMulByElementLong}, {"fmlal_asimdelem_lh", &Disassembler::DisassembleNEONFPMulByElementLong}, {"fmlsl2_asimdelem_lh", &Disassembler::DisassembleNEONFPMulByElementLong}, @@ -376,6 +378,7 @@ Disassembler::FormToVisitorFnMap Disassembler::form_to_visitor_ = { &Disassembler::VisitSVELoadAndBroadcastQOWord_ScalarPlusScalar}, {"usdot_z_zzzi_s", &Disassembler::VisitSVEMulIndex}, {"sudot_z_zzzi_s", &Disassembler::VisitSVEMulIndex}, + {"usdot_asimdsame2_d", &Disassembler::VisitNEON3SameExtra}, }; Disassembler::Disassembler() { @@ -3309,40 +3312,32 @@ void Disassembler::VisitNEON3SameFP16(const Instruction *instr) { void Disassembler::VisitNEON3SameExtra(const Instruction *instr) { static const NEONFormatMap map_usdot = {{30}, {NF_8B, NF_16B}}; - const char *mnemonic = "unallocated"; - const char *form = "(NEON3SameExtra)"; + const char *mnemonic = mnemonic_.c_str(); + const char *form = "'Vd.%s, 'Vn.%s, 'Vm.%s"; + const char *suffix = NULL; NEONFormatDecoder nfd(instr); - if (instr->Mask(NEON3SameExtraFCMLAMask) == NEON_FCMLA) { - mnemonic = "fcmla"; - form = "'Vd.%s, 'Vn.%s, 'Vm.%s, 'IVFCNM"; - } else if (instr->Mask(NEON3SameExtraFCADDMask) == NEON_FCADD) { - mnemonic = "fcadd"; - form = "'Vd.%s, 'Vn.%s, 'Vm.%s, 'IVFCNA"; - } else { - form = "'Vd.%s, 'Vn.%s, 'Vm.%s"; - switch (instr->Mask(NEON3SameExtraMask)) { - case NEON_SDOT: - mnemonic = "sdot"; - nfd.SetFormatMap(1, &map_usdot); - nfd.SetFormatMap(2, &map_usdot); - break; - case NEON_SQRDMLAH: - mnemonic = "sqrdmlah"; - break; - case NEON_UDOT: - mnemonic = "udot"; - nfd.SetFormatMap(1, &map_usdot); - nfd.SetFormatMap(2, &map_usdot); - break; - case NEON_SQRDMLSH: - mnemonic = "sqrdmlsh"; - break; - } + switch (form_hash_) { + case Hash("fcmla_asimdsame2_c"): + suffix = ", #'u1211*90"; + break; + case Hash("fcadd_asimdsame2_c"): + // Bit 10 is always set, so this gives 90 * 1 or 3. + suffix = ", #'u1212:1010*90"; + break; + case Hash("sdot_asimdsame2_d"): + case Hash("udot_asimdsame2_d"): + case Hash("usdot_asimdsame2_d"): + nfd.SetFormatMap(1, &map_usdot); + nfd.SetFormatMap(2, &map_usdot); + break; + default: + // sqrdml[as]h - nothing to do. + break; } - Format(instr, mnemonic, nfd.Substitute(form)); + Format(instr, mnemonic, nfd.Substitute(form), suffix); } @@ -3566,7 +3561,7 @@ void Disassembler::DisassembleNEONMulByElementLong(const Instruction *instr) { void Disassembler::DisassembleNEONDotProdByElement(const Instruction *instr) { const char *form = instr->ExtractBit(30) ? "'Vd.4s, 'Vn.16" : "'Vd.2s, 'Vn.8"; - const char *suffix = "b, 'Ve.4b['IVByElemIndex]"; + const char *suffix = "b, 'Vm.4b['u1111:2121]"; Format(instr, mnemonic_.c_str(), form, suffix); } @@ -10790,19 +10785,6 @@ int Disassembler::SubstituteImmediateField(const Instruction *instr, } case 'V': { // Immediate Vector. switch (format[2]) { - case 'F': { - switch (format[5]) { - // Convert 'rot' bit encodings into equivalent angle rotation - case 'A': - AppendToOutput("#%" PRId32, - instr->GetImmRotFcadd() == 1 ? 270 : 90); - break; - case 'M': - AppendToOutput("#%" PRId32, instr->GetImmRotFcmlaVec() * 90); - break; - } - return strlen("IVFCN") + 1; - } case 'E': { // IVExtract. AppendToOutput("#%" PRId32, instr->GetImmNEONExt()); return 9; diff --git a/src/aarch64/logic-aarch64.cc b/src/aarch64/logic-aarch64.cc index a76cae97..1fd33957 100644 --- a/src/aarch64/logic-aarch64.cc +++ b/src/aarch64/logic-aarch64.cc @@ -859,23 +859,6 @@ LogicVRegister Simulator::sqrdmulh(VectorFormat vform, } -LogicVRegister Simulator::sdot(VectorFormat vform, - LogicVRegister dst, - const LogicVRegister& src1, - const LogicVRegister& src2, - int index) { - SimVRegister temp; - // NEON indexed `dot` allows the index value exceed the register size. - // Promote the format to Q-sized vector format before the duplication. - dup_elements_to_segments(IsSVEFormat(vform) ? vform - : VectorFormatFillQ(vform), - temp, - src2, - index); - return sdot(vform, dst, src1, temp); -} - - LogicVRegister Simulator::sqrdmlah(VectorFormat vform, LogicVRegister dst, const LogicVRegister& src1, @@ -887,23 +870,6 @@ LogicVRegister Simulator::sqrdmlah(VectorFormat vform, } -LogicVRegister Simulator::udot(VectorFormat vform, - LogicVRegister dst, - const LogicVRegister& src1, - const LogicVRegister& src2, - int index) { - SimVRegister temp; - // NEON indexed `dot` allows the index value exceed the register size. - // Promote the format to Q-sized vector format before the duplication. - dup_elements_to_segments(IsSVEFormat(vform) ? vform - : VectorFormatFillQ(vform), - temp, - src2, - index); - return udot(vform, dst, src1, temp); -} - - LogicVRegister Simulator::sqrdmlsh(VectorFormat vform, LogicVRegister dst, const LogicVRegister& src1, diff --git a/src/aarch64/macro-assembler-aarch64.h b/src/aarch64/macro-assembler-aarch64.h index ac94eac7..b1e9ec5c 100644 --- a/src/aarch64/macro-assembler-aarch64.h +++ b/src/aarch64/macro-assembler-aarch64.h @@ -2819,7 +2819,8 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface { V(zip2, Zip2) \ V(smmla, Smmla) \ V(ummla, Ummla) \ - V(usmmla, Usmmla) + V(usmmla, Usmmla) \ + V(usdot, Usdot) #define DEFINE_MACRO_ASM_FUNC(ASM, MASM) \ void MASM(const VRegister& vd, const VRegister& vn, const VRegister& vm) { \ @@ -2971,7 +2972,10 @@ class MacroAssembler : public Assembler, public MacroAssemblerInterface { V(umlal, Umlal) \ V(umlal2, Umlal2) \ V(umlsl, Umlsl) \ - V(umlsl2, Umlsl2) + V(umlsl2, Umlsl2) \ + V(sudot, Sudot) \ + V(usdot, Usdot) + #define DEFINE_MACRO_ASM_FUNC(ASM, MASM) \ void MASM(const VRegister& vd, \ diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc index 17fe73a5..20e6b21b 100644 --- a/src/aarch64/simulator-aarch64.cc +++ b/src/aarch64/simulator-aarch64.cc @@ -92,8 +92,8 @@ Simulator::FormToVisitorFnMap Simulator::form_to_visitor_ = { {"fmls_asimdelem_r_sd", &Simulator::SimulateNEONFPMulByElement}, {"fmulx_asimdelem_r_sd", &Simulator::SimulateNEONFPMulByElement}, {"fmul_asimdelem_r_sd", &Simulator::SimulateNEONFPMulByElement}, - {"sdot_asimdelem_d", &Simulator::VisitNEONByIndexedElement}, - {"udot_asimdelem_d", &Simulator::VisitNEONByIndexedElement}, + {"sdot_asimdelem_d", &Simulator::SimulateNEONDotProdByElement}, + {"udot_asimdelem_d", &Simulator::SimulateNEONDotProdByElement}, {"adclb_z_zzz", &Simulator::SimulateSVEAddSubCarry}, {"adclt_z_zzz", &Simulator::SimulateSVEAddSubCarry}, {"addhnb_z_zz", &Simulator::SimulateSVEAddSubHigh}, @@ -401,6 +401,9 @@ Simulator::FormToVisitorFnMap Simulator::form_to_visitor_ = { {"usdot_z_zzz_s", &Simulator::VisitSVEIntMulAddUnpredicated}, {"sudot_z_zzzi_s", &Simulator::VisitSVEMulIndex}, {"usdot_z_zzzi_s", &Simulator::VisitSVEMulIndex}, + {"usdot_asimdsame2_d", &Simulator::VisitNEON3SameExtra}, + {"sudot_asimdelem_d", &Simulator::SimulateNEONDotProdByElement}, + {"usdot_asimdelem_d", &Simulator::SimulateNEONDotProdByElement}, }; Simulator::Simulator(Decoder* decoder, FILE* stream, SimStack::Allocated stack) @@ -7388,30 +7391,31 @@ void Simulator::VisitNEON3SameExtra(const Instruction* instr) { SimVRegister& rm = ReadVRegister(instr->GetRm()); int rot = 0; VectorFormat vf = nfd.GetVectorFormat(); - if (instr->Mask(NEON3SameExtraFCMLAMask) == NEON_FCMLA) { - rot = instr->GetImmRotFcmlaVec(); - fcmla(vf, rd, rn, rm, rd, rot); - } else if (instr->Mask(NEON3SameExtraFCADDMask) == NEON_FCADD) { - rot = instr->GetImmRotFcadd(); - fcadd(vf, rd, rn, rm, rot); - } else { - switch (instr->Mask(NEON3SameExtraMask)) { - case NEON_SDOT: - sdot(vf, rd, rn, rm); - break; - case NEON_SQRDMLAH: - sqrdmlah(vf, rd, rn, rm); - break; - case NEON_UDOT: - udot(vf, rd, rn, rm); - break; - case NEON_SQRDMLSH: - sqrdmlsh(vf, rd, rn, rm); - break; - default: - VIXL_UNIMPLEMENTED(); - break; - } + + switch (form_hash_) { + case Hash("fcmla_asimdsame2_c"): + rot = instr->GetImmRotFcmlaVec(); + fcmla(vf, rd, rn, rm, rd, rot); + break; + case Hash("fcadd_asimdsame2_c"): + rot = instr->GetImmRotFcadd(); + fcadd(vf, rd, rn, rm, rot); + break; + case Hash("sdot_asimdsame2_d"): + sdot(vf, rd, rn, rm); + break; + case Hash("udot_asimdsame2_d"): + udot(vf, rd, rn, rm); + break; + case Hash("usdot_asimdsame2_d"): + usdot(vf, rd, rn, rm); + break; + case Hash("sqrdmlah_asimdsame2_only"): + sqrdmlah(vf, rd, rn, rm); + break; + case Hash("sqrdmlsh_asimdsame2_only"): + sqrdmlsh(vf, rd, rn, rm); + break; } } @@ -7815,6 +7819,35 @@ void Simulator::SimulateNEONComplexMulByElement(const Instruction* instr) { } } +void Simulator::SimulateNEONDotProdByElement(const Instruction* instr) { + VectorFormat vform = instr->GetNEONQ() ? kFormat4S : kFormat2S; + + SimVRegister& rd = ReadVRegister(instr->GetRd()); + SimVRegister& rn = ReadVRegister(instr->GetRn()); + SimVRegister& rm = ReadVRegister(instr->GetRm()); + int index = (instr->GetNEONH() << 1) | instr->GetNEONL(); + + SimVRegister temp; + // NEON indexed `dot` allows the index value exceed the register size. + // Promote the format to Q-sized vector format before the duplication. + dup_elements_to_segments(VectorFormatFillQ(vform), temp, rm, index); + + switch (form_hash_) { + case Hash("sdot_asimdelem_d"): + sdot(vform, rd, rn, temp); + break; + case Hash("udot_asimdelem_d"): + udot(vform, rd, rn, temp); + break; + case Hash("sudot_asimdelem_d"): + usdot(vform, rd, temp, rn); + break; + case Hash("usdot_asimdelem_d"): + usdot(vform, rd, rn, temp); + break; + } +} + void Simulator::VisitNEONByIndexedElement(const Instruction* instr) { NEONFormatDecoder nfd(instr); VectorFormat vform = nfd.GetVectorFormat(); @@ -7848,15 +7881,9 @@ void Simulator::VisitNEONByIndexedElement(const Instruction* instr) { case Hash("sqrdmulh_asimdelem_r"): sqrdmulh(vform, rd, rn, rm, index); break; - case Hash("sdot_asimdelem_d"): - sdot(vform, rd, rn, rm, index); - break; case Hash("sqrdmlah_asimdelem_r"): sqrdmlah(vform, rd, rn, rm, index); break; - case Hash("udot_asimdelem_d"): - udot(vform, rd, rn, rm, index); - break; case Hash("sqrdmlsh_asimdelem_r"): sqrdmlsh(vform, rd, rn, rm, index); break; diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h index 4241d120..cee8724e 100644 --- a/src/aarch64/simulator-aarch64.h +++ b/src/aarch64/simulator-aarch64.h @@ -1246,6 +1246,7 @@ class Simulator : public DecoderVisitor { void SimulateNEONFPMulByElement(const Instruction* instr); void SimulateNEONFPMulByElementLong(const Instruction* instr); void SimulateNEONComplexMulByElement(const Instruction* instr); + void SimulateNEONDotProdByElement(const Instruction* instr); // Integer register accessors. @@ -3174,21 +3175,11 @@ class Simulator : public DecoderVisitor { const LogicVRegister& src1, const LogicVRegister& src2, int index); - LogicVRegister sdot(VectorFormat vform, - LogicVRegister dst, - const LogicVRegister& src1, - const LogicVRegister& src2, - int index); LogicVRegister sqrdmlah(VectorFormat vform, LogicVRegister dst, const LogicVRegister& src1, const LogicVRegister& src2, int index); - LogicVRegister udot(VectorFormat vform, - LogicVRegister dst, - const LogicVRegister& src1, - const LogicVRegister& src2, - int index); LogicVRegister sqrdmlsh(VectorFormat vform, LogicVRegister dst, const LogicVRegister& src1, diff --git a/test/aarch64/test-assembler-neon-aarch64.cc b/test/aarch64/test-assembler-neon-aarch64.cc index 8e76dc89..935a714a 100644 --- a/test/aarch64/test-assembler-neon-aarch64.cc +++ b/test/aarch64/test-assembler-neon-aarch64.cc @@ -10842,5 +10842,101 @@ TEST(neon_tbl) { } } +TEST(neon_usdot) { + SETUP_WITH_FEATURES(CPUFeatures::kNEON, + CPUFeatures::kDotProduct, + CPUFeatures::kI8MM); + + START(); + __ Movi(v0.V2D(), 0xffffffffffffffff, 0xffffffffffffffff); + __ Movi(v1.V2D(), 0x7f7f7f7f7f7f7f7f, 0x7f7f7f7f7f7f7f7f); + __ Movi(v2.V2D(), 0x8080808080808080, 0x8080808080808080); + __ Movi(v3.V2D(), 0, 0); + __ Mov(q4, q3); + __ Mov(q5, q3); + __ Mov(q6, q3); + __ Mov(q7, q3); + __ Mov(q8, q3); + __ Mov(q9, q3); + __ Mov(q10, q3); + __ Mov(q11, q3); + + // Test Usdot against Udot/Sdot over the range of inputs where they should be + // equal. + __ Usdot(v3.V2S(), v0.V8B(), v1.V8B()); + __ Udot(v4.V2S(), v0.V8B(), v1.V8B()); + __ Cmeq(v3.V4S(), v3.V4S(), v4.V4S()); + __ Usdot(v5.V4S(), v0.V16B(), v1.V16B()); + __ Udot(v6.V4S(), v0.V16B(), v1.V16B()); + __ Cmeq(v5.V4S(), v5.V4S(), v6.V4S()); + + __ Usdot(v7.V2S(), v1.V8B(), v2.V8B()); + __ Sdot(v8.V2S(), v1.V8B(), v2.V8B()); + __ Cmeq(v7.V4S(), v7.V4S(), v8.V4S()); + __ Usdot(v9.V4S(), v1.V16B(), v2.V16B()); + __ Sdot(v10.V4S(), v1.V16B(), v2.V16B()); + __ Cmeq(v9.V4S(), v9.V4S(), v10.V4S()); + + // Construct values which, when interpreted correctly as signed/unsigned, + // should give a zero result for dot product. + __ Mov(w0, 0x8101ff40); // [-127, 1, -1, 64] as signed bytes. + __ Mov(w1, 0x02fe8002); // [2, 254, 128, 2] as unsigned bytes. + __ Dup(v0.V4S(), w0); + __ Dup(v1.V4S(), w1); + __ Usdot(v11.V4S(), v1.V16B(), v0.V16B()); + + END(); + + if (CAN_RUN()) { + RUN(); + + ASSERT_EQUAL_128(-1, -1, q3); + ASSERT_EQUAL_128(-1, -1, q5); + ASSERT_EQUAL_128(-1, -1, q7); + ASSERT_EQUAL_128(-1, -1, q9); + ASSERT_EQUAL_128(0, 0, q11); + } +} + +TEST(neon_usdot_element) { + SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kI8MM); + + START(); + __ Movi(v0.V2D(), 0xfedcba9876543210, 0x0123456789abcdef); + __ Movi(v1.V2D(), 0x4242424242424242, 0x5555aaaaaaaa5555); + + // Test element Usdot against vector variant. + __ Dup(v2.V4S(), v1.V4S(), 0); + __ Dup(v3.V4S(), v1.V4S(), 1); + __ Dup(v4.V4S(), v1.V4S(), 3); + + __ Mov(q10, q1); + __ Usdot(v10.V2S(), v0.V8B(), v2.V8B()); + __ Mov(q11, q1); + __ Usdot(v11.V2S(), v0.V8B(), v1.S4B(), 0); + __ Cmeq(v11.V4S(), v11.V4S(), v10.V4S()); + + __ Mov(q12, q1); + __ Usdot(v12.V4S(), v0.V16B(), v3.V16B()); + __ Mov(q13, q1); + __ Usdot(v13.V4S(), v0.V16B(), v1.S4B(), 1); + __ Cmeq(v13.V4S(), v13.V4S(), v12.V4S()); + + __ Mov(q14, q1); + __ Usdot(v14.V4S(), v4.V16B(), v0.V16B()); + __ Mov(q15, q1); + __ Sudot(v15.V4S(), v0.V16B(), v1.S4B(), 3); + __ Cmeq(v15.V4S(), v15.V4S(), v14.V4S()); + END(); + + if (CAN_RUN()) { + RUN(); + + ASSERT_EQUAL_128(-1, -1, q11); + ASSERT_EQUAL_128(-1, -1, q13); + ASSERT_EQUAL_128(-1, -1, q15); + } +} + } // namespace aarch64 } // namespace vixl diff --git a/test/aarch64/test-disasm-neon-aarch64.cc b/test/aarch64/test-disasm-neon-aarch64.cc index b39cff08..cf3ba79e 100644 --- a/test/aarch64/test-disasm-neon-aarch64.cc +++ b/test/aarch64/test-disasm-neon-aarch64.cc @@ -1745,6 +1745,11 @@ TEST(neon_3same) { COMPARE_MACRO(Udot(v1.V4S(), v2.V16B(), v3.V16B()), "udot v1.4s, v2.16b, v3.16b"); + COMPARE_MACRO(Usdot(v7.V2S(), v9.V8B(), v30.V8B()), + "usdot v7.2s, v9.8b, v30.8b"); + COMPARE_MACRO(Usdot(v7.V4S(), v9.V16B(), v30.V16B()), + "usdot v7.4s, v9.16b, v30.16b"); + COMPARE_MACRO(And(v6.V8B(), v7.V8B(), v8.V8B()), "and v6.8b, v7.8b, v8.8b"); COMPARE_MACRO(And(v6.V16B(), v7.V16B(), v8.V16B()), "and v6.16b, v7.16b, v8.16b"); @@ -2425,6 +2430,15 @@ TEST(neon_byelement) { COMPARE_MACRO(Fmlsl2(v28.V4S(), v28.V4H(), v7.H(), 0), "fmlsl2 v28.4s, v28.4h, v7.h[0]"); + COMPARE_MACRO(Sudot(v10.V2S(), v21.V8B(), v31.S4B(), 0), + "sudot v10.2s, v21.8b, v31.4b[0]"); + COMPARE_MACRO(Sudot(v12.V4S(), v23.V16B(), v16.S4B(), 3), + "sudot v12.4s, v23.16b, v16.4b[3]"); + COMPARE_MACRO(Usdot(v10.V2S(), v21.V8B(), v31.S4B(), 0), + "usdot v10.2s, v21.8b, v31.4b[0]"); + COMPARE_MACRO(Usdot(v12.V4S(), v23.V16B(), v16.S4B(), 3), + "usdot v12.4s, v23.16b, v16.4b[3]"); + CLEANUP(); } diff --git a/tools/code_coverage.log b/tools/code_coverage.log index 454aee3e..e1bc2332 100644 --- a/tools/code_coverage.log +++ b/tools/code_coverage.log @@ -1,2 +1,3 @@ 1624976463 83.00% 97.44% 95.16% 1628075147 83.04% 97.52% 95.33% +1633016028 83.00% 97.52% 95.32%