Skip to content

Commit

Permalink
[AArch64] Implement intrinsics for FMLAL/FMLALL (single) (#119568)
Browse files Browse the repository at this point in the history
Multi-vector 8-bit floating-point multiply-add long (single)
```c
// Only if __ARM_FEATURE_SME_F8F16 != 0
void svmla[_single]_za16[_mf8]_vg2x1_fpm(uint32_t slice, svmfloat8_t zn,
                                         svmfloat8_t zm, fpm_t fpm)
                                         __arm_streaming __arm_inout("za");

void svmla[_single]_za16[_mf8]_vg2x2_fpm(uint32_t slice, svmfloat8x2_t zn,
                                         svmfloat8_t zm, fpm_t fpm)
                                         __arm_streaming __arm_inout("za");

void svmla[_single]_za16[_mf8]_vg2x4_fpm(uint32_t slice, svmfloat8x4_t zn,
                                         svmfloat8_t zm, fpm_t fpm)
                                         __arm_streaming __arm_inout("za");
// Only if __ARM_FEATURE_SME_F8F32 != 0
void svmla[_single]_za32[_mf8]_vg4x1_fpm(uint32_t slice, svmfloat8_t zn,
                                         svmfloat8_t zm, fpm_t fpm)
                                         __arm_streaming __arm_inout("za");

void svmla[_single]_za32[_mf8]_vg4x2_fpm(uint32_t slice, svmfloat8x2_t zn,
                                         svmfloat8_t zm, fpm_t fpm)
                                         __arm_streaming __arm_inout("za");

void svmla[_single]_za32[_mf8]_vg4x4_fpm(uint32_t slice, svmfloat8x4_t zn,
                                         svmfloat8_t zm, fpm_t fpm)
                                         __arm_streaming __arm_inout("za");
 ```
 In accordance with ARM-software/acle#323.
 
Co-authored-by: Momchil Velikov [email protected]
  • Loading branch information
SpencerAbson authored Dec 17, 2024
1 parent 2a7ed2c commit 9c89b40
Show file tree
Hide file tree
Showing 7 changed files with 360 additions and 89 deletions.
38 changes: 26 additions & 12 deletions clang/include/clang/Basic/arm_sme.td
Original file line number Diff line number Diff line change
Expand Up @@ -860,24 +860,38 @@ let SMETargetGuard = "sme-f8f32" in {
def SVMOPA_FP8_ZA32 : Inst<"svmopa_za32[_mf8]_m_fpm", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za32",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<0, ImmCheck0_3>]>;
// FMLALL (indexed)
def SVMLA_FP8_ZA32_VG4x1 : Inst<"svmla_lane_za32[_mf8]_vg4x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x1",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
def SVMLA_FP8_ZA32_VG4x2 : Inst<"svmla_lane_za32[_mf8]_vg4x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x2",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
def SVMLA_FP8_ZA16_VG4x4 : Inst<"svmla_lane_za32[_mf8]_vg4x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x4",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
def SVMLA_FP8_LANE_ZA32_VG4x1 : Inst<"svmla_lane_za32[_mf8]_vg4x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x1",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
def SVMLA_FP8_LANE_ZA32_VG4x2 : Inst<"svmla_lane_za32[_mf8]_vg4x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x2",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
def SVMLA_FP8_LANE_ZA16_VG4x4 : Inst<"svmla_lane_za32[_mf8]_vg4x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlall_lane_za32_vg4x4",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
// FMLALL (single)
def SVMLA_FP8_SINGLE_ZA32_VG4x1 : Inst<"svmla[_single]_za32[_mf8]_vg4x1_fpm", "vmdd>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x1",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
def SVMLA_FP8_SINGLE_ZA32_VG4x2 : Inst<"svmla[_single]_za32[_mf8]_vg4x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x2",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
def SVMLA_FP8_SINGLE_ZA32_VG4x4 : Inst<"svmla[_single]_za32[_mf8]_vg4x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlall_single_za32_vg4x4",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
}

let SMETargetGuard = "sme-f8f16" in {
def SVMOPA_FP8_ZA16 : Inst<"svmopa_za16[_mf8]_m_fpm", "viPPdd>", "m", MergeNone, "aarch64_sme_fp8_fmopa_za16",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<0, ImmCheck0_1>]>;
// FMLAL (indexed)
def SVMLA_FP8_ZA16_VG2x1 : Inst<"svmla_lane_za16[_mf8]_vg2x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x1",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
def SVMLA_FP8_ZA16_VG2x2 : Inst<"svmla_lane_za16[_mf8]_vg2x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x2",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
def SVMLA_FP8_ZA16_VG2x4 : Inst<"svmla_lane_za16[_mf8]_vg2x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x4",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
def SVMLA_FP8_LANE_ZA16_VG2x1 : Inst<"svmla_lane_za16[_mf8]_vg2x1_fpm", "vmddi>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x1",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
def SVMLA_FP8_LANE_ZA16_VG2x2 : Inst<"svmla_lane_za16[_mf8]_vg2x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x2",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
def SVMLA_FP8_LANE_ZA16_VG2x4 : Inst<"svmla_lane_za16[_mf8]_vg2x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fmlal_lane_za16_vg2x4",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_15>]>;
// FMLAL (single)
def SVMLA_FP8_SINGLE_ZA16_VG2x1 : Inst<"svmla[_single]_za16[_mf8]_vg2x1_fpm", "vmdd>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x1",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
def SVMLA_FP8_SINGLE_ZA16_VG2x2 : Inst<"svmla[_single]_za16[_mf8]_vg2x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x2",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
def SVMLA_FP8_SINGLE_ZA16_VG2x4 : Inst<"svmla[_single]_za16[_mf8]_vg2x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fmlal_single_za16_vg2x4",
[IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
}

} // let SVETargetGuard = InvalidMode
130 changes: 121 additions & 9 deletions clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_mla.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
#include <arm_sme.h>

#ifdef SME_OVERLOADED_FORMS
#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
#define SME_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED, A5) A1##A3##A5
#else
#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3
#define SME_ACLE_FUNC(A1, A2, A3, A4, A5) A1##A2##A3##A4##A5
#endif

// FMLAL (indexed)
Expand All @@ -33,7 +33,7 @@
// CPP-CHECK-NEXT: ret void
//
void test_svmla_lane_za16_vg2x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x1_fpm)(slice, zn, zm, 0, fpm);
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x1_fpm,,)(slice, zn, zm, 0, fpm);
}

// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_vg2x2(
Expand All @@ -51,7 +51,7 @@ void test_svmla_lane_za16_vg2x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm,
// CPP-CHECK-NEXT: ret void
//
void test_svmla_lane_za16_vg2x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x2_fpm)(slice, zn, zm, 15, fpm);
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x2_fpm,,)(slice, zn, zm, 15, fpm);
}

// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_vg2x4(
Expand All @@ -69,7 +69,7 @@ void test_svmla_lane_za16_vg2x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm
// CPP-CHECK-NEXT: ret void
//
void test_svmla_lane_za16_vg2x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x4_fpm)(slice, zn, zm, 7, fpm);
SME_ACLE_FUNC(svmla_lane_za16,_mf8,_vg2x4_fpm,,)(slice, zn, zm, 7, fpm);
}

// FMLALL (indexed)
Expand All @@ -89,7 +89,7 @@ void test_svmla_lane_za16_vg2x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm
// CPP-CHECK-NEXT: ret void
//
void test_svmla_lane_za32_vg4x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x1_fpm)(slice, zn, zm, 0, fpm);
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x1_fpm,,)(slice, zn, zm, 0, fpm);
}

// CHECK-LABEL: define dso_local void @test_svmla_lane_za32_vg4x2(
Expand All @@ -107,7 +107,7 @@ void test_svmla_lane_za32_vg4x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm,
// CPP-CHECK-NEXT: ret void
//
void test_svmla_lane_za32_vg4x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x2_fpm)(slice, zn, zm, 15, fpm);
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x2_fpm,,)(slice, zn, zm, 15, fpm);
}

// CHECK-LABEL: define dso_local void @test_svmla_lane_za32_vg4x4(
Expand All @@ -125,5 +125,117 @@ void test_svmla_lane_za32_vg4x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm
// CPP-CHECK-NEXT: ret void
//
void test_svmla_lane_za32_vg4x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x4_fpm)(slice, zn, zm, 7, fpm);
}
SME_ACLE_FUNC(svmla_lane_za32,_mf8,_vg4x4_fpm,,)(slice, zn, zm, 7, fpm);
}

// FMLAL (single)

// CHECK-LABEL: define dso_local void @test_svmla_single_za16_vg2x1(
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za16_vg2x1ju13__SVMfloat8_tS_m(
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
// CPP-CHECK-NEXT: ret void
//
void test_svmla_single_za16_vg2x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmla,_single,_za16,_mf8,_vg2x1_fpm)(slice, zn, zm, fpm);
}

// CHECK-LABEL: define dso_local void @test_svmla_single_za16_vg2x2(
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za16_vg2x2j13svmfloat8x2_tu13__SVMfloat8_tm(
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
// CPP-CHECK-NEXT: ret void
//
void test_svmla_single_za16_vg2x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmla,_single,_za16,_mf8,_vg2x2_fpm)(slice, zn, zm, fpm);
}

// CHECK-LABEL: define dso_local void @test_svmla_single_za16_vg2x4(
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za16_vg2x4j13svmfloat8x4_tu13__SVMfloat8_tm(
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlal.single.za16.vg2x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]])
// CPP-CHECK-NEXT: ret void
//
void test_svmla_single_za16_vg2x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmla,_single,_za16,_mf8,_vg2x4_fpm)(slice, zn, zm, fpm);
}

// FMLALL (single)

// CHECK-LABEL: define dso_local void @test_svmla_single_za32_vg4x1(
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za32_vg4x1ju13__SVMfloat8_tS_m(
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x1(i32 [[SLICE]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
// CPP-CHECK-NEXT: ret void
//
void test_svmla_single_za32_vg4x1(uint32_t slice, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmla,_single,_za32,_mf8,_vg4x1_fpm)(slice, zn, zm, fpm);
}

// CHECK-LABEL: define dso_local void @test_svmla_single_za32_vg4x2(
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za32_vg4x2j13svmfloat8x2_tu13__SVMfloat8_tm(
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
// CPP-CHECK-NEXT: ret void
//
void test_svmla_single_za32_vg4x2(uint32_t slice, svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmla,_single,_za32,_mf8,_vg4x2_fpm)(slice, zn, zm, fpm);
}

// CHECK-LABEL: define dso_local void @test_svmla_single_za32_vg4x4(
// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]])
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: define dso_local void @_Z28test_svmla_single_za32_vg4x4j13svmfloat8x4_tu13__SVMfloat8_tm(
// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmlall.single.za32.vg4x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]])
// CPP-CHECK-NEXT: ret void
//
void test_svmla_single_za32_vg4x4(uint32_t slice, svmfloat8x4_t zn, svmfloat8_t zm, fpm_t fpm) __arm_streaming __arm_inout("za") {
SME_ACLE_FUNC(svmla,_single,_za32,_mf8,_vg4x4_fpm)(slice, zn, zm, fpm);
}
18 changes: 18 additions & 0 deletions clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_mla.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,22 @@ void test_svmla(uint32_t slice, svmfloat8_t zn, svmfloat8x2_t znx2, svmfloat8x4_

// expected-error@+1 {{'svmla_lane_za32_mf8_vg4x4_fpm' needs target feature sme,sme-f8f32}}
svmla_lane_za32_mf8_vg4x4_fpm(slice, znx4, zn, 0, fpmr);

// expected-error@+1 {{'svmla_single_za16_mf8_vg2x1_fpm' needs target feature sme,sme-f8f16}}
svmla_single_za16_mf8_vg2x1_fpm(slice, zn, zn, fpmr);

// expected-error@+1 {{'svmla_single_za16_mf8_vg2x2_fpm' needs target feature sme,sme-f8f16}}
svmla_single_za16_mf8_vg2x2_fpm(slice, znx2, zn, fpmr);

// expected-error@+1 {{'svmla_single_za16_mf8_vg2x4_fpm' needs target feature sme,sme-f8f16}}
svmla_single_za16_mf8_vg2x4_fpm(slice, znx4, zn, fpmr);

// expected-error@+1 {{'svmla_single_za32_mf8_vg4x1_fpm' needs target feature sme,sme-f8f32}}
svmla_single_za32_mf8_vg4x1_fpm(slice, zn, zn, fpmr);

// expected-error@+1 {{'svmla_single_za32_mf8_vg4x2_fpm' needs target feature sme,sme-f8f32}}
svmla_single_za32_mf8_vg4x2_fpm(slice, znx2, zn, fpmr);

// expected-error@+1 {{'svmla_single_za32_mf8_vg4x4_fpm' needs target feature sme,sme-f8f32}}
svmla_single_za32_mf8_vg4x4_fpm(slice, znx4, zn, fpmr);
}
Loading

0 comments on commit 9c89b40

Please sign in to comment.