Skip to content

Commit

Permalink
Add matrix multiplication benchmark
Browse files Browse the repository at this point in the history
This is the first double precision SIMD benchmark.
Newly covered SIMD instructions:
f64x2.add
f64x2.mul
v128.load64_splat
  • Loading branch information
Peter Pronai authored and ksh8281 committed Nov 14, 2023
1 parent 3e3c99e commit df1d9f4
Show file tree
Hide file tree
Showing 3 changed files with 234 additions and 2 deletions.
6 changes: 4 additions & 2 deletions test/wasmBenchmarker/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,21 @@
"huffman": 0,
"k_nucleotide": 1,
"mandelbrot": 775007,
"matrix_multiply": 3920.0,
"nbody": -0.1691057,
"nqueens": 0,
"prime": 48611,
"quick_sort": 0,
"red-black": 4000000,
"salesman": 840,
"simdMandelbrot": 775007,
"simdNbody": -0.1691057
"simdNbody": -0.1691057,
"simd_matrix_multiply": 3920.0,
}
# https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/simple.html#simple
gameTests = ["mandelbrot", "nbody", "gregory", "fannkuch", "k_nucleotide"]

simdTests = ["simdMandelbrot", "simdNbody"]
simdTests = ["simdMandelbrot", "simdNbody", "simd_matrix_multiply"]

def prepare_arg_pars():
parser = argparse.ArgumentParser()
Expand Down
97 changes: 97 additions & 0 deletions test/wasmBenchmarker/ctests/matrix_multiply.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*
* Copyright (c) 2023-present Samsung Electronics Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Adapted from https://github.com/ngzhian/simd-benchmarks */

#include <stdio.h>
#include <stdint.h>

// 4x4 square matrix
#define MATRIX_SIZE 16

void multiply_scalar(const double m1[], const double m2[], double out_m[])
{
/* unrolled matrix multiplication */
double a00 = m1[0];
double a01 = m1[1];
double a02 = m1[2];
double a03 = m1[3];
double a10 = m1[4];
double a11 = m1[5];
double a12 = m1[6];
double a13 = m1[7];
double a20 = m1[8];
double a21 = m1[9];
double a22 = m1[10];
double a23 = m1[11];
double a30 = m1[12];
double a31 = m1[13];
double a32 = m1[14];
double a33 = m1[15];

double b0 = m2[0];
double b1 = m2[1];
double b2 = m2[2];
double b3 = m2[3];
out_m[0] = b0 * a00 + b1 * a10 + b2 * a20 + b3 * a30;
out_m[1] = b0 * a01 + b1 * a11 + b2 * a21 + b3 * a31;
out_m[2] = b0 * a02 + b1 * a12 + b2 * a22 + b3 * a32;
out_m[3] = b0 * a03 + b1 * a13 + b2 * a23 + b3 * a33;

b0 = m2[4];
b1 = m2[5];
b2 = m2[6];
b3 = m2[7];
out_m[4] = b0 * a00 + b1 * a10 + b2 * a20 + b3 * a30;
out_m[5] = b0 * a01 + b1 * a11 + b2 * a21 + b3 * a31;
out_m[6] = b0 * a02 + b1 * a12 + b2 * a22 + b3 * a32;
out_m[7] = b0 * a03 + b1 * a13 + b2 * a23 + b3 * a33;

b0 = m2[8];
b1 = m2[9];
b2 = m2[10];
b3 = m2[11];
out_m[8] = b0 * a00 + b1 * a10 + b2 * a20 + b3 * a30;
out_m[9] = b0 * a01 + b1 * a11 + b2 * a21 + b3 * a31;
out_m[10] = b0 * a02 + b1 * a12 + b2 * a22 + b3 * a32;
out_m[11] = b0 * a03 + b1 * a13 + b2 * a23 + b3 * a33;

b0 = m2[12];
b1 = m2[13];
b2 = m2[14];
b3 = m2[15];
out_m[12] = b0 * a00 + b1 * a10 + b2 * a20 + b3 * a30;
out_m[13] = b0 * a01 + b1 * a11 + b2 * a21 + b3 * a31;
out_m[14] = b0 * a02 + b1 * a12 + b2 * a22 + b3 * a32;
out_m[15] = b0 * a03 + b1 * a13 + b2 * a23 + b3 * a33;
}

double runtime()
{
double m1[MATRIX_SIZE];
double m2[MATRIX_SIZE];
double out[MATRIX_SIZE];
double sum=0;

for (int i = 0; i < MATRIX_SIZE; i++) {
m1[i] = (double)i;
m2[i] = (double)i;
}
multiply_scalar(m1, m2, out);
for (int i = 0; i < MATRIX_SIZE; i++) {
sum += out[i];
}
return sum;
}
133 changes: 133 additions & 0 deletions test/wasmBenchmarker/ctests/simd_matrix_multiply.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
/*
* Copyright (c) 2023-present Samsung Electronics Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Adapted from https://github.com/ngzhian/simd-benchmarks */

#include <stdio.h>
#include <stdint.h>
#include <wasm_simd128.h>

// 4x4 square matrix
#define MATRIX_SIZE 16

void multiply_simd(const double m1[], const double m2[], double out_m[])
{
v128_t a0 = wasm_v128_load(m1 + 0);
v128_t a1 = wasm_v128_load(m1 + 2);
v128_t a2 = wasm_v128_load(m1 + 4);
v128_t a3 = wasm_v128_load(m1 + 6);
v128_t a4 = wasm_v128_load(m1 + 8);
v128_t a5 = wasm_v128_load(m1 + 10);
v128_t a6 = wasm_v128_load(m1 + 12);
v128_t a7 = wasm_v128_load(m1 + 14);

v128_t b0 = wasm_v128_load(m2 + 0);
v128_t b1 = wasm_v128_load(m2 + 2);

wasm_v128_store(out_m + 0,
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b0, 0)), a0),
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b0, 1)), a2),
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b1, 0)), a4),
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b1, 1)), a6)))));
wasm_v128_store(out_m + 2,
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b0, 0)), a1),
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b0, 1)), a3),
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b1, 0)), a5),
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b1, 1)), a7)))));

v128_t b2 = wasm_v128_load(m2 + 4);
v128_t b3 = wasm_v128_load(m2 + 6);

wasm_v128_store(out_m + 4,
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b2, 0)), a0),
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b2, 1)), a2),
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b3, 0)), a4),
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b3, 1)), a6)))));
wasm_v128_store(out_m + 6,
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b2, 0)), a1),
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b2, 1)), a3),
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b3, 0)), a5),
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b3, 1)), a7)))));

v128_t b4 = wasm_v128_load(m2 + 8);
v128_t b5 = wasm_v128_load(m2 + 10);

wasm_v128_store(out_m + 8,
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b4, 0)), a0),
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b4, 1)), a2),
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b5, 0)), a4),
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b5, 1)), a6)))));
wasm_v128_store(out_m + 10,
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b4, 0)), a1),
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b4, 1)), a3),
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b5, 0)), a5),
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b5, 1)), a7)))));

v128_t b6 = wasm_v128_load(m2 + 12);
v128_t b7 = wasm_v128_load(m2 + 14);

wasm_v128_store(out_m + 12,
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b6, 0)), a0),
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b6, 1)), a2),
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b7, 0)), a4),
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b7, 1)), a6)))));
wasm_v128_store(out_m + 14,
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b6, 0)), a1),
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b6, 1)), a3),
wasm_f64x2_add(
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b7, 0)), a5),
wasm_f64x2_mul(wasm_f64x2_splat(wasm_f64x2_extract_lane(b7, 1)), a7)))));
}

double runtime()
{
double m1[MATRIX_SIZE];
double m2[MATRIX_SIZE];
double out[MATRIX_SIZE];
double sum=0;

for (int i = 0; i < MATRIX_SIZE; i++) {
m1[i] = (double)i;
m2[i] = (double)i;
}
multiply_simd(m1, m2, out);
for (int i = 0; i < MATRIX_SIZE; i++) {
sum += out[i];
}
return sum;
}

0 comments on commit df1d9f4

Please sign in to comment.