-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
examples: add a cpu_features/ folder, with several examples, using SS…
…E and MMX assembly instructions (vlang#22645)
- Loading branch information
Showing
7 changed files
with
279 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
Note: To more deep study see https://en.wikibooks.org/wiki/X86_Assembly | ||
|
||
# SSE and MMX Extensions | ||
|
||
This document provides an overview of the SSE and MMX extensions used in the project. | ||
|
||
## Table of Contents | ||
|
||
- [Introduction](#introduction) | ||
- [SSE Extensions](#sse-extensions) | ||
- [MMX Extensions](#mmx-extensions) | ||
- [Usage](#usage) | ||
|
||
## Introduction | ||
|
||
SSE (Streaming SIMD Extensions) and MMX (MultiMedia eXtensions) are instruction sets used to | ||
enhance the performance of multimedia and signal processing applications. | ||
|
||
## SSE Extensions | ||
|
||
SSE extensions provide a set of instructions that can handle multiple data with a single | ||
instruction, improving the performance of applications that require heavy mathematical | ||
computations. | ||
|
||
from: [wikibooks](https://en.wikibooks.org/wiki/X86_Assembly/SSE#SSE_Instruction_Set) | ||
There are literally hundreds of SSE instructions, some of which are capable of much more than | ||
simple SIMD arithmetic. For more in-depth references take a look at the resources chapter of this | ||
book. | ||
|
||
You may notice that many floating point SSE instructions end with something like PS or SD. These | ||
suffixes differentiate between different versions of the operation. The first letter describes | ||
whether the instruction should be Packed or Scalar. Packed operations are applied to every member | ||
of the register, while scalar operations are applied to only the first value. For example, in | ||
pseudo-code, a packed add would be executed as: | ||
|
||
``` | ||
v1[0] = v1[0] + v2[0] | ||
v1[1] = v1[1] + v2[1] | ||
v1[2] = v1[2] + v2[2] | ||
v1[3] = v1[3] + v2[3] | ||
``` | ||
|
||
While a scalar add would only be: | ||
|
||
``` | ||
v1[0] = v1[0] + v2[0] | ||
``` | ||
|
||
The second letter refers to the data size: either Single or Double. This simply tells the | ||
processor whether to use the register as four 32-bit floats or two 64-bit doubles, respectively. | ||
|
||
## MMX Extensions | ||
|
||
MMX extensions are designed to accelerate multimedia and communication applications by providing | ||
instructions that can process multiple data elements in parallel. | ||
|
||
## Usage | ||
|
||
To use these extensions in your project, ensure that your compiler supports them and that you have | ||
enabled the appropriate flags. | ||
On Linux, you can run the command `lscpu` | ||
|
||
Note: the examples here will compile, but not run on CPU architectures != amd64, like ARM or RISCV . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
// MMX Instruction Set | ||
// Several suffixes are used to indicate what data size the instruction operates on: | ||
// Byte (8 bits) | ||
// Word (16 bits) | ||
// Double word (32 bits) | ||
// Quad word (64 bits) | ||
// The signedness of the operation is also signified by the suffix: US for unsigned and S for signed. | ||
// For example, PSUBUSB subtracts unsigned bytes, while PSUBSD subtracts signed double words. | ||
// MMX defined over 40 new instructions, listed below. | ||
// EMMS, MOVD, MOVQ, PACKSSDW, PACKSSWB, PACKUSWB, PADDB, PADDD, PADDSB, PADDSW, PADDUSB, PADDUSW, | ||
// PADDW, PAND, PANDN, PCMPEQB, PCMPEQD, PCMPEQW, PCMPGTB, PCMPGTD, PCMPGTW, PMADDWD, PMULHW, PMULLW, | ||
// POR, PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW, PSUBB, PSUBD, PSUBSB, PSUBSW, PSUBUSB, | ||
// PSUBUSW, PSUBW, PUNPCKHBW, PUNPCKHDQ, PUNPCKHWD, PUNPCKLBW, PUNPCKLDQ, PUNPCKLWD, PXOR | ||
|
||
@[if amd64 && !tinyc && !msvc] | ||
fn add_vectors_mmx(a &u8, b &u8, result &u8) { | ||
unsafe { | ||
asm volatile amd64 { | ||
movq mm0, [a] // Load 8 bytes from a into MMX register mm0 | ||
movq mm1, [b] // Load 8 bytes from b into MMX register mm1 | ||
paddb mm0, mm1 // Add the two vectors using MMX instruction | ||
movq [result], mm0 // Store the result back to memory | ||
; ; r (a) | ||
r (b) | ||
r (result) | ||
; mm0 | ||
mm1 | ||
} | ||
} | ||
} | ||
|
||
fn main() { | ||
a := [u8(1), 2, 3, 4, 5, 6, 7, 8] | ||
b := [u8(8), 7, 6, 5, 4, 3, 2, 1] | ||
result := []u8{len: 8} | ||
add_vectors_mmx(&a[0], &b[0], &result[0]) | ||
println(result) | ||
assert result == [u8(9), 9, 9, 9, 9, 9, 9, 9] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
// SSE Instruction Set | ||
// SSE: Added with Pentium III | ||
// Floating-point Instructions: | ||
// ADDPS, ADDSS, CMPPS, CMPSS, COMISS, CVTPI2PS, CVTPS2PI, CVTSI2SS, CVTSS2SI, CVTTPS2PI, CVTTSS2SI, | ||
// DIVPS, DIVSS, LDMXCSR, MAXPS, MAXSS, MINPS, MINSS, MOVAPS, MOVHLPS, MOVHPS, MOVLHPS, MOVLPS, | ||
// MOVMSKPS, MOVNTPS, MOVSS, MOVUPS, MULPS, MULSS, RCPPS, RCPSS, RSQRTPS, RSQRTSS, SHUFPS, SQRTPS, | ||
// SQRTSS, STMXCSR, SUBPS, SUBSS, UCOMISS, UNPCKHPS, UNPCKLPS | ||
// | ||
// Integer Instructions: | ||
// ANDNPS, ANDPS, ORPS, PAVGB, PAVGW, PEXTRW, PINSRW, PMAXSW, PMAXUB, PMINSW, PMINUB, PMOVMSKB, PMULHUW, PSADBW, PSHUFW, XORPS | ||
// The ADDPS instruction adds two vectors of floats using SSE instructions. | ||
|
||
@[if amd64 && !tinyc && !msvc] | ||
fn add_vectors_sse(a &f32, b &f32, result &f32) { | ||
unsafe { | ||
asm volatile amd64 { | ||
movups xmm0, [a] // Load 4 floats from array a into SSE register xmm0 | ||
movups xmm1, [b] // Load 4 floats from array b into SSE register xmm1 | ||
addps xmm0, xmm1 // Add the two vectors using SSE instruction | ||
movups [result], xmm0 // Store the result back to memory | ||
; ; r (a) | ||
r (b) | ||
r (result) | ||
; xmm0 | ||
xmm1 | ||
} | ||
} | ||
} | ||
|
||
fn main() { | ||
a := [f32(1.0), 2.0, 3.0, 4.0] | ||
b := [f32(4.0), 3.0, 2.0, 1.0] | ||
result := []f32{len: 4} | ||
add_vectors_sse(&a[0], &b[0], &result[0]) | ||
println(result) | ||
assert result == [f32(5.0), 5.0, 5.0, 5.0] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
// SSE Instruction Set | ||
// SSE2: Added with Pentium 4 | ||
// Floating-point Instructions: | ||
// ADDPD, ADDSD, ANDNPD, ANDPD, CMPPD, CMPSD*, COMISD, CVTDQ2PD, CVTDQ2PS, CVTPD2DQ, CVTPD2PI, | ||
// CVTPD2PS, CVTPI2PD, CVTPS2DQ, CVTPS2PD, CVTSD2SI, CVTSD2SS, CVTSI2SD, CVTSS2SD, CVTTPD2DQ, | ||
// CVTTPD2PI, CVTTPS2DQ, CVTTSD2SI, DIVPD, DIVSD, MAXPD, MAXSD, MINPD, MINSD, MOVAPD, MOVHPD, | ||
// MOVLPD, MOVMSKPD, MOVSD*, MOVUPD, MULPD, MULSD, ORPD, SHUFPD, SQRTPD, SQRTSD, SUBPD, SUBSD, | ||
// UCOMISD, UNPCKHPD, UNPCKLPD, XORPD | ||
// * CMPSD and MOVSD have the same name as the string instruction mnemonics CMPSD (CMPS) and | ||
// MOVSD (MOVS); however, the former refer to scalar double-precision floating-points whereas | ||
// the latter refer to doubleword strings. | ||
// Integer Instructions: | ||
// MOVDQ2Q, MOVDQA, MOVDQU, MOVQ2DQ, PADDQ, PSUBQ, PMULUDQ, PSHUFHW, PSHUFLW, PSHUFD, PSLLDQ, PSRLDQ, PUNPCKHQDQ, PUNPCKLQDQ | ||
// The MULPD instruction multiplies two vectors of doubles using SSE2 instructions. | ||
|
||
@[if amd64 && !tinyc && !msvc] | ||
fn multiply_vectors_sse2(a &f64, b &f64, result &f64) { | ||
unsafe { | ||
asm volatile amd64 { | ||
movupd xmm0, [a] // Load 2 doubles from array a into SSE2 register xmm0 | ||
movupd xmm1, [b] // Load 2 doubles from array b into SSE2 register xmm1 | ||
mulpd xmm0, xmm1 // Multiply the two vectors using SSE2 instruction | ||
movupd [result], xmm0 // Store the result back to memory | ||
; ; r (a) | ||
r (b) | ||
r (result) | ||
; xmm0 | ||
xmm1 | ||
} | ||
} | ||
} | ||
|
||
fn main() { | ||
a := [f64(1.5), 2.5] | ||
b := [f64(3.5), 4.5] | ||
result := []f64{len: 2} | ||
multiply_vectors_sse2(&a[0], &b[0], &result[0]) | ||
println(result) | ||
// 5.25 = 1.5 * 3.5 | ||
// 11.25 = 2.5 * 4.5 | ||
assert result == [f64(5.25), 11.25] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
// SSE Instruction Set | ||
// SSE3: Added with later Pentium 4 | ||
// ADDSUBPD, ADDSUBPS, HADDPD, HADDPS, HSUBPD, HSUBPS, MOVDDUP, MOVSHDUP, MOVSLDUP | ||
// The HADDPS instruction performs horizontal addition of two vectors of floats using SSE3 | ||
// instructions. | ||
|
||
@[if amd64 && !tinyc && !msvc] | ||
fn horizontal_add_sse3(a &f32, b &f32, result &f32) { | ||
unsafe { | ||
asm volatile amd64 { | ||
movaps xmm0, [a] // Load 4 floats from array a into SSE3 register xmm0 | ||
movaps xmm1, [b] // Load 4 floats from array b into SSE3 register xmm1 | ||
haddps xmm0, xmm1 // Perform horizontal add of xmm0 and xmm1 | ||
movaps [result], xmm0 // Store the result back to memory | ||
; ; r (a) | ||
r (b) | ||
r (result) | ||
; xmm0 | ||
xmm1 | ||
} | ||
} | ||
} | ||
|
||
fn main() { | ||
a := [f32(1.0), 2.0, 3.0, 4.0] | ||
b := [f32(5.0), 6.0, 7.0, 8.0] | ||
result := []f32{len: 4} | ||
horizontal_add_sse3(&a[0], &b[0], &result[0]) | ||
println(result) | ||
// The result should be [3.0, 7.0, 11.0, 15.0] due to horizontal addition. | ||
// 1.0 + 2.0 = 3.0 | ||
// 3.0 + 4.0 = 7.0 | ||
// 5.0 + 6.0 = 11.0 | ||
// 7.0 + 8.0 = 15.0 | ||
assert result == [f32(3.0), 7.0, 11.0, 15.0] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
// SSE Instruction Set | ||
// SSE4.1: Added with later Core 2 | ||
// MPSADBW, PHMINPOSUW, PMULLD, PMULDQ, DPPS, DPPD, BLENDPS, BLENDPD, BLENDVPS, BLENDVPD, | ||
// PBLENDVB, PBLENDW, PMINSB, PMAXSB, PMINUW, PMAXUW, PMINUD, PMAXUD, PMINSD, PMAXSD, ROUNDPS, | ||
// ROUNDSS, ROUNDPD, ROUNDSD, INSERTPS, PINSRB, PINSRD, PINSRQ, EXTRACTPS, PEXTRB, PEXTRW, | ||
// PEXTRD, PEXTRQ, PMOVSXBW, PMOVZXBW, PMOVSXBD, PMOVZXBD, PMOVSXBQ, PMOVZXBQ, PMOVSXWD, | ||
// PMOVZXWD, PMOVSXWQ, PMOVZXWQ, PMOVSXDQ, PMOVZXDQ, PTEST, PCMPEQQ, PACKUSDW, MOVNTDQA | ||
|
||
@[if amd64 && !tinyc && !msvc] | ||
fn round_floats_sse4_1(a &f32, result &f32) { | ||
unsafe { | ||
asm volatile amd64 { | ||
movups xmm0, [a] // Load 4 floats from array a into xmm0 | ||
roundps xmm0, xmm0, 0 // Round to nearest integer | ||
movups [result], xmm0 // Store the result in result array | ||
; ; r (a) | ||
r (result) | ||
; xmm0 | ||
} | ||
} | ||
} | ||
|
||
fn main() { | ||
a := [f32(1.2), 2.5, 3.8, 4.4] | ||
result := []f32{len: 4} | ||
// Rounding mode 0 corresponds to rounding to the nearest integer | ||
round_floats_sse4_1(&a[0], &result[0]) | ||
println(result) | ||
// The expected rounded result should be [1.0, 2.0, 4.0, 4.0] | ||
assert result == [f32(1.0), 2.0, 4.0, 4.0] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
// SSE Instruction Set | ||
// SSSE3: Added with Xeon 5100 and early Core 2 | ||
// PSIGNW, PSIGND, PSIGNB, PSHUFB, PMULHRSW, PMADDUBSW, PHSUBW, PHSUBSW, PHSUBD, PHADDW, PHADDSW, | ||
// PHADDD, PALIGNR, PABSW, PABSD, PABSB | ||
// The PSIGNW instruction negates or leaves elements unchanged based on another vector's signs. | ||
|
||
@[if amd64 && !tinyc && !msvc] | ||
fn psignw_example(a &i16, b &i16, result &i16) { | ||
unsafe { | ||
asm volatile amd64 { | ||
movdqa xmm0, [a] // Load 8 signed 16-bit integers from array a into xmm0 | ||
movdqa xmm1, [b] // Load 8 signed 16-bit integers from array b into xmm1 | ||
psignw xmm0, xmm1 // Adjust the sign of elements in xmm0 based on xmm1 | ||
movdqa [result], xmm0 // Store the result back to memory | ||
; ; r (a) | ||
r (b) | ||
r (result) | ||
; xmm0 | ||
xmm1 | ||
} | ||
} | ||
} | ||
|
||
fn main() { | ||
a0 := [i16(1), -2, 3, -4, 5, -6, 7, -8] | ||
b0 := [i16(1), -1, 1, -1, 1, -1, 1, -1] | ||
result0 := []i16{len: 8} | ||
psignw_example(&a0[0], &b0[0], &result0[0]) | ||
dump(result0) | ||
assert result0 == [i16(1), 2, 3, 4, 5, 6, 7, 8] | ||
} |