From b79ec56fb8082723e342bf2889111ca84b8cd6bf Mon Sep 17 00:00:00 2001
From: Lokathor <zefria@gmail.com>
Date: Thu, 9 Jan 2025 22:52:39 -0700
Subject: [PATCH] aeabi

---
 CHANGELOG.md |   8 +
 Cargo.toml   |   4 +
 src/lib.rs   |   2 +
 src/mem.rs   | 704 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 717 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 788ca21..ad29106 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Changelog
 
+#### 0.14.0
+
+* **Break:** `copy_u32x8_unchecked` is an `extern "C"` fn now.
+* new cargo feature `aeabi_mem_fns` causes the appropriate functions to be
+  generated. They're still written as `#[naked]` functions, so they require
+  nightly. It turns out that rust has so many implicit memcpy calls that it did
+  make a performance difference.
+
 #### 0.13.3
 
 * Added `TextEntry::to_u16`
diff --git a/Cargo.toml b/Cargo.toml
index e5822bd..6053af2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,6 +11,10 @@ default = ["track_caller", "on_gba"]
 track_caller = []
 on_gba = []
 fixed = ["dep:fixed"]
+# Provide the ARM AEABI memory functions, requires Nightly because they're
+# written as naked functions for efficiency. They're `no_mangle` and they're
+# placed in IWRAM.
+aeabi_mem_fns = []
 
 [dependencies]
 voladdress = "1.3.0"
diff --git a/src/lib.rs b/src/lib.rs
index 10485a8..e57641f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,4 +1,6 @@
 #![no_std]
+#![cfg_attr(feature = "aeabi_mem_fns", feature(naked_functions))]
+#![allow(unused_mut)]
 #![allow(unused_imports)]
 #![allow(clippy::let_and_return)]
 #![allow(clippy::result_unit_err)]
diff --git a/src/mem.rs b/src/mem.rs
index 9c5dbb7..1be293c 100644
--- a/src/mem.rs
+++ b/src/mem.rs
@@ -54,7 +54,7 @@ pub unsafe extern "C" fn copy_u8_unchecked(
 /// * The regions must not overlap.
 #[cfg_attr(feature = "on_gba", instruction_set(arm::a32))]
 #[cfg_attr(feature = "on_gba", link_section = ".iwram.copy_u32x8_unchecked")]
-pub unsafe fn copy_u32x8_unchecked(
+pub unsafe extern "C" fn copy_u32x8_unchecked(
   dest: *mut [u32; 8], src: *const [u32; 8], count: usize,
 ) {
   on_gba_or_unimplemented!(unsafe {
@@ -136,3 +136,705 @@ pub unsafe extern "C" fn set_u32x80_unchecked(
     )
   });
 }
+
+#[cfg(feature = "aeabi_mem_fns")]
+pub use aeabi_mem_fns::*;
+#[cfg(feature = "aeabi_mem_fns")]
+mod aeabi_mem_fns {
+  //! Module for direct memory operations.
+  //!
+  //! Generally you don't need to call these yourself. Instead, the compiler
+  //! will insert calls to the functions defined here as necessary.
+
+  use core::ffi::c_void;
+
+  /// Byte copy between exclusive regions.
+  ///
+  /// * This will *always* copy one byte at a time, making it suitable for use
+  ///   with SRAM memory.
+  ///
+  /// ## Safety
+  /// * If `byte_count` is zero then the pointers are not used and they can be
+  ///   any value.
+  /// * If `byte_count` is non-zero then:
+  ///   * Both pointers must be valid for the number of bytes given.
+  ///   * The two regions must either be *entirely* disjoint or *entirely*
+  ///     overlapping. Partial overlap is not allowed.
+  #[inline]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.__aeabi_memcpy1"]
+  pub unsafe extern "C" fn __aeabi_memcpy1(
+    dest: *mut u8, src: *const u8, byte_count: usize,
+  ) {
+    core::arch::asm! {
+      "1:",
+      "subs    {count}, {count}, #1",
+      "ldrbge  {temp}, [{src}], #1",
+      "strbge  {temp}, [{dest}], #1",
+      "bgt     1b",
+      temp = out(reg) _,
+      count = inout(reg) byte_count => _,
+      src = inout(reg) src => _,
+      dest = inout(reg) dest => _,
+      options(nostack)
+    }
+  }
+
+  /// Halfword copy between exclusive regions.
+  ///
+  /// * If the `byte_count` is odd then a single byte copy will happen at the
+  ///   end.
+  ///
+  /// ## Safety
+  /// * If `byte_count` is zero then the pointers are not used and they can be
+  ///   any value.
+  /// * If `byte_count` is non-zero then:
+  ///   * Both pointers must be valid for the span used and aligned to 2.
+  ///   * The two regions must either be *entirely* disjoint or *entirely*
+  ///     overlapping. Partial overlap is not allowed.
+  #[inline]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.__aeabi_memcpy2"]
+  pub unsafe extern "C" fn __aeabi_memcpy2(
+    mut dest: *mut u16, mut src: *const u16, mut byte_count: usize,
+  ) {
+    core::arch::asm! {
+      "1:",
+      "subs    {count}, {count}, #2",
+      "ldrhge  {temp}, [{src}], #2",
+      "strhge  {temp}, [{dest}], #2",
+      "bgt     1b",
+      temp = out(reg) _,
+      count = inout(reg) byte_count,
+      src = inout(reg) src,
+      dest = inout(reg) dest,
+      options(nostack)
+    }
+    if byte_count != 0 {
+      let dest = dest.cast::<u8>();
+      let src = src.cast::<u8>();
+      dest.write_volatile(src.read_volatile());
+    }
+  }
+
+  /// Word copy between exclusive regions.
+  ///
+  /// * If `byte_count` is not a multiple of 4 then a halfword and/or byte copy
+  ///   will happen at the end.
+  ///
+  /// ## Safety
+  /// * If `byte_count` is zero then the pointers are not used and they can be
+  ///   any value.
+  /// * If `byte_count` is non-zero then:
+  ///   * Both pointers must be valid for the span used and aligned to 4.
+  ///   * The two regions must either be *entirely* disjoint or *entirely*
+  ///     overlapping. Partial overlap is not allowed.
+  #[naked]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.__aeabi_memcpy4"]
+  pub unsafe extern "C" fn __aeabi_memcpy4(
+    dest: *mut u32, src: *const u32, byte_count: usize,
+  ) {
+    core::arch::asm! {
+      bracer::when!(("r2" >=u "#32") [2] {
+        "push {{r4-r9}}",
+        "1:",
+        "subs   r2, r2, #32",
+        "ldmge  r1!, {{r3-r9, r12}}",
+        "stmge  r0!, {{r3-r9, r12}}",
+        "bgt    1b",
+        "pop {{r4-r9}}",
+        "bxeq   lr",
+      }),
+
+      // copy 4 words, two at a time
+      "tst    r2, #0b10000",
+      "ldmne  r1!, {{r3, r12}}",
+      "stmne  r0!, {{r3, r12}}",
+      "ldmne  r1!, {{r3, r12}}",
+      "stmne  r0!, {{r3, r12}}",
+      "bics   r2, r2, #0b10000",
+      "bxeq   lr",
+
+      // copy 2 and/or 1 words
+      "lsls   r3, r2, #29",
+      "ldmcs  r1!, {{r3, r12}}",
+      "stmcs  r0!, {{r3, r12}}",
+      "ldrmi  r3, [r1], #4",
+      "strmi  r3, [r0], #4",
+      "bics   r2, r2, #0b1100",
+      "bxeq   lr",
+
+      // copy halfword and/or byte
+      "lsls   r3, r2, #31",
+      "ldrhcs r3, [r1], #2",
+      "strhcs r3, [r0], #2",
+      "ldrbmi r3, [r1], #1",
+      "strbmi r3, [r0], #1",
+      "bx     lr",
+      options(noreturn),
+    }
+  }
+
+  /// Just call [`__aeabi_memcpy4`] instead.
+  ///
+  /// This function is provided only for API completeness, because in some cases
+  /// the compiler might automatically generate a call to this function.
+  #[inline]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.__aeabi_memcpy8"]
+  pub unsafe extern "C" fn __aeabi_memcpy8(
+    dest: *mut u32, src: *const u32, byte_count: usize,
+  ) {
+    __aeabi_memcpy4(dest, src, byte_count);
+  }
+
+  /// Arbitrary-width copy between exclusive regions.
+  ///
+  /// ## Safety
+  /// * If `byte_count` is zero then the pointers are not used and they can be
+  ///   any value.
+  /// * If `byte_count` is non-zero then:
+  ///   * Both pointers must be valid for the span used (no required alignment).
+  ///   * The two regions must either be *entirely* disjoint or *entirely*
+  ///     overlapping. Partial overlap is not allowed.
+  #[naked]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.__aeabi_memcpy"]
+  pub unsafe extern "C" fn __aeabi_memcpy(
+    dest: *mut u8, src: *const u8, byte_count: usize,
+  ) {
+    core::arch::asm! {
+      "cmp    r2, #7", // if count <= (fix+word): just byte copy
+      "ble    {__aeabi_memcpy1}",
+
+      // check max coalign
+      "eor    r3, r0, r1",
+      "lsls   r3, r3, #31",
+      "bmi    {__aeabi_memcpy1}",
+      "bcs    2f",
+
+      // max coalign4, possible fixup and jump
+      "lsls   r3, r0, #31",
+      "submi  r2, r2, #1",
+      "ldrbmi r3, [r1], #1",
+      "strbmi r3, [r0], #1",
+      "subcs  r2, r2, #2",
+      "ldrhcs r3, [r1], #2",
+      "strhcs r3, [r0], #2",
+      "b      {__aeabi_memcpy4}",
+
+      // max coalign2, possible fixup and jump
+      "2:",
+      "lsls   r3, r0, #31",
+      "submi  r2, r2, #1",
+      "ldrbmi r3, [r1], #1",
+      "strbmi r3, [r0], #1",
+      "b      {__aeabi_memcpy2}",
+
+      //
+      __aeabi_memcpy4 = sym __aeabi_memcpy4,
+      __aeabi_memcpy2 = sym __aeabi_memcpy2,
+      __aeabi_memcpy1 = sym __aeabi_memcpy1,
+      options(noreturn)
+    }
+  }
+
+  /// Copy between exclusive regions, prefer [`__aeabi_memcpy`] if possible.
+  ///
+  /// This is the libc version of a memory copy. It's required to return the
+  /// `dest` pointer at the end of the call, which makes it need an extra
+  /// push/pop compared to a direct call to `__aeabi_memcpy`.
+  ///
+  /// * **Returns:** The `dest` pointer.
+  #[naked]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.memcpy"]
+  pub unsafe extern "C" fn memcpy(
+    dest: *mut u8, src: *const u8, byte_count: usize,
+  ) -> *mut u8 {
+    // I've seen a standard call to `__aeabi_memcpy` give weird codegen,
+    // so we (currently) do the call manually.
+    core::arch::asm! {
+      "push {{r0, lr}}",
+      "bl {__aeabi_memcpy}",
+      "pop {{r0, lr}}",
+      "bx lr",
+      __aeabi_memcpy = sym __aeabi_memcpy,
+      options(noreturn)
+    }
+  }
+
+  // MOVE
+
+  // used by `__aeabi_memmove` in some cases
+  #[inline]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.reverse_copy_u8"]
+  unsafe extern "C" fn reverse_copy_u8(
+    dest: *mut u8, src: *const u8, byte_count: usize,
+  ) {
+    core::arch::asm! {
+      "1:",
+      "subs    {count}, {count}, #1",
+      "ldrbge  {temp}, [{src}, #-1]!",
+      "strbge  {temp}, [{dest}, #-1]!",
+      "bgt     1b",
+      temp = out(reg) _,
+      count = inout(reg) byte_count => _,
+      src = inout(reg) src => _,
+      dest = inout(reg) dest => _,
+      options(nostack)
+    }
+  }
+
+  // used by `__aeabi_memmove` in some cases
+  #[inline]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.reverse_copy_u16"]
+  unsafe extern "C" fn reverse_copy_u16(
+    mut dest: *mut u16, mut src: *const u16, mut byte_count: usize,
+  ) {
+    core::arch::asm! {
+      "1:",
+      "subs    {count}, {count}, #2",
+      "ldrhge  {temp}, [{src}, #-2]!",
+      "strhge  {temp}, [{dest}, #-2]!",
+      "bgt     1b",
+      temp = out(reg) _,
+      count = inout(reg) byte_count,
+      src = inout(reg) src,
+      dest = inout(reg) dest,
+      options(nostack)
+    }
+    if byte_count != 0 {
+      let dest = dest.cast::<u8>().sub(1);
+      let src = src.cast::<u8>().sub(1);
+      dest.write_volatile(src.read_volatile());
+    }
+  }
+
+  // used by `__aeabi_memmove` in some cases
+  #[naked]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.reverse_copy_u32"]
+  unsafe extern "C" fn reverse_copy_u32(
+    dest: *mut u32, src: *const u32, byte_count: usize,
+  ) {
+    core::arch::asm! {
+      bracer::when!(("r2" >=u "#32") [2] {
+        "push {{r4-r9}}",
+        "1:",
+        "subs    r2, r2, #32",
+        "ldmdbcs r1!, {{r3-r9, r12}}",
+        "stmdbcs r0!, {{r3-r9, r12}}",
+        "bgt     1b",
+        "pop {{r4-r9}}",
+        "bxeq   lr",
+      }),
+
+      // copy 4 words, two at a time
+      "tst     r2, #0b10000",
+      "ldmdbne r1!, {{r3, r12}}",
+      "stmdbne r0!, {{r3, r12}}",
+      "ldmdbne r1!, {{r3, r12}}",
+      "stmdbne r0!, {{r3, r12}}",
+      "bics    r2, r2, #0b10000",
+      "bxeq    lr",
+
+      // copy 2 and/or 1 words
+      "lsls    r3, r2, #29",
+      "ldmdbcs r1!, {{r3, r12}}",
+      "stmdbcs r0!, {{r3, r12}}",
+      "ldrmi   r3, [r1, #-4]!",
+      "strmi   r3, [r0, #-4]!",
+      "bxeq    lr",
+
+      // copy halfword and/or byte
+      "lsls    r2, r2, #31",
+      "ldrhcs  r3, [r1, #-2]!",
+      "strhcs  r3, [r0, #-2]!",
+      "ldrbmi  r3, [r1, #-1]!",
+      "strbmi  r3, [r0, #-1]!",
+      "bx      lr",
+      options(noreturn),
+    }
+  }
+
+  /// Copy between non-exclusive regions, prefer [`__aeabi_memmove`] if
+  /// possible.
+  ///
+  /// This function is provided only for API completeness, because in some cases
+  /// the compiler might automatically generate a call to this function.
+  #[inline]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.__aeabi_memmove4"]
+  pub unsafe extern "C" fn __aeabi_memmove4(
+    dest: *mut u32, src: *const u32, byte_count: usize,
+  ) {
+    __aeabi_memmove(dest.cast(), src.cast(), byte_count)
+  }
+
+  /// Copy between non-exclusive regions, prefer [`__aeabi_memmove`] if
+  /// possible.
+  ///
+  /// This function is provided only for API completeness, because in some cases
+  /// the compiler might automatically generate a call to this function.
+  #[inline]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.__aeabi_memmove8"]
+  pub unsafe extern "C" fn __aeabi_memmove8(
+    dest: *mut u32, src: *const u32, byte_count: usize,
+  ) {
+    __aeabi_memmove(dest.cast(), src.cast(), byte_count)
+  }
+
+  /// Copy between non-exclusive regions.
+  ///
+  /// * The pointers do not have a minimum alignment. The function will
+  ///   automatically detect the best type of copy to perform.
+  #[naked]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.__aeabi_memmove"]
+  pub unsafe extern "C" fn __aeabi_memmove(
+    dest: *mut u8, src: *const u8, byte_count: usize,
+  ) {
+    core::arch::asm! {
+      // when d > s we need to copy back-to-front
+      bracer::when!(("r0" >=u "r1") [1] {
+        "add     r0, r0, r2",
+        "add     r1, r1, r2",
+        "eor     r3, r0, r1",
+        "lsls    r3, r3, #31",
+        "bmi     {reverse_copy_u8}",
+        "bcs     2f",
+
+        // max coalign4, possible fixup and jump
+        "lsls    r3, r0, #31",
+        "submi   r2, r2, #1",
+        "ldrbmi  r3, [r1, #-1]!",
+        "strbmi  r3, [r0, #-1]!",
+        "subcs   r2, r2, #2",
+        "ldrhcs  r3, [r1, #-2]!",
+        "strhcs  r3, [r0, #-2]!",
+        "b       {reverse_copy_u32}",
+
+        // max coalign2, possible fixup and jump
+        "2:",
+        "tst     r0, #1",
+        "sub     r2, r2, #1",
+        "ldrb    r3, [r1, #-1]!",
+        "strb    r3, [r0, #-1]!",
+        "b       {reverse_copy_u16}",
+      }),
+      // forward copy is a normal memcpy
+      "b      {__aeabi_memcpy}",
+      __aeabi_memcpy = sym __aeabi_memcpy,
+      reverse_copy_u8 = sym reverse_copy_u8,
+      reverse_copy_u16 = sym reverse_copy_u16,
+      reverse_copy_u32 = sym reverse_copy_u32,
+      options(noreturn),
+    }
+  }
+
+  /// Copy between non-exclusive regions, prefer [`__aeabi_memmove`] if
+  /// possible.
+  ///
+  /// This is the libc version of a memory move. It's required to return the
+  /// `dest` pointer at the end of the call, which makes it need an extra
+  /// push/pop compared to a direct call to `__aeabi_memmove`.
+  ///
+  /// * **Returns:** The `dest` pointer.
+  #[naked]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.memmove"]
+  pub unsafe extern "C" fn memmove(
+    dest: *mut u8, src: *const u8, byte_count: usize,
+  ) -> *mut u8 {
+    core::arch::asm! {
+      "push {{r0, lr}}",
+      "bl {__aeabi_memmove}",
+      "pop {{r0, lr}}",
+      "bx lr",
+      __aeabi_memmove = sym __aeabi_memmove,
+      options(noreturn)
+    }
+  }
+
+  // SET
+
+  /// Copy between non-exclusive regions, prefer [`__aeabi_memset`] if possible.
+  ///
+  /// This function is provided only for API completeness, because in some cases
+  /// the compiler might automatically generate a call to this function.
+  #[inline]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.__aeabi_memset4"]
+  pub unsafe extern "C" fn __aeabi_memset4(
+    dest: *mut u32, byte_count: usize, byte: i32,
+  ) {
+    __aeabi_memset(dest.cast(), byte_count, byte)
+  }
+
+  /// Copy between non-exclusive regions, prefer [`__aeabi_memset`] if possible.
+  ///
+  /// This function is provided only for API completeness, because in some cases
+  /// the compiler might automatically generate a call to this function.
+  #[inline]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.__aeabi_memset8"]
+  pub unsafe extern "C" fn __aeabi_memset8(
+    dest: *mut u32, byte_count: usize, byte: i32,
+  ) {
+    __aeabi_memset(dest.cast(), byte_count, byte)
+  }
+
+  /// Sets all bytes in the region to the `byte` given.
+  ///
+  /// Because of historical reasons, the byte is passed in as an `i32`, but only
+  /// the lowest 8 bits are used.
+  #[naked]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.__aeabi_memset"]
+  pub unsafe extern "C" fn __aeabi_memset(
+    dest: *mut u8, byte_count: usize, byte: i32,
+  ) {
+    core::arch::asm! {
+      bracer::when!(("r1" >=u "#8") [7] {
+        // duplicate the byte across all of r2 and r3
+        "and    r2, r2, #0xFF",
+        "orr    r2, r2, r2, lsl #8",
+        "orr    r2, r2, r2, lsl #16",
+        "mov    r3, r2",
+
+        // align the pointer for word ops
+        "tst    r0, #0b1",
+        "subne  r1, r1, #1",
+        "strbne r2, [r0], #1",
+        "tst    r0, #0b10",
+        "subne  r1, r1, #2",
+        "strhne r2, [r0], #2",
+
+        bracer::when!(("r1" >=u "#32") [8] {
+          "push {{r4-r9}}",
+          "mov    r4, r2",
+          "mov    r5, r2",
+          "mov    r6, r2",
+          "mov    r7, r2",
+          "mov    r8, r2",
+          "mov    r9, r2",
+          "1:",
+          "subs   r1, r1, #32",
+          "stmge  r0!, {{r2-r9}}",
+          "bgt    1b",
+          "pop {{r4-r9}}",
+          "bxeq   lr",
+        }),
+
+        // set 4 words
+        "tst    r1, #0b10000",
+        "stmne  r0!, {{r2, r3}}",
+        "stmne  r0!, {{r2, r3}}",
+
+        // set 2 and/or 1 words
+        "lsls   r12, r1, #29",
+        "stmcs  r0!, {{r2, r3}}",
+        "strmi  r2, [r0], #4",
+
+        // set halfword and/or byte
+        "lsls   r12, r1, #31",
+        "strhcs r2, [r0], #2",
+        "strbmi r2, [r0], #1",
+        "bx     lr",
+      }),
+      // byte loop
+      "9:",
+      "subs   r1, r1, #1",
+      "strbcs r2, [r0], #1",
+      "bgt    9b",
+      "bx     lr",
+      options(noreturn)
+    }
+  }
+
+  /// Write a value to all bytes in the region, prefer [`__aeabi_memset`] if
+  /// possible.
+  ///
+  /// This is the libc version of a memory set. It's required to return the
+  /// `dest` pointer at the end of the call, which makes it need an extra
+  /// push/pop compared to a direct call to `__aeabi_memset`. Also, the
+  /// argument ordering is swapped, so shuffling registers costs a few cycles.
+  ///
+  /// * **Returns:** The `dest` pointer.
+  #[naked]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.memset"]
+  pub unsafe extern "C" fn memset(
+    dest: *mut u8, byte: i32, byte_count: usize,
+  ) -> *mut u8 {
+    core::arch::asm! {
+      "push {{r0, lr}}",
+      "mov r3, r2",
+      "mov r2, r1",
+      "mov r1, r3",
+      "bl {__aeabi_memset}",
+      "pop {{r0, lr}}",
+      "bx lr",
+      __aeabi_memset = sym __aeabi_memset,
+      options(noreturn)
+    }
+  }
+
+  // CLEAR
+
+  /// Just call [`__aeabi_memset`] with 0 as the `byte` instead.
+  ///
+  /// This function is provided only for API completeness, because in some cases
+  /// the compiler might automatically generate a call to this function.
+  #[inline]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.__aeabi_memclr4"]
+  pub unsafe extern "C" fn __aeabi_memclr4(dest: *mut u32, byte_count: usize) {
+    __aeabi_memset(dest.cast(), byte_count, 0)
+  }
+
+  /// Just call [`__aeabi_memset`] with 0 as the `byte` instead.
+  ///
+  /// This function is provided only for API completeness, because in some cases
+  /// the compiler might automatically generate a call to this function.
+  #[inline]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.__aeabi_memclr8"]
+  pub unsafe extern "C" fn __aeabi_memclr8(dest: *mut u32, byte_count: usize) {
+    __aeabi_memset(dest.cast(), byte_count, 0)
+  }
+
+  /// Just call [`__aeabi_memset`] with 0 as the `byte` instead.
+  ///
+  /// This function is provided only for API completeness, because in some cases
+  /// the compiler might automatically generate a call to this function.
+  #[inline]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.__aeabi_memclr"]
+  pub unsafe extern "C" fn __aeabi_memclr(dest: *mut u8, byte_count: usize) {
+    __aeabi_memset(dest, byte_count, 0)
+  }
+
+  /// Reads 4 bytes, starting at the address given.
+  ///
+  /// See [__aeabi_uread4]
+  ///
+  /// [__aeabi_uread4]: https://github.com/ARM-software/abi-aa/blob/main/rtabi32/rtabi32.rst#unaligned-memory-access
+  #[naked]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.aeabi.uread4"]
+  unsafe extern "C" fn __aeabi_uread4(address: *const c_void) -> u32 {
+    core::arch::asm!(
+      "ldrb r2, [r0]",
+      "ldrb r3, [r0, #1]",
+      "orr  r2, r2, r3, lsl #8",
+      "ldrb r3, [r0, #2]",
+      "orr  r2, r2, r3, lsl #16",
+      "ldrb r3, [r0, #3]",
+      "orr  r2, r2, r3, lsl #24",
+      "mov  r0, r2",
+      "bx   lr",
+      options(noreturn),
+    )
+  }
+
+  /// Writes 4 bytes, starting at the address given.
+  ///
+  /// See [__aeabi_uwrite4]
+  ///
+  /// [__aeabi_uwrite4]: https://github.com/ARM-software/abi-aa/blob/main/rtabi32/rtabi32.rst#unaligned-memory-access
+  #[naked]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.aeabi.uwrite4"]
+  unsafe extern "C" fn __aeabi_uwrite4(value: u32, address: *mut c_void) {
+    core::arch::asm!(
+      "strb r0, [r1]",
+      "lsr  r2, r0, #8",
+      "strb r2, [r1, #1]",
+      "lsr  r2, r2, #8",
+      "strb r2, [r1, #2]",
+      "lsr  r2, r2, #8",
+      "strb r2, [r1, #3]",
+      "bx   lr",
+      options(noreturn),
+    )
+  }
+
+  /// Reads 8 bytes, starting at the address given.
+  ///
+  /// See [__aeabi_uread8]
+  ///
+  /// [__aeabi_uread8]: https://github.com/ARM-software/abi-aa/blob/main/rtabi32/rtabi32.rst#unaligned-memory-access
+  #[naked]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.aeabi.uread8"]
+  unsafe extern "C" fn __aeabi_uread8(address: *const c_void) -> u64 {
+    core::arch::asm!(
+      "ldrb r1, [r0, #4]",
+      "ldrb r2, [r0, #5]",
+      "orr  r1, r1, r2, lsl #8",
+      "ldrb r2, [r0, #6]",
+      "orr  r1, r1, r2, lsl #16",
+      "ldrb r2, [r0, #7]",
+      "orr  r1, r1, r2, lsl #24",
+      "b    {__aeabi_uread4}",
+      __aeabi_uread4 = sym __aeabi_uread4,
+      options(noreturn),
+    )
+  }
+
+  /// Writes 8 bytes, starting at the address given.
+  ///
+  /// See [__aeabi_uwrite8]
+  ///
+  /// [__aeabi_uwrite8]: https://github.com/ARM-software/abi-aa/blob/main/rtabi32/rtabi32.rst#unaligned-memory-access
+  #[naked]
+  #[no_mangle]
+  #[instruction_set(arm::a32)]
+  #[link_section = ".iwram.aeabi.uwrite8"]
+  unsafe extern "C" fn __aeabi_uwrite8(value: u64, address: *mut c_void) {
+    core::arch::asm!(
+      "strb r0, [r2]",
+      "lsr  r3, r0, #8",
+      "strb r3, [r2, #1]",
+      "lsr  r3, r3, #8",
+      "strb r3, [r2, #2]",
+      "lsr  r3, r3, #8",
+      "strb r3, [r2, #3]",
+      "strb r1, [r2, #4]",
+      "lsr  r3, r1, #8",
+      "strb r3, [r2, #5]",
+      "lsr  r3, r3, #8",
+      "strb r3, [r2, #6]",
+      "lsr  r3, r3, #8",
+      "strb r3, [r2, #7]",
+      "bx   lr",
+      options(noreturn),
+    )
+  }
+}