From 0c80033daf7252bae9906d379a0385b24078deb7 Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Mon, 16 Sep 2024 17:10:34 +0200
Subject: [PATCH 1/2] lowram implementation

---
 lowram/Makefile              |  116 ++++
 lowram/api.h                 |   98 +++
 lowram/config.h              |   27 +
 lowram/fips202.c             |    1 +
 lowram/fips202.h             |    1 +
 lowram/lowram.c              | 1248 ++++++++++++++++++++++++++++++++++
 lowram/lowram.h              |   90 +++
 lowram/ntt.c                 |    1 +
 lowram/ntt.h                 |    1 +
 lowram/packing.c             |    1 +
 lowram/packing.h             |    1 +
 lowram/params.h              |    1 +
 lowram/poly.c                |    1 +
 lowram/poly.h                |    1 +
 lowram/polyvec.c             |    1 +
 lowram/polyvec.h             |    1 +
 lowram/randombytes.c         |    1 +
 lowram/randombytes.h         |    1 +
 lowram/reduce.c              |    1 +
 lowram/reduce.h              |    1 +
 lowram/rounding.c            |    1 +
 lowram/rounding.h            |    1 +
 lowram/sign.c                |  517 ++++++++++++++
 lowram/sign.h                |    1 +
 lowram/smallntt.h            |   36 +
 lowram/smallntt_3329.c       |  180 +++++
 lowram/smallpoly.c           |  100 +++
 lowram/smallpoly.h           |   26 +
 lowram/symmetric-shake.c     |    1 +
 lowram/symmetric.h           |    1 +
 lowram/test/.gitignore       |    1 +
 lowram/test/cpucycles.c      |    1 +
 lowram/test/cpucycles.h      |    1 +
 lowram/test/speed_print.c    |    1 +
 lowram/test/speed_print.h    |    1 +
 lowram/test/test_dilithium.c |    1 +
 lowram/test/test_mul.c       |    1 +
 lowram/test/test_speed.c     |    1 +
 lowram/test/test_vectors.c   |    1 +
 39 files changed, 2467 insertions(+)
 create mode 100644 lowram/Makefile
 create mode 100644 lowram/api.h
 create mode 100644 lowram/config.h
 create mode 120000 lowram/fips202.c
 create mode 120000 lowram/fips202.h
 create mode 100644 lowram/lowram.c
 create mode 100644 lowram/lowram.h
 create mode 120000 lowram/ntt.c
 create mode 120000 lowram/ntt.h
 create mode 120000 lowram/packing.c
 create mode 120000 lowram/packing.h
 create mode 120000 lowram/params.h
 create mode 120000 lowram/poly.c
 create mode 120000 lowram/poly.h
 create mode 120000 lowram/polyvec.c
 create mode 120000 lowram/polyvec.h
 create mode 120000 lowram/randombytes.c
 create mode 120000 lowram/randombytes.h
 create mode 120000 lowram/reduce.c
 create mode 120000 lowram/reduce.h
 create mode 120000 lowram/rounding.c
 create mode 120000 lowram/rounding.h
 create mode 100644 lowram/sign.c
 create mode 120000 lowram/sign.h
 create mode 100644 lowram/smallntt.h
 create mode 100644 lowram/smallntt_3329.c
 create mode 100644 lowram/smallpoly.c
 create mode 100644 lowram/smallpoly.h
 create mode 120000 lowram/symmetric-shake.c
 create mode 120000 lowram/symmetric.h
 create mode 120000 lowram/test/.gitignore
 create mode 120000 lowram/test/cpucycles.c
 create mode 120000 lowram/test/cpucycles.h
 create mode 120000 lowram/test/speed_print.c
 create mode 120000 lowram/test/speed_print.h
 create mode 120000 lowram/test/test_dilithium.c
 create mode 120000 lowram/test/test_mul.c
 create mode 120000 lowram/test/test_speed.c
 create mode 120000 lowram/test/test_vectors.c

diff --git a/lowram/Makefile b/lowram/Makefile
new file mode 100644
index 0000000..1c6bceb
--- /dev/null
+++ b/lowram/Makefile
@@ -0,0 +1,116 @@
+CC ?= /usr/bin/cc
+CFLAGS += -Wall -Wextra -Wpedantic -Wmissing-prototypes -Wredundant-decls \
+  -Wshadow -Wvla -Wpointer-arith -O3 -fomit-frame-pointer
+NISTFLAGS += -Wno-unused-result -O3 -fomit-frame-pointer
+SOURCES = sign.c packing.c polyvec.c poly.c ntt.c reduce.c rounding.c lowram.c smallpoly.c smallntt_3329.c
+HEADERS = config.h params.h api.h sign.h packing.h polyvec.h poly.h ntt.h \
+  reduce.h rounding.h symmetric.h randombytes.h lowram.h smallpoly.h smallntt.h
+KECCAK_SOURCES = $(SOURCES) fips202.c symmetric-shake.c
+KECCAK_HEADERS = $(HEADERS) fips202.h
+
+.PHONY: all speed shared clean
+
+all: \
+  test/test_dilithium2 \
+  test/test_dilithium3 \
+  test/test_dilithium5 \
+  test/test_vectors2 \
+  test/test_vectors3 \
+  test/test_vectors5
+
+speed: \
+  test/test_mul \
+  test/test_speed2 \
+  test/test_speed3 \
+  test/test_speed5 \
+
+shared: \
+  libpqcrystals_dilithium2_lowram.so \
+  libpqcrystals_dilithium3_lowram.so \
+  libpqcrystals_dilithium5_lowram.so \
+  libpqcrystals_fips202_lowram.so \
+
+libpqcrystals_fips202_lowram.so: fips202.c fips202.h
+	$(CC) -shared -fPIC $(CFLAGS) -o $@ $<
+
+libpqcrystals_dilithium2_lowram.so: $(SOURCES) $(HEADERS) symmetric-shake.c
+	$(CC) -shared -fPIC $(CFLAGS) -DDILITHIUM_MODE=2 \
+	  -o $@ $(SOURCES) symmetric-shake.c
+
+libpqcrystals_dilithium3_lowram.so: $(SOURCES) $(HEADERS) symmetric-shake.c
+	$(CC) -shared -fPIC $(CFLAGS) -DDILITHIUM_MODE=3 \
+	  -o $@ $(SOURCES) symmetric-shake.c
+
+libpqcrystals_dilithium5_lowram.so: $(SOURCES) $(HEADERS) symmetric-shake.c
+	$(CC) -shared -fPIC $(CFLAGS) -DDILITHIUM_MODE=5 \
+	  -o $@ $(SOURCES) symmetric-shake.c
+
+test/test_dilithium2: test/test_dilithium.c randombytes.c $(KECCAK_SOURCES) \
+  $(KECCAK_HEADERS)
+	$(CC) $(CFLAGS) -DDILITHIUM_MODE=2 \
+	  -o $@ $< randombytes.c $(KECCAK_SOURCES)
+
+test/test_dilithium3: test/test_dilithium.c randombytes.c $(KECCAK_SOURCES) \
+  $(KECCAK_HEADERS)
+	$(CC) $(CFLAGS) -DDILITHIUM_MODE=3 \
+	  -o $@ $< randombytes.c $(KECCAK_SOURCES)
+
+test/test_dilithium5: test/test_dilithium.c randombytes.c $(KECCAK_SOURCES) \
+  $(KECCAK_HEADERS)
+	$(CC) $(CFLAGS) -DDILITHIUM_MODE=5 \
+	  -o $@ $< randombytes.c $(KECCAK_SOURCES)
+
+test/test_vectors2: test/test_vectors.c $(KECCAK_SOURCES) \
+  $(KECCAK_HEADERS)
+	$(CC) $(CFLAGS) -DDILITHIUM_MODE=2 \
+	  -o $@ $< $(KECCAK_SOURCES)
+
+test/test_vectors3: test/test_vectors.c $(KECCAK_SOURCES) $(KECCAK_HEADERS)
+	$(CC) $(CFLAGS) -DDILITHIUM_MODE=3 \
+	  -o $@ $< $(KECCAK_SOURCES)
+
+test/test_vectors5: test/test_vectors.c $(KECCAK_SOURCES) \
+  $(KECCAK_HEADERS)
+	$(CC) $(CFLAGS) -DDILITHIUM_MODE=5 \
+	  -o $@ $< $(KECCAK_SOURCES)
+
+test/test_speed2: test/test_speed.c test/speed_print.c test/speed_print.h \
+  test/cpucycles.c test/cpucycles.h randombytes.c $(KECCAK_SOURCES) \
+  $(KECCAK_HEADERS)
+	$(CC) $(CFLAGS) -DDILITHIUM_MODE=2 \
+	  -o $@ $< test/speed_print.c test/cpucycles.c randombytes.c \
+	  $(KECCAK_SOURCES)
+
+test/test_speed3: test/test_speed.c test/speed_print.c test/speed_print.h \
+  test/cpucycles.c test/cpucycles.h randombytes.c $(KECCAK_SOURCES) \
+  $(KECCAK_HEADERS)
+	$(CC) $(CFLAGS) -DDILITHIUM_MODE=3 \
+	  -o $@ $< test/speed_print.c test/cpucycles.c randombytes.c \
+	  $(KECCAK_SOURCES)
+
+test/test_speed5: test/test_speed.c test/speed_print.c test/speed_print.h \
+  test/cpucycles.c test/cpucycles.h randombytes.c $(KECCAK_SOURCES) \
+  $(KECCAK_HEADERS)
+	$(CC) $(CFLAGS) -DDILITHIUM_MODE=5 \
+	  -o $@ $< test/speed_print.c test/cpucycles.c randombytes.c \
+	  $(KECCAK_SOURCES)
+
+test/test_mul: test/test_mul.c randombytes.c $(KECCAK_SOURCES) $(KECCAK_HEADERS)
+	$(CC) $(CFLAGS) -UDBENCH -o $@ $< randombytes.c $(KECCAK_SOURCES)
+
+clean:
+	rm -f *~ test/*~ *.gcno *.gcda *.lcov
+	rm -f libpqcrystals_dilithium2_lowram.so
+	rm -f libpqcrystals_dilithium3_lowram.so
+	rm -f libpqcrystals_dilithium5_lowram.so
+	rm -f libpqcrystals_fips202_lowram.so
+	rm -f test/test_dilithium2
+	rm -f test/test_dilithium3
+	rm -f test/test_dilithium5
+	rm -f test/test_vectors2
+	rm -f test/test_vectors3
+	rm -f test/test_vectors5
+	rm -f test/test_speed2
+	rm -f test/test_speed3
+	rm -f test/test_speed5
+	rm -f test/test_mul
diff --git a/lowram/api.h b/lowram/api.h
new file mode 100644
index 0000000..c255fd1
--- /dev/null
+++ b/lowram/api.h
@@ -0,0 +1,98 @@
+#ifndef API_H
+#define API_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define pqcrystals_dilithium2_PUBLICKEYBYTES 1312
+#define pqcrystals_dilithium2_SECRETKEYBYTES 2560
+#define pqcrystals_dilithium2_BYTES 2420
+
+#define pqcrystals_dilithium2_lowram_PUBLICKEYBYTES pqcrystals_dilithium2_PUBLICKEYBYTES
+#define pqcrystals_dilithium2_lowram_SECRETKEYBYTES pqcrystals_dilithium2_SECRETKEYBYTES
+#define pqcrystals_dilithium2_lowram_BYTES pqcrystals_dilithium2_BYTES
+
+int pqcrystals_dilithium2_lowram_keypair(uint8_t *pk, uint8_t *sk);
+
+int pqcrystals_dilithium2_lowram_signature(uint8_t *sig, size_t *siglen,
+                                        const uint8_t *m, size_t mlen,
+                                        const uint8_t *ctx, size_t ctxlen,
+                                        const uint8_t *sk);
+
+int pqcrystals_dilithium2_lowram(uint8_t *sm, size_t *smlen,
+                              const uint8_t *m, size_t mlen,
+                              const uint8_t *ctx, size_t ctxlen,
+                              const uint8_t *sk);
+
+int pqcrystals_dilithium2_lowram_verify(const uint8_t *sig, size_t siglen,
+                                     const uint8_t *m, size_t mlen,
+                                     const uint8_t *ctx, size_t ctxlen,
+                                     const uint8_t *pk);
+
+int pqcrystals_dilithium2_lowram_open(uint8_t *m, size_t *mlen,
+                                   const uint8_t *sm, size_t smlen,
+                                   const uint8_t *ctx, size_t ctxlen,
+                                   const uint8_t *pk);
+
+#define pqcrystals_dilithium3_PUBLICKEYBYTES 1952
+#define pqcrystals_dilithium3_SECRETKEYBYTES 4032
+#define pqcrystals_dilithium3_BYTES 3309
+
+#define pqcrystals_dilithium3_lowram_PUBLICKEYBYTES pqcrystals_dilithium3_PUBLICKEYBYTES
+#define pqcrystals_dilithium3_lowram_SECRETKEYBYTES pqcrystals_dilithium3_SECRETKEYBYTES
+#define pqcrystals_dilithium3_lowram_BYTES pqcrystals_dilithium3_BYTES
+
+int pqcrystals_dilithium3_lowram_keypair(uint8_t *pk, uint8_t *sk);
+
+int pqcrystals_dilithium3_lowram_signature(uint8_t *sig, size_t *siglen,
+                                        const uint8_t *m, size_t mlen,
+                                        const uint8_t *ctx, size_t ctxlen,
+                                        const uint8_t *sk);
+
+int pqcrystals_dilithium3_lowram(uint8_t *sm, size_t *smlen,
+                              const uint8_t *m, size_t mlen,
+                              const uint8_t *ctx, size_t ctxlen,
+                              const uint8_t *sk);
+
+int pqcrystals_dilithium3_lowram_verify(const uint8_t *sig, size_t siglen,
+                                     const uint8_t *m, size_t mlen,
+                                     const uint8_t *ctx, size_t ctxlen,
+                                     const uint8_t *pk);
+
+int pqcrystals_dilithium3_lowram_open(uint8_t *m, size_t *mlen,
+                                   const uint8_t *sm, size_t smlen,
+                                   const uint8_t *ctx, size_t ctxlen,
+                                   const uint8_t *pk);
+
+#define pqcrystals_dilithium5_PUBLICKEYBYTES 2592
+#define pqcrystals_dilithium5_SECRETKEYBYTES 4896
+#define pqcrystals_dilithium5_BYTES 4627
+
+#define pqcrystals_dilithium5_lowram_PUBLICKEYBYTES pqcrystals_dilithium5_PUBLICKEYBYTES
+#define pqcrystals_dilithium5_lowram_SECRETKEYBYTES pqcrystals_dilithium5_SECRETKEYBYTES
+#define pqcrystals_dilithium5_lowram_BYTES pqcrystals_dilithium5_BYTES
+
+int pqcrystals_dilithium5_lowram_keypair(uint8_t *pk, uint8_t *sk);
+
+int pqcrystals_dilithium5_lowram_signature(uint8_t *sig, size_t *siglen,
+                                        const uint8_t *m, size_t mlen,
+                                        const uint8_t *ctx, size_t ctxlen,
+                                        const uint8_t *sk);
+
+int pqcrystals_dilithium5_lowram(uint8_t *sm, size_t *smlen,
+                              const uint8_t *m, size_t mlen,
+                              const uint8_t *ctx, size_t ctxlen,
+                              const uint8_t *sk);
+
+int pqcrystals_dilithium5_lowram_verify(const uint8_t *sig, size_t siglen,
+                                     const uint8_t *m, size_t mlen,
+                                     const uint8_t *ctx, size_t ctxlen,
+                                     const uint8_t *pk);
+
+int pqcrystals_dilithium5_lowram_open(uint8_t *m, size_t *mlen,
+                                   const uint8_t *sm, size_t smlen,
+                                   const uint8_t *ctx, size_t ctxlen,
+                                   const uint8_t *pk);
+
+
+#endif
diff --git a/lowram/config.h b/lowram/config.h
new file mode 100644
index 0000000..ac0a571
--- /dev/null
+++ b/lowram/config.h
@@ -0,0 +1,27 @@
+#ifndef CONFIG_H
+#define CONFIG_H
+
+//#define DILITHIUM_MODE 2
+// #define DILITHIUM_RANDOMIZED_SIGNING
+//#define USE_RDPMC
+//#define DBENCH
+
+#ifndef DILITHIUM_MODE
+#define DILITHIUM_MODE 2
+#endif
+
+#if DILITHIUM_MODE == 2
+#define CRYPTO_ALGNAME "Dilithium2"
+#define DILITHIUM_NAMESPACETOP pqcrystals_dilithium2_lowram
+#define DILITHIUM_NAMESPACE(s) pqcrystals_dilithium2_lowram_##s
+#elif DILITHIUM_MODE == 3
+#define CRYPTO_ALGNAME "Dilithium3"
+#define DILITHIUM_NAMESPACETOP pqcrystals_dilithium3_lowram
+#define DILITHIUM_NAMESPACE(s) pqcrystals_dilithium3_lowram_##s
+#elif DILITHIUM_MODE == 5
+#define CRYPTO_ALGNAME "Dilithium5"
+#define DILITHIUM_NAMESPACETOP pqcrystals_dilithium5_lowram
+#define DILITHIUM_NAMESPACE(s) pqcrystals_dilithium5_lowram_##s
+#endif
+
+#endif
diff --git a/lowram/fips202.c b/lowram/fips202.c
new file mode 120000
index 0000000..da2fa42
--- /dev/null
+++ b/lowram/fips202.c
@@ -0,0 +1 @@
+../ref/fips202.c
\ No newline at end of file
diff --git a/lowram/fips202.h b/lowram/fips202.h
new file mode 120000
index 0000000..c759415
--- /dev/null
+++ b/lowram/fips202.h
@@ -0,0 +1 @@
+../ref/fips202.h
\ No newline at end of file
diff --git a/lowram/lowram.c b/lowram/lowram.c
new file mode 100644
index 0000000..2a3716d
--- /dev/null
+++ b/lowram/lowram.c
@@ -0,0 +1,1248 @@
+#include "lowram.h"
+#include "fips202.h"
+#include "symmetric.h"
+#include "reduce.h"
+#include "rounding.h"
+
+/*
+This file implements functions aiding with the reduction of the memory
+footprint of ML-DSA. 
+
+The ideas are taken from the paper:
+
+Joppe W. Bos, Joost Renes, and Amber Sprenkels. 2022. Dilithium for Memory
+Constrained Devices. In Progress in Cryptology - AFRICACRYPT 2022: 13th
+International Conference on Cryptology in Africa, AFRICACRYPT 2022, Fes,
+Morocco, July 18–20, 2022, Proceedings. Springer-Verlag, Berlin, Heidelberg,
+217–235. https://doi.org/10.1007/978-3-031-17433-9_10
+*/
+
+/*************************************************
+ * Name:        unpack_pk_t1
+ *
+ * Description: Unpack only t1 from pk.
+ *
+ * Arguments:   - poly *t1: pointer to output t1
+ *              - const size_t idx: unpack n'th element from t1
+ *              - unsigned char pk[]: byte array containing bit-packed pk
+ **************************************************/
+void unpack_pk_t1(poly *t1, size_t idx, const unsigned char pk[CRYPTO_PUBLICKEYBYTES])
+{
+  pk += SEEDBYTES;
+  polyt1_unpack(t1, pk + idx * POLYT1_PACKEDBYTES);
+}
+
+/*************************************************
+* Name:        pack_sig_c
+*
+* Description: Pack only c into signature.
+*
+* Arguments:   - uint8_t sig[]: byte array containing bit-packed signature
+*              - const uint8_t c: challenge
+**************************************************/
+void pack_sig_c(uint8_t sig[CRYPTO_BYTES],
+                const uint8_t c[CTILDEBYTES])
+{
+  unsigned int i;
+
+  for (i = 0; i < CTILDEBYTES; ++i)
+    sig[i] = c[i];
+  sig += CTILDEBYTES;
+}
+
+/*************************************************
+* Name:        pack_sig_z
+*
+* Description: Pack only z into signature.
+*
+* Arguments:   - uint8_t sig[]: byte array containing bit-packed signature
+*              - const polyvecl *z: z vector
+**************************************************/
+void pack_sig_z(uint8_t sig[CRYPTO_BYTES],
+                const polyvecl *z)
+{
+  unsigned int i;
+  sig += CTILDEBYTES;
+  for (i = 0; i < L; ++i)
+    polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]);
+}
+
+/*************************************************
+* Name:        pack_sig_h
+*
+* Description: Pack only h into signature.
+*
+* Arguments:   - unsigned char sig[]: byte array containing bit-packed signature
+*              - const poly *h_elem: element of h
+*              - const unsigned int idx: index of h in vector
+*              - unsigned int *hints_written: number of hints already written
+**************************************************/
+void pack_sig_h(unsigned char sig[CRYPTO_BYTES],
+                const poly *h_elem,
+                const unsigned int idx,
+                unsigned int *hints_written)
+{
+  sig += CTILDEBYTES;
+  sig += L * POLYZ_PACKEDBYTES;
+
+  // Encode h
+  for (unsigned int j = 0; j < N; j++)
+  {
+    if (h_elem->coeffs[j] != 0)
+    {
+      sig[*hints_written] = (uint8_t)j;
+      (*hints_written)++;
+    }
+  }
+  sig[OMEGA + idx] = (uint8_t)*hints_written;
+}
+
+/*************************************************
+* Name:        pack_sig_h_zero
+*
+* Description: Pack only remaining zeros into signature.
+*
+* Arguments:   - unsigned char sig[]: byte array containing bit-packed signature
+*              - unsigned int *hints_written: number of hints written
+**************************************************/
+void pack_sig_h_zero(unsigned char sig[CRYPTO_BYTES],
+                     unsigned int *hints_written)
+{
+  sig += CTILDEBYTES;
+  sig += L * POLYZ_PACKEDBYTES;
+  while (*hints_written < OMEGA)
+  {
+    sig[*hints_written] = 0;
+    (*hints_written)++;
+  }
+}
+
+/*************************************************
+ * Name:        unpack_sig_c
+ *
+ * Description: Unpack only c from signature sig = (z, h, c).
+ *
+ * Arguments:   - poly *c: pointer to output challenge polynomial
+ *              - const unsigned char sig[]: byte array containing
+ *                bit-packed signature
+ *
+ * Returns 1 in case of malformed signature; otherwise 0.
+ **************************************************/
+int unpack_sig_c(uint8_t c[CTILDEBYTES], const unsigned char sig[CRYPTO_BYTES])
+{
+  for (size_t i = 0; i < CTILDEBYTES; ++i)
+    c[i] = sig[i];
+  sig += CTILDEBYTES;
+  return 0;
+}
+
+/*************************************************
+ * Name:        unpack_sig_z
+ *
+ * Description: Unpack only z from signature sig = (z, h, c).
+ *
+ * Arguments:   - polyvecl *z: pointer to output vector z
+ *              - const unsigned char sig[]: byte array containing
+ *                bit-packed signature
+ *
+ * Returns 1 in case of malformed signature; otherwise 0.
+ **************************************************/
+int unpack_sig_z(polyvecl *z, const unsigned char sig[CRYPTO_BYTES])
+{
+  sig += CTILDEBYTES;
+  for (size_t i = 0; i < L; ++i)
+  {
+    polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES);
+  }
+  return 0;
+}
+
+/*************************************************
+ * Name:        unpack_sig_h
+ *
+ * Description: Unpack only h from signature sig = (z, h, c).
+ *
+ * Arguments:   - polyveck *h: pointer to output hint vector h
+ *              - const unsigned char sig[]: byte array containing
+ *                bit-packed signature
+ *
+ * Returns 1 in case of malformed signature; otherwise 0.
+ **************************************************/
+int unpack_sig_h(poly *h, size_t idx, const unsigned char sig[CRYPTO_BYTES])
+{
+  sig += CTILDEBYTES;
+  sig += L * POLYZ_PACKEDBYTES;
+
+  /* Decode h */
+  size_t k = 0;
+  for (size_t i = 0; i < K; ++i)
+  {
+    for (size_t j = 0; j < N; ++j)
+    {
+      if (i == idx)
+      {
+        h->coeffs[j] = 0;
+      }
+    }
+
+    if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA)
+    {
+      return 1;
+    }
+
+    for (size_t j = k; j < sig[OMEGA + i]; ++j)
+    {
+      /* Coefficients are ordered for strong unforgeability */
+      if (j > k && sig[j] <= sig[j - 1])
+      {
+        return 1;
+      }
+      if (i == idx)
+      {
+        h->coeffs[sig[j]] = 1;
+      }
+    }
+
+    k = sig[OMEGA + i];
+  }
+
+  /* Extra indices are zero for strong unforgeability */
+  for (size_t j = k; j < OMEGA; ++j)
+  {
+    if (sig[j])
+    {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+/*************************************************
+ * Name:        poly_challenge_compress
+ *
+ * Description: Compress the challenge polynomial.
+ *
+ * Arguments:   - uint8_t c[]: byte array for holding the compressed challenge
+ *              - const poly *cp: challenge polynomnial
+ *
+ **************************************************/
+void poly_challenge_compress(uint8_t c[68], const poly *cp)
+{
+  unsigned int i, pos;
+  uint64_t signs;
+  uint64_t mask;
+  /* Encode c */
+  for (i = 0; i < 68; i++)
+    c[i] = 0;
+  signs = 0;
+  mask = 1;
+  pos = 0;
+  for (i = 0; i < N; ++i)
+  {
+    if (cp->coeffs[i] != 0)
+    {
+      c[pos++] = i;
+      if (cp->coeffs[i] == -1)
+      {
+        signs |= mask;
+      }
+      mask <<= 1;
+    }
+  }
+
+  for (i = 0; i < 8; ++i)
+  {
+    c[60 + i] = (unsigned char)(signs >> 8 * i);
+  }
+}
+
+/*************************************************
+ * Name:        poly_challenge_decompress
+ *
+ * Description: Decompress the challenge polynomial.
+ *
+ * Arguments:   - poly *cp: challenge polynomnial output
+ *              - uint8_t c[]: byte array holding the compressed challenge
+ *
+ **************************************************/
+void poly_challenge_decompress(poly *cp, const uint8_t c[68])
+{
+  unsigned int i;
+  unsigned pos;
+  uint64_t signs = 0;
+  for (i = 0; i < N; i++)
+    cp->coeffs[i] = 0;
+  for (i = 0; i < 8; i++)
+  {
+    signs |= ((uint64_t)c[60 + i]) << (8 * i);
+  }
+
+  for (i = 0; i < TAU; i++)
+  {
+    pos = c[i];
+    if (signs & 1)
+    {
+      cp->coeffs[pos] = -1;
+    }
+    else
+    {
+      cp->coeffs[pos] = 1;
+    }
+    signs >>= 1;
+  }
+}
+
+/*************************************************
+ * Name:        polyt0_unpack_idx
+ *
+ * Description: Unpack coefficient from t0 at specific index.
+ *
+ * Arguments:   - const uint8_t *t0: packed t0
+ *              - unsigned idx: index of coefficient
+ *
+ **************************************************/
+static inline int32_t polyt0_unpack_idx(const uint8_t *t0, unsigned idx)
+{
+  int32_t coeff;
+  // 8 coefficients are packed in 13 bytes
+  t0 += 13 * (idx >> 3);
+
+  if (idx % 8 == 0)
+  {
+    coeff = t0[0];
+    coeff |= (uint32_t)t0[1] << 8;
+  }
+  else if (idx % 8 == 1)
+  {
+    coeff = t0[1] >> 5;
+    coeff |= (uint32_t)t0[2] << 3;
+    coeff |= (uint32_t)t0[3] << 11;
+  }
+  else if (idx % 8 == 2)
+  {
+    coeff = t0[3] >> 2;
+    coeff |= (uint32_t)t0[4] << 6;
+  }
+  else if (idx % 8 == 3)
+  {
+    coeff = t0[4] >> 7;
+    coeff |= (uint32_t)t0[5] << 1;
+    coeff |= (uint32_t)t0[6] << 9;
+  }
+  else if (idx % 8 == 4)
+  {
+    coeff = t0[6] >> 4;
+    coeff |= (uint32_t)t0[7] << 4;
+    coeff |= (uint32_t)t0[8] << 12;
+  }
+  else if (idx % 8 == 5)
+  {
+    coeff = t0[8] >> 1;
+    coeff |= (uint32_t)t0[9] << 7;
+  }
+  else if (idx % 8 == 6)
+  {
+    coeff = t0[9] >> 6;
+    coeff |= (uint32_t)t0[10] << 2;
+    coeff |= (uint32_t)t0[11] << 10;
+  }
+  else
+  { // (idx % 8 == 7)
+    coeff = t0[11] >> 3;
+    coeff |= (uint32_t)t0[12] << 5;
+  }
+  coeff &= 0x1FFF;
+  return (1 << (D - 1)) - coeff;
+}
+
+/*************************************************
+ * Name:        polyt1_unpack_idx
+ *
+ * Description: Unpack coefficient from t1 at specific index.
+ *
+ * Arguments:   - const uint8_t *t1: packed t1
+ *              - unsigned idx: index of coefficient
+ *
+ **************************************************/
+static inline int32_t polyt1_unpack_idx(const uint8_t *t1, unsigned idx)
+{
+  int32_t coeff;
+  // 4 coefficients are packed in 5 bytes
+  t1 += 5 * (idx >> 2);
+
+  if (idx % 4 == 0)
+  {
+    coeff = (t1[0] >> 0);
+    coeff |= ((uint32_t)t1[1] << 8);
+  }
+  else if (idx % 4 == 1)
+  {
+    coeff = (t1[1] >> 2);
+    coeff |= ((uint32_t)t1[2] << 6);
+  }
+  else if (idx % 4 == 2)
+  {
+    coeff = (t1[2] >> 4);
+    coeff |= ((uint32_t)t1[3] << 4);
+  }
+  else
+  { // (idx % 4 == 3)
+    coeff = (t1[3] >> 6);
+    coeff |= ((uint32_t)t1[4] << 2);
+  }
+  coeff &= 0x3FF;
+  return coeff;
+}
+
+/*************************************************
+ * Name:        poly_schoolbook
+ *
+ * Description: Schoolbook multiplication between challenge and t0.
+ *
+ * Arguments:   - poly *c: Output polynomial
+ *              - const uint8_t ccomp[]: First input, compressed challenge
+ *              - const uint8_t *t0: Second input, packed t0
+ *
+ **************************************************/
+void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0)
+{
+  unsigned i, j, idx;
+  uint64_t signs = 0;
+  for (i = 0; i < N; i++)
+    c->coeffs[i] = 0;
+  for (i = 0; i < 8; i++)
+  {
+    signs |= ((uint64_t)ccomp[60 + i]) << (8 * i);
+  }
+
+  for (idx = 0; idx < TAU; idx++)
+  {
+    i = ccomp[idx];
+    if (!(signs & 1))
+    {
+      for (j = 0; i + j < N; j++)
+      {
+        c->coeffs[i + j] += polyt0_unpack_idx(t0, j);
+      }
+      for (j = N - i; j < N; j++)
+      {
+        c->coeffs[i + j - N] -= polyt0_unpack_idx(t0, j);
+      }
+    }
+    else
+    {
+      for (j = 0; i + j < N; j++)
+      {
+        c->coeffs[i + j] -= polyt0_unpack_idx(t0, j);
+      }
+      for (j = N - i; j < N; j++)
+      {
+        c->coeffs[i + j - N] += polyt0_unpack_idx(t0, j);
+      }
+    }
+
+    signs >>= 1;
+  }
+}
+
+/*************************************************
+ * Name:        poly_schoolbook_t1
+ *
+ * Description: Schoolbook multiplication between challenge and t1.
+ *
+ * Arguments:   - poly *c: Output polynomial
+ *              - const uint8_t ccomp[]: First input, compressed challenge
+ *              - const uint8_t *t1: Second input, packed t1
+ *
+ **************************************************/
+void poly_schoolbook_t1(poly *c, const uint8_t ccomp[68], const uint8_t *t1)
+{
+  unsigned i, j, idx;
+  uint64_t signs = 0;
+  for (i = 0; i < N; i++)
+    c->coeffs[i] = 0;
+  for (i = 0; i < 8; i++)
+  {
+    signs |= ((uint64_t)ccomp[60 + i]) << (8 * i);
+  }
+
+  for (idx = 0; idx < TAU; idx++)
+  {
+    i = ccomp[idx];
+    if (!(signs & 1))
+    {
+      for (j = 0; i + j < N; j++)
+      {
+        c->coeffs[i + j] += (polyt1_unpack_idx(t1, j) << D);
+      }
+      for (j = N - i; j < N; j++)
+      {
+        c->coeffs[i + j - N] -= (polyt1_unpack_idx(t1, j) << D);
+      }
+    }
+    else
+    {
+      for (j = 0; i + j < N; j++)
+      {
+        c->coeffs[i + j] -= (polyt1_unpack_idx(t1, j) << D);
+      }
+      for (j = N - i; j < N; j++)
+      {
+        c->coeffs[i + j - N] += (polyt1_unpack_idx(t1, j) << D);
+      }
+    }
+
+    signs >>= 1;
+  }
+}
+
+/*************************************************
+ * Name:        polyw_pack
+ *
+ * Description: Pack polynomial w.
+ *
+ * Arguments:   - uint8_t buf[]: buffer to hold compressed w
+ *              - poly *w: input polynomial
+ *
+ **************************************************/
+void polyw_pack(uint8_t buf[K * 768], poly *w)
+{
+  poly_reduce(w);
+  poly_caddq(w);
+  unsigned int i;
+  for (i = 0; i < N; i++)
+  {
+    buf[i * 3 + 0] = w->coeffs[i];
+    buf[i * 3 + 1] = w->coeffs[i] >> 8;
+    buf[i * 3 + 2] = w->coeffs[i] >> 16;
+  }
+}
+
+/*************************************************
+ * Name:        polyw_unpack
+ *
+ * Description: Unpack polynomial w.
+ *
+ * Arguments:   - poly *w: output polynomial
+ *              - const uint8_t buf[]: buffer holding compressed w
+ *
+ **************************************************/
+void polyw_unpack(poly *w, const uint8_t buf[K * 768])
+{
+  unsigned int i;
+  for (i = 0; i < N; i++)
+  {
+    w->coeffs[i] = buf[i * 3 + 0];
+    w->coeffs[i] |= (int32_t)buf[i * 3 + 1] << 8;
+    w->coeffs[i] |= (int32_t)buf[i * 3 + 2] << 16;
+  }
+}
+
+/*************************************************
+ * Name:        polyw_add_idx
+ *
+ * Description: Add an integer to a coefficient in a compressed polynomial buffer.
+ *
+ * Arguments:   - uint8_t buf[]: buffer holding compressed polynomial coefficients
+ *              - int32_t a: integer to add to the coefficient
+ *              - size_t i: index of the coefficient to modify
+ *
+ **************************************************/
+static void polyw_add_idx(uint8_t buf[K * 768], int32_t a, size_t i)
+{
+  int32_t coeff;
+  coeff = buf[i * 3 + 0];
+  coeff |= (int32_t)buf[i * 3 + 1] << 8;
+  coeff |= (int32_t)buf[i * 3 + 2] << 16;
+
+  coeff += a;
+
+  coeff = freeze(coeff);
+
+  buf[i * 3 + 0] = coeff;
+  buf[i * 3 + 1] = coeff >> 8;
+  buf[i * 3 + 2] = coeff >> 16;
+}
+
+/*************************************************
+ * Name:        polyw_sub
+ *
+ * Description: Subtract the coefficients of a polynomial from a compressed
+                polynomial buffer and store the result in another polynomial.
+ *
+ * Arguments:   - poly *c: output polynomial to store the result
+ *              - uint8_t buf[]: buffer holding compressed polynomial coefficients
+ *              - poly *a: polynomial whose coefficients are to be subtracted
+ *                from the buffer
+ *
+ **************************************************/
+void polyw_sub(poly *c, uint8_t buf[3 * 256], poly *a)
+{
+  int32_t coeff;
+
+  for (size_t i = 0; i < N; i++)
+  {
+    coeff = buf[i * 3 + 0];
+    coeff |= (int32_t)buf[i * 3 + 1] << 8;
+    coeff |= (int32_t)buf[i * 3 + 2] << 16;
+
+    c->coeffs[i] = coeff - a->coeffs[i];
+  }
+}
+
+/*************************************************
+ * Name:        highbits
+ *
+ * Description: Compute the high bits of an integer.
+ *
+ * Arguments:   - int32_t a: input integer whose high bits are to be computed
+ *
+ * Returns the high bits of the input as the result.
+ **************************************************/
+static int32_t highbits(int32_t a)
+{
+  int32_t a1;
+
+  a1 = (a + 127) >> 7;
+#if GAMMA2 == (Q - 1) / 32
+  a1 = (a1 * 1025 + (1 << 21)) >> 22;
+  a1 &= 15;
+#elif GAMMA2 == (Q - 1) / 88
+  a1 = (a1 * 11275 + (1 << 23)) >> 24;
+  a1 ^= ((43 - a1) >> 31) & a1;
+#endif
+
+  return a1;
+}
+
+/*************************************************
+ * Name:        poly_highbits
+ *
+ * Description: Compute the high bits of each coefficient in a polynomial.
+ *
+ * Arguments:   - poly *a1: output polynomial to store the high bits of the coefficients
+ *              - const poly *a: input polynomial whose coefficients' high bits
+ *                are to be computed
+ *
+ **************************************************/
+void poly_highbits(poly *a1, const poly *a)
+{
+  unsigned int i;
+
+  for (i = 0; i < N; ++i)
+    a1->coeffs[i] = highbits(a->coeffs[i]);
+}
+
+/*************************************************
+ * Name:        lowbits
+ *
+ * Description: Compute the low bits of an integer.
+ *
+ * Arguments:   - int32_t a: input integer whose low bits are to be computed
+ *
+ * Returns the low bits of the input as the result.
+ **************************************************/
+static int32_t lowbits(int32_t a)
+{
+  int32_t a1;
+  int32_t a0;
+
+  a1 = (a + 127) >> 7;
+#if GAMMA2 == (Q - 1) / 32
+  a1 = (a1 * 1025 + (1 << 21)) >> 22;
+  a1 &= 15;
+#elif GAMMA2 == (Q - 1) / 88
+  a1 = (a1 * 11275 + (1 << 23)) >> 24;
+  a1 ^= ((43 - a1) >> 31) & a1;
+#endif
+
+  a0 = a - a1 * 2 * GAMMA2;
+  a0 -= (((Q - 1) / 2 - a0) >> 31) & Q;
+  return a0;
+}
+
+/*************************************************
+ * Name:        poly_lowbits
+ *
+ * Description: Compute the low bits of each coefficient in a polynomial.
+ *
+ * Arguments:   - poly *a0: output polynomial to store the low bits of the coefficients
+ *              - const poly *a: input polynomial whose coefficients' low bits
+                  are to be computed
+ *
+ **************************************************/
+void poly_lowbits(poly *a0, const poly *a)
+{
+  unsigned int i;
+
+  for (i = 0; i < N; ++i)
+    a0->coeffs[i] = lowbits(a->coeffs[i]);
+}
+
+/*************************************************
+ * Name:        unpack_sk_s1
+ *
+ * Description: Unpack only s1 from the secret key into a small polynomial.
+ *
+ * Arguments:   - smallpoly *a: output small polynomial to store the unpacked data
+ *              - const uint8_t *sk: input secret key buffer
+ *              - size_t idx: index specifying the polynomial to unpack
+ *
+ **************************************************/
+void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx)
+{
+  small_polyeta_unpack(a, sk + 2 * SEEDBYTES + TRBYTES + idx * POLYETA_PACKEDBYTES);
+}
+
+/*************************************************
+ * Name:        unpack_sk_s2
+ *
+ * Description: Unpack only s2 from the secret key into a small polynomial.
+ *
+ * Arguments:   - smallpoly *a: output small polynomial to store the unpacked data
+ *              - const uint8_t *sk: input secret key buffer
+ *              - size_t idx: index specifying the polynomial to unpack
+ *
+ **************************************************/
+void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx)
+{
+  small_polyeta_unpack(a, sk + 2 * SEEDBYTES + TRBYTES + L * POLYETA_PACKEDBYTES + idx * POLYETA_PACKEDBYTES);
+}
+
+/* Note: Buffer size can potentially be increased */
+#define POLY_UNIFORM_BUFFERSIZE 3
+/*************************************************
+ * Name:        poly_uniform_pointwise_montgomery_polywadd_lowram
+ *
+ * Description: Generate a uniform polynomial using a seed and nonce, 
+ *              perform pointwise multiplication with another polynomial, 
+ *              and add the result to a compressed polynomial buffer.
+ *
+ * Arguments:   - uint8_t wcomp[]: buffer to store the compressed polynomial
+ *                coefficients
+ *              - poly *b: input polynomial for pointwise multiplication
+ *              - const uint8_t seed[]: seed for SHAKE128
+ *              - uint16_t nonce: nonce for SHAKE128
+ *              - keccak_state *state: state for the SHAKE128
+ *
+ **************************************************/
+void poly_uniform_pointwise_montgomery_polywadd_lowram(uint8_t wcomp[3 * N], poly *b, const uint8_t seed[SEEDBYTES], uint16_t nonce, keccak_state *state)
+{
+  int32_t t;
+  uint8_t buf[POLY_UNIFORM_BUFFERSIZE * 3];
+  {
+    size_t ctr = 0;
+    stream128_init(state, seed, nonce);
+
+    do
+    {
+      shake128_squeeze(buf, sizeof buf, state);
+
+      for (size_t pos = 0; pos < sizeof buf && ctr < N; pos += 3)
+      {
+        t = buf[pos];
+        t |= (uint32_t)buf[pos + 1] << 8;
+        t |= (uint32_t)buf[pos + 2] << 16;
+        t &= 0x7FFFFF;
+
+        if (t < Q)
+        {
+          t = montgomery_reduce((int64_t)t * b->coeffs[ctr]);
+          polyw_add_idx(wcomp, t, ctr);
+          ctr++;
+        }
+      }
+    } while (ctr < N);
+  }
+}
+
+#define POLY_UNIFORM_GAMMA1_BUFFERSIZE 1
+#if GAMMA1 == (1 << 17)
+#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS (POLY_UNIFORM_GAMMA1_BUFFERSIZE * 4)
+#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES (POLY_UNIFORM_GAMMA1_BUFFERSIZE * 9)
+#elif GAMMA1 == (1 << 19)
+#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS (POLY_UNIFORM_GAMMA1_BUFFERSIZE * 2)
+#define POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES (POLY_UNIFORM_GAMMA1_BUFFERSIZE * 5)
+#endif
+
+/*************************************************
+ * Name:        polyz_unpack_inplace
+ *
+ * Description: Unpack a compressed polynomial z in place.
+ *
+ * Arguments:   - int32_t *r: pointer to the array where the unpacked polynomial
+ *                            coefficients will be stored which is also used as
+ *                            the input
+ *
+ **************************************************/
+static void polyz_unpack_inplace(int32_t *r)
+{
+  uint8_t *a = (uint8_t *)r;
+
+  unsigned int i, j;
+#if GAMMA1 == (1 << 17)
+  for (j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE; ++j)
+  {
+    i = POLY_UNIFORM_GAMMA1_BUFFERSIZE - 1 - j;
+    int32_t t0;
+
+    r[4 * i + 3] = a[9 * i + 6] >> 6;
+    r[4 * i + 3] |= (uint32_t)a[9 * i + 7] << 2;
+    r[4 * i + 3] |= (uint32_t)a[9 * i + 8] << 10;
+    r[4 * i + 3] &= 0x3FFFF;
+
+    r[4 * i + 2] = a[9 * i + 4] >> 4;
+    r[4 * i + 2] |= (uint32_t)a[9 * i + 5] << 4;
+    r[4 * i + 2] |= (uint32_t)a[9 * i + 6] << 12;
+    r[4 * i + 2] &= 0x3FFFF;
+
+    r[4 * i + 1] = (uint32_t)a[9 * i + 4] << 14;
+    r[4 * i + 1] |= a[9 * i + 2] >> 2;
+    r[4 * i + 1] |= (uint32_t)a[9 * i + 3] << 6;
+    r[4 * i + 1] &= 0x3FFFF;
+
+    t0 = a[9 * i + 0];
+    t0 |= (uint32_t)a[9 * i + 1] << 8;
+    t0 |= (uint32_t)a[9 * i + 2] << 16;
+    t0 &= 0x3FFFF;
+
+    r[4 * i + 0] = GAMMA1 - t0;
+    r[4 * i + 1] = GAMMA1 - r[4 * i + 1];
+    r[4 * i + 2] = GAMMA1 - r[4 * i + 2];
+    r[4 * i + 3] = GAMMA1 - r[4 * i + 3];
+  }
+#elif GAMMA1 == (1 << 19)
+  for (j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE; ++j)
+  {
+    i = POLY_UNIFORM_GAMMA1_BUFFERSIZE - 1 - j;
+    int32_t tmp0, tmp1;
+
+    tmp0 = a[5 * i + 2] >> 4;
+    tmp0 |= (uint32_t)a[5 * i + 3] << 4;
+    tmp0 |= (uint32_t)a[5 * i + 4] << 12;
+    tmp0 &= 0xFFFFF;
+
+    tmp1 = a[5 * i + 0];
+    tmp1 |= (uint32_t)a[5 * i + 1] << 8;
+    tmp1 |= (uint32_t)a[5 * i + 2] << 16;
+    tmp1 &= 0xFFFFF;
+
+    r[2 * i + 0] = GAMMA1 - tmp1;
+    r[2 * i + 1] = GAMMA1 - tmp0;
+  }
+#endif
+}
+
+/*************************************************
+ * Name:        poly_uniform_gamma1_lowram
+ *
+ * Description: Generate a uniform polynomial with coefficients in the range [-GAMMA1, GAMMA1].
+ *
+ * Arguments:   - poly *a: output polynomial to store the generated coefficients
+ *              - const uint8_t seed[]: seed for SHAKE256
+ *              - uint16_t nonce: nonce for SHAKE256
+ *              - keccak_state *state: state for SHAKE256
+ *
+ **************************************************/
+void poly_uniform_gamma1_lowram(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, keccak_state *state)
+{
+  int32_t buf[POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS];
+
+  stream256_init(state, seed, nonce);
+  for (size_t i = 0; i < N / POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; i++)
+  {
+    shake256_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, state);
+    polyz_unpack_inplace(buf);
+
+    for (size_t j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; j++)
+    {
+      a->coeffs[i * POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS + j] = buf[j];
+    }
+  }
+}
+
+/*************************************************
+ * Name:        poly_uniform_gamma1_add_lowram
+ *
+ * Description: Generate a uniform polynomial with coefficients in the range [-GAMMA1, GAMMA1],
+ *              and add it to another polynomial.
+ *
+ * Arguments:   - poly *a: output polynomial to store the result
+ *              - poly *b: input polynomial whose coefficients are to be added
+ *              - const uint8_t seed[]: seed for SHAKE256
+ *              - uint16_t nonce: nonce for SHAKE256
+ *              - keccak_state *state: state for SHAKE256
+ *
+ **************************************************/
+void poly_uniform_gamma1_add_lowram(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, keccak_state *state)
+{
+  int32_t buf[POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS];
+
+  stream256_init(state, seed, nonce);
+  for (size_t i = 0; i < N / POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; i++)
+  {
+    shake256_squeeze((uint8_t *)buf, POLY_UNIFORM_GAMMA1_BUFFERSIZE_BYTES, state);
+    polyz_unpack_inplace(buf);
+
+    for (size_t j = 0; j < POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS; j++)
+    {
+      a->coeffs[i * POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS + j] = buf[j] + b->coeffs[i * POLY_UNIFORM_GAMMA1_BUFFERSIZE_COEFFS + j];
+    }
+  }
+}
+
+/*************************************************
+* Name:        make_hint
+*
+* Description: Compute hint bit indicating whether the low bits of the
+*              input element overflow into the high bits.
+*
+* Arguments:   - int32_t a0: low bits of input element
+*              - int32_t a1: high bits of input element
+*
+* Returns 1 if overflow.
+**************************************************/
+static inline int32_t make_hint_lowram(int32_t z, int32_t r)
+{
+  int32_t r1, v1;
+
+  r1 = highbits(r);
+  v1 = highbits(r + z);
+
+  if (r1 != v1)
+    return 1;
+  return 0;
+}
+
+/*************************************************
+ * Name:        poly_make_hint_lowram
+ *
+ * Description: Generate hint polynomial.
+ *
+ * Arguments:   - poly *a: output polynomial to store the generated hints
+ *              - poly *t: input polynomial
+ *              - uint8_t w[]: buffer holding compressed polynomial coefficients
+ *
+ * Returns the number of hints generated.
+ **************************************************/
+size_t poly_make_hint_lowram(poly *a, poly *t, uint8_t w[768])
+{
+  int32_t coeff;
+  size_t hints_n = 0;
+  for (size_t i = 0; i < N; i++)
+  {
+    // unpack coeff from w (contains w - cs2)
+    coeff = w[i * 3 + 0];
+    coeff |= (int32_t)w[i * 3 + 1] << 8;
+    coeff |= (int32_t)w[i * 3 + 2] << 16;
+
+    // compute w - cs2 + c*t0
+    coeff = coeff + t->coeffs[i];
+
+    a->coeffs[i] = make_hint_lowram(-t->coeffs[i], coeff);
+    if (a->coeffs[i] == 1)
+    {
+      hints_n++;
+    }
+  }
+  return hints_n;
+}
+
+/*************************************************
+ * Name:        unpack_sig_h_indices
+ *
+ * Description: Unpack only h from signature sig = (c, z, h).
+ *
+ * Arguments:   - polyveck *h: pointer to output hint vector h
+ *              - const unsigned char sig[]: byte array containing
+ *                bit-packed signature
+ *
+ * Returns 1 in case of malformed signature; otherwise 0.
+ **************************************************/
+int unpack_sig_h_indices(uint8_t h_i[OMEGA], unsigned int *number_of_hints, unsigned int idx, const unsigned char sig[CRYPTO_BYTES])
+{
+  sig += L * POLYZ_PACKEDBYTES;
+  sig += CTILDEBYTES;
+  /* Decode h */
+  unsigned int k = 0;
+  unsigned int hidx = 0;
+
+  if (idx > 0)
+  {
+    k = sig[OMEGA + (idx - 1)];
+  }
+
+  if (sig[OMEGA + idx] < k || sig[OMEGA + idx] > OMEGA)
+  {
+    return 1;
+  }
+
+  for (unsigned int j = k; j < sig[OMEGA + idx]; ++j)
+  {
+    /* Coefficients are ordered for strong unforgeability */
+    if (j > k && sig[j] <= sig[j - 1])
+    {
+      return 1;
+    }
+    h_i[hidx++] = sig[j];
+  }
+
+  *number_of_hints = hidx;
+
+  k = sig[OMEGA + (K - 1)];
+  /* Extra indices are zero for strong unforgeability */
+  for (unsigned int j = k; j < OMEGA; ++j)
+  {
+    if (sig[j])
+    {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+/*************************************************
+ * Name:        poly_use_hint_lowram
+ *
+ * Description: Use hint polynomial to correct the high bits of a polynomial.
+ *
+ * Arguments:   - poly *b: pointer to output polynomial with corrected high bits
+ *              - const poly *a: pointer to input polynomial
+ *              - const poly *h: pointer to input hint polynomial
+ **************************************************/
+void poly_use_hint_lowram(poly *b, const poly *a, uint8_t h_i[OMEGA], unsigned int number_of_hints)
+{
+  unsigned int i;
+  unsigned int in_list;
+
+  for (i = 0; i < N; ++i)
+  {
+    in_list = 0;
+    for (size_t hidx = 0; hidx < number_of_hints; hidx++)
+    {
+      if (i == h_i[hidx])
+      {
+        in_list = 1;
+        break;
+      }
+    }
+    if (in_list)
+    {
+      b->coeffs[i] = use_hint(a->coeffs[i], 1);
+    }
+    else
+    {
+      b->coeffs[i] = use_hint(a->coeffs[i], 0);
+    }
+  }
+}
+
+/*************************************************
+ * Name:        pack_pk_rho
+ *
+ * Description: Bit-pack only rho in public key pk = (rho, t1).
+ *
+ * Arguments:   - unsigned char pk[]: output byte array
+ *              - const unsigned char rho[]: byte array containing rho
+ **************************************************/
+void pack_pk_rho(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
+                 const unsigned char rho[SEEDBYTES])
+{
+  for (unsigned int i = 0; i < SEEDBYTES; ++i)
+  {
+    pk[i] = rho[i];
+  }
+}
+
+/*************************************************
+ * Name:        pack_pk_t1
+ *
+ * Description: Bit-pack only the t1 elem at idx in public key pk = (rho, t1).
+ *
+ * Arguments:   - unsigned char pk[]: output byte array
+ *              - const polyveck *t1: pointer to vector t1
+ *              - const unsigned int idx: index to the elem to pack
+ **************************************************/
+void pack_pk_t1(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
+                const poly *t1,
+                const unsigned int idx)
+{
+  pk += SEEDBYTES;
+  polyt1_pack(pk + idx * POLYT1_PACKEDBYTES, t1);
+}
+
+/*************************************************
+ * Name:        pack_sk_s1
+ *
+ * Description: Bit-pack only some element of s1 in secret key sk = (rho, key, tr, s1, s2, t0).
+ *
+ * Arguments:   - unsigned char sk[]: output byte array
+ *              - const poly *s1_elem: pointer to vector element idx in s1
+ *              - const unisgned int idx: index to the element of s1 that should
+ *                be packed
+ **************************************************/
+void pack_sk_s1(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *s1_elem,
+                const unsigned int idx)
+{
+  sk += 2 * SEEDBYTES + TRBYTES;
+  polyeta_pack(sk + idx * POLYETA_PACKEDBYTES, s1_elem);
+}
+
+/*************************************************
+ * Name:        pack_sk_s2
+ *
+ * Description: Bit-pack only some element of s2 in secret key sk = (rho, key, tr, s1, s2, t0).
+ *
+ * Arguments:   - unsigned char sk[]: output byte array
+ *              - const poly *s2_elem: pointer to vector element idx in s2
+ *              - const unsigned int idx: index to the element of s1 that should
+ *                be packed
+ **************************************************/
+void pack_sk_s2(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *s2_elem,
+                const unsigned int idx)
+{
+  sk += 2 * SEEDBYTES + TRBYTES + L * POLYETA_PACKEDBYTES;
+  polyeta_pack(sk + idx * POLYETA_PACKEDBYTES, s2_elem);
+}
+
+/*************************************************
+ * Name:        pack_sk_t0
+ *
+ * Description: Bit-pack only some element of t0 in secret key sk = (rho, key, tr, s1, s2, t0).
+ *
+ * Arguments:   - unsigned char sk[]: output byte array
+ *              - const poly *t0_elem: pointer to vector element idx in s2
+ *              - const unsigned int idx: index to the element of s1 that should
+ *                be packed
+ **************************************************/
+void pack_sk_t0(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *t0_elem,
+                const unsigned int idx)
+{
+  sk += 2 * SEEDBYTES + TRBYTES + L * POLYETA_PACKEDBYTES + K * POLYETA_PACKEDBYTES;
+  polyt0_pack(sk + idx * POLYT0_PACKEDBYTES, t0_elem);
+}
+
+/*************************************************
+ * Name:        pack_sk_rho
+ *
+ * Description: Bit-pack only rho in secret key sk = (rho, key, tr, s1, s2, t0).
+ *
+ * Arguments:   - unsigned char sk[]: output byte array
+ *              - const unsigned char rho[]: byte array containing rho
+ **************************************************/
+void pack_sk_rho(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                 const unsigned char rho[SEEDBYTES])
+{
+  for (unsigned int i = 0; i < SEEDBYTES; ++i)
+  {
+    sk[i] = rho[i];
+  }
+}
+
+/*************************************************
+ * Name:        pack_sk_key
+ *
+ * Description: Bit-pack only key in secret key sk = (rho, key, tr, s1, s2, t0).
+ *
+ * Arguments:   - unsigned char sk[]: output byte array
+ *              - const unsigned char key[]: byte array containing key
+ **************************************************/
+void pack_sk_key(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                 const unsigned char key[SEEDBYTES])
+{
+  sk += SEEDBYTES;
+  for (unsigned int i = 0; i < SEEDBYTES; ++i)
+  {
+    sk[i] = key[i];
+  }
+}
+
+/*************************************************
+ * Name:        pack_sk_tr
+ *
+ * Description: Bit-pack only tr in secret key sk = (rho, key, tr, s1, s2, t0).
+ *
+ * Arguments:   - unsigned char sk[]: output byte array
+ *              - const unsigned char tr[]: byte array containing tr
+ **************************************************/
+void pack_sk_tr(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const unsigned char tr[TRBYTES])
+{
+  sk += 2 * SEEDBYTES;
+  for (unsigned int i = 0; i < TRBYTES; ++i)
+  {
+    sk[i] = tr[i];
+  }
+}
+
+/*************************************************
+ * Name:        challenge
+ *
+ * Description: Implementation of H. Samples polynomial with TAU nonzero
+ *              coefficients in {-1,1} using the output stream of
+ *              SHAKE256(seed). Memory optimized.
+ *
+ * Arguments:   - poly *c: pointer to output polynomial
+ *              - const uint8_t mu[]: byte array containing seed of length SEEDBYTES
+ **************************************************/
+#define CHALLENGE_lowram_BUF_SIZE 8
+void poly_challenge_lowram(poly *c, const uint8_t seed[CTILDEBYTES])
+{
+  unsigned int i, b, pos;
+  uint64_t signs;
+  uint8_t buf[CHALLENGE_lowram_BUF_SIZE];
+  keccak_state state;
+
+  shake256_init(&state);
+  shake256_absorb(&state, seed, CTILDEBYTES);
+  shake256_finalize(&state);
+  shake256_squeeze(buf, CHALLENGE_lowram_BUF_SIZE, &state);
+  signs = 0;
+  for (i = 0; i < 8; ++i)
+  {
+    signs |= (uint64_t)buf[i] << 8 * i;
+  }
+  pos = 8;
+
+  for (i = 0; i < N; ++i)
+    c->coeffs[i] = 0;
+  for (i = N - TAU; i < N; ++i)
+  {
+    do
+    {
+      if (pos >= CHALLENGE_lowram_BUF_SIZE)
+      {
+        shake256_squeeze(buf, CHALLENGE_lowram_BUF_SIZE, &state);
+        pos = 0;
+      }
+
+      b = buf[pos++];
+    } while (b > i);
+
+    c->coeffs[i] = c->coeffs[b];
+    c->coeffs[b] = 1 - 2 * (signs & 1);
+    signs >>= 1;
+  }
+}
+
+/*************************************************
+ * Name:        poly_pointwise_acc_montgomery
+ *
+ * Description: Pointwise multiplication of polynomials in NTT domain
+ *              representation and multiplication of resulting polynomial
+ *              by 2^{-32} with accumulation.
+ *
+ * Arguments:   - poly *c: pointer to output/accumulator polynomial
+ *              - const poly *a: pointer to first input polynomial
+ *              - const poly *b: pointer to second input polynomial
+ **************************************************/
+void poly_pointwise_acc_montgomery(poly *c, const poly *a, const poly *b)
+{
+  unsigned int i;
+
+  for (i = 0; i < N; ++i)
+    c->coeffs[i] += montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]);
+}
diff --git a/lowram/lowram.h b/lowram/lowram.h
new file mode 100644
index 0000000..d330d25
--- /dev/null
+++ b/lowram/lowram.h
@@ -0,0 +1,90 @@
+#ifndef STACK_H
+#define STACK_H
+
+#include "poly.h"
+#include "smallpoly.h"
+#include <stdint.h>
+#include <stddef.h>
+#include "fips202.h"
+
+#define unpack_pk_t1 DILITHIUM_NAMESPACE(unpack_pk_t1)
+void unpack_pk_t1(poly *t1, size_t idx, const unsigned char pk[CRYPTO_PUBLICKEYBYTES]);
+#define unpack_sig_z DILITHIUM_NAMESPACE(unpack_sig_z)
+int unpack_sig_z(polyvecl *z, const unsigned char sig[CRYPTO_BYTES]);
+#define unpack_sig_h DILITHIUM_NAMESPACE(unpack_sig_h)
+int unpack_sig_h(poly *h, size_t idx, const unsigned char sig[CRYPTO_BYTES]);
+#define unpack_sig_c DILITHIUM_NAMESPACE(unpack_sig_c)
+int unpack_sig_c(uint8_t c[CTILDEBYTES], const unsigned char sig[CRYPTO_BYTES]);
+
+
+#define pack_sig_c DILITHIUM_NAMESPACE(pack_sig_c)
+void pack_sig_c(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES]);
+#define pack_sig_z DILITHIUM_NAMESPACE(pack_sig_z)
+void pack_sig_z(uint8_t sig[CRYPTO_BYTES], const polyvecl *z);
+#define pack_sig_h DILITHIUM_NAMESPACE(pack_sig_h)
+void pack_sig_h(unsigned char sig[CRYPTO_BYTES],
+                const poly *h_elem,
+                const unsigned int idx,
+                unsigned int *hints_written);
+#define pack_sig_h_zero DILITHIUM_NAMESPACE(pack_sig_h_zero)
+void pack_sig_h_zero(unsigned char sig[CRYPTO_BYTES],
+                unsigned int *hints_written);
+
+void poly_challenge_compress(uint8_t c[68], const poly *cp);
+void poly_challenge_decompress(poly *cp, const uint8_t c[68]);
+
+
+void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0);
+void poly_schoolbook_t1(poly *c, const uint8_t ccomp[68], const uint8_t *t1);
+void polyw_pack(uint8_t buf[K*768], poly *w);
+void polyw_unpack(poly *w, const uint8_t buf[K*768]);
+
+void polyw_add(uint8_t buf[3*256], poly *p);
+void polyw_sub(poly* c, uint8_t buf[3*256], poly *a);
+
+void poly_highbits(poly *a1, const poly *a);
+void poly_lowbits(poly *a0, const poly *a);
+
+void unpack_sk_s1(smallpoly *a, const uint8_t *sk, size_t idx);
+void unpack_sk_s2(smallpoly *a, const uint8_t *sk, size_t idx);
+
+void poly_uniform_pointwise_montgomery_polywadd_lowram(uint8_t wcomp[3*N], poly *b, const uint8_t  seed[SEEDBYTES], uint16_t nonce, keccak_state *state);
+void poly_uniform_gamma1_lowram(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce, keccak_state *state);
+void poly_uniform_gamma1_add_lowram(poly *a, poly *b, const uint8_t seed[CRHBYTES], uint16_t nonce, keccak_state *state);
+void poly_challenge_lowram(poly *c, const uint8_t seed[CTILDEBYTES]);
+
+size_t poly_make_hint_lowram(poly *a, poly *t, uint8_t w[768]);
+int unpack_sig_h_indices(uint8_t h_i[OMEGA], unsigned int * number_of_hints, unsigned int idx, const unsigned char sig[CRYPTO_BYTES]);
+void poly_use_hint_lowram(poly *b, const poly *a, uint8_t h_i[OMEGA], unsigned int number_of_hints);
+
+void pack_pk_rho(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
+                 const unsigned char rho[SEEDBYTES]);
+
+void pack_pk_t1(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
+             const poly *t1,
+             const unsigned int idx);
+
+void pack_sk_s1(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *s1_elem,
+                const unsigned int idx);
+
+void pack_sk_s2(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *s2_elem,
+                const unsigned int idx);
+
+void pack_sk_t0(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const poly *t0_elem,
+                const unsigned int idx);
+
+void pack_sk_rho(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                 const unsigned char rho[SEEDBYTES]);
+
+void pack_sk_key(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                 const unsigned char key[SEEDBYTES]);
+
+void pack_sk_tr(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+                const unsigned char tr[TRBYTES]);
+
+#define poly_pointwise_acc_montgomery DILITHIUM_NAMESPACE(poly_pointwise_acc_montgomery)
+void poly_pointwise_acc_montgomery(poly *c, const poly *a, const poly *b);
+#endif
diff --git a/lowram/ntt.c b/lowram/ntt.c
new file mode 120000
index 0000000..b8583dd
--- /dev/null
+++ b/lowram/ntt.c
@@ -0,0 +1 @@
+../ref/ntt.c
\ No newline at end of file
diff --git a/lowram/ntt.h b/lowram/ntt.h
new file mode 120000
index 0000000..87cb9c3
--- /dev/null
+++ b/lowram/ntt.h
@@ -0,0 +1 @@
+../ref/ntt.h
\ No newline at end of file
diff --git a/lowram/packing.c b/lowram/packing.c
new file mode 120000
index 0000000..e77fac9
--- /dev/null
+++ b/lowram/packing.c
@@ -0,0 +1 @@
+../ref/packing.c
\ No newline at end of file
diff --git a/lowram/packing.h b/lowram/packing.h
new file mode 120000
index 0000000..d27a8e9
--- /dev/null
+++ b/lowram/packing.h
@@ -0,0 +1 @@
+../ref/packing.h
\ No newline at end of file
diff --git a/lowram/params.h b/lowram/params.h
new file mode 120000
index 0000000..53133cc
--- /dev/null
+++ b/lowram/params.h
@@ -0,0 +1 @@
+../ref/params.h
\ No newline at end of file
diff --git a/lowram/poly.c b/lowram/poly.c
new file mode 120000
index 0000000..1f747c9
--- /dev/null
+++ b/lowram/poly.c
@@ -0,0 +1 @@
+../ref/poly.c
\ No newline at end of file
diff --git a/lowram/poly.h b/lowram/poly.h
new file mode 120000
index 0000000..7a4cdf0
--- /dev/null
+++ b/lowram/poly.h
@@ -0,0 +1 @@
+../ref/poly.h
\ No newline at end of file
diff --git a/lowram/polyvec.c b/lowram/polyvec.c
new file mode 120000
index 0000000..dc0efe3
--- /dev/null
+++ b/lowram/polyvec.c
@@ -0,0 +1 @@
+../ref/polyvec.c
\ No newline at end of file
diff --git a/lowram/polyvec.h b/lowram/polyvec.h
new file mode 120000
index 0000000..59c77ec
--- /dev/null
+++ b/lowram/polyvec.h
@@ -0,0 +1 @@
+../ref/polyvec.h
\ No newline at end of file
diff --git a/lowram/randombytes.c b/lowram/randombytes.c
new file mode 120000
index 0000000..59a42a5
--- /dev/null
+++ b/lowram/randombytes.c
@@ -0,0 +1 @@
+../ref/randombytes.c
\ No newline at end of file
diff --git a/lowram/randombytes.h b/lowram/randombytes.h
new file mode 120000
index 0000000..055e443
--- /dev/null
+++ b/lowram/randombytes.h
@@ -0,0 +1 @@
+../ref/randombytes.h
\ No newline at end of file
diff --git a/lowram/reduce.c b/lowram/reduce.c
new file mode 120000
index 0000000..3300e09
--- /dev/null
+++ b/lowram/reduce.c
@@ -0,0 +1 @@
+../ref/reduce.c
\ No newline at end of file
diff --git a/lowram/reduce.h b/lowram/reduce.h
new file mode 120000
index 0000000..d429fe7
--- /dev/null
+++ b/lowram/reduce.h
@@ -0,0 +1 @@
+../ref/reduce.h
\ No newline at end of file
diff --git a/lowram/rounding.c b/lowram/rounding.c
new file mode 120000
index 0000000..970bc20
--- /dev/null
+++ b/lowram/rounding.c
@@ -0,0 +1 @@
+../ref/rounding.c
\ No newline at end of file
diff --git a/lowram/rounding.h b/lowram/rounding.h
new file mode 120000
index 0000000..d26f3b1
--- /dev/null
+++ b/lowram/rounding.h
@@ -0,0 +1 @@
+../ref/rounding.h
\ No newline at end of file
diff --git a/lowram/sign.c b/lowram/sign.c
new file mode 100644
index 0000000..0849272
--- /dev/null
+++ b/lowram/sign.c
@@ -0,0 +1,517 @@
+#include <stdint.h>
+#include "params.h"
+#include "sign.h"
+#include "packing.h"
+#include "polyvec.h"
+#include "poly.h"
+#include "randombytes.h"
+#include "symmetric.h"
+#include "fips202.h"
+#include "smallpoly.h"
+#include "lowram.h"
+
+#include "smallntt.h"
+
+/*************************************************
+* Name:        crypto_sign_keypair
+*
+* Description: Generates public and private key.
+*
+* Arguments:   - uint8_t *pk: pointer to output public key (allocated
+*                             array of CRYPTO_PUBLICKEYBYTES bytes)
+*              - uint8_t *sk: pointer to output private key (allocated
+*                             array of CRYPTO_SECRETKEYBYTES bytes)
+*
+* Returns 0 (success)
+**************************************************/
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
+  unsigned int i, j;
+  uint8_t seedbuf[2*SEEDBYTES + CRHBYTES];
+  const uint8_t *rho, *rhoprime, *key;
+
+  poly tA, tB;
+
+  union {
+    uint8_t tr[TRBYTES];
+    keccak_state s256;
+    poly tC;
+  } data;
+
+  keccak_state *s256 = &data.s256;
+  uint8_t *tr        = data.tr;
+  poly *tC           = &data.tC;
+
+  /* Get randomness for rho, rhoprime and key */
+  randombytes(seedbuf, SEEDBYTES);
+  seedbuf[SEEDBYTES+0] = K;
+  seedbuf[SEEDBYTES+1] = L;
+  shake256_init(s256);
+  shake256_absorb(s256, seedbuf, SEEDBYTES + 2);
+  shake256_finalize(s256);
+  shake256_squeeze(seedbuf, 2*SEEDBYTES + CRHBYTES, s256);
+
+  rho = seedbuf;
+  rhoprime = rho + SEEDBYTES;
+  key = rhoprime + CRHBYTES;
+
+  pack_sk_rho(sk, rho);
+  pack_sk_key(sk, key);
+  pack_pk_rho(pk, rho);
+
+  /* Matrix-vector multiplication */
+  for (i = 0; i < K; i++)
+  {
+    /* Expand part of s1 */
+    poly_uniform_eta(tC, rhoprime, 0);
+    if (i == 0)
+    {
+      pack_sk_s1(sk, tC, 0);
+    }
+    poly_ntt(tC);
+    /* expand part of the matrix */
+    poly_uniform(&tB, rho, (i << 8) + 0);
+    /* partial matrix-vector multiplication */
+    poly_pointwise_montgomery(&tA, &tB, tC);
+    for(j = 1; j < L; j++)
+    {
+      /* Expand part of s1 */
+      poly_uniform_eta(tC, rhoprime, j);
+      if (i == 0)
+      {
+        pack_sk_s1(sk, tC, j);
+      }
+      poly_ntt(tC);
+      poly_uniform(&tB, rho, (i << 8) + j);
+      poly_pointwise_acc_montgomery(&tA, &tB, tC);
+    }
+
+    poly_reduce(&tA);
+    poly_invntt_tomont(&tA);
+
+    /* Add error vector s2 */
+    /* Sample short vector s2 */
+    poly_uniform_eta(&tB, rhoprime, L + i);
+    pack_sk_s2(sk, &tB, i);
+    poly_add(&tA, &tA, &tB);
+
+    /* Compute t{0,1} */
+    poly_caddq(&tA);
+    poly_power2round(tC, &tB, &tA);
+    pack_sk_t0(sk, &tB, i);
+    pack_pk_t1(pk, tC, i);
+
+  }
+
+  /* Compute H(rho, t1) and write secret key */
+  shake256(tr, TRBYTES, pk, CRYPTO_PUBLICKEYBYTES);
+  pack_sk_tr(sk, tr);
+
+  return 0;
+}
+
+
+/*************************************************
+* Name:        crypto_sign_signature
+*
+* Description: Computes signature.
+*
+* Arguments:   - uint8_t *sig:   pointer to output signature (of length CRYPTO_BYTES)
+*              - size_t *siglen: pointer to output length of signature
+*              - uint8_t *m:     pointer to message to be signed
+*              - size_t mlen:    length of message
+*              - uint8_t *ctx:   pointer to context string
+*              - size_t ctxlen:  length of context string
+*              - uint8_t *sk:    pointer to bit-packed secret key
+*
+* Returns 0 (success) or -1 (context string too long)
+**************************************************/
+int crypto_sign_signature(uint8_t *sig,
+                          size_t *siglen,
+                          const uint8_t *m,
+                          size_t mlen,
+                          const uint8_t *ctx,
+                          size_t ctxlen,
+                          const uint8_t *sk)
+{
+  uint8_t buf[2 * CRHBYTES];
+  uint8_t *mu, *rhoprime, *rnd;
+  const uint8_t *rho, *tr, *key;
+  uint16_t nonce = 0;
+  uint8_t wcomp[K][768];
+  uint8_t ccomp[68];
+
+  if(ctxlen > 255)
+    return -1;
+
+  union {
+    keccak_state s128;
+    keccak_state s256;
+  } state;
+
+  union {
+    poly full;
+    struct {
+      smallpoly stmp0;
+      smallpoly stmp1;
+    } small;
+  } polybuffer;
+
+  poly      *tmp0  = &polybuffer.full;
+  smallpoly *stmp0 = &polybuffer.small.stmp0;
+  smallpoly *scp   = &polybuffer.small.stmp1;
+
+  rho = sk;
+  tr = sk + SEEDBYTES*2;
+  key = sk + SEEDBYTES;
+  
+  mu = buf;
+  rnd = mu + CRHBYTES;
+  rhoprime = mu + CRHBYTES;
+
+  /* Compute mu = CRH(tr, 0, ctxlen, ctx, msg) */
+  mu[0] = 0;
+  mu[1] = ctxlen;
+  shake256_init(&state.s256);
+  shake256_absorb(&state.s256, tr, TRBYTES);
+  shake256_absorb(&state.s256, mu, 2);
+  shake256_absorb(&state.s256, ctx, ctxlen);
+  shake256_absorb(&state.s256, m, mlen);
+  shake256_finalize(&state.s256);
+  shake256_squeeze(mu, CRHBYTES, &state.s256);
+
+#ifdef DILITHIUM_RANDOMIZED_SIGNING
+  randombytes(rnd, RNDBYTES);
+#else
+  unsigned int n;
+  /* Note: RNDBYTES < CRHBYTES, so buffer has proper size */
+  for(n=0;n<RNDBYTES;n++)
+    rnd[n] = 0;
+#endif
+
+  shake256_init(&state.s256);
+  shake256_absorb(&state.s256, key, SEEDBYTES);
+  shake256_absorb(&state.s256, rnd, RNDBYTES);
+  shake256_absorb(&state.s256, mu, CRHBYTES);
+  shake256_finalize(&state.s256);
+  /* rnd can be overwritten here */
+  shake256_squeeze(rhoprime, CRHBYTES, &state.s256);
+
+rej:  
+    for (size_t k_idx = 0; k_idx < K; k_idx++) {
+      for(size_t i=0;i<768;i++){
+        wcomp[k_idx][i] = 0;
+      }
+    }
+
+      for (size_t l_idx = 0; l_idx < L; l_idx++) {
+        /* Sample intermediate vector y */
+        poly_uniform_gamma1_lowram(tmp0, rhoprime, L*nonce + l_idx, &state.s256);
+        poly_ntt(tmp0);
+
+        /* Matrix-vector multiplication */
+        for (size_t k_idx = 0; k_idx < K; k_idx++) {
+          /* sampling of y and packing into wcomp inlined into the basemul */
+          poly_uniform_pointwise_montgomery_polywadd_lowram(wcomp[k_idx], tmp0, rho, (k_idx << 8) + l_idx, &state.s128);
+        }
+      }
+      nonce++;
+      for (size_t k_idx = 0; k_idx < K; k_idx++) {
+        polyw_unpack(tmp0, wcomp[k_idx]);
+        poly_invntt_tomont(tmp0);
+        poly_caddq(tmp0);
+
+        polyw_pack(wcomp[k_idx], tmp0);
+        poly_highbits(tmp0, tmp0);
+        polyw1_pack(&sig[k_idx*POLYW1_PACKEDBYTES], tmp0);
+      }
+
+  shake256_init(&state.s256);
+  shake256_absorb(&state.s256, mu, CRHBYTES);
+  shake256_absorb(&state.s256, sig, K*POLYW1_PACKEDBYTES);
+  shake256_finalize(&state.s256);
+  shake256_squeeze(sig, CTILDEBYTES, &state.s256);
+  poly_challenge(tmp0, sig);
+
+  poly_challenge_compress(ccomp, tmp0);
+  
+  /* Compute z, reject if it reveals secret */
+  for(size_t l_idx=0;l_idx < L; l_idx++){
+  if(l_idx != 0){
+    poly_challenge_decompress(tmp0, ccomp);
+  }
+    poly_small_ntt_copy(scp, tmp0);
+    unpack_sk_s1(stmp0, sk, l_idx);
+    small_ntt(stmp0->coeffs);
+    poly_small_basemul_invntt(tmp0, scp, stmp0);
+
+    poly_uniform_gamma1_add_lowram(tmp0, tmp0, rhoprime, L*(nonce-1) + l_idx, &state.s256);
+
+    poly_reduce(tmp0);
+
+    if(poly_chknorm(tmp0, GAMMA1 - BETA))
+      goto rej;
+
+    polyz_pack(sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES, tmp0);
+  }
+
+
+  /* Write signature */
+  unsigned int hint_n = 0;
+  unsigned int hints_written = 0;
+  /* Check that subtracting cs2 does not change high bits of w and low bits
+   * do not reveal secret information */
+  
+  for(unsigned int k_idx = 0; k_idx < K; ++k_idx) {
+    poly_challenge_decompress(tmp0, ccomp);
+    poly_small_ntt_copy(scp, tmp0);
+
+    unpack_sk_s2(stmp0, sk, k_idx);
+    small_ntt(stmp0->coeffs);
+    poly_small_basemul_invntt(tmp0, scp, stmp0);
+
+    polyw_sub(tmp0, wcomp[k_idx], tmp0);
+    poly_reduce(tmp0);
+
+    polyw_pack(wcomp[k_idx], tmp0);
+
+    poly_lowbits(tmp0, tmp0);
+    poly_reduce(tmp0);
+    if(poly_chknorm(tmp0, GAMMA2 - BETA)){
+      goto rej;
+    }
+
+    poly_schoolbook(tmp0, ccomp, sk + SEEDBYTES + TRBYTES + SEEDBYTES +
+      L*POLYETA_PACKEDBYTES + K*POLYETA_PACKEDBYTES + k_idx*POLYT0_PACKEDBYTES);
+
+    /* Compute hints for w1 */
+
+    if(poly_chknorm(tmp0, GAMMA2)) {
+      goto rej;
+    }
+
+    hint_n += poly_make_hint_lowram(tmp0, tmp0, wcomp[k_idx]);
+
+    if (hint_n > OMEGA) {
+      goto rej;
+    }
+    pack_sig_h(sig, tmp0, k_idx, &hints_written);
+  }
+  pack_sig_h_zero(sig, &hints_written);
+  *siglen = CRYPTO_BYTES;
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_sign
+*
+* Description: Compute signed message.
+*
+* Arguments:   - uint8_t *sm: pointer to output signed message (allocated
+*                             array with CRYPTO_BYTES + mlen bytes),
+*                             can be equal to m
+*              - size_t *smlen: pointer to output length of signed
+*                               message
+*              - const uint8_t *m: pointer to message to be signed
+*              - size_t mlen: length of message
+*              - const uint8_t *ctx: pointer to context string
+*              - size_t ctxlen: length of context string
+*              - const uint8_t *sk: pointer to bit-packed secret key
+*
+* Returns 0 (success) or -1 (context string too long)
+**************************************************/
+int crypto_sign(uint8_t *sm,
+                size_t *smlen,
+                const uint8_t *m,
+                size_t mlen,
+                const uint8_t *ctx,
+                size_t ctxlen,
+                const uint8_t *sk)
+{
+  int ret;
+  size_t i;
+
+  for(i = 0; i < mlen; ++i)
+    sm[CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i];
+  ret = crypto_sign_signature(sm, smlen, sm + CRYPTO_BYTES, mlen, ctx, ctxlen, sk);
+  *smlen += mlen;
+  return ret;
+}
+
+/*************************************************
+* Name:        crypto_sign_verify
+*
+* Description: Verifies signature.
+*
+* Arguments:   - uint8_t *m: pointer to input signature
+*              - size_t siglen: length of signature
+*              - const uint8_t *m: pointer to message
+*              - size_t mlen: length of message
+*              - const uint8_t *ctx: pointer to context string
+*              - size_t ctxlen: length of context string
+*              - const uint8_t *pk: pointer to bit-packed public key
+*
+* Returns 0 if signature could be verified correctly and -1 otherwise
+**************************************************/
+int crypto_sign_verify(const uint8_t *sig,
+                       size_t siglen,
+                       const uint8_t *m,
+                       size_t mlen,
+                       const uint8_t *ctx,
+                       size_t ctxlen,
+                       const uint8_t *pk)
+{
+  unsigned int i;
+  
+  poly p;
+
+  union {
+    uint8_t w1_packed[POLYW1_PACKEDBYTES];
+    uint8_t wcomp[768];
+  } w1_packed_comp;
+  uint8_t *w1_packed = w1_packed_comp.w1_packed;
+  uint8_t *wcomp     = w1_packed_comp.wcomp;
+
+  union {
+    uint8_t ccomp[68];
+    uint8_t mu[TRBYTES];
+  } ccomp_mu;
+  uint8_t *ccomp = ccomp_mu.ccomp;
+  uint8_t *mu  = ccomp_mu.mu;
+
+  keccak_state s256;
+
+  union {
+    uint8_t hint_ones[OMEGA];
+    keccak_state s128;
+    uint8_t c2[CTILDEBYTES];
+  } shake_hint;
+
+  uint8_t *hint_ones = shake_hint.hint_ones;
+  keccak_state *s128 = &shake_hint.s128;
+  uint8_t *c2        = shake_hint.c2;
+
+  if(ctxlen > 255 || siglen != CRYPTO_BYTES)
+    return -1;
+
+  /* Compute CRH(h(rho, t1), msg) */
+  shake256_init(&s256);
+  shake256_absorb(&s256, pk, CRYPTO_PUBLICKEYBYTES);
+  shake256_finalize(&s256);
+  shake256_squeeze(mu, TRBYTES, &s256);
+
+  shake256_init(&s256);
+  shake256_absorb(&s256, mu, TRBYTES);
+  mu[0] = 0;
+  mu[1] = ctxlen;
+  shake256_absorb(&s256, mu, 2);
+  shake256_absorb(&s256, ctx, ctxlen);
+  shake256_absorb(&s256, m, mlen);
+  shake256_finalize(&s256);
+  shake256_squeeze(mu, CRHBYTES, &s256);
+
+  shake256_init(&s256);
+  shake256_absorb(&s256, mu, CRHBYTES);
+
+  /* Matrix-vector multiplication; compute Az - c2^dt1 */
+  poly_challenge_lowram(&p, sig);
+  poly_challenge_compress(ccomp, &p);
+
+  for (size_t k_idx = 0; k_idx < K; k_idx++) {
+    for(size_t widx=0;widx<768;widx++){
+        wcomp[widx] = 0;
+    }
+
+    polyz_unpack(&p, sig + CTILDEBYTES);
+    if(poly_chknorm(&p, GAMMA1 - BETA))
+      return -1;
+    poly_ntt(&p);
+    
+    poly_uniform_pointwise_montgomery_polywadd_lowram(wcomp, &p, pk, (k_idx << 8) + 0, s128);
+
+    for (size_t l_idx = 1; l_idx < L; l_idx++) {
+      polyz_unpack(&p, sig + CTILDEBYTES + l_idx*POLYZ_PACKEDBYTES);
+      if(poly_chknorm(&p, GAMMA1 - BETA))
+        return -1;
+      poly_ntt(&p);
+      poly_uniform_pointwise_montgomery_polywadd_lowram(wcomp, &p, pk, (k_idx << 8) + l_idx, s128);
+    }
+    polyw_unpack(&p, wcomp);
+    poly_reduce(&p);
+    poly_invntt_tomont(&p);
+    polyw_pack(wcomp, &p);
+    
+    poly_schoolbook_t1(&p, ccomp, pk + SEEDBYTES + k_idx*POLYT1_PACKEDBYTES);
+
+    polyw_sub(&p, wcomp, &p);
+    poly_reduce(&p);
+
+    /* Reconstruct w1 */
+    poly_caddq(&p);
+
+    if (unpack_sig_h_indices(hint_ones, &i, k_idx, sig) != 0)
+    {
+      return -1;
+    }
+    poly_use_hint_lowram(&p, &p, hint_ones, i);
+
+    polyw1_pack(w1_packed, &p);
+
+    shake256_absorb(&s256, w1_packed, POLYW1_PACKEDBYTES);
+  }
+  /* Call random oracle and verify challenge */
+  shake256_finalize(&s256);
+  shake256_squeeze(c2, CTILDEBYTES, &s256);
+  for(i = 0; i < CTILDEBYTES; ++i)
+    if(sig[i] != c2[i])
+      return -1;
+
+  return 0;
+}
+
+/*************************************************
+* Name:        crypto_sign_open
+*
+* Description: Verify signed message.
+*
+* Arguments:   - uint8_t *m: pointer to output message (allocated
+*                            array with smlen bytes), can be equal to sm
+*              - size_t *mlen: pointer to output length of message
+*              - const uint8_t *sm: pointer to signed message
+*              - size_t smlen: length of signed message
+*              - const uint8_t *ctx: pointer to context tring
+*              - size_t ctxlen: length of context string
+*              - const uint8_t *pk: pointer to bit-packed public key
+*
+* Returns 0 if signed message could be verified correctly and -1 otherwise
+**************************************************/
+int crypto_sign_open(uint8_t *m,
+                     size_t *mlen,
+                     const uint8_t *sm,
+                     size_t smlen,
+                     const uint8_t *ctx,
+                     size_t ctxlen,
+                     const uint8_t *pk)
+{
+  size_t i;
+
+  if(smlen < CRYPTO_BYTES)
+    goto badsig;
+
+  *mlen = smlen - CRYPTO_BYTES;
+  if(crypto_sign_verify(sm, CRYPTO_BYTES, sm + CRYPTO_BYTES, *mlen, ctx, ctxlen, pk))
+    goto badsig;
+  else {
+    /* All good, copy msg, return 0 */
+    for(i = 0; i < *mlen; ++i)
+      m[i] = sm[CRYPTO_BYTES + i];
+    return 0;
+  }
+
+badsig:
+  /* Signature verification failed */
+  *mlen = 0;
+  for(i = 0; i < smlen; ++i)
+    m[i] = 0;
+
+  return -1;
+}
diff --git a/lowram/sign.h b/lowram/sign.h
new file mode 120000
index 0000000..200b72f
--- /dev/null
+++ b/lowram/sign.h
@@ -0,0 +1 @@
+../ref/sign.h
\ No newline at end of file
diff --git a/lowram/smallntt.h b/lowram/smallntt.h
new file mode 100644
index 0000000..df908cf
--- /dev/null
+++ b/lowram/smallntt.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright (c) 2023 Junhao Huang (jhhuang_nuaa@126.com)
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SMALLNTT_H
+#define SMALLNTT_H
+
+#include <stdint.h>
+#include "params.h"
+#include "poly.h"
+
+/* We use the Kyber prime 3329 as the modulus for the small NTT. Other choices
+such as 769 for all parameter sets or 257 for Dilithium2 and Dilithium5 are also
+viable. */
+#define SMALL_Q 3329
+#define Q_INV_SMALL -3327
+
+extern const int16_t small_zetas[128];
+
+void small_ntt(int16_t r[N]);
+void small_invntt_tomont(int16_t r[N]);
+void small_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta);
+
+#endif
diff --git a/lowram/smallntt_3329.c b/lowram/smallntt_3329.c
new file mode 100644
index 0000000..df2f5c3
--- /dev/null
+++ b/lowram/smallntt_3329.c
@@ -0,0 +1,180 @@
+#include <stdint.h>
+#include "params.h"
+#include "ntt.h"
+#include "reduce.h"
+#include "smallntt.h"
+
+/* Code to generate zetas and zetas_inv used in the number-theoretic transform:
+
+#define KYBER_ROOT_OF_UNITY 17
+
+static const uint8_t tree[128] = {
+  0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120,
+  4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124,
+  2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122,
+  6, 70, 38, 102, 22, 86, 54, 118, 14, 78, 46, 110, 30, 94, 62, 126,
+  1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121,
+  5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125,
+  3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123,
+  7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127
+};
+
+void init_ntt() {
+  unsigned int i;
+  int16_t tmp[128];
+
+  tmp[0] = MONT;
+  for(i=1;i<128;i++)
+    tmp[i] = fqmul(tmp[i-1],MONT*KYBER_ROOT_OF_UNITY % KYBER_Q);
+
+  for(i=0;i<128;i++) {
+    zetas[i] = tmp[tree[i]];
+    if(zetas[i] > KYBER_Q/2)
+      zetas[i] -= KYBER_Q;
+    if(zetas[i] < -KYBER_Q/2)
+      zetas[i] += KYBER_Q;
+  }
+}
+*/
+
+const int16_t small_zetas[128] = {
+    -1044, -758, -359, -1517, 1493, 1422, 287, 202,
+    -171, 622, 1577, 182, 962, -1202, -1474, 1468,
+    573, -1325, 264, 383, -829, 1458, -1602, -130,
+    -681, 1017, 732, 608, -1542, 411, -205, -1571,
+    1223, 652, -552, 1015, -1293, 1491, -282, -1544,
+    516, -8, -320, -666, -1618, -1162, 126, 1469,
+    -853, -90, -271, 830, 107, -1421, -247, -951,
+    -398, 961, -1508, -725, 448, -1065, 677, -1275,
+    -1103, 430, 555, 843, -1251, 871, 1550, 105,
+    422, 587, 177, -235, -291, -460, 1574, 1653,
+    -246, 778, 1159, -147, -777, 1483, -602, 1119,
+    -1590, 644, -872, 349, 418, 329, -156, -75,
+    817, 1097, 603, 610, 1322, -1285, -1465, 384,
+    -1215, -136, 1218, -1335, -874, 220, -1187, -1659,
+    -1185, -1530, -1278, 794, -1510, -854, -870, 478,
+    -108, -308, 996, 991, 958, -1460, 1522, 1628};
+
+static int16_t montgomery_reduce_small(int32_t a)
+{
+  int16_t t;
+
+  t = (int16_t)a * Q_INV_SMALL;
+  t = (a - (int32_t)t * SMALL_Q) >> 16;
+  return t;
+}
+
+static int16_t barrett_reduce(int16_t a)
+{
+  int16_t t;
+  const int16_t v = ((1 << 26) + SMALL_Q / 2) / SMALL_Q;
+
+  t = ((int32_t)v * a + (1 << 25)) >> 26;
+  t *= SMALL_Q;
+  return a - t;
+}
+
+/*************************************************
+ * Name:        fqmul
+ *
+ * Description: Multiplication followed by Montgomery reduction
+ *
+ * Arguments:   - int16_t a: first factor
+ *              - int16_t b: second factor
+ *
+ * Returns 16-bit integer congruent to a*b*R^{-1} mod q
+ **************************************************/
+static int16_t fqmul(int16_t a, int16_t b)
+{
+  return montgomery_reduce_small((int32_t)a * b);
+}
+
+/*************************************************
+ * Name:        ntt
+ *
+ * Description: Inplace number-theoretic transform (NTT) in Rq.
+ *              input is in standard order, output is in bitreversed order
+ *
+ * Arguments:   - int16_t r[256]: pointer to input/output vector of elements of Zq
+ **************************************************/
+void small_ntt(int16_t r[256])
+{
+  unsigned int len, start, j, k;
+  int16_t t, zeta;
+
+  k = 1;
+  for (len = 128; len >= 2; len >>= 1)
+  {
+    for (start = 0; start < 256; start = j + len)
+    {
+      zeta = small_zetas[k++];
+      for (j = start; j < start + len; j++)
+      {
+        t = fqmul(zeta, r[j + len]);
+        r[j + len] = r[j] - t;
+        r[j] = r[j] + t;
+      }
+    }
+  }
+}
+
+/*************************************************
+ * Name:        invntt_tomont
+ *
+ * Description: Inplace inverse number-theoretic transform in Rq and
+ *              multiplication by Montgomery factor 2^16.
+ *              Input is in bitreversed order, output is in standard order
+ *
+ * Arguments:   - int16_t r[256]: pointer to input/output vector of elements of Zq
+ **************************************************/
+void small_invntt_tomont(int16_t r[256])
+{
+  unsigned int start, len, j, k;
+  int16_t t, zeta;
+  const int16_t f = 1441; // mont^2/128
+
+  k = 127;
+  for (len = 2; len <= 128; len <<= 1)
+  {
+    for (start = 0; start < 256; start = j + len)
+    {
+      zeta = small_zetas[k--];
+      for (j = start; j < start + len; j++)
+      {
+        t = r[j];
+        r[j] = barrett_reduce(t + r[j + len]);
+        r[j + len] = r[j + len] - t;
+        r[j + len] = fqmul(zeta, r[j + len]);
+      }
+    }
+  }
+
+  for (j = 0; j < 256; j++)
+  {
+    r[j] = barrett_reduce(fqmul(r[j], f));
+  }
+}
+
+/*************************************************
+ * Name:        basemul
+ *
+ * Description: Multiplication of polynomials in Zq[X]/(X^2-zeta)
+ *              used for multiplication of elements in Rq in NTT domain
+ *
+ * Arguments:   - int16_t r[2]: pointer to the output polynomial
+ *              - const int16_t a[2]: pointer to the first factor
+ *              - const int16_t b[2]: pointer to the second factor
+ *              - int16_t zeta: integer defining the reduction polynomial
+ **************************************************/
+void small_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta)
+{
+
+  int16_t a0 = a[0], a1 = a[1];
+  int16_t b0 = b[0];
+
+  r[0] = fqmul(a1, b[1]);
+  r[0] = fqmul(r[0], zeta);
+  r[0] += fqmul(a0, b0);
+  r[1] = fqmul(a0, b[1]);
+  r[1] += fqmul(a1, b0);
+}
diff --git a/lowram/smallpoly.c b/lowram/smallpoly.c
new file mode 100644
index 0000000..361fcfd
--- /dev/null
+++ b/lowram/smallpoly.c
@@ -0,0 +1,100 @@
+#include "smallpoly.h"
+#include "smallntt.h"
+
+void poly_small_ntt_copy(smallpoly *out, poly *in)
+{
+  for (int i = N - 1; i >= 0; i--)
+  {
+    out->coeffs[i] = in->coeffs[i];
+  }
+  small_ntt(out->coeffs);
+}
+
+void polyvecl_small_ntt(smallpoly v[L])
+{
+  unsigned int i;
+
+  for (i = 0; i < L; ++i)
+    small_ntt(v[i].coeffs);
+}
+
+void polyveck_small_ntt(smallpoly v[K])
+{
+  unsigned int i;
+
+  for (i = 0; i < K; ++i)
+    small_ntt(v[i].coeffs);
+}
+
+void poly_small_basemul(int16_t r[N], const int16_t a[N], const int16_t b[N])
+{
+  unsigned int i;
+  for (i = 0; i < N / 4; i++)
+  {
+    small_basemul(&r[4 * i], &a[4 * i], &b[4 * i], small_zetas[64 + i]);
+    small_basemul(&r[4 * i + 2], &a[4 * i + 2], &b[4 * i + 2], -small_zetas[64 + i]);
+  }
+}
+
+void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallpoly *b)
+{
+  // re-use the buffer
+  smallpoly *tmp = (smallpoly *)r;
+  poly_small_basemul(tmp->coeffs, a->coeffs, b->coeffs);
+
+  small_invntt_tomont(tmp->coeffs);
+
+#ifdef SMALL_POLY_16_BIT
+  int j;
+  // buffer is the same, so we neeed to be careful
+  for (j = N - 1; j >= 0; j--)
+  {
+    r->coeffs[j] = tmp->coeffs[j];
+  }
+#endif
+}
+
+void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallpoly b[L])
+{
+  unsigned int i;
+  for (i = 0; i < L; i++)
+  {
+    poly_small_basemul_invntt(&r->vec[i], a, &b[i]);
+  }
+}
+
+void small_polyeta_unpack(smallpoly *r, const uint8_t *a)
+{
+  unsigned int i;
+
+#if ETA == 2
+  for (i = 0; i < N / 8; ++i)
+  {
+    r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7;
+    r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7;
+    r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7;
+    r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7;
+    r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7;
+    r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7;
+    r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7;
+    r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7;
+
+    r->coeffs[8 * i + 0] = ETA - r->coeffs[8 * i + 0];
+    r->coeffs[8 * i + 1] = ETA - r->coeffs[8 * i + 1];
+    r->coeffs[8 * i + 2] = ETA - r->coeffs[8 * i + 2];
+    r->coeffs[8 * i + 3] = ETA - r->coeffs[8 * i + 3];
+    r->coeffs[8 * i + 4] = ETA - r->coeffs[8 * i + 4];
+    r->coeffs[8 * i + 5] = ETA - r->coeffs[8 * i + 5];
+    r->coeffs[8 * i + 6] = ETA - r->coeffs[8 * i + 6];
+    r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7];
+  }
+#elif ETA == 4
+  for (i = 0; i < N / 2; ++i)
+  {
+    r->coeffs[2 * i + 0] = a[i] & 0x0F;
+    r->coeffs[2 * i + 1] = a[i] >> 4;
+    r->coeffs[2 * i + 0] = ETA - r->coeffs[2 * i + 0];
+    r->coeffs[2 * i + 1] = ETA - r->coeffs[2 * i + 1];
+  }
+#endif
+}
diff --git a/lowram/smallpoly.h b/lowram/smallpoly.h
new file mode 100644
index 0000000..a547c8d
--- /dev/null
+++ b/lowram/smallpoly.h
@@ -0,0 +1,26 @@
+#ifndef SMALLPOLY_H
+#define SMALLPOLY_H
+#include "params.h"
+#include "poly.h"
+#include "polyvec.h"
+
+#define SMALL_POLY_16_BIT
+typedef struct
+{
+    int16_t coeffs[N];
+} smallpoly;
+
+typedef smallpoly smallhalfpoly;
+
+void poly_small_ntt_copy(smallpoly *, poly *);
+void poly_small_basemul(int16_t r[N], const int16_t a[N], const int16_t b[N]);
+
+void polyvecl_small_ntt(smallpoly v[L]);
+void polyveck_small_ntt(smallpoly v[K]);
+
+void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallpoly b[L]);
+void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallpoly *b);
+
+void small_polyeta_unpack(smallpoly *r, const uint8_t *a);
+
+#endif
diff --git a/lowram/symmetric-shake.c b/lowram/symmetric-shake.c
new file mode 120000
index 0000000..86f8b6c
--- /dev/null
+++ b/lowram/symmetric-shake.c
@@ -0,0 +1 @@
+../ref/symmetric-shake.c
\ No newline at end of file
diff --git a/lowram/symmetric.h b/lowram/symmetric.h
new file mode 120000
index 0000000..8655364
--- /dev/null
+++ b/lowram/symmetric.h
@@ -0,0 +1 @@
+../ref/symmetric.h
\ No newline at end of file
diff --git a/lowram/test/.gitignore b/lowram/test/.gitignore
new file mode 120000
index 0000000..cde2696
--- /dev/null
+++ b/lowram/test/.gitignore
@@ -0,0 +1 @@
+../../ref/test/.gitignore
\ No newline at end of file
diff --git a/lowram/test/cpucycles.c b/lowram/test/cpucycles.c
new file mode 120000
index 0000000..4d6fc8a
--- /dev/null
+++ b/lowram/test/cpucycles.c
@@ -0,0 +1 @@
+../../ref/test/cpucycles.c
\ No newline at end of file
diff --git a/lowram/test/cpucycles.h b/lowram/test/cpucycles.h
new file mode 120000
index 0000000..269feb3
--- /dev/null
+++ b/lowram/test/cpucycles.h
@@ -0,0 +1 @@
+../../ref/test/cpucycles.h
\ No newline at end of file
diff --git a/lowram/test/speed_print.c b/lowram/test/speed_print.c
new file mode 120000
index 0000000..98f2a46
--- /dev/null
+++ b/lowram/test/speed_print.c
@@ -0,0 +1 @@
+../../ref/test/speed_print.c
\ No newline at end of file
diff --git a/lowram/test/speed_print.h b/lowram/test/speed_print.h
new file mode 120000
index 0000000..8ba4e5e
--- /dev/null
+++ b/lowram/test/speed_print.h
@@ -0,0 +1 @@
+../../ref/test/speed_print.h
\ No newline at end of file
diff --git a/lowram/test/test_dilithium.c b/lowram/test/test_dilithium.c
new file mode 120000
index 0000000..729cb5b
--- /dev/null
+++ b/lowram/test/test_dilithium.c
@@ -0,0 +1 @@
+../../ref/test/test_dilithium.c
\ No newline at end of file
diff --git a/lowram/test/test_mul.c b/lowram/test/test_mul.c
new file mode 120000
index 0000000..013f7af
--- /dev/null
+++ b/lowram/test/test_mul.c
@@ -0,0 +1 @@
+../../ref/test/test_mul.c
\ No newline at end of file
diff --git a/lowram/test/test_speed.c b/lowram/test/test_speed.c
new file mode 120000
index 0000000..7decc02
--- /dev/null
+++ b/lowram/test/test_speed.c
@@ -0,0 +1 @@
+../../ref/test/test_speed.c
\ No newline at end of file
diff --git a/lowram/test/test_vectors.c b/lowram/test/test_vectors.c
new file mode 120000
index 0000000..1dffda0
--- /dev/null
+++ b/lowram/test/test_vectors.c
@@ -0,0 +1 @@
+../../ref/test/test_vectors.c
\ No newline at end of file

From dbe6543f6e020ebcbf62d2b2a80b44471baecf8a Mon Sep 17 00:00:00 2001
From: Amin Abdulrahman <amin@abdulrahman.de>
Date: Thu, 31 Oct 2024 11:34:21 +0100
Subject: [PATCH 2/2] Apply suggestions from code review

Co-authored-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
---
 lowram/sign.c          | 4 ++--
 lowram/smallntt_3329.c | 2 ++
 lowram/smallpoly.c     | 2 --
 lowram/smallpoly.h     | 1 -
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/lowram/sign.c b/lowram/sign.c
index 0849272..4d7075b 100644
--- a/lowram/sign.c
+++ b/lowram/sign.c
@@ -393,7 +393,7 @@ int crypto_sign_verify(const uint8_t *sig,
   if(ctxlen > 255 || siglen != CRYPTO_BYTES)
     return -1;
 
-  /* Compute CRH(h(rho, t1), msg) */
+  /* Compute mu = CRH(H(rho, t1), 0, ctxlen, ctx, msg) */
   shake256_init(&s256);
   shake256_absorb(&s256, pk, CRYPTO_PUBLICKEYBYTES);
   shake256_finalize(&s256);
@@ -478,7 +478,7 @@ int crypto_sign_verify(const uint8_t *sig,
 *              - size_t *mlen: pointer to output length of message
 *              - const uint8_t *sm: pointer to signed message
 *              - size_t smlen: length of signed message
-*              - const uint8_t *ctx: pointer to context tring
+*              - const uint8_t *ctx: pointer to context string
 *              - size_t ctxlen: length of context string
 *              - const uint8_t *pk: pointer to bit-packed public key
 *
diff --git a/lowram/smallntt_3329.c b/lowram/smallntt_3329.c
index df2f5c3..97eb7be 100644
--- a/lowram/smallntt_3329.c
+++ b/lowram/smallntt_3329.c
@@ -7,6 +7,8 @@
 /* Code to generate zetas and zetas_inv used in the number-theoretic transform:
 
 #define KYBER_ROOT_OF_UNITY 17
+#define KYBER_Q 3329
+#define MONT -1044
 
 static const uint8_t tree[128] = {
   0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120,
diff --git a/lowram/smallpoly.c b/lowram/smallpoly.c
index 361fcfd..3e0f9e5 100644
--- a/lowram/smallpoly.c
+++ b/lowram/smallpoly.c
@@ -44,14 +44,12 @@ void poly_small_basemul_invntt(poly *r, const smallpoly *a, const smallpoly *b)
 
   small_invntt_tomont(tmp->coeffs);
 
-#ifdef SMALL_POLY_16_BIT
   int j;
   // buffer is the same, so we neeed to be careful
   for (j = N - 1; j >= 0; j--)
   {
     r->coeffs[j] = tmp->coeffs[j];
   }
-#endif
 }
 
 void polyvecl_small_basemul_invntt(polyvecl *r, const smallpoly *a, const smallpoly b[L])
diff --git a/lowram/smallpoly.h b/lowram/smallpoly.h
index a547c8d..72cc403 100644
--- a/lowram/smallpoly.h
+++ b/lowram/smallpoly.h
@@ -4,7 +4,6 @@
 #include "poly.h"
 #include "polyvec.h"
 
-#define SMALL_POLY_16_BIT
 typedef struct
 {
     int16_t coeffs[N];