Skip to content

Commit

Permalink
RDMA: Protect RDMA memory regions (valkey-io#1602)
Browse files Browse the repository at this point in the history
Use Linux syscall mmap/munmap to manage a RDMA memory region, then we
have a guard page protected VMA like (cat /proc/PID/maps):
 785018afe000-785018aff000 ---p 00000000 00:00 0  -> top guard page
 785018aff000-785018bff000 rw-p 00000000 00:00 0  -> RDMA memory region
 785018bff000-785018c00000 ---p 00000000 00:00 0  -> bottom guard page

Once any code accesses memory unexpectedly, segment fault occurs.

Signed-off-by: zhenwei pi <[email protected]>
Signed-off-by: zhenwei pi <[email protected]>
  • Loading branch information
pizhenwei authored Jan 28, 2025
1 parent ad60d6b commit d72a97e
Showing 1 changed file with 51 additions and 22 deletions.
73 changes: 51 additions & 22 deletions src/rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <sys/types.h>
#include <sys/socket.h>
#include <netdb.h>
#include <sys/mman.h>

#define CONN_TYPE_RDMA "rdma"

Expand Down Expand Up @@ -134,6 +135,8 @@ static list *pending_list;
static rdma_listener *rdma_listeners;
static serverRdmaContextConfig *rdma_config;

static size_t page_size;

static ConnectionType CT_RDMA;

static void serverRdmaError(char *err, const char *fmt, ...) {
Expand Down Expand Up @@ -191,31 +194,56 @@ static int rdmaPostRecv(RdmaContext *ctx, struct rdma_cm_id *cm_id, ValkeyRdmaCm
return C_OK;
}

/* To make Valkey forkable, buffer which is registered as RDMA
* memory region should be aligned to page size. And the length
* also need be aligned to page size.
/* To make Valkey forkable, buffer which is registered as RDMA memory region should be
* aligned to page size. And the length also need be aligned to page size.
* Random segment-fault case like this:
* 0x7f2764ac5000 - 0x7f2764ac7000
* |ptr0 128| ... |ptr1 4096| ... |ptr2 512|
*
* After ibv_reg_mr(pd, ptr1, 4096, access), the full range of 8K
* becomes DONTFORK. And the child process will hit a segment fault
* during access ptr0/ptr2.
* Note that the memory can be freed by libc free only.
* TODO: move it to zmalloc.c if necessary
* After ibv_reg_mr(pd, ptr1, 4096, access), the full range of 8K becomes DONTFORK. And
* the child process will hit a segment fault during access ptr0/ptr2.
*
* The portable posix_memalign(&tmp, page_size, aligned_size) would be fine too. However,
* RDMA is supported by Linux only, so it would not break anything. Using raw mmap syscall
* to allocate a separate virtual memory area(VMA), also make it protected by the 2 guard
* pages (a top one and a bottom one).
*/
static void *page_aligned_zalloc(size_t size) {
void *tmp;
size_t aligned_size, page_size = sysconf(_SC_PAGESIZE);
static void *rdmaMemoryAlloc(size_t size) {
size_t real_size, aligned_size = (size + page_size - 1) & (~(page_size - 1));
uint8_t *ptr;

aligned_size = (size + page_size - 1) & (~(page_size - 1));
if (posix_memalign(&tmp, page_size, aligned_size)) {
serverPanic("posix_memalign failed");
real_size = aligned_size + 2 * page_size;
ptr = mmap(NULL, real_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (ptr == MAP_FAILED) {
serverPanic("failed to allocate memory for RDMA region");
}

madvise(ptr, real_size, MADV_DONTDUMP); /* no need to dump this VMA on coredump */
mprotect(ptr, page_size, PROT_NONE); /* top page of this VMA */
mprotect(ptr + size + page_size, page_size, PROT_NONE); /* bottom page of this VMA */

return ptr + page_size;
}

static void rdmaMemoryFree(void *ptr, size_t size) {
uint8_t *real_ptr;
size_t real_size, aligned_size;

if (!ptr) {
return;
}

memset(tmp, 0x00, aligned_size);
if ((unsigned long)ptr & (page_size - 1)) {
serverPanic("unaligned memory in use for RDMA region");
}

return tmp;
aligned_size = (size + page_size - 1) & (~(page_size - 1));
real_size = aligned_size + 2 * page_size;
real_ptr = (uint8_t *)ptr - page_size;

if (munmap(real_ptr, real_size)) {
serverPanic("failed to free memory for RDMA region");
}
}

static void rdmaDestroyIoBuf(RdmaContext *ctx) {
Expand All @@ -224,23 +252,23 @@ static void rdmaDestroyIoBuf(RdmaContext *ctx) {
ctx->rx.mr = NULL;
}

zlibc_free(ctx->rx.addr);
rdmaMemoryFree(ctx->rx.addr, ctx->rx.length);
ctx->rx.addr = NULL;

if (ctx->tx.mr) {
ibv_dereg_mr(ctx->tx.mr);
ctx->tx.mr = NULL;
}

zlibc_free(ctx->tx.addr);
rdmaMemoryFree(ctx->tx.addr, ctx->tx.length);
ctx->tx.addr = NULL;

if (ctx->cmd_mr) {
ibv_dereg_mr(ctx->cmd_mr);
ctx->cmd_mr = NULL;
}

zlibc_free(ctx->cmd_buf);
rdmaMemoryFree(ctx->cmd_buf, sizeof(ValkeyRdmaCmd) * VALKEY_RDMA_MAX_WQE * 2);
ctx->cmd_buf = NULL;
}

Expand All @@ -251,7 +279,7 @@ static int rdmaSetupIoBuf(RdmaContext *ctx, struct rdma_cm_id *cm_id) {
int i;

/* setup CMD buf & MR */
ctx->cmd_buf = page_aligned_zalloc(length);
ctx->cmd_buf = rdmaMemoryAlloc(length);
ctx->cmd_mr = ibv_reg_mr(ctx->pd, ctx->cmd_buf, length, access);
if (!ctx->cmd_mr) {
serverLog(LL_WARNING, "RDMA: reg mr for CMD failed");
Expand All @@ -275,7 +303,7 @@ static int rdmaSetupIoBuf(RdmaContext *ctx, struct rdma_cm_id *cm_id) {
/* setup recv buf & MR */
access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE;
length = rdma_config->rx_size;
ctx->rx.addr = page_aligned_zalloc(length);
ctx->rx.addr = rdmaMemoryAlloc(length);
ctx->rx.length = length;
ctx->rx.mr = ibv_reg_mr(ctx->pd, ctx->rx.addr, length, access);
if (!ctx->rx.mr) {
Expand Down Expand Up @@ -387,7 +415,7 @@ static int rdmaAdjustSendbuf(RdmaContext *ctx, unsigned int length) {
}

/* create a new buffer & MR */
ctx->tx.addr = page_aligned_zalloc(length);
ctx->tx.addr = rdmaMemoryAlloc(length);
ctx->tx_length = length;
ctx->tx.mr = ibv_reg_mr(ctx->pd, ctx->tx.addr, length, access);
if (!ctx->tx.mr) {
Expand Down Expand Up @@ -1705,6 +1733,7 @@ static int connRdmaAddr(connection *conn, char *ip, size_t ip_len, int *port, in

static void rdmaInit(void) {
pending_list = listCreate();
page_size = sysconf(_SC_PAGESIZE);

VALKEY_BUILD_BUG_ON(sizeof(ValkeyRdmaFeature) != 32);
VALKEY_BUILD_BUG_ON(sizeof(ValkeyRdmaKeepalive) != 32);
Expand Down

0 comments on commit d72a97e

Please sign in to comment.