Skip to content

Commit

Permalink
prov/opx: Add runtime parameters for SDMA, RZV, MP egr disable
Browse files Browse the repository at this point in the history
Add the ability for users to specify the minimum message length
at which SDMA is used (FI_OPX_SDMA_MIN_PAYLOAD_BYTES),
the minimum length at which rendezvous is used (OPX_RZV_MIN_PAYLOAD_BYTES),
and a multi-packet eager option (FI_OPX_MP_EAGER_DISABLE) to enable
or disable multi-packet eager. This allows the user to tune the
default values. Tuning these parameters may have impacts to performance.

Signed-off-by: Lindsay Reiser <[email protected]>
  • Loading branch information
lsavers authored and charlesshereda committed Jul 18, 2024
1 parent 791c6e0 commit 7f7fa3f
Show file tree
Hide file tree
Showing 9 changed files with 150 additions and 82 deletions.
13 changes: 13 additions & 0 deletions man/fi_opx.7.md
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,19 @@ OPX is not compatible with Open MPI 4.1.x PML/BTL.
*FI_OPX_SDMA_DISABLE*
: Integer. Disables SDMA offload hardware. Default is 0

*FI_OPX_SDMA_MIN_PAYLOAD_BYTES*
: Integer. The minimum length in bytes where SDMA will be used.
For messages smaller than this threshold, the send will be completed using PIO.
Value must be between 64 and 2147483646. Defaults to 16385.

*FI_OPX_RZV_MIN_PAYLOAD_BYTES*
: Integer. The minimum length in bytes where rendezvous will be used.
For messages smaller than this threshold, the send will first try to be completed using eager or multi-packet eager.
Value must be between 64 and 65536. Defaults to 16385.

*FI_OPX_MP_EAGER_DISABLE*
: Integer. Disables multi-packet eager. Defaults to 0.

*FI_OPX_EXPECTED_RECEIVE_ENABLE*
: Boolean (0/1, on/off, true/false, yes/no). Enables expected receive rendezvous using Token ID (TID).
Defaults to "No". This feature is not currently supported.
Expand Down
6 changes: 3 additions & 3 deletions prov/opx/include/rdma/opx/fi_opx_domain.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@ struct fi_opx_node {
#define OPX_JOB_KEY_STR_SIZE 33
#define OPX_DEFAULT_JOB_KEY_STR "00112233445566778899aabbccddeeff"

#define OPX_SDMA_BOUNCE_BUF_MIN FI_OPX_SDMA_MIN_LENGTH
#define OPX_SDMA_BOUNCE_BUF_THRESHOLD FI_OPX_SDMA_DC_MIN
#define OPX_SDMA_BOUNCE_BUF_MAX (INT_MAX - 1)
#define OPX_SDMA_BOUNCE_BUF_MIN FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MIN
#define OPX_SDMA_BOUNCE_BUF_THRESHOLD FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT
#define OPX_SDMA_BOUNCE_BUF_MAX FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MAX

struct fi_opx_domain {
struct fid_domain domain_fid;
Expand Down
90 changes: 49 additions & 41 deletions prov/opx/include/rdma/opx/fi_opx_endpoint.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ struct fi_opx_ep_tx {
uint64_t cq_bind_flags;
struct fi_opx_context_slist * cq_completed_ptr;
uint32_t do_cq_completion;
uint16_t mp_eager_max_payload_bytes;
uint16_t unused_cacheline1;
uint8_t force_credit_return;
uint8_t use_sdma;

Expand Down Expand Up @@ -301,7 +301,10 @@ struct fi_opx_ep_tx {
struct ofi_bufpool *rma_payload_pool;
struct ofi_bufpool *rma_request_pool;
struct ofi_bufpool *sdma_work_pool;
uint64_t unused_cacheline6[2];
uint32_t sdma_min_payload_bytes;
uint32_t rzv_min_payload_bytes;
uint16_t mp_eager_max_payload_bytes;
uint8_t unused_cacheline6[6];

/* == CACHE LINE 7 == */
struct opx_sdma_queue sdma_request_queue;
Expand All @@ -328,7 +331,7 @@ OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, work_pending) == (FI_OPX_C
OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, work_pending_completion) == (FI_OPX_CACHE_LINE_SIZE * 6),
"Offset of fi_opx_ep_tx->work_pending_completion should start at cacheline 6!");
OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, sdma_request_queue) == (FI_OPX_CACHE_LINE_SIZE * 7),
"Offset of fi_opx_ep_tx->ref_cnt should start at cacheline 7!");
"Offset of fi_opx_ep_tx->sdma_request_queue should start at cacheline 7!");
OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, ref_cnt) == (FI_OPX_CACHE_LINE_SIZE * 8),
"Offset of fi_opx_ep_tx->ref_cnt should start at cacheline 8!");

Expand Down Expand Up @@ -3899,7 +3902,8 @@ ssize_t fi_opx_ep_tx_send_try_eager(struct fid_ep *ep,
const enum ofi_reliability_kind reliability,
const uint64_t do_cq_completion,
const enum fi_hmem_iface hmem_iface,
const uint64_t hmem_device)
const uint64_t hmem_device,
const bool mp_eager_fallback)
{
ssize_t rc;

Expand All @@ -3922,7 +3926,7 @@ ssize_t fi_opx_ep_tx_send_try_eager(struct fid_ep *ep,
if (OFI_LIKELY(rc == FI_SUCCESS)) {
return rc;
#ifndef FI_OPX_MP_EGR_DISABLE
} else if (rc == -FI_ENOBUFS && len > FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE) {
} else if (rc == -FI_ENOBUFS && mp_eager_fallback) {
/* Insufficient credits. If the payload is big enough,
fall back to Multi-packet eager to try sending this in
smaller chunks. */
Expand Down Expand Up @@ -4116,48 +4120,52 @@ ssize_t fi_opx_ep_tx_send_internal (struct fid_ep *ep,
const uint64_t do_cq_completion =
fi_opx_ep_tx_do_cq_completion(opx_ep, override_flags, tx_op_flags);

if (total_len <= opx_ep->tx->pio_max_eager_tx_bytes) {

rc = fi_opx_ep_tx_send_try_eager(ep, buf, len, desc, addr, tag, context, local_iov,
niov, total_len, data, lock_required, is_contiguous,
override_flags, tx_op_flags, caps, reliability,
do_cq_completion, hmem_iface, hmem_device);
if (OFI_LIKELY(rc == FI_SUCCESS)) {
OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND");
return rc;
if (total_len < opx_ep->tx->rzv_min_payload_bytes) {
const bool mp_eager_fallback = (total_len > FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE &&
total_len <= opx_ep->tx->mp_eager_max_payload_bytes);
if (total_len <= opx_ep->tx->pio_max_eager_tx_bytes) {

rc = fi_opx_ep_tx_send_try_eager(ep, buf, len, desc, addr, tag, context, local_iov,
niov, total_len, data, lock_required, is_contiguous,
override_flags, tx_op_flags, caps, reliability,
do_cq_completion, hmem_iface, hmem_device,
mp_eager_fallback);
if (OFI_LIKELY(rc == FI_SUCCESS)) {
OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND");
return rc;
}
OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND");
FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,
"===================================== SEND -- Eager send failed, trying next method\n");
}
OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND");
FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,
"===================================== SEND -- Eager send failed, trying next method\n");
}

#ifndef FI_OPX_MP_EGR_DISABLE
/* If hmem_iface != FI_HMEM_SYSTEM, we skip MP EGR because RZV yields better performance for devices */
if (is_contiguous &&
total_len <= opx_ep->tx->mp_eager_max_payload_bytes &&
total_len > FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE &&
!fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps) &&
(caps & FI_TAGGED) && hmem_iface == FI_HMEM_SYSTEM) {

rc = fi_opx_hfi1_tx_send_try_mp_egr(ep, buf, len, desc, addr.fi, tag,
context, data, lock_required, override_flags,
tx_op_flags, caps, reliability, do_cq_completion,
FI_HMEM_SYSTEM, 0ul);
if (OFI_LIKELY(rc == FI_SUCCESS)) {
OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND");
return rc;
/* If hmem_iface != FI_HMEM_SYSTEM, we skip MP EGR because RZV yields better performance for devices */
if (is_contiguous &&
mp_eager_fallback &&
!fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps) &&
(caps & FI_TAGGED) && hmem_iface == FI_HMEM_SYSTEM) {

rc = fi_opx_hfi1_tx_send_try_mp_egr(ep, buf, len, desc, addr.fi, tag,
context, data, lock_required, override_flags,
tx_op_flags, caps, reliability, do_cq_completion,
FI_HMEM_SYSTEM, 0ul);
if (OFI_LIKELY(rc == FI_SUCCESS)) {
OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND");
return rc;
}
OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND");
FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,
"===================================== SEND -- MP-Eager send failed, trying next method\n");
}
OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND");
FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,
"===================================== SEND -- MP-Eager send failed, trying next method\n");
}
#endif

if (OFI_UNLIKELY(total_len < FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES)) {
OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN,"SEND");
FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,
"===================================== SEND -- FI_EAGAIN Can't do RZV with payload length = %ld\n",len);
return -FI_EAGAIN;
if (OFI_UNLIKELY(total_len < FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES)) {
OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN,"SEND");
FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,
"===================================== SEND -- FI_EAGAIN Can't do RZV with payload length = %ld\n",len);
return -FI_EAGAIN;
}
}

rc = fi_opx_ep_tx_send_rzv(ep,
Expand Down
18 changes: 11 additions & 7 deletions prov/opx/include/rdma/opx/fi_opx_hfi1.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,13 @@

#define OPX_MP_EGR_MAX_PAYLOAD_BYTES_DEFAULT (16384) /* Default for max payload size for using Multi-packet Eager */
#define OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX (65535) /* Max value (set to fit within uint16_t) */
#define OPX_MP_EGR_DISABLE_SET (1)
#define OPX_MP_EGR_DISABLE_NOT_SET (0)
#define OPX_MP_EGR_DISABLE_DEFAULT (OPX_MP_EGR_DISABLE_NOT_SET)

#define OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT (OPX_MP_EGR_MAX_PAYLOAD_BYTES_DEFAULT+1) /* Default for payload threshold size for RZV */
#define OPX_RZV_MIN_PAYLOAD_BYTES_MAX (OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX+1) /* Max value */
#define OPX_RZV_MIN_PAYLOAD_BYTES_MIN (FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES) /* Min value */

/* The total size for a single packet used in a multi-packet eager send.
This is packet payload plus 64 bytes for the PBC and packet header.
Expand Down Expand Up @@ -176,15 +183,12 @@ static_assert(OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX >= OPX_MP_EGR_MAX_PAYLOAD_BYTES_D

#define FI_OPX_HFI1_SDMA_MAX_COMP_INDEX (128) // This should what opx_ep->hfi->info.sdma.queue_size is set to.

#ifndef FI_OPX_SDMA_MIN_LENGTH
#define FI_OPX_SDMA_MIN_LENGTH (16385)
#ifndef FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT
#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT (16385)
#endif
#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MIN (FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES)
#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MAX (INT_MAX-1)

/*
* The minimum payload size threshold for which we will use delivery completion
* instead of copying the payload for reliability.
*/
#define FI_OPX_SDMA_DC_MIN FI_OPX_SDMA_MIN_LENGTH

static_assert(!(FI_OPX_HFI1_SDMA_MAX_COMP_INDEX & (FI_OPX_HFI1_SDMA_MAX_COMP_INDEX - 1)), "FI_OPX_HFI1_SDMA_MAX_COMP_INDEX must be power of 2!\n");
static_assert(FI_OPX_HFI1_SDMA_MAX_WE >= FI_OPX_HFI1_SDMA_MAX_COMP_INDEX, "FI_OPX_HFI1_SDMA_MAX_WE must be >= FI_OPX_HFI1_SDMA_MAX_COMP_INDEX!\n");
Expand Down
2 changes: 1 addition & 1 deletion prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ bool fi_opx_hfi1_sdma_use_sdma(struct fi_opx_ep *opx_ep,

return !is_intranode &&
(is_hmem || opcode == FI_OPX_HFI_DPUT_OPCODE_RZV_TID
|| total_bytes >= FI_OPX_SDMA_MIN_LENGTH) &&
|| total_bytes >= opx_ep->tx->sdma_min_payload_bytes) &&
opx_ep->tx->use_sdma;
}

Expand Down
28 changes: 16 additions & 12 deletions prov/opx/src/fi_opx_domain.c
Original file line number Diff line number Diff line change
Expand Up @@ -380,22 +380,26 @@ int fi_opx_domain(struct fid_fabric *fabric,
size_t env_var_threshold;
get_param_check = fi_param_get_size_t(fi_opx_global.prov, "dev_reg_send_threshold",
&env_var_threshold);
if ((get_param_check == FI_SUCCESS) && (env_var_threshold <= OPX_HMEM_DEV_REG_THRESHOLD_MAX)) {
opx_domain->hmem_domain->devreg_copy_from_threshold = env_var_threshold;
} else {
FI_WARN(fi_opx_global.prov, FI_LOG_DOMAIN,
"FI_OPX_DEV_REG_SEND_THRESHOLD must be an integer >= %u and <= %u. Using default value (%u) instead of %zu\n",
OPX_HMEM_DEV_REG_THRESHOLD_MIN, OPX_HMEM_DEV_REG_THRESHOLD_MAX, OPX_HMEM_DEV_REG_SEND_THRESHOLD_DEFAULT, env_var_threshold);
if (get_param_check == FI_SUCCESS) {
if (env_var_threshold <= OPX_HMEM_DEV_REG_THRESHOLD_MAX) {
opx_domain->hmem_domain->devreg_copy_from_threshold = env_var_threshold;
} else {
FI_WARN(fi_opx_global.prov, FI_LOG_DOMAIN,
"FI_OPX_DEV_REG_SEND_THRESHOLD must be an integer >= %u and <= %u. Using default value (%u) instead of %zu\n",
OPX_HMEM_DEV_REG_THRESHOLD_MIN, OPX_HMEM_DEV_REG_THRESHOLD_MAX, OPX_HMEM_DEV_REG_SEND_THRESHOLD_DEFAULT, env_var_threshold);
}
}

get_param_check = fi_param_get_size_t(fi_opx_global.prov, "dev_reg_recv_threshold",
&env_var_threshold);
if ((get_param_check == FI_SUCCESS) && (env_var_threshold <= OPX_HMEM_DEV_REG_THRESHOLD_MAX)) {
opx_domain->hmem_domain->devreg_copy_to_threshold = env_var_threshold;
} else {
FI_WARN(fi_opx_global.prov, FI_LOG_DOMAIN,
"FI_OPX_DEV_REG_RECV_THRESHOLD must be an integer >= %u and <= %u. Using default value (%u) instead of %zu\n",
OPX_HMEM_DEV_REG_THRESHOLD_MIN, OPX_HMEM_DEV_REG_THRESHOLD_MAX, OPX_HMEM_DEV_REG_RECV_THRESHOLD_DEFAULT, env_var_threshold);
if (get_param_check == FI_SUCCESS) {
if (env_var_threshold <= OPX_HMEM_DEV_REG_THRESHOLD_MAX) {
opx_domain->hmem_domain->devreg_copy_to_threshold = env_var_threshold;
} else {
FI_WARN(fi_opx_global.prov, FI_LOG_DOMAIN,
"FI_OPX_DEV_REG_RECV_THRESHOLD must be an integer >= %u and <= %u. Using default value (%u) instead of %zu\n",
OPX_HMEM_DEV_REG_THRESHOLD_MIN, OPX_HMEM_DEV_REG_THRESHOLD_MAX, OPX_HMEM_DEV_REG_RECV_THRESHOLD_DEFAULT, env_var_threshold);
}
}
#endif

Expand Down
69 changes: 53 additions & 16 deletions prov/opx/src/fi_opx_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,25 @@ static int fi_opx_ep_tx_init (struct fi_opx_ep *opx_ep,
opx_ep->tx->pio_scb_first = hfi->info.pio.scb_first;
opx_ep->tx->pio_credits_addr = hfi->info.pio.credits_addr;

// Retrieve the parameter for RZV min message length
int l_rzv_min_payload_bytes;
ssize_t rc = fi_param_get_int(fi_opx_global.prov, "rzv_min_payload_bytes", &l_rzv_min_payload_bytes);
if (rc != FI_SUCCESS) {
l_rzv_min_payload_bytes = OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT;
OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_RZV_MIN_PAYLOAD_BYTES not set. Using default setting of %d\n",
l_rzv_min_payload_bytes);
} else if (l_rzv_min_payload_bytes < OPX_RZV_MIN_PAYLOAD_BYTES_MIN ||
l_rzv_min_payload_bytes > OPX_RZV_MIN_PAYLOAD_BYTES_MAX) {
l_rzv_min_payload_bytes = OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT;
FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA,
"Error: FI_OPX_RZV_MIN_PAYLOAD_BYTES was set but is outside min/max thresholds (%d-%d). Using default setting of %d\n",
OPX_RZV_MIN_PAYLOAD_BYTES_MIN, OPX_RZV_MIN_PAYLOAD_BYTES_MAX, l_rzv_min_payload_bytes);
} else {
OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_RZV_MIN_PAYLOAD_BYTES was specified. Set to %d\n",
l_rzv_min_payload_bytes);
}
opx_ep->tx->rzv_min_payload_bytes = l_rzv_min_payload_bytes;

/* Now that we know how many PIO Tx send credits we have, calculate the threshold to switch from EAGER send to RTS/CTS
* With max credits, there should be enough PIO Eager buffer to send 1 full-size message and 1 credit leftover for min reliablity.
*/
Expand Down Expand Up @@ -892,22 +911,22 @@ static int fi_opx_ep_tx_init (struct fi_opx_ep *opx_ep,
OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "Set pio_flow_eager_tx_bytes to %d \n", opx_ep->tx->pio_flow_eager_tx_bytes);

// Set the multi-packet eager max message length
int l_mp_eager_max_payload_bytes;
ssize_t rc = fi_param_get_int(fi_opx_global.prov, "mp_eager_max_payload_bytes", &l_mp_eager_max_payload_bytes);
if (rc != FI_SUCCESS) {
opx_ep->tx->mp_eager_max_payload_bytes = OPX_MP_EGR_MAX_PAYLOAD_BYTES_DEFAULT;
OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_MP_EAGER_MAX_PAYLOAD_BYTES not set. Using default setting of %d\n",
opx_ep->tx->mp_eager_max_payload_bytes);
} else if (l_mp_eager_max_payload_bytes < opx_ep->tx->pio_flow_eager_tx_bytes || l_mp_eager_max_payload_bytes > OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX) {
opx_ep->tx->mp_eager_max_payload_bytes = OPX_MP_EGR_MAX_PAYLOAD_BYTES_DEFAULT;
FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA,
"Error: FI_OPX_MP_EAGER_MAX_PAYLOAD_BYTES was set but is outside min/max thresholds (%d-%d). Using default setting of %d\n",
opx_ep->tx->pio_flow_eager_tx_bytes, OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX, opx_ep->tx->mp_eager_max_payload_bytes);
int l_mp_eager_disable;
if (fi_param_get_bool(fi_opx_global.prov, "mp_eager_disable", &l_mp_eager_disable) != FI_SUCCESS) {
l_mp_eager_disable = OPX_MP_EGR_DISABLE_DEFAULT;
OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_MP_EAGER_DISABLE not set. Using default setting of %d\n",
l_mp_eager_disable);
} else {
OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_MP_EAGER_DISABLE was specified. Set to %d\n",
l_mp_eager_disable);
}

if (l_mp_eager_disable == OPX_MP_EGR_DISABLE_SET) {
opx_ep->tx->mp_eager_max_payload_bytes = 0;
} else {
opx_ep->tx->mp_eager_max_payload_bytes = l_mp_eager_max_payload_bytes;
OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_MP_EAGER_MAX_PAYLOAD_BYTES was specified. Set to %d\n",
opx_ep->tx->mp_eager_max_payload_bytes);
opx_ep->tx->mp_eager_max_payload_bytes = l_rzv_min_payload_bytes - 1;
}
OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "Using MP eager threshold of %d\n", opx_ep->tx->mp_eager_max_payload_bytes);
OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "Multi-packet eager chunk-size is %d.\n", FI_OPX_MP_EGR_CHUNK_SIZE);

/* Set SDMA bounce buffer threshold. Any messages larger than this value in bytes will not be copied to
Expand All @@ -926,8 +945,8 @@ static int fi_opx_ep_tx_init (struct fi_opx_ep *opx_ep,
} else if (l_sdma_bounce_buf_threshold < OPX_SDMA_BOUNCE_BUF_MIN || l_sdma_bounce_buf_threshold > (OPX_SDMA_BOUNCE_BUF_MAX)) {
opx_ep->tx->sdma_bounce_buf_threshold = OPX_SDMA_BOUNCE_BUF_THRESHOLD;
FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA,
"Error: FI_OPX_SDMA_BOUNCE_BUF_THRESHOLD was set but is outside of MIN/MAX thresholds. Using default setting of %d\n",
opx_ep->tx->sdma_bounce_buf_threshold);
"Error: FI_OPX_SDMA_BOUNCE_BUF_THRESHOLD was set but is outside of min/max thresholds (%d-%d). Using default setting of %d\n",
OPX_SDMA_BOUNCE_BUF_MIN, OPX_SDMA_BOUNCE_BUF_MAX, opx_ep->tx->sdma_bounce_buf_threshold);
} else {
opx_ep->tx->sdma_bounce_buf_threshold = l_sdma_bounce_buf_threshold;
OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_SDMA_BOUNCE_BUF_THRESHOLD was specified. Set to %d\n",
Expand All @@ -951,6 +970,24 @@ static int fi_opx_ep_tx_init (struct fi_opx_ep *opx_ep,
opx_ep->tx->use_sdma = 1;
}

// Set the SDMA minimum message length
int l_sdma_min_payload_bytes;
rc = fi_param_get_int(fi_opx_global.prov, "sdma_min_payload_bytes", &l_sdma_min_payload_bytes);
if (rc != FI_SUCCESS) {
opx_ep->tx->sdma_min_payload_bytes = FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT;
OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_SDMA_MIN_PAYLOAD_BYTES not set. Using default setting of %d\n",
opx_ep->tx->sdma_min_payload_bytes);
} else if (l_sdma_min_payload_bytes < FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES || l_sdma_min_payload_bytes > INT_MAX) {
opx_ep->tx->sdma_min_payload_bytes = FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT;
FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA,
"Error: FI_OPX_SDMA_MIN_PAYLOAD_BYTES was set but is outside min/max thresholds (%d-%d). Using default setting of %d\n",
FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES, INT_MAX, opx_ep->tx->sdma_min_payload_bytes);
} else {
opx_ep->tx->sdma_min_payload_bytes = l_sdma_min_payload_bytes;
OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_SDMA_MIN_PAYLOAD_BYTES was specified. Set to %d\n",
opx_ep->tx->sdma_min_payload_bytes);
}

slist_init(&opx_ep->tx->work_pending[OPX_WORK_TYPE_SHM]);
slist_init(&opx_ep->tx->work_pending[OPX_WORK_TYPE_PIO]);
slist_init(&opx_ep->tx->work_pending[OPX_WORK_TYPE_SDMA]);
Expand Down
2 changes: 1 addition & 1 deletion prov/opx/src/fi_opx_hfi1.c
Original file line number Diff line number Diff line change
Expand Up @@ -3093,7 +3093,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep,
/* Expected tid needs to send a leading data block and a trailing
* data block for alignment. Limit this to SDMA (8K+) for now */

const uint64_t immediate_block_count = (len > FI_OPX_SDMA_MIN_LENGTH && opx_ep->use_expected_tid_rzv) ? 1 : 0;
const uint64_t immediate_block_count = (len > opx_ep->tx->sdma_min_payload_bytes && opx_ep->use_expected_tid_rzv) ? 1 : 0;
FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,
"immediate_block_count %#lX *origin_byte_counter_value %#lX, origin_byte_counter_vaddr %p, "
"*origin_byte_counter_vaddr %lu/%#lX, len %lu/%#lX\n",
Expand Down
Loading

0 comments on commit 7f7fa3f

Please sign in to comment.