summaryrefslogtreecommitdiffstats
path: root/sys/dev/mlx5
diff options
context:
space:
mode:
authorhselasky <hselasky@FreeBSD.org>2016-05-20 06:54:58 +0000
committerhselasky <hselasky@FreeBSD.org>2016-05-20 06:54:58 +0000
commit64d010d53317655ede6d212985d499e1b7c7f583 (patch)
tree5b13ba2b91ef24aa9fc4103002682befe2d95de1 /sys/dev/mlx5
parentad10425ff7c11b0f9841a9ff53859ca0d7800109 (diff)
downloadFreeBSD-src-64d010d53317655ede6d212985d499e1b7c7f583.zip
FreeBSD-src-64d010d53317655ede6d212985d499e1b7c7f583.tar.gz
Implement TX completion event interleaving.
This patch implements a sysctl which allows setting a factor, N, for how many work queue elements can be generated before requiring a completion event. When a completion event happens the code simulates N completion events instead of only one. When draining a transmit queue, N-1 NOPs are transmitted at most, to force generation of the final completion event. Further a timer is running every HZ ticks to flush any remaining data off the transmit queue when the tx_completion_fact > 1. The goal of this feature is to reduce the PCI bandwidth needed when transmitting data. Sponsored by: Mellanox Technologies Tested by: Netflix MFC after: 1 week
Diffstat (limited to 'sys/dev/mlx5')
-rw-r--r--sys/dev/mlx5/mlx5_en/en.h10
-rw-r--r--sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c45
-rw-r--r--sys/dev/mlx5/mlx5_en/mlx5_en_main.c97
-rw-r--r--sys/dev/mlx5/mlx5_en/mlx5_en_tx.c72
4 files changed, 185 insertions, 39 deletions
diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h
index 12a6922..693259d 100644
--- a/sys/dev/mlx5/mlx5_en/en.h
+++ b/sys/dev/mlx5/mlx5_en/en.h
@@ -391,6 +391,8 @@ struct mlx5e_params {
m(+1, u64 tx_coalesce_usecs, "tx_coalesce_usecs", "Limit in usec for joining tx packets") \
m(+1, u64 tx_coalesce_pkts, "tx_coalesce_pkts", "Maximum number of tx packets to join") \
m(+1, u64 tx_coalesce_mode, "tx_coalesce_mode", "0: EQE mode 1: CQE mode") \
+ m(+1, u64 tx_completion_fact, "tx_completion_fact", "1..MAX: Completion event ratio") \
+ m(+1, u64 tx_completion_fact_max, "tx_completion_fact_max", "Maximum completion event ratio") \
m(+1, u64 hw_lro, "hw_lro", "set to enable hw_lro") \
m(+1, u64 cqe_zipping, "cqe_zipping", "0 : CQE zipping disabled")
@@ -496,6 +498,13 @@ struct mlx5e_sq {
/* dirtied @xmit */
u16 pc __aligned(MLX5E_CACHELINE_SIZE);
u16 bf_offset;
+ u16 cev_counter; /* completion event counter */
+ u16 cev_factor; /* completion event factor */
+ u32 cev_next_state; /* next completion event state */
+#define MLX5E_CEV_STATE_INITIAL 0 /* timer not started */
+#define MLX5E_CEV_STATE_SEND_NOPS 1 /* send NOPs */
+#define MLX5E_CEV_STATE_HOLD_NOPS 2 /* don't send NOPs yet */
+ struct callout cev_callout;
struct mlx5e_sq_stats stats;
struct mlx5e_cq cq;
@@ -787,6 +796,7 @@ void mlx5e_create_stats(struct sysctl_ctx_list *,
struct sysctl_oid_list *, const char *,
const char **, unsigned, u64 *);
void mlx5e_send_nop(struct mlx5e_sq *, u32, bool);
+void mlx5e_sq_cev_timeout(void *);
int mlx5e_refresh_channel_params(struct mlx5e_priv *);
#endif /* _MLX5_EN_H_ */
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c b/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c
index f7993e9..647e622 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c
@@ -48,6 +48,42 @@ mlx5e_create_stats(struct sysctl_ctx_list *ctx,
}
}
+static void
+mlx5e_ethtool_sync_tx_completion_fact(struct mlx5e_priv *priv)
+{
+ /*
+ * Limit the maximum distance between completion events to
+ * half of the currently set TX queue size.
+ *
+ * The maximum number of queue entries a single IP packet can
+ * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
+ *
+ * The worst case max value is then given as below:
+ */
+ uint64_t max = priv->params_ethtool.tx_queue_size /
+ (2 * MLX5_SEND_WQE_MAX_WQEBBS);
+
+ /*
+ * Update the maximum completion factor value in case the
+ * tx_queue_size field changed. Ensure we don't overflow
+ * 16-bits.
+ */
+ if (max < 1)
+ max = 1;
+ else if (max > 65535)
+ max = 65535;
+ priv->params_ethtool.tx_completion_fact_max = max;
+
+ /*
+ * Verify that the current TX completion factor is within the
+ * given limits:
+ */
+ if (priv->params_ethtool.tx_completion_fact < 1)
+ priv->params_ethtool.tx_completion_fact = 1;
+ else if (priv->params_ethtool.tx_completion_fact > max)
+ priv->params_ethtool.tx_completion_fact = max;
+}
+
static int
mlx5e_ethtool_handler(SYSCTL_HANDLER_ARGS)
{
@@ -206,6 +242,14 @@ mlx5e_ethtool_handler(SYSCTL_HANDLER_ARGS)
priv->params_ethtool.cqe_zipping = 0;
}
}
+
+ if (&priv->params_ethtool.arg[arg2] ==
+ &priv->params_ethtool.tx_completion_fact ||
+ &priv->params_ethtool.arg[arg2] ==
+ &priv->params_ethtool.tx_queue_size) {
+ /* verify parameter */
+ mlx5e_ethtool_sync_tx_completion_fact(priv);
+ }
if (was_opened)
mlx5e_open_locked(priv->ifp);
done:
@@ -475,6 +519,7 @@ mlx5e_create_ethtool(struct mlx5e_priv *priv)
priv->params_ethtool.tx_coalesce_pkts = priv->params.tx_cq_moderation_pkts;
priv->params_ethtool.hw_lro = priv->params.hw_lro_en;
priv->params_ethtool.cqe_zipping = priv->params.cqe_zipping_en;
+ mlx5e_ethtool_sync_tx_completion_fact(priv);
/* create root node */
node = SYSCTL_ADD_NODE(&priv->sysctl_ctx,
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
index a76d32e..0a898cf 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
@@ -1185,24 +1185,82 @@ err_destroy_sq:
}
static void
-mlx5e_close_sq(struct mlx5e_sq *sq)
+mlx5e_sq_send_nops_locked(struct mlx5e_sq *sq, int can_sleep)
+{
+ /* fill up remainder with NOPs */
+ while (sq->cev_counter != 0) {
+ while (!mlx5e_sq_has_room_for(sq, 1)) {
+ if (can_sleep != 0) {
+ mtx_unlock(&sq->lock);
+ msleep(4);
+ mtx_lock(&sq->lock);
+ } else {
+ goto done;
+ }
+ }
+ mlx5e_send_nop(sq, 1, true);
+ }
+done:
+ return;
+}
+
+void
+mlx5e_sq_cev_timeout(void *arg)
{
+ struct mlx5e_sq *sq = arg;
- /* ensure hw is notified of all pending wqes */
- if (mlx5e_sq_has_room_for(sq, 1))
- mlx5e_send_nop(sq, 1, true);
+ mtx_assert(&sq->lock, MA_OWNED);
- mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
+ /* check next state */
+ switch (sq->cev_next_state) {
+ case MLX5E_CEV_STATE_SEND_NOPS:
+ /* fill TX ring with NOPs, if any */
+ mlx5e_sq_send_nops_locked(sq, 0);
+
+ /* check if completed */
+ if (sq->cev_counter == 0) {
+ sq->cev_next_state = MLX5E_CEV_STATE_INITIAL;
+ return;
+ }
+ break;
+ default:
+ /* send NOPs on next timeout */
+ sq->cev_next_state = MLX5E_CEV_STATE_SEND_NOPS;
+ break;
+ }
+
+ /* restart timer */
+ callout_reset_curcpu(&sq->cev_callout, hz, mlx5e_sq_cev_timeout, sq);
}
static void
mlx5e_close_sq_wait(struct mlx5e_sq *sq)
{
+
+ mtx_lock(&sq->lock);
+ /* teardown event factor timer, if any */
+ sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
+ callout_stop(&sq->cev_callout);
+
+ /* send dummy NOPs in order to flush the transmit ring */
+ mlx5e_sq_send_nops_locked(sq, 1);
+ mtx_unlock(&sq->lock);
+
+ /* make sure it is safe to free the callout */
+ callout_drain(&sq->cev_callout);
+
+ /* error out remaining requests */
+ mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
+
/* wait till SQ is empty */
+ mtx_lock(&sq->lock);
while (sq->cc != sq->pc) {
+ mtx_unlock(&sq->lock);
msleep(4);
sq->cq.mcq.comp(&sq->cq.mcq);
+ mtx_lock(&sq->lock);
}
+ mtx_unlock(&sq->lock);
mlx5e_disable_sq(sq);
mlx5e_destroy_sq(sq);
@@ -1412,24 +1470,13 @@ mlx5e_open_sqs(struct mlx5e_channel *c,
return (0);
err_close_sqs:
- for (tc--; tc >= 0; tc--) {
- mlx5e_close_sq(&c->sq[tc]);
+ for (tc--; tc >= 0; tc--)
mlx5e_close_sq_wait(&c->sq[tc]);
- }
return (err);
}
static void
-mlx5e_close_sqs(struct mlx5e_channel *c)
-{
- int tc;
-
- for (tc = 0; tc < c->num_tc; tc++)
- mlx5e_close_sq(&c->sq[tc]);
-}
-
-static void
mlx5e_close_sqs_wait(struct mlx5e_channel *c)
{
int tc;
@@ -1446,9 +1493,19 @@ mlx5e_chan_mtx_init(struct mlx5e_channel *c)
mtx_init(&c->rq.mtx, "mlx5rx", MTX_NETWORK_LOCK, MTX_DEF);
for (tc = 0; tc < c->num_tc; tc++) {
- mtx_init(&c->sq[tc].lock, "mlx5tx", MTX_NETWORK_LOCK, MTX_DEF);
- mtx_init(&c->sq[tc].comp_lock, "mlx5comp", MTX_NETWORK_LOCK,
+ struct mlx5e_sq *sq = c->sq + tc;
+
+ mtx_init(&sq->lock, "mlx5tx", MTX_NETWORK_LOCK, MTX_DEF);
+ mtx_init(&sq->comp_lock, "mlx5comp", MTX_NETWORK_LOCK,
MTX_DEF);
+
+ callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
+
+ sq->cev_factor = c->priv->params_ethtool.tx_completion_fact;
+
+ /* ensure the TX completion event factor is not zero */
+ if (sq->cev_factor == 0)
+ sq->cev_factor = 1;
}
}
@@ -1529,7 +1586,6 @@ mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
return (0);
err_close_sqs:
- mlx5e_close_sqs(c);
mlx5e_close_sqs_wait(c);
err_close_rx_cq:
@@ -1554,7 +1610,6 @@ mlx5e_close_channel(struct mlx5e_channel *volatile *pp)
if (c == NULL)
return;
mlx5e_close_rq(&c->rq);
- mlx5e_close_sqs(c);
}
static void
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
index 483a7e1..2a18f1f 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
@@ -28,6 +28,18 @@
#include "en.h"
#include <machine/atomic.h>
+static inline bool
+mlx5e_do_send_cqe(struct mlx5e_sq *sq)
+{
+ sq->cev_counter++;
+ /* interleave the CQEs */
+ if (sq->cev_counter >= sq->cev_factor) {
+ sq->cev_counter = 0;
+ return (1);
+ }
+ return (0);
+}
+
void
mlx5e_send_nop(struct mlx5e_sq *sq, u32 ds_cnt, bool notify_hw)
{
@@ -38,7 +50,10 @@ mlx5e_send_nop(struct mlx5e_sq *sq, u32 ds_cnt, bool notify_hw)
wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP);
wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
- wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+ if (mlx5e_do_send_cqe(sq))
+ wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+ else
+ wqe->ctrl.fm_ce_se = 0;
sq->mbuf[pi].mbuf = NULL;
sq->mbuf[pi].num_bytes = 0;
@@ -340,7 +355,10 @@ skip_dma:
wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode);
wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
- wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+ if (mlx5e_do_send_cqe(sq))
+ wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+ else
+ wqe->ctrl.fm_ce_se = 0;
/* Store pointer to mbuf */
sq->mbuf[pi].mbuf = mb;
@@ -374,9 +392,10 @@ mlx5e_poll_tx_cq(struct mlx5e_sq *sq, int budget)
*/
sqcc = sq->cc;
- while (budget--) {
+ while (budget > 0) {
struct mlx5_cqe64 *cqe;
struct mbuf *mb;
+ u16 x;
u16 ci;
cqe = mlx5e_get_cqe(&sq->cq);
@@ -385,24 +404,29 @@ mlx5e_poll_tx_cq(struct mlx5e_sq *sq, int budget)
mlx5_cqwq_pop(&sq->cq.wq);
- ci = sqcc & sq->wq.sz_m1;
- mb = sq->mbuf[ci].mbuf;
- sq->mbuf[ci].mbuf = NULL; /* Safety clear */
+ /* update budget according to the event factor */
+ budget -= sq->cev_factor;
- if (mb == NULL) {
- if (sq->mbuf[ci].num_bytes == 0) {
- /* NOP */
- sq->stats.nop++;
- }
- } else {
- bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map,
- BUS_DMASYNC_POSTWRITE);
- bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map);
+ for (x = 0; x != sq->cev_factor; x++) {
+ ci = sqcc & sq->wq.sz_m1;
+ mb = sq->mbuf[ci].mbuf;
+ sq->mbuf[ci].mbuf = NULL; /* Safety clear */
- /* Free transmitted mbuf */
- m_freem(mb);
+ if (mb == NULL) {
+ if (sq->mbuf[ci].num_bytes == 0) {
+ /* NOP */
+ sq->stats.nop++;
+ }
+ } else {
+ bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map,
+ BUS_DMASYNC_POSTWRITE);
+ bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map);
+
+ /* Free transmitted mbuf */
+ m_freem(mb);
+ }
+ sqcc += sq->mbuf[ci].num_wqebbs;
}
- sqcc += sq->mbuf[ci].num_wqebbs;
}
mlx5_cqwq_update_db_record(&sq->cq.wq);
@@ -450,6 +474,18 @@ mlx5e_xmit_locked(struct ifnet *ifp, struct mlx5e_sq *sq, struct mbuf *mb)
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
break;
}
+ /*
+ * Check if we need to start the event timer which flushes the
+ * transmit ring on timeout:
+ */
+ if (unlikely(sq->cev_next_state == MLX5E_CEV_STATE_INITIAL &&
+ sq->cev_factor != 1)) {
+ /* start the timer */
+ mlx5e_sq_cev_timeout(sq);
+ } else {
+ /* don't send NOPs yet */
+ sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
+ }
return (err);
}
OpenPOWER on IntegriCloud