From da91309e0a7e8966d916a74cce42ed170fde06bf Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Mon, 9 Jun 2014 10:24:38 +0300 Subject: [PATCH 1/2] cpumask: Utility function to set n'th cpu - local cpu first This function sets the n'th cpu - local cpu's first. For example: in a 16 cores server with even cpu's local, will get the following values: cpumask_set_cpu_local_first(0, numa, cpumask) => cpu 0 is set cpumask_set_cpu_local_first(1, numa, cpumask) => cpu 2 is set ... cpumask_set_cpu_local_first(7, numa, cpumask) => cpu 14 is set cpumask_set_cpu_local_first(8, numa, cpumask) => cpu 1 is set cpumask_set_cpu_local_first(9, numa, cpumask) => cpu 3 is set ... cpumask_set_cpu_local_first(15, numa, cpumask) => cpu 15 is set Curently this function will be used by multi queue networking devices to calculate the irq affinity mask, such that as many local cpu's as possible will be utilized to handle the mq device irq's. Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/linux/cpumask.h | 8 ++++++ lib/cpumask.c | 63 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index d08e4d2a9b92..d5ef249735d2 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -142,6 +142,13 @@ static inline unsigned int cpumask_any_but(const struct cpumask *mask, return 1; } +static inline int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp) +{ + set_bit(0, cpumask_bits(dstp)); + + return 0; +} + #define for_each_cpu(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_not(cpu, mask) \ @@ -192,6 +199,7 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *); int cpumask_any_but(const struct cpumask *mask, unsigned int cpu); +int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp); /** * for_each_cpu - iterate over every cpu in a mask diff --git a/lib/cpumask.c b/lib/cpumask.c index b810b753c607..c101230658eb 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -164,3 +164,66 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask) memblock_free_early(__pa(mask), cpumask_size()); } #endif + +/** + * cpumask_set_cpu_local_first - set i'th cpu with local numa cpu's first + * + * @i: index number + * @numa_node: local numa_node + * @dstp: cpumask with the relevant cpu bit set according to the policy + * + * This function sets the cpumask according to a numa aware policy. + * cpumask could be used as an affinity hint for the IRQ related to a + * queue. When the policy is to spread queues across cores - local cores + * first. + * + * Returns 0 on success, -ENOMEM for no memory, and -EAGAIN when failed to set + * the cpu bit and need to re-call the function. + */ +int cpumask_set_cpu_local_first(int i, int numa_node, cpumask_t *dstp) +{ + cpumask_var_t mask; + int cpu; + int ret = 0; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + i %= num_online_cpus(); + + if (!cpumask_of_node(numa_node)) { + /* Use all online cpu's for non numa aware system */ + cpumask_copy(mask, cpu_online_mask); + } else { + int n; + + cpumask_and(mask, + cpumask_of_node(numa_node), cpu_online_mask); + + n = cpumask_weight(mask); + if (i >= n) { + i -= n; + + /* If index > number of local cpu's, mask out local + * cpu's + */ + cpumask_andnot(mask, cpu_online_mask, mask); + } + } + + for_each_cpu(cpu, mask) { + if (--i < 0) + goto out; + } + + ret = -EAGAIN; + +out: + free_cpumask_var(mask); + + if (!ret) + cpumask_set_cpu(cpu, dstp); + + return ret; +} +EXPORT_SYMBOL(cpumask_set_cpu_local_first); From 9e311e77a85e37b5caec3d64c3593cd52b2cdb71 Mon Sep 17 00:00:00 2001 From: Yuval Atias Date: Mon, 9 Jun 2014 10:24:39 +0300 Subject: [PATCH 2/2] net/mlx4_en: Use affinity hint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The “affinity hint” mechanism is used by the user space daemon, irqbalancer, to indicate a preferred CPU mask for irqs. Irqbalancer can use this hint to balance the irqs between the cpus indicated by the mask. We wish the HCA to preferentially map the IRQs it uses to numa cores close to it. To accomplish this, we use cpumask_set_cpu_local_first(), that sets the affinity hint according the following policy: First it maps IRQs to “close” numa cores. If these are exhausted, the remaining IRQs are mapped to “far” numa cores. Signed-off-by: Yuval Atias Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_cq.c | 12 ++++++- .../net/ethernet/mellanox/mlx4/en_netdev.c | 35 ++++++++++++++++++- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 1 + 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c index 636963db598a..4b2130760eed 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c @@ -163,6 +163,13 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_tx_cq, NAPI_POLL_WEIGHT); } else { + struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring]; + + err = irq_set_affinity_hint(cq->mcq.irq, + ring->affinity_mask); + if (err) + mlx4_warn(mdev, "Failed setting affinity hint\n"); + netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_rx_cq, 64); napi_hash_add(&cq->napi); } @@ -179,8 +186,11 @@ void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq) mlx4_en_unmap_buffer(&cq->wqres.buf); mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size); - if (priv->mdev->dev->caps.comp_pool && cq->vector) + if (priv->mdev->dev->caps.comp_pool && cq->vector) { + if (!cq->is_tx) + irq_set_affinity_hint(cq->mcq.irq, NULL); mlx4_release_eq(priv->mdev->dev, cq->vector); + } cq->vector = 0; cq->buf_size = 0; cq->buf = NULL; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 58209bd0c94c..7d4fb7bf2593 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -1526,6 +1526,27 @@ static void mlx4_en_linkstate(struct work_struct *work) mutex_unlock(&mdev->state_lock); } +static int mlx4_en_init_affinity_hint(struct mlx4_en_priv *priv, int ring_idx) +{ + struct mlx4_en_rx_ring *ring = priv->rx_ring[ring_idx]; + int numa_node = priv->mdev->dev->numa_node; + int ret = 0; + + if (!zalloc_cpumask_var(&ring->affinity_mask, GFP_KERNEL)) + return -ENOMEM; + + ret = cpumask_set_cpu_local_first(ring_idx, numa_node, + ring->affinity_mask); + if (ret) + free_cpumask_var(ring->affinity_mask); + + return ret; +} + +static void mlx4_en_free_affinity_hint(struct mlx4_en_priv *priv, int ring_idx) +{ + free_cpumask_var(priv->rx_ring[ring_idx]->affinity_mask); +} int mlx4_en_start_port(struct net_device *dev) { @@ -1567,9 +1588,16 @@ int mlx4_en_start_port(struct net_device *dev) mlx4_en_cq_init_lock(cq); + err = mlx4_en_init_affinity_hint(priv, i); + if (err) { + en_err(priv, "Failed preparing IRQ affinity hint\n"); + goto cq_err; + } + err = mlx4_en_activate_cq(priv, cq, i); if (err) { en_err(priv, "Failed activating Rx CQ\n"); + mlx4_en_free_affinity_hint(priv, i); goto cq_err; } for (j = 0; j < cq->size; j++) @@ -1578,6 +1606,7 @@ int mlx4_en_start_port(struct net_device *dev) if (err) { en_err(priv, "Failed setting cq moderation parameters\n"); mlx4_en_deactivate_cq(priv, cq); + mlx4_en_free_affinity_hint(priv, i); goto cq_err; } mlx4_en_arm_cq(priv, cq); @@ -1715,8 +1744,10 @@ rss_err: mac_err: mlx4_en_put_qp(priv); cq_err: - while (rx_index--) + while (rx_index--) { mlx4_en_deactivate_cq(priv, priv->rx_cq[rx_index]); + mlx4_en_free_affinity_hint(priv, i); + } for (i = 0; i < priv->rx_ring_num; i++) mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]); @@ -1847,6 +1878,8 @@ void mlx4_en_stop_port(struct net_device *dev, int detach) msleep(1); mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]); mlx4_en_deactivate_cq(priv, cq); + + mlx4_en_free_affinity_hint(priv, i); } } diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index b5db1bf361dc..0e15295bedd6 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -313,6 +313,7 @@ struct mlx4_en_rx_ring { unsigned long csum_ok; unsigned long csum_none; int hwtstamp_rx_filter; + cpumask_var_t affinity_mask; }; struct mlx4_en_cq {