From 0211e12dd0a5385ecffd3557bc570dbad7fcf245 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Apr 2018 12:40:07 +0200 Subject: [PATCH 1/5] genirq/affinity: Don't return with empty affinity masks on error When the allocation of node_to_possible_cpumask fails, then irq_create_affinity_masks() returns with a pointer to the empty affinity masks array, which will cause malfunction. Reorder the allocations so the masks array allocation comes last and every failure path returns NULL. Fixes: 9a0ef98e186d ("genirq/affinity: Assign vectors to all present CPUs") Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: Ming Lei --- kernel/irq/affinity.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index a37a3b4b6342..e0665549af59 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -108,7 +108,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) int affv = nvecs - affd->pre_vectors - affd->post_vectors; int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; - struct cpumask *masks; + struct cpumask *masks = NULL; cpumask_var_t nmsk, *node_to_possible_cpumask; /* @@ -121,13 +121,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; - masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); - if (!masks) - goto out; - node_to_possible_cpumask = alloc_node_to_possible_cpumask(); if (!node_to_possible_cpumask) - goto out; + goto outcpumsk; + + masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); + if (!masks) + goto outnodemsk; /* Fill out vectors at the beginning that don't need affinity */ for (curvec = 0; curvec < affd->pre_vectors; curvec++) @@ -192,8 +192,9 @@ done: /* Fill out vectors at the end that don't need affinity */ for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); +outnodemsk: free_node_to_possible_cpumask(node_to_possible_cpumask); -out: +outcpumsk: free_cpumask_var(nmsk); return masks; } From 47778f33dcba7feb92031643b37e477892f82b62 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 8 Mar 2018 18:53:55 +0800 Subject: [PATCH 2/5] genirq/affinity: Rename *node_to_possible_cpumask as *node_to_cpumask The following patches will introduce two stage irq spreading for improving irq spread on all possible CPUs. No functional change. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: Laurence Oberman Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20180308105358.1506-2-ming.lei@redhat.com --- kernel/irq/affinity.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e0665549af59..272c968d9ef1 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, } } -static cpumask_var_t *alloc_node_to_possible_cpumask(void) +static cpumask_var_t *alloc_node_to_cpumask(void) { cpumask_var_t *masks; int node; @@ -62,7 +62,7 @@ out_unwind: return NULL; } -static void free_node_to_possible_cpumask(cpumask_var_t *masks) +static void free_node_to_cpumask(cpumask_var_t *masks) { int node; @@ -71,7 +71,7 @@ static void free_node_to_possible_cpumask(cpumask_var_t *masks) kfree(masks); } -static void build_node_to_possible_cpumask(cpumask_var_t *masks) +static void build_node_to_cpumask(cpumask_var_t *masks) { int cpu; @@ -79,14 +79,14 @@ static void build_node_to_possible_cpumask(cpumask_var_t *masks) cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); } -static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask, +static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, const struct cpumask *mask, nodemask_t *nodemsk) { int n, nodes = 0; /* Calculate the number of nodes in the supplied affinity mask */ for_each_node(n) { - if (cpumask_intersects(mask, node_to_possible_cpumask[n])) { + if (cpumask_intersects(mask, node_to_cpumask[n])) { node_set(n, *nodemsk); nodes++; } @@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; struct cpumask *masks = NULL; - cpumask_var_t nmsk, *node_to_possible_cpumask; + cpumask_var_t nmsk, *node_to_cpumask; /* * If there aren't any vectors left after applying the pre/post @@ -121,8 +121,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; - node_to_possible_cpumask = alloc_node_to_possible_cpumask(); - if (!node_to_possible_cpumask) + node_to_cpumask = alloc_node_to_cpumask(); + if (!node_to_cpumask) goto outcpumsk; masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); @@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Stabilize the cpumasks */ get_online_cpus(); - build_node_to_possible_cpumask(node_to_possible_cpumask); - nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask, + build_node_to_cpumask(node_to_cpumask); + nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_possible_mask, &nodemsk); /* @@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (affv <= nodes) { for_each_node_mask(n, nodemsk) { cpumask_copy(masks + curvec, - node_to_possible_cpumask[n]); + node_to_cpumask[n]); if (++curvec == last_affv) break; } @@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]); + cpumask_and(nmsk, cpu_possible_mask, node_to_cpumask[n]); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); @@ -193,7 +193,7 @@ done: for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); outnodemsk: - free_node_to_possible_cpumask(node_to_possible_cpumask); + free_node_to_cpumask(node_to_cpumask); outcpumsk: free_cpumask_var(nmsk); return masks; From b3e6aaa8d94d618e685c4df08bef991a4fb43923 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 8 Mar 2018 18:53:56 +0800 Subject: [PATCH 3/5] genirq/affinity: Move actual irq vector spreading into a helper function No functional change, just prepare for converting to 2-stage irq vector spreading. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: Laurence Oberman Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20180308105358.1506-3-ming.lei@redhat.com --- kernel/irq/affinity.c | 97 ++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 42 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 272c968d9ef1..a9c36904500c 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -94,50 +94,19 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } -/** - * irq_create_affinity_masks - Create affinity masks for multiqueue spreading - * @nvecs: The total number of vectors - * @affd: Description of the affinity requirements - * - * Returns the masks pointer or NULL if allocation failed. - */ -struct cpumask * -irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) +static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + struct cpumask *nmsk, + struct cpumask *masks) { - int n, nodes, cpus_per_vec, extra_vecs, curvec; int affv = nvecs - affd->pre_vectors - affd->post_vectors; int last_affv = affv + affd->pre_vectors; + int curvec = affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; - struct cpumask *masks = NULL; - cpumask_var_t nmsk, *node_to_cpumask; + int n, nodes, cpus_per_vec, extra_vecs; - /* - * If there aren't any vectors left after applying the pre/post - * vectors don't bother with assigning affinity. - */ - if (!affv) - return NULL; - - if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) - return NULL; - - node_to_cpumask = alloc_node_to_cpumask(); - if (!node_to_cpumask) - goto outcpumsk; - - masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); - if (!masks) - goto outnodemsk; - - /* Fill out vectors at the beginning that don't need affinity */ - for (curvec = 0; curvec < affd->pre_vectors; curvec++) - cpumask_copy(masks + curvec, irq_default_affinity); - - /* Stabilize the cpumasks */ - get_online_cpus(); - build_node_to_cpumask(node_to_cpumask); - nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_possible_mask, - &nodemsk); + nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); /* * If the number of nodes in the mask is greater than or equal the @@ -150,7 +119,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (++curvec == last_affv) break; } - goto done; + goto out; } for_each_node_mask(n, nodemsk) { @@ -160,7 +129,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_possible_mask, node_to_cpumask[n]); + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); @@ -186,7 +155,51 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) --nodes; } -done: +out: + return curvec - affd->pre_vectors; +} + +/** + * irq_create_affinity_masks - Create affinity masks for multiqueue spreading + * @nvecs: The total number of vectors + * @affd: Description of the affinity requirements + * + * Returns the masks pointer or NULL if allocation failed. + */ +struct cpumask * +irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) +{ + cpumask_var_t nmsk, *node_to_cpumask; + struct cpumask *masks = NULL; + int curvec; + + /* + * If there aren't any vectors left after applying the pre/post + * vectors don't bother with assigning affinity. + */ + if (nvecs == affd->pre_vectors + affd->post_vectors) + return NULL; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + return NULL; + + node_to_cpumask = alloc_node_to_cpumask(); + if (!node_to_cpumask) + goto outcpumsk; + + masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); + if (!masks) + goto outnodemsk; + + /* Fill out vectors at the beginning that don't need affinity */ + for (curvec = 0; curvec < affd->pre_vectors; curvec++) + cpumask_copy(masks + curvec, irq_default_affinity); + + /* Stabilize the cpumasks */ + get_online_cpus(); + build_node_to_cpumask(node_to_cpumask); + curvec += irq_build_affinity_masks(nvecs, affd, node_to_cpumask, + cpu_possible_mask, nmsk, masks); put_online_cpus(); /* Fill out vectors at the end that don't need affinity */ From 1a2d0914e23aab386f5d5acb689777e24151c2c8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 8 Mar 2018 18:53:57 +0800 Subject: [PATCH 4/5] genirq/affinity: Allow irq spreading from a given starting point To support two stage irq vector spreading, it's required to add a starting point to the spreading function. No functional change, just preparatory work for the actual two stage change. [ tglx: Renamed variables, tidied up the code and massaged changelog ] Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: Laurence Oberman Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20180308105358.1506-4-ming.lei@redhat.com --- kernel/irq/affinity.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index a9c36904500c..213695a27ddb 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -94,17 +94,17 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } -static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd, +static int irq_build_affinity_masks(const struct irq_affinity *affd, + int startvec, int numvecs, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, struct cpumask *nmsk, struct cpumask *masks) { - int affv = nvecs - affd->pre_vectors - affd->post_vectors; - int last_affv = affv + affd->pre_vectors; - int curvec = affd->pre_vectors; + int n, nodes, cpus_per_vec, extra_vecs, done = 0; + int last_affv = affd->pre_vectors + numvecs; + int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; - int n, nodes, cpus_per_vec, extra_vecs; nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); @@ -112,12 +112,13 @@ static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd, * If the number of nodes in the mask is greater than or equal the * number of vectors we just spread the vectors across the nodes. */ - if (affv <= nodes) { + if (numvecs <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_copy(masks + curvec, - node_to_cpumask[n]); - if (++curvec == last_affv) + cpumask_copy(masks + curvec, node_to_cpumask[n]); + if (++done == numvecs) break; + if (++curvec == last_affv) + curvec = affd->pre_vectors; } goto out; } @@ -126,7 +127,7 @@ static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd, int ncpus, v, vecs_to_assign, vecs_per_node; /* Spread the vectors per node */ - vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; + vecs_per_node = (numvecs - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); @@ -150,13 +151,16 @@ static int irq_build_affinity_masks(int nvecs, const struct irq_affinity *affd, irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); } - if (curvec >= last_affv) + done += v; + if (done >= numvecs) break; + if (curvec >= last_affv) + curvec = affd->pre_vectors; --nodes; } out: - return curvec - affd->pre_vectors; + return done; } /** @@ -169,9 +173,9 @@ out: struct cpumask * irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) { + int curvec, affvecs = nvecs - affd->pre_vectors - affd->post_vectors; cpumask_var_t nmsk, *node_to_cpumask; struct cpumask *masks = NULL; - int curvec; /* * If there aren't any vectors left after applying the pre/post @@ -198,8 +202,9 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Stabilize the cpumasks */ get_online_cpus(); build_node_to_cpumask(node_to_cpumask); - curvec += irq_build_affinity_masks(nvecs, affd, node_to_cpumask, - cpu_possible_mask, nmsk, masks); + curvec += irq_build_affinity_masks(affd, curvec, affvecs, + node_to_cpumask, cpu_possible_mask, + nmsk, masks); put_online_cpus(); /* Fill out vectors at the end that don't need affinity */ From d3056812e7dfe6bf4f8ad9e397a9116dd5d32d15 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 8 Mar 2018 18:53:58 +0800 Subject: [PATCH 5/5] genirq/affinity: Spread irq vectors among present CPUs as far as possible Commit 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs") tried to spread the interrupts accross all possible CPUs to make sure that in case of phsyical hotplug (e.g. virtualization) the CPUs which get plugged in after the device was initialized are targeted by a hardware queue and the corresponding interrupt. This has a downside in cases where the ACPI tables claim that there are more possible CPUs than present CPUs and the number of interrupts to spread out is smaller than the number of possible CPUs. These bogus ACPI tables are unfortunately not uncommon. In such a case the vector spreading algorithm assigns interrupts to CPUs which can never be utilized and as a consequence these interrupts are unused instead of being mapped to present CPUs. As a result the performance of the device is suboptimal. To fix this spread the interrupt vectors in two stages: 1) Spread as many interrupts as possible among the present CPUs 2) Spread the remaining vectors among non present CPUs On a 8 core system, where CPU 0-3 are present and CPU 4-7 are not present, for a device with 4 queues the resulting interrupt affinity is: 1) Before 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs") irq 39, cpu list 0 irq 40, cpu list 1 irq 41, cpu list 2 irq 42, cpu list 3 2) With 84676c1f21 ("genirq/affinity: assign vectors to all possible CPUs") irq 39, cpu list 0-2 irq 40, cpu list 3-4,6 irq 41, cpu list 5 irq 42, cpu list 7 3) With the refined vector spread applied: irq 39, cpu list 0,4 irq 40, cpu list 1,6 irq 41, cpu list 2,5 irq 42, cpu list 3,7 On a 8 core system, where all CPUs are present the resulting interrupt affinity for the 4 queues is: irq 39, cpu list 0,1 irq 40, cpu list 2,3 irq 41, cpu list 4,5 irq 42, cpu list 6,7 This is independent of the number of CPUs which are online at the point of initialization because in such a system the offline CPUs can be easily onlined afterwards, while in non-present CPUs need to be plugged physically or virtually which requires external interaction. The downside of this approach is that in case of physical hotplug the interrupt vector spreading might be suboptimal when CPUs 4-7 are physically plugged. Suboptimal from a NUMA point of view and due to the single target nature of interrupt affinities the later plugged CPUs might not be targeted by interrupts at all. Though, physical hotplug systems are not the common case while the broken ACPI table disease is wide spread. So it's preferred to have as many interrupts as possible utilized at the point where the device is initialized. Block multi-queue devices like NVME create a hardware queue per possible CPU, so the goal of commit 84676c1f21 to assign one interrupt vector per possible CPU is still achieved even with physical/virtual hotplug. [ tglx: Changed from online to present CPUs for the first spreading stage, renamed variables for readability sake, added comments and massaged changelog ] Reported-by: Laurence Oberman Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20180308105358.1506-5-ming.lei@redhat.com --- kernel/irq/affinity.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 213695a27ddb..f4f29b9d90ee 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -106,6 +106,9 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; + if (!cpumask_weight(cpu_mask)) + return 0; + nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); /* @@ -173,8 +176,9 @@ out: struct cpumask * irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) { - int curvec, affvecs = nvecs - affd->pre_vectors - affd->post_vectors; - cpumask_var_t nmsk, *node_to_cpumask; + int affvecs = nvecs - affd->pre_vectors - affd->post_vectors; + int curvec, usedvecs; + cpumask_var_t nmsk, npresmsk, *node_to_cpumask; struct cpumask *masks = NULL; /* @@ -187,9 +191,12 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; + if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) + goto outcpumsk; + node_to_cpumask = alloc_node_to_cpumask(); if (!node_to_cpumask) - goto outcpumsk; + goto outnpresmsk; masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); if (!masks) @@ -202,16 +209,40 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Stabilize the cpumasks */ get_online_cpus(); build_node_to_cpumask(node_to_cpumask); - curvec += irq_build_affinity_masks(affd, curvec, affvecs, - node_to_cpumask, cpu_possible_mask, - nmsk, masks); + + /* Spread on present CPUs starting from affd->pre_vectors */ + usedvecs = irq_build_affinity_masks(affd, curvec, affvecs, + node_to_cpumask, cpu_present_mask, + nmsk, masks); + + /* + * Spread on non present CPUs starting from the next vector to be + * handled. If the spreading of present CPUs already exhausted the + * vector space, assign the non present CPUs to the already spread + * out vectors. + */ + if (usedvecs >= affvecs) + curvec = affd->pre_vectors; + else + curvec = affd->pre_vectors + usedvecs; + cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); + usedvecs += irq_build_affinity_masks(affd, curvec, affvecs, + node_to_cpumask, npresmsk, + nmsk, masks); put_online_cpus(); /* Fill out vectors at the end that don't need affinity */ + if (usedvecs >= affvecs) + curvec = affd->pre_vectors + affvecs; + else + curvec = affd->pre_vectors + usedvecs; for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); + outnodemsk: free_node_to_cpumask(node_to_cpumask); +outnpresmsk: + free_cpumask_var(npresmsk); outcpumsk: free_cpumask_var(nmsk); return masks;