linux/net/ipv4/netfilter/ipt_CLUSTERIP.c

912 lines
22 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/* Cluster IP hashmark target
* (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
* based on ideas of Fabio Olive Leite <olive@unixforge.org>
*
* Development of this code funded by SuSE Linux AG, http://www.suse.com/
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/jhash.h>
#include <linux/bitops.h>
#include <linux/skbuff.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 09:04:11 +01:00
#include <linux/slab.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/icmp.h>
#include <linux/if_arp.h>
#include <linux/seq_file.h>
#include <linux/refcount.h>
#include <linux/netfilter_arp.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/checksum.h>
#include <net/ip.h>
#define CLUSTERIP_VERSION "0.8"
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("Xtables: CLUSTERIP target");
struct clusterip_config {
struct list_head list; /* list of all configs */
refcount_t refcount; /* reference count */
refcount_t entries; /* number of entries/rules
* referencing us */
__be32 clusterip; /* the IP address */
u_int8_t clustermac[ETH_ALEN]; /* the MAC address */
netfilter: ipt_CLUSTERIP: do not hold dev It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi <jishi@redhat.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2017-05-24 15:24:37 +02:00
int ifindex; /* device ifindex */
u_int16_t num_total_nodes; /* total number of nodes */
unsigned long local_nodes; /* node number array */
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *pde; /* proc dir entry */
#endif
enum clusterip_hashmode hash_mode; /* which hashing mode */
u_int32_t hash_initval; /* hash initialization */
struct rcu_head rcu; /* for call_rcu */
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
struct net *net; /* netns for pernet list */
netfilter: ipt_CLUSTERIP: do not hold dev It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi <jishi@redhat.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2017-05-24 15:24:37 +02:00
char ifname[IFNAMSIZ]; /* device ifname */
};
#ifdef CONFIG_PROC_FS
static const struct file_operations clusterip_proc_fops;
#endif
struct clusterip_net {
struct list_head configs;
/* lock protects the configs list */
spinlock_t lock;
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *procdir;
netfilter: ipt_CLUSTERIP: fix sleep-in-atomic bug in clusterip_config_entry_put() A proc_remove() can sleep. so that it can't be inside of spin_lock. Hence proc_remove() is moved to outside of spin_lock. and it also adds mutex to sync create and remove of proc entry(config->pde). test commands: SHELL#1 %while :; do iptables -A INPUT -p udp -i enp2s0 -d 192.168.1.100 \ --dport 9000 -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:21 --total-nodes 3 --local-node 3; \ iptables -F; done SHELL#2 %while :; do echo +1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; \ echo -1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; done [ 2949.569864] BUG: sleeping function called from invalid context at kernel/sched/completion.c:99 [ 2949.579944] in_atomic(): 1, irqs_disabled(): 0, pid: 5472, name: iptables [ 2949.587920] 1 lock held by iptables/5472: [ 2949.592711] #0: 000000008f0ebcf2 (&(&cn->lock)->rlock){+...}, at: refcount_dec_and_lock+0x24/0x50 [ 2949.603307] CPU: 1 PID: 5472 Comm: iptables Tainted: G W 4.19.0-rc5+ #16 [ 2949.604212] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015 [ 2949.604212] Call Trace: [ 2949.604212] dump_stack+0xc9/0x16b [ 2949.604212] ? show_regs_print_info+0x5/0x5 [ 2949.604212] ___might_sleep+0x2eb/0x420 [ 2949.604212] ? set_rq_offline.part.87+0x140/0x140 [ 2949.604212] ? _rcu_barrier_trace+0x400/0x400 [ 2949.604212] wait_for_completion+0x94/0x710 [ 2949.604212] ? wait_for_completion_interruptible+0x780/0x780 [ 2949.604212] ? __kernel_text_address+0xe/0x30 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __init_waitqueue_head+0x86/0x130 [ 2949.604212] ? init_wait_entry+0x1a0/0x1a0 [ 2949.604212] proc_entry_rundown+0x208/0x270 [ 2949.604212] ? proc_reg_get_unmapped_area+0x370/0x370 [ 2949.604212] ? __lock_acquire+0x4500/0x4500 [ 2949.604212] ? complete+0x18/0x70 [ 2949.604212] remove_proc_subtree+0x143/0x2a0 [ 2949.708655] ? remove_proc_entry+0x390/0x390 [ 2949.708655] clusterip_tg_destroy+0x27a/0x630 [ipt_CLUSTERIP] [ ... ] Fixes: b3e456fce9f5 ("netfilter: ipt_CLUSTERIP: fix a race condition of proc file creation") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:23:13 +01:00
/* mutex protects the config->pde*/
struct mutex mutex;
#endif
};
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
static unsigned int clusterip_net_id __read_mostly;
static inline struct clusterip_net *clusterip_pernet(struct net *net)
{
return net_generic(net, clusterip_net_id);
}
static inline void
clusterip_config_get(struct clusterip_config *c)
{
refcount_inc(&c->refcount);
}
static void clusterip_config_rcu_free(struct rcu_head *head)
{
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
struct clusterip_config *config;
struct net_device *dev;
config = container_of(head, struct clusterip_config, rcu);
dev = dev_get_by_name(config->net, config->ifname);
if (dev) {
dev_mc_del(dev, config->clustermac);
dev_put(dev);
}
kfree(config);
}
static inline void
clusterip_config_put(struct clusterip_config *c)
{
if (refcount_dec_and_test(&c->refcount))
call_rcu(&c->rcu, clusterip_config_rcu_free);
}
/* decrease the count of entries using/referencing this config. If last
* entry(rule) is removed, remove the config from lists, but don't free it
* yet, since proc-files could still be holding references */
static inline void
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
clusterip_config_entry_put(struct clusterip_config *c)
{
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
struct clusterip_net *cn = clusterip_pernet(c->net);
local_bh_disable();
if (refcount_dec_and_lock(&c->entries, &cn->lock)) {
netfilter: ipt_CLUSTERIP: fix sleep-in-atomic bug in clusterip_config_entry_put() A proc_remove() can sleep. so that it can't be inside of spin_lock. Hence proc_remove() is moved to outside of spin_lock. and it also adds mutex to sync create and remove of proc entry(config->pde). test commands: SHELL#1 %while :; do iptables -A INPUT -p udp -i enp2s0 -d 192.168.1.100 \ --dport 9000 -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:21 --total-nodes 3 --local-node 3; \ iptables -F; done SHELL#2 %while :; do echo +1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; \ echo -1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; done [ 2949.569864] BUG: sleeping function called from invalid context at kernel/sched/completion.c:99 [ 2949.579944] in_atomic(): 1, irqs_disabled(): 0, pid: 5472, name: iptables [ 2949.587920] 1 lock held by iptables/5472: [ 2949.592711] #0: 000000008f0ebcf2 (&(&cn->lock)->rlock){+...}, at: refcount_dec_and_lock+0x24/0x50 [ 2949.603307] CPU: 1 PID: 5472 Comm: iptables Tainted: G W 4.19.0-rc5+ #16 [ 2949.604212] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015 [ 2949.604212] Call Trace: [ 2949.604212] dump_stack+0xc9/0x16b [ 2949.604212] ? show_regs_print_info+0x5/0x5 [ 2949.604212] ___might_sleep+0x2eb/0x420 [ 2949.604212] ? set_rq_offline.part.87+0x140/0x140 [ 2949.604212] ? _rcu_barrier_trace+0x400/0x400 [ 2949.604212] wait_for_completion+0x94/0x710 [ 2949.604212] ? wait_for_completion_interruptible+0x780/0x780 [ 2949.604212] ? __kernel_text_address+0xe/0x30 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __init_waitqueue_head+0x86/0x130 [ 2949.604212] ? init_wait_entry+0x1a0/0x1a0 [ 2949.604212] proc_entry_rundown+0x208/0x270 [ 2949.604212] ? proc_reg_get_unmapped_area+0x370/0x370 [ 2949.604212] ? __lock_acquire+0x4500/0x4500 [ 2949.604212] ? complete+0x18/0x70 [ 2949.604212] remove_proc_subtree+0x143/0x2a0 [ 2949.708655] ? remove_proc_entry+0x390/0x390 [ 2949.708655] clusterip_tg_destroy+0x27a/0x630 [ipt_CLUSTERIP] [ ... ] Fixes: b3e456fce9f5 ("netfilter: ipt_CLUSTERIP: fix a race condition of proc file creation") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:23:13 +01:00
list_del_rcu(&c->list);
spin_unlock(&cn->lock);
local_bh_enable();
/* In case anyone still accesses the file, the open/close
* functions are also incrementing the refcount on their own,
* so it's safe to remove the entry even if it's in use. */
#ifdef CONFIG_PROC_FS
netfilter: ipt_CLUSTERIP: fix sleep-in-atomic bug in clusterip_config_entry_put() A proc_remove() can sleep. so that it can't be inside of spin_lock. Hence proc_remove() is moved to outside of spin_lock. and it also adds mutex to sync create and remove of proc entry(config->pde). test commands: SHELL#1 %while :; do iptables -A INPUT -p udp -i enp2s0 -d 192.168.1.100 \ --dport 9000 -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:21 --total-nodes 3 --local-node 3; \ iptables -F; done SHELL#2 %while :; do echo +1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; \ echo -1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; done [ 2949.569864] BUG: sleeping function called from invalid context at kernel/sched/completion.c:99 [ 2949.579944] in_atomic(): 1, irqs_disabled(): 0, pid: 5472, name: iptables [ 2949.587920] 1 lock held by iptables/5472: [ 2949.592711] #0: 000000008f0ebcf2 (&(&cn->lock)->rlock){+...}, at: refcount_dec_and_lock+0x24/0x50 [ 2949.603307] CPU: 1 PID: 5472 Comm: iptables Tainted: G W 4.19.0-rc5+ #16 [ 2949.604212] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015 [ 2949.604212] Call Trace: [ 2949.604212] dump_stack+0xc9/0x16b [ 2949.604212] ? show_regs_print_info+0x5/0x5 [ 2949.604212] ___might_sleep+0x2eb/0x420 [ 2949.604212] ? set_rq_offline.part.87+0x140/0x140 [ 2949.604212] ? _rcu_barrier_trace+0x400/0x400 [ 2949.604212] wait_for_completion+0x94/0x710 [ 2949.604212] ? wait_for_completion_interruptible+0x780/0x780 [ 2949.604212] ? __kernel_text_address+0xe/0x30 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __init_waitqueue_head+0x86/0x130 [ 2949.604212] ? init_wait_entry+0x1a0/0x1a0 [ 2949.604212] proc_entry_rundown+0x208/0x270 [ 2949.604212] ? proc_reg_get_unmapped_area+0x370/0x370 [ 2949.604212] ? __lock_acquire+0x4500/0x4500 [ 2949.604212] ? complete+0x18/0x70 [ 2949.604212] remove_proc_subtree+0x143/0x2a0 [ 2949.708655] ? remove_proc_entry+0x390/0x390 [ 2949.708655] clusterip_tg_destroy+0x27a/0x630 [ipt_CLUSTERIP] [ ... ] Fixes: b3e456fce9f5 ("netfilter: ipt_CLUSTERIP: fix a race condition of proc file creation") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:23:13 +01:00
mutex_lock(&cn->mutex);
if (cn->procdir)
proc_remove(c->pde);
netfilter: ipt_CLUSTERIP: fix sleep-in-atomic bug in clusterip_config_entry_put() A proc_remove() can sleep. so that it can't be inside of spin_lock. Hence proc_remove() is moved to outside of spin_lock. and it also adds mutex to sync create and remove of proc entry(config->pde). test commands: SHELL#1 %while :; do iptables -A INPUT -p udp -i enp2s0 -d 192.168.1.100 \ --dport 9000 -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:21 --total-nodes 3 --local-node 3; \ iptables -F; done SHELL#2 %while :; do echo +1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; \ echo -1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; done [ 2949.569864] BUG: sleeping function called from invalid context at kernel/sched/completion.c:99 [ 2949.579944] in_atomic(): 1, irqs_disabled(): 0, pid: 5472, name: iptables [ 2949.587920] 1 lock held by iptables/5472: [ 2949.592711] #0: 000000008f0ebcf2 (&(&cn->lock)->rlock){+...}, at: refcount_dec_and_lock+0x24/0x50 [ 2949.603307] CPU: 1 PID: 5472 Comm: iptables Tainted: G W 4.19.0-rc5+ #16 [ 2949.604212] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015 [ 2949.604212] Call Trace: [ 2949.604212] dump_stack+0xc9/0x16b [ 2949.604212] ? show_regs_print_info+0x5/0x5 [ 2949.604212] ___might_sleep+0x2eb/0x420 [ 2949.604212] ? set_rq_offline.part.87+0x140/0x140 [ 2949.604212] ? _rcu_barrier_trace+0x400/0x400 [ 2949.604212] wait_for_completion+0x94/0x710 [ 2949.604212] ? wait_for_completion_interruptible+0x780/0x780 [ 2949.604212] ? __kernel_text_address+0xe/0x30 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __init_waitqueue_head+0x86/0x130 [ 2949.604212] ? init_wait_entry+0x1a0/0x1a0 [ 2949.604212] proc_entry_rundown+0x208/0x270 [ 2949.604212] ? proc_reg_get_unmapped_area+0x370/0x370 [ 2949.604212] ? __lock_acquire+0x4500/0x4500 [ 2949.604212] ? complete+0x18/0x70 [ 2949.604212] remove_proc_subtree+0x143/0x2a0 [ 2949.708655] ? remove_proc_entry+0x390/0x390 [ 2949.708655] clusterip_tg_destroy+0x27a/0x630 [ipt_CLUSTERIP] [ ... ] Fixes: b3e456fce9f5 ("netfilter: ipt_CLUSTERIP: fix a race condition of proc file creation") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:23:13 +01:00
mutex_unlock(&cn->mutex);
#endif
return;
}
local_bh_enable();
}
static struct clusterip_config *
__clusterip_config_find(struct net *net, __be32 clusterip)
{
struct clusterip_config *c;
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
struct clusterip_net *cn = clusterip_pernet(net);
list_for_each_entry_rcu(c, &cn->configs, list) {
if (c->clusterip == clusterip)
return c;
}
return NULL;
}
static inline struct clusterip_config *
clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
{
struct clusterip_config *c;
rcu_read_lock_bh();
c = __clusterip_config_find(net, clusterip);
if (c) {
#ifdef CONFIG_PROC_FS
if (!c->pde)
c = NULL;
else
#endif
if (unlikely(!refcount_inc_not_zero(&c->refcount)))
c = NULL;
else if (entry) {
if (unlikely(!refcount_inc_not_zero(&c->entries))) {
clusterip_config_put(c);
c = NULL;
}
}
}
rcu_read_unlock_bh();
return c;
}
static void
clusterip_config_init_nodelist(struct clusterip_config *c,
const struct ipt_clusterip_tgt_info *i)
{
int n;
for (n = 0; n < i->num_local_nodes; n++)
set_bit(i->local_nodes[n] - 1, &c->local_nodes);
}
netfilter: ipt_CLUSTERIP: do not hold dev It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi <jishi@redhat.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2017-05-24 15:24:37 +02:00
static int
clusterip_netdev_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
netfilter: ipt_CLUSTERIP: do not hold dev It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi <jishi@redhat.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2017-05-24 15:24:37 +02:00
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
struct net *net = dev_net(dev);
struct clusterip_net *cn = clusterip_pernet(net);
struct clusterip_config *c;
netfilter: ipt_CLUSTERIP: do not hold dev It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi <jishi@redhat.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2017-05-24 15:24:37 +02:00
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
spin_lock_bh(&cn->lock);
list_for_each_entry_rcu(c, &cn->configs, list) {
switch (event) {
case NETDEV_REGISTER:
if (!strcmp(dev->name, c->ifname)) {
c->ifindex = dev->ifindex;
dev_mc_add(dev, c->clustermac);
}
break;
case NETDEV_UNREGISTER:
if (dev->ifindex == c->ifindex) {
dev_mc_del(dev, c->clustermac);
c->ifindex = -1;
}
break;
case NETDEV_CHANGENAME:
if (!strcmp(dev->name, c->ifname)) {
c->ifindex = dev->ifindex;
dev_mc_add(dev, c->clustermac);
} else if (dev->ifindex == c->ifindex) {
dev_mc_del(dev, c->clustermac);
c->ifindex = -1;
}
break;
netfilter: ipt_CLUSTERIP: do not hold dev It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi <jishi@redhat.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2017-05-24 15:24:37 +02:00
}
}
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
spin_unlock_bh(&cn->lock);
netfilter: ipt_CLUSTERIP: do not hold dev It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi <jishi@redhat.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2017-05-24 15:24:37 +02:00
return NOTIFY_DONE;
}
static struct clusterip_config *
clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,
__be32 ip, const char *iniface)
{
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
struct clusterip_net *cn = clusterip_pernet(net);
netfilter: ipt_CLUSTERIP: do not hold dev It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi <jishi@redhat.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2017-05-24 15:24:37 +02:00
struct clusterip_config *c;
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
struct net_device *dev;
netfilter: ipt_CLUSTERIP: do not hold dev It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi <jishi@redhat.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2017-05-24 15:24:37 +02:00
int err;
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
if (iniface[0] == '\0') {
pr_info("Please specify an interface name\n");
return ERR_PTR(-EINVAL);
}
c = kzalloc(sizeof(*c), GFP_ATOMIC);
if (!c)
return ERR_PTR(-ENOMEM);
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
dev = dev_get_by_name(net, iniface);
if (!dev) {
pr_info("no such interface %s\n", iniface);
kfree(c);
return ERR_PTR(-ENOENT);
}
c->ifindex = dev->ifindex;
strcpy(c->ifname, dev->name);
memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
dev_mc_add(dev, c->clustermac);
dev_put(dev);
c->clusterip = ip;
c->num_total_nodes = i->num_total_nodes;
clusterip_config_init_nodelist(c, i);
c->hash_mode = i->hash_mode;
c->hash_initval = i->hash_initval;
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
c->net = net;
refcount_set(&c->refcount, 1);
spin_lock_bh(&cn->lock);
if (__clusterip_config_find(net, ip)) {
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
err = -EBUSY;
goto out_config_put;
}
list_add_rcu(&c->list, &cn->configs);
spin_unlock_bh(&cn->lock);
#ifdef CONFIG_PROC_FS
{
char buffer[16];
/* create proc dir entry */
sprintf(buffer, "%pI4", &ip);
netfilter: ipt_CLUSTERIP: fix sleep-in-atomic bug in clusterip_config_entry_put() A proc_remove() can sleep. so that it can't be inside of spin_lock. Hence proc_remove() is moved to outside of spin_lock. and it also adds mutex to sync create and remove of proc entry(config->pde). test commands: SHELL#1 %while :; do iptables -A INPUT -p udp -i enp2s0 -d 192.168.1.100 \ --dport 9000 -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:21 --total-nodes 3 --local-node 3; \ iptables -F; done SHELL#2 %while :; do echo +1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; \ echo -1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; done [ 2949.569864] BUG: sleeping function called from invalid context at kernel/sched/completion.c:99 [ 2949.579944] in_atomic(): 1, irqs_disabled(): 0, pid: 5472, name: iptables [ 2949.587920] 1 lock held by iptables/5472: [ 2949.592711] #0: 000000008f0ebcf2 (&(&cn->lock)->rlock){+...}, at: refcount_dec_and_lock+0x24/0x50 [ 2949.603307] CPU: 1 PID: 5472 Comm: iptables Tainted: G W 4.19.0-rc5+ #16 [ 2949.604212] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015 [ 2949.604212] Call Trace: [ 2949.604212] dump_stack+0xc9/0x16b [ 2949.604212] ? show_regs_print_info+0x5/0x5 [ 2949.604212] ___might_sleep+0x2eb/0x420 [ 2949.604212] ? set_rq_offline.part.87+0x140/0x140 [ 2949.604212] ? _rcu_barrier_trace+0x400/0x400 [ 2949.604212] wait_for_completion+0x94/0x710 [ 2949.604212] ? wait_for_completion_interruptible+0x780/0x780 [ 2949.604212] ? __kernel_text_address+0xe/0x30 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __init_waitqueue_head+0x86/0x130 [ 2949.604212] ? init_wait_entry+0x1a0/0x1a0 [ 2949.604212] proc_entry_rundown+0x208/0x270 [ 2949.604212] ? proc_reg_get_unmapped_area+0x370/0x370 [ 2949.604212] ? __lock_acquire+0x4500/0x4500 [ 2949.604212] ? complete+0x18/0x70 [ 2949.604212] remove_proc_subtree+0x143/0x2a0 [ 2949.708655] ? remove_proc_entry+0x390/0x390 [ 2949.708655] clusterip_tg_destroy+0x27a/0x630 [ipt_CLUSTERIP] [ ... ] Fixes: b3e456fce9f5 ("netfilter: ipt_CLUSTERIP: fix a race condition of proc file creation") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:23:13 +01:00
mutex_lock(&cn->mutex);
c->pde = proc_create_data(buffer, 0600,
cn->procdir,
&clusterip_proc_fops, c);
netfilter: ipt_CLUSTERIP: fix sleep-in-atomic bug in clusterip_config_entry_put() A proc_remove() can sleep. so that it can't be inside of spin_lock. Hence proc_remove() is moved to outside of spin_lock. and it also adds mutex to sync create and remove of proc entry(config->pde). test commands: SHELL#1 %while :; do iptables -A INPUT -p udp -i enp2s0 -d 192.168.1.100 \ --dport 9000 -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:21 --total-nodes 3 --local-node 3; \ iptables -F; done SHELL#2 %while :; do echo +1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; \ echo -1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; done [ 2949.569864] BUG: sleeping function called from invalid context at kernel/sched/completion.c:99 [ 2949.579944] in_atomic(): 1, irqs_disabled(): 0, pid: 5472, name: iptables [ 2949.587920] 1 lock held by iptables/5472: [ 2949.592711] #0: 000000008f0ebcf2 (&(&cn->lock)->rlock){+...}, at: refcount_dec_and_lock+0x24/0x50 [ 2949.603307] CPU: 1 PID: 5472 Comm: iptables Tainted: G W 4.19.0-rc5+ #16 [ 2949.604212] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015 [ 2949.604212] Call Trace: [ 2949.604212] dump_stack+0xc9/0x16b [ 2949.604212] ? show_regs_print_info+0x5/0x5 [ 2949.604212] ___might_sleep+0x2eb/0x420 [ 2949.604212] ? set_rq_offline.part.87+0x140/0x140 [ 2949.604212] ? _rcu_barrier_trace+0x400/0x400 [ 2949.604212] wait_for_completion+0x94/0x710 [ 2949.604212] ? wait_for_completion_interruptible+0x780/0x780 [ 2949.604212] ? __kernel_text_address+0xe/0x30 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __init_waitqueue_head+0x86/0x130 [ 2949.604212] ? init_wait_entry+0x1a0/0x1a0 [ 2949.604212] proc_entry_rundown+0x208/0x270 [ 2949.604212] ? proc_reg_get_unmapped_area+0x370/0x370 [ 2949.604212] ? __lock_acquire+0x4500/0x4500 [ 2949.604212] ? complete+0x18/0x70 [ 2949.604212] remove_proc_subtree+0x143/0x2a0 [ 2949.708655] ? remove_proc_entry+0x390/0x390 [ 2949.708655] clusterip_tg_destroy+0x27a/0x630 [ipt_CLUSTERIP] [ ... ] Fixes: b3e456fce9f5 ("netfilter: ipt_CLUSTERIP: fix a race condition of proc file creation") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:23:13 +01:00
mutex_unlock(&cn->mutex);
if (!c->pde) {
netfilter: ipt_CLUSTERIP: do not hold dev It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi <jishi@redhat.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2017-05-24 15:24:37 +02:00
err = -ENOMEM;
goto err;
}
}
#endif
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
refcount_set(&c->entries, 1);
return c;
netfilter: ipt_CLUSTERIP: do not hold dev It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi <jishi@redhat.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2017-05-24 15:24:37 +02:00
#ifdef CONFIG_PROC_FS
err:
#endif
spin_lock_bh(&cn->lock);
list_del_rcu(&c->list);
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
out_config_put:
netfilter: ipt_CLUSTERIP: do not hold dev It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi <jishi@redhat.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2017-05-24 15:24:37 +02:00
spin_unlock_bh(&cn->lock);
clusterip_config_put(c);
netfilter: ipt_CLUSTERIP: do not hold dev It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi <jishi@redhat.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2017-05-24 15:24:37 +02:00
return ERR_PTR(err);
}
#ifdef CONFIG_PROC_FS
static int
clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
{
if (nodenum == 0 ||
nodenum > c->num_total_nodes)
return 1;
/* check if we already have this number in our bitfield */
if (test_and_set_bit(nodenum - 1, &c->local_nodes))
return 1;
return 0;
}
static bool
clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
{
if (nodenum == 0 ||
nodenum > c->num_total_nodes)
return true;
if (test_and_clear_bit(nodenum - 1, &c->local_nodes))
return false;
return true;
}
#endif
static inline u_int32_t
clusterip_hashfn(const struct sk_buff *skb,
const struct clusterip_config *config)
{
const struct iphdr *iph = ip_hdr(skb);
unsigned long hashval;
u_int16_t sport = 0, dport = 0;
int poff;
poff = proto_ports_offset(iph->protocol);
if (poff >= 0) {
const u_int16_t *ports;
u16 _ports[2];
ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
if (ports) {
sport = ports[0];
dport = ports[1];
}
} else {
net_info_ratelimited("unknown protocol %u\n", iph->protocol);
}
switch (config->hash_mode) {
case CLUSTERIP_HASHMODE_SIP:
hashval = jhash_1word(ntohl(iph->saddr),
config->hash_initval);
break;
case CLUSTERIP_HASHMODE_SIP_SPT:
hashval = jhash_2words(ntohl(iph->saddr), sport,
config->hash_initval);
break;
case CLUSTERIP_HASHMODE_SIP_SPT_DPT:
hashval = jhash_3words(ntohl(iph->saddr), sport, dport,
config->hash_initval);
break;
default:
/* to make gcc happy */
hashval = 0;
/* This cannot happen, unless the check function wasn't called
* at rule load time */
pr_info("unknown mode %u\n", config->hash_mode);
BUG();
break;
}
/* node numbers are 1..n, not 0..n */
return reciprocal_scale(hashval, config->num_total_nodes) + 1;
}
static inline int
clusterip_responsible(const struct clusterip_config *config, u_int32_t hash)
{
return test_bit(hash - 1, &config->local_nodes);
}
/***********************************************************************
* IPTABLES TARGET
***********************************************************************/
static unsigned int
clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
u_int32_t hash;
/* don't need to clusterip_config_get() here, since refcount
* is only decremented by destroy() - and ip_tables guarantees
* that the ->target() function isn't called after ->destroy() */
ct = nf_ct_get(skb, &ctinfo);
if (ct == NULL)
return NF_DROP;
/* special case: ICMP error handling. conntrack distinguishes between
* error messages (RELATED) and information requests (see below) */
if (ip_hdr(skb)->protocol == IPPROTO_ICMP &&
(ctinfo == IP_CT_RELATED ||
ctinfo == IP_CT_RELATED_REPLY))
return XT_CONTINUE;
/* nf_conntrack_proto_icmp guarantees us that we only have ICMP_ECHO,
* TIMESTAMP, INFO_REQUEST or ICMP_ADDRESS type icmp packets from here
* on, which all have an ID field [relevant for hashing]. */
hash = clusterip_hashfn(skb, cipinfo->config);
switch (ctinfo) {
case IP_CT_NEW:
ct->mark = hash;
break;
case IP_CT_RELATED:
case IP_CT_RELATED_REPLY:
/* FIXME: we don't handle expectations at the moment.
* They can arrive on a different node than
* the master connection (e.g. FTP passive mode) */
case IP_CT_ESTABLISHED:
case IP_CT_ESTABLISHED_REPLY:
break;
default: /* Prevent gcc warnings */
break;
}
#ifdef DEBUG
nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
#endif
pr_debug("hash=%u ct_hash=%u ", hash, ct->mark);
if (!clusterip_responsible(cipinfo->config, hash)) {
pr_debug("not responsible\n");
return NF_DROP;
}
pr_debug("responsible\n");
/* despite being received via linklayer multicast, this is
* actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
skb->pkt_type = PACKET_HOST;
return XT_CONTINUE;
}
static int clusterip_tg_check(const struct xt_tgchk_param *par)
{
struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
const struct ipt_entry *e = par->entryinfo;
struct clusterip_config *config;
int ret, i;
if (par->nft_compat) {
pr_err("cannot use CLUSTERIP target from nftables compat\n");
return -EOPNOTSUPP;
}
if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
pr_info("unknown mode %u\n", cipinfo->hash_mode);
return -EINVAL;
}
if (e->ip.dmsk.s_addr != htonl(0xffffffff) ||
e->ip.dst.s_addr == 0) {
pr_info("Please specify destination IP\n");
return -EINVAL;
}
if (cipinfo->num_local_nodes > ARRAY_SIZE(cipinfo->local_nodes)) {
pr_info("bad num_local_nodes %u\n", cipinfo->num_local_nodes);
return -EINVAL;
}
for (i = 0; i < cipinfo->num_local_nodes; i++) {
if (cipinfo->local_nodes[i] - 1 >=
sizeof(config->local_nodes) * 8) {
pr_info("bad local_nodes[%d] %u\n",
i, cipinfo->local_nodes[i]);
return -EINVAL;
}
}
config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1);
if (!config) {
if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
pr_info("no config found for %pI4, need 'new'\n",
&e->ip.dst.s_addr);
return -EINVAL;
} else {
netfilter: ipt_CLUSTERIP: do not hold dev It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi <jishi@redhat.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2017-05-24 15:24:37 +02:00
config = clusterip_config_init(par->net, cipinfo,
e->ip.dst.s_addr,
e->ip.iniface);
if (IS_ERR(config))
return PTR_ERR(config);
}
} else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN)) {
clusterip_config_entry_put(config);
clusterip_config_put(config);
return -EINVAL;
}
ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0) {
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
clusterip_config_entry_put(config);
clusterip_config_put(config);
return ret;
}
if (!par->net->xt.clusterip_deprecated_warning) {
pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, "
"use xt_cluster instead\n");
par->net->xt.clusterip_deprecated_warning = true;
}
cipinfo->config = config;
return ret;
}
/* drop reference count of cluster config when rule is deleted */
static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
{
const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
/* if no more entries are referencing the config, remove it
* from the list and destroy the proc entry */
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
clusterip_config_entry_put(cipinfo->config);
clusterip_config_put(cipinfo->config);
nf_ct_netns_put(par->net, par->family);
}
#ifdef CONFIG_COMPAT
struct compat_ipt_clusterip_tgt_info
{
u_int32_t flags;
u_int8_t clustermac[6];
u_int16_t num_total_nodes;
u_int16_t num_local_nodes;
u_int16_t local_nodes[CLUSTERIP_MAX_NODES];
u_int32_t hash_mode;
u_int32_t hash_initval;
compat_uptr_t config;
};
#endif /* CONFIG_COMPAT */
static struct xt_target clusterip_tg_reg __read_mostly = {
.name = "CLUSTERIP",
.family = NFPROTO_IPV4,
.target = clusterip_tg,
.checkentry = clusterip_tg_check,
.destroy = clusterip_tg_destroy,
.targetsize = sizeof(struct ipt_clusterip_tgt_info),
.usersize = offsetof(struct ipt_clusterip_tgt_info, config),
#ifdef CONFIG_COMPAT
.compatsize = sizeof(struct compat_ipt_clusterip_tgt_info),
#endif /* CONFIG_COMPAT */
.me = THIS_MODULE
};
/***********************************************************************
* ARP MANGLING CODE
***********************************************************************/
/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */
struct arp_payload {
u_int8_t src_hw[ETH_ALEN];
__be32 src_ip;
u_int8_t dst_hw[ETH_ALEN];
__be32 dst_ip;
} __packed;
#ifdef DEBUG
static void arp_print(struct arp_payload *payload)
{
#define HBUFFERLEN 30
char hbuffer[HBUFFERLEN];
int j, k;
for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < ETH_ALEN; j++) {
hbuffer[k++] = hex_asc_hi(payload->src_hw[j]);
hbuffer[k++] = hex_asc_lo(payload->src_hw[j]);
hbuffer[k++] = ':';
}
hbuffer[--k] = '\0';
pr_debug("src %pI4@%s, dst %pI4\n",
&payload->src_ip, hbuffer, &payload->dst_ip);
}
#endif
static unsigned int
arp_mangle(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct arphdr *arp = arp_hdr(skb);
struct arp_payload *payload;
struct clusterip_config *c;
struct net *net = state->net;
/* we don't care about non-ethernet and non-ipv4 ARP */
if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
arp->ar_pro != htons(ETH_P_IP) ||
arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
return NF_ACCEPT;
/* we only want to mangle arp requests and replies */
if (arp->ar_op != htons(ARPOP_REPLY) &&
arp->ar_op != htons(ARPOP_REQUEST))
return NF_ACCEPT;
payload = (void *)(arp+1);
/* if there is no clusterip configuration for the arp reply's
* source ip, we don't want to mangle it */
c = clusterip_config_find_get(net, payload->src_ip, 0);
if (!c)
return NF_ACCEPT;
/* normally the linux kernel always replies to arp queries of
* addresses on different interfacs. However, in the CLUSTERIP case
* this wouldn't work, since we didn't subscribe the mcast group on
* other interfaces */
netfilter: ipt_CLUSTERIP: do not hold dev It's a terrible thing to hold dev in iptables target. When the dev is being removed, unregister_netdevice has to wait for the dev to become free. dmesg will keep logging the err: kernel:unregister_netdevice: waiting for veth0_in to become free. \ Usage count = 1 until iptables rules with this target are removed manually. The worse thing is when deleting a netns, a virtual nic will be deleted instead of reset to init_net in default_device_ops exit/exit_batch. As it is earlier than to flush the iptables rules in iptable_filter_net_ops exit, unregister_netdevice will block to wait for the nic to become free. As unregister_netdevice is actually waiting for iptables rules flushing while iptables rules have to be flushed after unregister_netdevice. This 'dead lock' will cause unregister_netdevice to block there forever. As the netns is not available to operate at that moment, iptables rules can not even be flushed manually either. The reproducer can be: # ip netns add test # ip link add veth0_in type veth peer name veth0_out # ip link set veth0_in netns test # ip netns exec test ip link set lo up # ip netns exec test ip link set veth0_in up # ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport # ip netns del test This issue can be triggered by all virtual nics with ipt_CLUSTERIP. This patch is to fix it by not holding dev in ipt_CLUSTERIP, but saving the dev->ifindex instead of the dev. As Pablo Neira Ayuso's suggestion, it will refresh c->ifindex and dev's mc by registering a netdevice notifier, just as what xt_TEE does. So it removes the old codes updating dev's mc, and also no need to initialize c->ifindex with dev->ifindex. But as one config can be shared by more than one targets, and the netdev notifier is per config, not per target. It couldn't get e->ip.iniface in the notifier handler. So e->ip.iniface has to be saved into config. Note that for backwards compatibility, this patch doesn't remove the codes checking if the dev exists before creating a config. v1->v2: - As Pablo Neira Ayuso's suggestion, register a netdevice notifier to manage c->ifindex and dev's mc. Reported-by: Jianlin Shi <jishi@redhat.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2017-05-24 15:24:37 +02:00
if (c->ifindex != state->out->ifindex) {
pr_debug("not mangling arp reply on different interface: cip'%d'-skb'%d'\n",
c->ifindex, state->out->ifindex);
clusterip_config_put(c);
return NF_ACCEPT;
}
/* mangle reply hardware address */
memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
#ifdef DEBUG
pr_debug("mangled arp reply: ");
arp_print(payload);
#endif
clusterip_config_put(c);
return NF_ACCEPT;
}
static const struct nf_hook_ops cip_arp_ops = {
.hook = arp_mangle,
.pf = NFPROTO_ARP,
.hooknum = NF_ARP_OUT,
.priority = -1
};
/***********************************************************************
* PROC DIR HANDLING
***********************************************************************/
#ifdef CONFIG_PROC_FS
struct clusterip_seq_position {
unsigned int pos; /* position */
unsigned int weight; /* number of bits set == size */
unsigned int bit; /* current bit */
unsigned long val; /* current value */
};
static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
{
struct clusterip_config *c = s->private;
unsigned int weight;
u_int32_t local_nodes;
struct clusterip_seq_position *idx;
/* FIXME: possible race */
local_nodes = c->local_nodes;
weight = hweight32(local_nodes);
if (*pos >= weight)
return NULL;
idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL);
if (!idx)
return ERR_PTR(-ENOMEM);
idx->pos = *pos;
idx->weight = weight;
idx->bit = ffs(local_nodes);
idx->val = local_nodes;
clear_bit(idx->bit - 1, &idx->val);
return idx;
}
static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
struct clusterip_seq_position *idx = v;
*pos = ++idx->pos;
if (*pos >= idx->weight) {
kfree(v);
return NULL;
}
idx->bit = ffs(idx->val);
clear_bit(idx->bit - 1, &idx->val);
return idx;
}
static void clusterip_seq_stop(struct seq_file *s, void *v)
{
if (!IS_ERR(v))
kfree(v);
}
static int clusterip_seq_show(struct seq_file *s, void *v)
{
struct clusterip_seq_position *idx = v;
if (idx->pos != 0)
seq_putc(s, ',');
seq_printf(s, "%u", idx->bit);
if (idx->pos == idx->weight - 1)
seq_putc(s, '\n');
return 0;
}
static const struct seq_operations clusterip_seq_ops = {
.start = clusterip_seq_start,
.next = clusterip_seq_next,
.stop = clusterip_seq_stop,
.show = clusterip_seq_show,
};
static int clusterip_proc_open(struct inode *inode, struct file *file)
{
int ret = seq_open(file, &clusterip_seq_ops);
if (!ret) {
struct seq_file *sf = file->private_data;
struct clusterip_config *c = PDE_DATA(inode);
sf->private = c;
clusterip_config_get(c);
}
return ret;
}
static int clusterip_proc_release(struct inode *inode, struct file *file)
{
struct clusterip_config *c = PDE_DATA(inode);
int ret;
ret = seq_release(inode, file);
if (!ret)
clusterip_config_put(c);
return ret;
}
static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
size_t size, loff_t *ofs)
{
struct clusterip_config *c = PDE_DATA(file_inode(file));
#define PROC_WRITELEN 10
char buffer[PROC_WRITELEN+1];
unsigned long nodenum;
int rc;
if (size > PROC_WRITELEN)
return -EIO;
if (copy_from_user(buffer, input, size))
return -EFAULT;
buffer[size] = 0;
if (*buffer == '+') {
rc = kstrtoul(buffer+1, 10, &nodenum);
if (rc)
return rc;
if (clusterip_add_node(c, nodenum))
return -ENOMEM;
} else if (*buffer == '-') {
rc = kstrtoul(buffer+1, 10, &nodenum);
if (rc)
return rc;
if (clusterip_del_node(c, nodenum))
return -ENOENT;
} else
return -EIO;
return size;
}
static const struct file_operations clusterip_proc_fops = {
.open = clusterip_proc_open,
.read = seq_read,
.write = clusterip_proc_write,
.llseek = seq_lseek,
.release = clusterip_proc_release,
};
#endif /* CONFIG_PROC_FS */
static int clusterip_net_init(struct net *net)
{
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
struct clusterip_net *cn = clusterip_pernet(net);
int ret;
INIT_LIST_HEAD(&cn->configs);
spin_lock_init(&cn->lock);
ret = nf_register_net_hook(net, &cip_arp_ops);
if (ret < 0)
return ret;
#ifdef CONFIG_PROC_FS
cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
if (!cn->procdir) {
nf_unregister_net_hook(net, &cip_arp_ops);
pr_err("Unable to proc dir entry\n");
return -ENOMEM;
}
netfilter: ipt_CLUSTERIP: fix sleep-in-atomic bug in clusterip_config_entry_put() A proc_remove() can sleep. so that it can't be inside of spin_lock. Hence proc_remove() is moved to outside of spin_lock. and it also adds mutex to sync create and remove of proc entry(config->pde). test commands: SHELL#1 %while :; do iptables -A INPUT -p udp -i enp2s0 -d 192.168.1.100 \ --dport 9000 -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:21 --total-nodes 3 --local-node 3; \ iptables -F; done SHELL#2 %while :; do echo +1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; \ echo -1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; done [ 2949.569864] BUG: sleeping function called from invalid context at kernel/sched/completion.c:99 [ 2949.579944] in_atomic(): 1, irqs_disabled(): 0, pid: 5472, name: iptables [ 2949.587920] 1 lock held by iptables/5472: [ 2949.592711] #0: 000000008f0ebcf2 (&(&cn->lock)->rlock){+...}, at: refcount_dec_and_lock+0x24/0x50 [ 2949.603307] CPU: 1 PID: 5472 Comm: iptables Tainted: G W 4.19.0-rc5+ #16 [ 2949.604212] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015 [ 2949.604212] Call Trace: [ 2949.604212] dump_stack+0xc9/0x16b [ 2949.604212] ? show_regs_print_info+0x5/0x5 [ 2949.604212] ___might_sleep+0x2eb/0x420 [ 2949.604212] ? set_rq_offline.part.87+0x140/0x140 [ 2949.604212] ? _rcu_barrier_trace+0x400/0x400 [ 2949.604212] wait_for_completion+0x94/0x710 [ 2949.604212] ? wait_for_completion_interruptible+0x780/0x780 [ 2949.604212] ? __kernel_text_address+0xe/0x30 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __init_waitqueue_head+0x86/0x130 [ 2949.604212] ? init_wait_entry+0x1a0/0x1a0 [ 2949.604212] proc_entry_rundown+0x208/0x270 [ 2949.604212] ? proc_reg_get_unmapped_area+0x370/0x370 [ 2949.604212] ? __lock_acquire+0x4500/0x4500 [ 2949.604212] ? complete+0x18/0x70 [ 2949.604212] remove_proc_subtree+0x143/0x2a0 [ 2949.708655] ? remove_proc_entry+0x390/0x390 [ 2949.708655] clusterip_tg_destroy+0x27a/0x630 [ipt_CLUSTERIP] [ ... ] Fixes: b3e456fce9f5 ("netfilter: ipt_CLUSTERIP: fix a race condition of proc file creation") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:23:13 +01:00
mutex_init(&cn->mutex);
#endif /* CONFIG_PROC_FS */
return 0;
}
static void clusterip_net_exit(struct net *net)
{
#ifdef CONFIG_PROC_FS
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
struct clusterip_net *cn = clusterip_pernet(net);
netfilter: ipt_CLUSTERIP: fix sleep-in-atomic bug in clusterip_config_entry_put() A proc_remove() can sleep. so that it can't be inside of spin_lock. Hence proc_remove() is moved to outside of spin_lock. and it also adds mutex to sync create and remove of proc entry(config->pde). test commands: SHELL#1 %while :; do iptables -A INPUT -p udp -i enp2s0 -d 192.168.1.100 \ --dport 9000 -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:21 --total-nodes 3 --local-node 3; \ iptables -F; done SHELL#2 %while :; do echo +1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; \ echo -1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; done [ 2949.569864] BUG: sleeping function called from invalid context at kernel/sched/completion.c:99 [ 2949.579944] in_atomic(): 1, irqs_disabled(): 0, pid: 5472, name: iptables [ 2949.587920] 1 lock held by iptables/5472: [ 2949.592711] #0: 000000008f0ebcf2 (&(&cn->lock)->rlock){+...}, at: refcount_dec_and_lock+0x24/0x50 [ 2949.603307] CPU: 1 PID: 5472 Comm: iptables Tainted: G W 4.19.0-rc5+ #16 [ 2949.604212] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015 [ 2949.604212] Call Trace: [ 2949.604212] dump_stack+0xc9/0x16b [ 2949.604212] ? show_regs_print_info+0x5/0x5 [ 2949.604212] ___might_sleep+0x2eb/0x420 [ 2949.604212] ? set_rq_offline.part.87+0x140/0x140 [ 2949.604212] ? _rcu_barrier_trace+0x400/0x400 [ 2949.604212] wait_for_completion+0x94/0x710 [ 2949.604212] ? wait_for_completion_interruptible+0x780/0x780 [ 2949.604212] ? __kernel_text_address+0xe/0x30 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __init_waitqueue_head+0x86/0x130 [ 2949.604212] ? init_wait_entry+0x1a0/0x1a0 [ 2949.604212] proc_entry_rundown+0x208/0x270 [ 2949.604212] ? proc_reg_get_unmapped_area+0x370/0x370 [ 2949.604212] ? __lock_acquire+0x4500/0x4500 [ 2949.604212] ? complete+0x18/0x70 [ 2949.604212] remove_proc_subtree+0x143/0x2a0 [ 2949.708655] ? remove_proc_entry+0x390/0x390 [ 2949.708655] clusterip_tg_destroy+0x27a/0x630 [ipt_CLUSTERIP] [ ... ] Fixes: b3e456fce9f5 ("netfilter: ipt_CLUSTERIP: fix a race condition of proc file creation") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:23:13 +01:00
mutex_lock(&cn->mutex);
proc_remove(cn->procdir);
cn->procdir = NULL;
netfilter: ipt_CLUSTERIP: fix sleep-in-atomic bug in clusterip_config_entry_put() A proc_remove() can sleep. so that it can't be inside of spin_lock. Hence proc_remove() is moved to outside of spin_lock. and it also adds mutex to sync create and remove of proc entry(config->pde). test commands: SHELL#1 %while :; do iptables -A INPUT -p udp -i enp2s0 -d 192.168.1.100 \ --dport 9000 -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:21 --total-nodes 3 --local-node 3; \ iptables -F; done SHELL#2 %while :; do echo +1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; \ echo -1 > /proc/net/ipt_CLUSTERIP/192.168.1.100; done [ 2949.569864] BUG: sleeping function called from invalid context at kernel/sched/completion.c:99 [ 2949.579944] in_atomic(): 1, irqs_disabled(): 0, pid: 5472, name: iptables [ 2949.587920] 1 lock held by iptables/5472: [ 2949.592711] #0: 000000008f0ebcf2 (&(&cn->lock)->rlock){+...}, at: refcount_dec_and_lock+0x24/0x50 [ 2949.603307] CPU: 1 PID: 5472 Comm: iptables Tainted: G W 4.19.0-rc5+ #16 [ 2949.604212] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015 [ 2949.604212] Call Trace: [ 2949.604212] dump_stack+0xc9/0x16b [ 2949.604212] ? show_regs_print_info+0x5/0x5 [ 2949.604212] ___might_sleep+0x2eb/0x420 [ 2949.604212] ? set_rq_offline.part.87+0x140/0x140 [ 2949.604212] ? _rcu_barrier_trace+0x400/0x400 [ 2949.604212] wait_for_completion+0x94/0x710 [ 2949.604212] ? wait_for_completion_interruptible+0x780/0x780 [ 2949.604212] ? __kernel_text_address+0xe/0x30 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __lockdep_init_map+0x10e/0x5c0 [ 2949.604212] ? __init_waitqueue_head+0x86/0x130 [ 2949.604212] ? init_wait_entry+0x1a0/0x1a0 [ 2949.604212] proc_entry_rundown+0x208/0x270 [ 2949.604212] ? proc_reg_get_unmapped_area+0x370/0x370 [ 2949.604212] ? __lock_acquire+0x4500/0x4500 [ 2949.604212] ? complete+0x18/0x70 [ 2949.604212] remove_proc_subtree+0x143/0x2a0 [ 2949.708655] ? remove_proc_entry+0x390/0x390 [ 2949.708655] clusterip_tg_destroy+0x27a/0x630 [ipt_CLUSTERIP] [ ... ] Fixes: b3e456fce9f5 ("netfilter: ipt_CLUSTERIP: fix a race condition of proc file creation") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:23:13 +01:00
mutex_unlock(&cn->mutex);
#endif
nf_unregister_net_hook(net, &cip_arp_ops);
}
static struct pernet_operations clusterip_net_ops = {
.init = clusterip_net_init,
.exit = clusterip_net_exit,
.id = &clusterip_net_id,
.size = sizeof(struct clusterip_net),
};
static struct notifier_block cip_netdev_notifier = {
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
.notifier_call = clusterip_netdev_event
};
static int __init clusterip_tg_init(void)
{
int ret;
ret = register_pernet_subsys(&clusterip_net_ops);
if (ret < 0)
return ret;
ret = xt_register_target(&clusterip_tg_reg);
if (ret < 0)
goto cleanup_subsys;
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
ret = register_netdevice_notifier(&cip_netdev_notifier);
if (ret < 0)
goto unregister_target;
pr_info("ClusterIP Version %s loaded successfully\n",
CLUSTERIP_VERSION);
return 0;
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
unregister_target:
xt_unregister_target(&clusterip_tg_reg);
cleanup_subsys:
unregister_pernet_subsys(&clusterip_net_ops);
return ret;
}
static void __exit clusterip_tg_exit(void)
{
pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
netfilter: ipt_CLUSTERIP: fix deadlock in netns exit routine When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier(). But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs. After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list. test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1 splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ] Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo <ap420073@gmail.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2018-11-05 10:22:44 +01:00
unregister_netdevice_notifier(&cip_netdev_notifier);
xt_unregister_target(&clusterip_tg_reg);
unregister_pernet_subsys(&clusterip_net_ops);
/* Wait for completion of call_rcu()'s (clusterip_config_rcu_free) */
rcu_barrier();
}
module_init(clusterip_tg_init);
module_exit(clusterip_tg_exit);