From 1f90c7f3470580e24da25f6a6c1fb480ed9371ac Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sat, 4 Feb 2017 18:05:06 +0100 Subject: [PATCH 1/4] bridge: modify bridge and port to have often accessed fields in one cache line Move around net_bridge so the vlan fields are in the beginning since they're checked on every packet even if vlan filtering is disabled. For the port move flags & vlan group to the beginning, so they're in the same cache line with the port's state (both flags and state are checked on each packet). Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_private.h | 43 +++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 40177df45ba6..ec8560349b6f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -212,12 +212,16 @@ struct net_bridge_mdb_htable u32 ver; }; -struct net_bridge_port -{ +struct net_bridge_port { struct net_bridge *br; struct net_device *dev; struct list_head list; + unsigned long flags; +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + struct net_bridge_vlan_group __rcu *vlgrp; +#endif + /* STP */ u8 priority; u8 state; @@ -238,8 +242,6 @@ struct net_bridge_port struct kobject kobj; struct rcu_head rcu; - unsigned long flags; - #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct bridge_mcast_own_query ip4_own_query; #if IS_ENABLED(CONFIG_IPV6) @@ -259,9 +261,6 @@ struct net_bridge_port #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif -#ifdef CONFIG_BRIDGE_VLAN_FILTERING - struct net_bridge_vlan_group __rcu *vlgrp; -#endif #ifdef CONFIG_NET_SWITCHDEV int offload_fwd_mark; #endif @@ -283,14 +282,21 @@ static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device * rtnl_dereference(dev->rx_handler_data) : NULL; } -struct net_bridge -{ +struct net_bridge { spinlock_t lock; + spinlock_t hash_lock; struct list_head port_list; struct net_device *dev; - struct pcpu_sw_netstats __percpu *stats; - spinlock_t hash_lock; + /* These fields are accessed on each packet */ +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + u8 vlan_enabled; + u8 vlan_stats_enabled; + __be16 vlan_proto; + u16 default_pvid; + struct net_bridge_vlan_group __rcu *vlgrp; +#endif + struct hlist_head hash[BR_HASH_SIZE]; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) union { @@ -308,6 +314,9 @@ struct net_bridge bridge_id designated_root; bridge_id bridge_id; u32 root_path_cost; + unsigned char topology_change; + unsigned char topology_change_detected; + u16 root_port; unsigned long max_age; unsigned long hello_time; unsigned long forward_delay; @@ -319,7 +328,6 @@ struct net_bridge u8 group_addr[ETH_ALEN]; bool group_addr_set; - u16 root_port; enum { BR_NO_STP, /* no spanning tree */ @@ -327,9 +335,6 @@ struct net_bridge BR_USER_STP, /* new RSTP in userspace */ } stp_enabled; - unsigned char topology_change; - unsigned char topology_change_detected; - #ifdef CONFIG_BRIDGE_IGMP_SNOOPING unsigned char multicast_router; @@ -381,14 +386,6 @@ struct net_bridge #ifdef CONFIG_NET_SWITCHDEV int offload_fwd_mark; #endif - -#ifdef CONFIG_BRIDGE_VLAN_FILTERING - struct net_bridge_vlan_group __rcu *vlgrp; - u8 vlan_enabled; - u8 vlan_stats_enabled; - __be16 vlan_proto; - u16 default_pvid; -#endif }; struct br_input_skb_cb { From f7cdee8a79a1cb03fa9ca71b825e72f880b344e1 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sat, 4 Feb 2017 18:05:07 +0100 Subject: [PATCH 2/4] bridge: move to workqueue gc Move the fdb garbage collector to a workqueue which fires at least 10 milliseconds apart and cleans chain by chain allowing for other tasks to run in the meantime. When having thousands of fdbs the system is much more responsive. Most importantly remove the need to check if the matched entry has expired in __br_fdb_get that causes false-sharing and is completely unnecessary if we cleanup entries, at worst we'll get 10ms of traffic for that entry before it gets deleted. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_device.c | 1 + net/bridge/br_fdb.c | 31 +++++++++++++++++++------------ net/bridge/br_if.c | 2 +- net/bridge/br_ioctl.c | 2 +- net/bridge/br_netlink.c | 2 +- net/bridge/br_private.h | 4 ++-- net/bridge/br_stp.c | 2 +- net/bridge/br_stp_if.c | 4 ++-- net/bridge/br_stp_timer.c | 2 -- net/bridge/br_sysfs_br.c | 2 +- 10 files changed, 29 insertions(+), 23 deletions(-) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 5ba0b558f8ae..d208ee9ab60a 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -411,4 +411,5 @@ void br_dev_setup(struct net_device *dev) br_netfilter_rtable_init(br); br_stp_timer_init(br); br_multicast_init(br); + INIT_DELAYED_WORK(&br->gc_work, br_fdb_cleanup); } diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index e4a4176171c9..5cbed5c0db88 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -154,7 +154,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) if (f->added_by_external_learn) fdb_del_external_learn(f); - hlist_del_rcu(&f->hlist); + hlist_del_init_rcu(&f->hlist); fdb_notify(br, f, RTM_DELNEIGH); call_rcu(&f->rcu, fdb_rcu_free); } @@ -290,34 +290,43 @@ out: spin_unlock_bh(&br->hash_lock); } -void br_fdb_cleanup(unsigned long _data) +void br_fdb_cleanup(struct work_struct *work) { - struct net_bridge *br = (struct net_bridge *)_data; + struct net_bridge *br = container_of(work, struct net_bridge, + gc_work.work); unsigned long delay = hold_time(br); - unsigned long next_timer = jiffies + br->ageing_time; + unsigned long work_delay = delay; + unsigned long now = jiffies; int i; - spin_lock(&br->hash_lock); for (i = 0; i < BR_HASH_SIZE; i++) { struct net_bridge_fdb_entry *f; struct hlist_node *n; + if (!br->hash[i].first) + continue; + + spin_lock_bh(&br->hash_lock); hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) { unsigned long this_timer; + if (f->is_static) continue; if (f->added_by_external_learn) continue; this_timer = f->updated + delay; - if (time_before_eq(this_timer, jiffies)) + if (time_after(this_timer, now)) + work_delay = min(work_delay, this_timer - now); + else fdb_delete(br, f); - else if (time_before(this_timer, next_timer)) - next_timer = this_timer; } + spin_unlock_bh(&br->hash_lock); + cond_resched(); } - spin_unlock(&br->hash_lock); - mod_timer(&br->gc_timer, round_jiffies_up(next_timer)); + /* Cleanup minimum 10 milliseconds apart */ + work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10)); + mod_delayed_work(system_long_wq, &br->gc_work, work_delay); } /* Completely flush all dynamic entries in forwarding database.*/ @@ -382,8 +391,6 @@ struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, &br->hash[br_mac_hash(addr, vid)], hlist) { if (ether_addr_equal(fdb->addr.addr, addr) && fdb->vlan_id == vid) { - if (unlikely(has_expired(br, fdb))) - break; return fdb; } } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index ed0dd3340084..8ac1770aa222 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -313,7 +313,7 @@ void br_dev_delete(struct net_device *dev, struct list_head *head) br_vlan_flush(br); br_multicast_dev_del(br); - del_timer_sync(&br->gc_timer); + cancel_delayed_work_sync(&br->gc_work); br_sysfs_delbr(br->dev); unregister_netdevice_queue(br->dev, head); diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index da8157c57eb1..7970f8540cbb 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -149,7 +149,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) b.hello_timer_value = br_timer_value(&br->hello_timer); b.tcn_timer_value = br_timer_value(&br->tcn_timer); b.topology_change_timer_value = br_timer_value(&br->topology_change_timer); - b.gc_timer_value = br_timer_value(&br->gc_timer); + b.gc_timer_value = br_timer_value(&br->gc_work.timer); rcu_read_unlock(); if (copy_to_user((void __user *)args[1], &b, sizeof(b))) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index fc5d885dbb22..1cbdc5b96aa7 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1250,7 +1250,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = br_timer_value(&br->gc_timer); + clockval = br_timer_value(&br->gc_work.timer); if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index ec8560349b6f..47fd64bf5022 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -379,7 +379,7 @@ struct net_bridge { struct timer_list hello_timer; struct timer_list tcn_timer; struct timer_list topology_change_timer; - struct timer_list gc_timer; + struct delayed_work gc_work; struct kobject *ifobj; u32 auto_cnt; @@ -502,7 +502,7 @@ void br_fdb_find_delete_local(struct net_bridge *br, const unsigned char *addr, u16 vid); void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); -void br_fdb_cleanup(unsigned long arg); +void br_fdb_cleanup(struct work_struct *work); void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, u16 vid, int do_all); struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 71fd1a4e63cc..8f56c2d1f1a7 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -602,7 +602,7 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time) br->ageing_time = t; spin_unlock_bh(&br->lock); - mod_timer(&br->gc_timer, jiffies); + mod_delayed_work(system_long_wq, &br->gc_work, 0); return 0; } diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 6c1e21411125..08341d2aa9c9 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -57,7 +57,7 @@ void br_stp_enable_bridge(struct net_bridge *br) spin_lock_bh(&br->lock); if (br->stp_enabled == BR_KERNEL_STP) mod_timer(&br->hello_timer, jiffies + br->hello_time); - mod_timer(&br->gc_timer, jiffies + HZ/10); + mod_delayed_work(system_long_wq, &br->gc_work, HZ / 10); br_config_bpdu_generation(br); @@ -88,7 +88,7 @@ void br_stp_disable_bridge(struct net_bridge *br) del_timer_sync(&br->hello_timer); del_timer_sync(&br->topology_change_timer); del_timer_sync(&br->tcn_timer); - del_timer_sync(&br->gc_timer); + cancel_delayed_work_sync(&br->gc_work); } /* called under bridge lock */ diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index 7ddb38e0a06e..c98b3e5c140a 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -153,8 +153,6 @@ void br_stp_timer_init(struct net_bridge *br) setup_timer(&br->topology_change_timer, br_topology_change_timer_expired, (unsigned long) br); - - setup_timer(&br->gc_timer, br_fdb_cleanup, (unsigned long) br); } void br_stp_port_timer_init(struct net_bridge_port *p) diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index a18148213b08..0f4034934d56 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -263,7 +263,7 @@ static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer)); + return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer)); } static DEVICE_ATTR_RO(gc_timer); From 1214628cb1868254e107230c9052f28ff9899b6a Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sat, 4 Feb 2017 18:05:08 +0100 Subject: [PATCH 3/4] bridge: move write-heavy fdb members in their own cache line Fdb's used and updated fields are written to on every packet forward and packet receive respectively. Thus if we are receiving packets from a particular fdb, they'll cause false-sharing with everyone who has looked it up (even if it didn't match, since mac/vid share cache line!). The "used" field is even worse since it is updated on every packet forward to that fdb, thus the standard config where X ports use a single gateway results in 100% fdb false-sharing. Note that this patch does not prevent the last scenario, but it makes it better for other bridge participants which are not using that fdb (and are only doing lookups over it). The point is with this move we make sure that only communicating parties get the false-sharing, in a later patch we'll show how to avoid that too. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_private.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 47fd64bf5022..1cbbf63a5ef7 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -160,19 +160,21 @@ struct net_bridge_vlan_group { u16 pvid; }; -struct net_bridge_fdb_entry -{ +struct net_bridge_fdb_entry { struct hlist_node hlist; struct net_bridge_port *dst; - unsigned long updated; - unsigned long used; mac_addr addr; __u16 vlan_id; unsigned char is_local:1, is_static:1, added_by_user:1, added_by_external_learn:1; + + /* write-heavy members should not affect lookups */ + unsigned long updated ____cacheline_aligned_in_smp; + unsigned long used; + struct rcu_head rcu; }; From 83a718d6294964fd1b227fa5f1ad001bc1fe7656 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sat, 4 Feb 2017 18:05:09 +0100 Subject: [PATCH 4/4] bridge: fdb: write to used and updated at most once per jiffy Writing once per jiffy is enough to limit the bridge's false sharing. After this change the bridge doesn't show up in the local load HitM stats. Suggested-by: David S. Miller Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 3 ++- net/bridge/br_input.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 5cbed5c0db88..5028691fa68a 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -597,7 +597,8 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, fdb->dst = source; fdb_modified = true; } - fdb->updated = jiffies; + if (jiffies != fdb->updated) + fdb->updated = jiffies; if (unlikely(added_by_user)) fdb->added_by_user = 1; if (unlikely(fdb_modified)) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index fba38d8a1a08..220943f920d2 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -198,7 +198,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (dst->is_local) return br_pass_frame_up(skb); - dst->used = jiffies; + if (jiffies != dst->used) + dst->used = jiffies; br_forward(dst->dst, skb, local_rcv, false); } else { if (!mcast_hit)