From 8142b227ef43119e19acf6122a9eea1a82492645 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Mon, 31 Mar 2014 18:14:18 +0400 Subject: [PATCH 01/33] netfilter: nf_conntrack: flush net_gre->keymap_list only from gre helper nf_ct_gre_keymap_flush() removes a nf_ct_gre_keymap object from net_gre->keymap_list and frees the object. But it doesn't clean a reference on this object from ct_pptp_info->keymap[dir]. Then nf_ct_gre_keymap_destroy() may release the same object again. So nf_ct_gre_keymap_flush() can be called only when we are sure that when nf_ct_gre_keymap_destroy will not be called. nf_ct_gre_keymap is created by nf_ct_gre_keymap_add() and the right way to destroy it is to call nf_ct_gre_keymap_destroy(). This patch marks nf_ct_gre_keymap_flush() as static, so this patch can break compilation of third party modules, which use nf_ct_gre_keymap_flush. I'm not sure this is the right way to deprecate this function. [ 226.540793] general protection fault: 0000 [#1] SMP [ 226.541750] Modules linked in: nf_nat_pptp nf_nat_proto_gre nf_conntrack_pptp nf_conntrack_proto_gre ip_gre ip_tunnel gre ppp_deflate bsd_comp ppp_async crc_ccitt ppp_generic slhc xt_nat iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack veth tun bridge stp llc ppdev microcode joydev pcspkr serio_raw virtio_console virtio_balloon floppy parport_pc parport pvpanic i2c_piix4 virtio_net drm_kms_helper ttm ata_generic virtio_pci virtio_ring virtio drm i2c_core pata_acpi [last unloaded: ip_tunnel] [ 226.541776] CPU: 0 PID: 49 Comm: kworker/u4:2 Not tainted 3.14.0-rc8+ #101 [ 226.541776] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 226.541776] Workqueue: netns cleanup_net [ 226.541776] task: ffff8800371e0000 ti: ffff88003730c000 task.ti: ffff88003730c000 [ 226.541776] RIP: 0010:[] [] __list_del_entry+0x29/0xd0 [ 226.541776] RSP: 0018:ffff88003730dbd0 EFLAGS: 00010a83 [ 226.541776] RAX: 6b6b6b6b6b6b6b6b RBX: ffff8800374e6c40 RCX: dead000000200200 [ 226.541776] RDX: 6b6b6b6b6b6b6b6b RSI: ffff8800371e07d0 RDI: ffff8800374e6c40 [ 226.541776] RBP: ffff88003730dbd0 R08: 0000000000000000 R09: 0000000000000000 [ 226.541776] R10: 0000000000000001 R11: ffff88003730d92e R12: 0000000000000002 [ 226.541776] R13: ffff88007a4c42d0 R14: ffff88007aef0000 R15: ffff880036cf0018 [ 226.541776] FS: 0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000 [ 226.541776] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 226.541776] CR2: 00007f07f643f7d0 CR3: 0000000036fd2000 CR4: 00000000000006f0 [ 226.541776] Stack: [ 226.541776] ffff88003730dbe8 ffffffff81389c5d ffff8800374ffbe4 ffff88003730dc28 [ 226.541776] ffffffffa0162a43 ffffffffa01627c5 ffff88007a4c42d0 ffff88007aef0000 [ 226.541776] ffffffffa01651c0 ffff88007a4c45e0 ffff88007aef0000 ffff88003730dc40 [ 226.541776] Call Trace: [ 226.541776] [] list_del+0xd/0x30 [ 226.541776] [] nf_ct_gre_keymap_destroy+0x283/0x2d0 [nf_conntrack_proto_gre] [ 226.541776] [] ? nf_ct_gre_keymap_destroy+0x5/0x2d0 [nf_conntrack_proto_gre] [ 226.541776] [] gre_destroy+0x27/0x70 [nf_conntrack_proto_gre] [ 226.541776] [] destroy_conntrack+0x83/0x200 [nf_conntrack] [ 226.541776] [] ? destroy_conntrack+0x27/0x200 [nf_conntrack] [ 226.541776] [] ? nf_conntrack_hash_check_insert+0x2e0/0x2e0 [nf_conntrack] [ 226.541776] [] nf_conntrack_destroy+0x72/0x180 [ 226.541776] [] ? nf_conntrack_destroy+0x5/0x180 [ 226.541776] [] ? kill_l3proto+0x20/0x20 [nf_conntrack] [ 226.541776] [] nf_ct_iterate_cleanup+0x14e/0x170 [nf_conntrack] [ 226.541776] [] nf_ct_l4proto_pernet_unregister+0x5b/0x90 [nf_conntrack] [ 226.541776] [] proto_gre_net_exit+0x19/0x30 [nf_conntrack_proto_gre] [ 226.541776] [] ops_exit_list.isra.1+0x39/0x60 [ 226.541776] [] cleanup_net+0x100/0x1d0 [ 226.541776] [] process_one_work+0x1ea/0x4f0 [ 226.541776] [] ? process_one_work+0x188/0x4f0 [ 226.541776] [] worker_thread+0x11b/0x3a0 [ 226.541776] [] ? process_one_work+0x4f0/0x4f0 [ 226.541776] [] kthread+0xed/0x110 [ 226.541776] [] ? _raw_spin_unlock_irq+0x2c/0x40 [ 226.541776] [] ? kthread_create_on_node+0x200/0x200 [ 226.541776] [] ret_from_fork+0x7c/0xb0 [ 226.541776] [] ? kthread_create_on_node+0x200/0x200 [ 226.541776] Code: 00 00 55 48 8b 17 48 b9 00 01 10 00 00 00 ad de 48 8b 47 08 48 89 e5 48 39 ca 74 29 48 b9 00 02 20 00 00 00 ad de 48 39 c8 74 7a <4c> 8b 00 4c 39 c7 75 53 4c 8b 42 08 4c 39 c7 75 2b 48 89 42 08 [ 226.541776] RIP [] __list_del_entry+0x29/0xd0 [ 226.541776] RSP [ 226.612193] ---[ end trace 985ae23ddfcc357c ]--- Cc: Pablo Neira Ayuso Cc: Patrick McHardy Cc: Jozsef Kadlecsik Cc: "David S. Miller" Signed-off-by: Andrey Vagin Signed-off-by: Pablo Neira Ayuso --- .../linux/netfilter/nf_conntrack_proto_gre.h | 1 - net/netfilter/nf_conntrack_pptp.c | 20 +------------------ net/netfilter/nf_conntrack_proto_gre.c | 3 +-- 3 files changed, 2 insertions(+), 22 deletions(-) diff --git a/include/linux/netfilter/nf_conntrack_proto_gre.h b/include/linux/netfilter/nf_conntrack_proto_gre.h index ec2ffaf418c8..df78dc2b5524 100644 --- a/include/linux/netfilter/nf_conntrack_proto_gre.h +++ b/include/linux/netfilter/nf_conntrack_proto_gre.h @@ -87,7 +87,6 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, /* delete keymap entries */ void nf_ct_gre_keymap_destroy(struct nf_conn *ct); -void nf_ct_gre_keymap_flush(struct net *net); void nf_nat_need_gre(void); #endif /* __KERNEL__ */ diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 7bd03decd36c..825c3e3f8305 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -605,32 +605,14 @@ static struct nf_conntrack_helper pptp __read_mostly = { .expect_policy = &pptp_exp_policy, }; -static void nf_conntrack_pptp_net_exit(struct net *net) -{ - nf_ct_gre_keymap_flush(net); -} - -static struct pernet_operations nf_conntrack_pptp_net_ops = { - .exit = nf_conntrack_pptp_net_exit, -}; - static int __init nf_conntrack_pptp_init(void) { - int rv; - - rv = nf_conntrack_helper_register(&pptp); - if (rv < 0) - return rv; - rv = register_pernet_subsys(&nf_conntrack_pptp_net_ops); - if (rv < 0) - nf_conntrack_helper_unregister(&pptp); - return rv; + return nf_conntrack_helper_register(&pptp); } static void __exit nf_conntrack_pptp_fini(void) { nf_conntrack_helper_unregister(&pptp); - unregister_pernet_subsys(&nf_conntrack_pptp_net_ops); } module_init(nf_conntrack_pptp_init); diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 9d9c0dade602..d5665739e3b1 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -66,7 +66,7 @@ static inline struct netns_proto_gre *gre_pernet(struct net *net) return net_generic(net, proto_gre_net_id); } -void nf_ct_gre_keymap_flush(struct net *net) +static void nf_ct_gre_keymap_flush(struct net *net) { struct netns_proto_gre *net_gre = gre_pernet(net); struct nf_ct_gre_keymap *km, *tmp; @@ -78,7 +78,6 @@ void nf_ct_gre_keymap_flush(struct net *net) } write_unlock_bh(&net_gre->keymap_lock); } -EXPORT_SYMBOL(nf_ct_gre_keymap_flush); static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, const struct nf_conntrack_tuple *t) From b04c46190219a4f845e46a459e3102137b7f6cac Mon Sep 17 00:00:00 2001 From: "Wang, Xiaoming" Date: Mon, 14 Apr 2014 12:30:45 -0400 Subject: [PATCH 02/33] net: ipv4: current group_info should be put after using. Plug a group_info refcount leak in ping_init. group_info is only needed during initialization and the code failed to release the reference on exit. While here move grabbing the reference to a place where it is actually needed. Signed-off-by: Chuansheng Liu Signed-off-by: Zhang Dongxing Signed-off-by: xiaoming wang Signed-off-by: David S. Miller --- net/ipv4/ping.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index f4b19e5dde54..8210964a9f19 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -252,26 +252,33 @@ int ping_init_sock(struct sock *sk) { struct net *net = sock_net(sk); kgid_t group = current_egid(); - struct group_info *group_info = get_current_groups(); - int i, j, count = group_info->ngroups; + struct group_info *group_info; + int i, j, count; kgid_t low, high; + int ret = 0; inet_get_ping_group_range_net(net, &low, &high); if (gid_lte(low, group) && gid_lte(group, high)) return 0; + group_info = get_current_groups(); + count = group_info->ngroups; for (i = 0; i < group_info->nblocks; i++) { int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); for (j = 0; j < cp_count; j++) { kgid_t gid = group_info->blocks[i][j]; if (gid_lte(low, gid) && gid_lte(gid, high)) - return 0; + goto out_release_group; } count -= cp_count; } - return -EACCES; + ret = -EACCES; + +out_release_group: + put_group_info(group_info); + return ret; } EXPORT_SYMBOL_GPL(ping_init_sock); From befdf8978accecac2e0739e6b5075afc62db37fe Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 14 Apr 2014 09:51:19 +0800 Subject: [PATCH 03/33] net/mlx4_core: Preserve pci_dev_data after __mlx4_remove_one() pci_match_id() just match the static pci_device_id, which may return NULL if someone binds the driver to a device manually using /sys/bus/pci/drivers/.../new_id. This patch wrap up a helper function __mlx4_remove_one() which does the tear down function but preserve the drv_data. Functions like mlx4_pci_err_detected() and mlx4_restart_one() will call this one with out releasing drvdata. Fixes: 97a5221 "net/mlx4_core: pass pci_device_id.driver_data to __mlx4_init_one during reset". CC: Bjorn Helgaas CC: Amir Vadai CC: Jack Morgenstein CC: Or Gerlitz Signed-off-by: Wei Yang Acked-by: Jack Morgenstein Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/main.c | 186 ++++++++++++---------- drivers/net/ethernet/mellanox/mlx4/mlx4.h | 1 + 2 files changed, 103 insertions(+), 84 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index f0ae95f66ceb..4b86c7af2a7a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -2301,13 +2301,8 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) /* Allow large DMA segments, up to the firmware limit of 1 GB */ dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (!priv) { - err = -ENOMEM; - goto err_release_regions; - } - - dev = &priv->dev; + dev = pci_get_drvdata(pdev); + priv = mlx4_priv(dev); dev->pdev = pdev; INIT_LIST_HEAD(&priv->ctx_list); spin_lock_init(&priv->ctx_lock); @@ -2535,8 +2530,7 @@ slave_start: mlx4_sense_init(dev); mlx4_start_sense(dev); - priv->pci_dev_data = pci_dev_data; - pci_set_drvdata(pdev, dev); + priv->removed = 0; return 0; @@ -2604,85 +2598,109 @@ err_disable_pdev: static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { + struct mlx4_priv *priv; + struct mlx4_dev *dev; + printk_once(KERN_INFO "%s", mlx4_version); + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + dev = &priv->dev; + pci_set_drvdata(pdev, dev); + priv->pci_dev_data = id->driver_data; + return __mlx4_init_one(pdev, id->driver_data); } +static void __mlx4_remove_one(struct pci_dev *pdev) +{ + struct mlx4_dev *dev = pci_get_drvdata(pdev); + struct mlx4_priv *priv = mlx4_priv(dev); + int pci_dev_data; + int p; + + if (priv->removed) + return; + + pci_dev_data = priv->pci_dev_data; + + /* in SRIOV it is not allowed to unload the pf's + * driver while there are alive vf's */ + if (mlx4_is_master(dev) && mlx4_how_many_lives_vf(dev)) + printk(KERN_ERR "Removing PF when there are assigned VF's !!!\n"); + mlx4_stop_sense(dev); + mlx4_unregister_device(dev); + + for (p = 1; p <= dev->caps.num_ports; p++) { + mlx4_cleanup_port_info(&priv->port[p]); + mlx4_CLOSE_PORT(dev, p); + } + + if (mlx4_is_master(dev)) + mlx4_free_resource_tracker(dev, + RES_TR_FREE_SLAVES_ONLY); + + mlx4_cleanup_counters_table(dev); + mlx4_cleanup_qp_table(dev); + mlx4_cleanup_srq_table(dev); + mlx4_cleanup_cq_table(dev); + mlx4_cmd_use_polling(dev); + mlx4_cleanup_eq_table(dev); + mlx4_cleanup_mcg_table(dev); + mlx4_cleanup_mr_table(dev); + mlx4_cleanup_xrcd_table(dev); + mlx4_cleanup_pd_table(dev); + + if (mlx4_is_master(dev)) + mlx4_free_resource_tracker(dev, + RES_TR_FREE_STRUCTS_ONLY); + + iounmap(priv->kar); + mlx4_uar_free(dev, &priv->driver_uar); + mlx4_cleanup_uar_table(dev); + if (!mlx4_is_slave(dev)) + mlx4_clear_steering(dev); + mlx4_free_eq_table(dev); + if (mlx4_is_master(dev)) + mlx4_multi_func_cleanup(dev); + mlx4_close_hca(dev); + if (mlx4_is_slave(dev)) + mlx4_multi_func_cleanup(dev); + mlx4_cmd_cleanup(dev); + + if (dev->flags & MLX4_FLAG_MSI_X) + pci_disable_msix(pdev); + if (dev->flags & MLX4_FLAG_SRIOV) { + mlx4_warn(dev, "Disabling SR-IOV\n"); + pci_disable_sriov(pdev); + } + + if (!mlx4_is_slave(dev)) + mlx4_free_ownership(dev); + + kfree(dev->caps.qp0_tunnel); + kfree(dev->caps.qp0_proxy); + kfree(dev->caps.qp1_tunnel); + kfree(dev->caps.qp1_proxy); + kfree(dev->dev_vfs); + + pci_release_regions(pdev); + pci_disable_device(pdev); + memset(priv, 0, sizeof(*priv)); + priv->pci_dev_data = pci_dev_data; + priv->removed = 1; +} + static void mlx4_remove_one(struct pci_dev *pdev) { struct mlx4_dev *dev = pci_get_drvdata(pdev); struct mlx4_priv *priv = mlx4_priv(dev); - int p; - if (dev) { - /* in SRIOV it is not allowed to unload the pf's - * driver while there are alive vf's */ - if (mlx4_is_master(dev)) { - if (mlx4_how_many_lives_vf(dev)) - printk(KERN_ERR "Removing PF when there are assigned VF's !!!\n"); - } - mlx4_stop_sense(dev); - mlx4_unregister_device(dev); - - for (p = 1; p <= dev->caps.num_ports; p++) { - mlx4_cleanup_port_info(&priv->port[p]); - mlx4_CLOSE_PORT(dev, p); - } - - if (mlx4_is_master(dev)) - mlx4_free_resource_tracker(dev, - RES_TR_FREE_SLAVES_ONLY); - - mlx4_cleanup_counters_table(dev); - mlx4_cleanup_qp_table(dev); - mlx4_cleanup_srq_table(dev); - mlx4_cleanup_cq_table(dev); - mlx4_cmd_use_polling(dev); - mlx4_cleanup_eq_table(dev); - mlx4_cleanup_mcg_table(dev); - mlx4_cleanup_mr_table(dev); - mlx4_cleanup_xrcd_table(dev); - mlx4_cleanup_pd_table(dev); - - if (mlx4_is_master(dev)) - mlx4_free_resource_tracker(dev, - RES_TR_FREE_STRUCTS_ONLY); - - iounmap(priv->kar); - mlx4_uar_free(dev, &priv->driver_uar); - mlx4_cleanup_uar_table(dev); - if (!mlx4_is_slave(dev)) - mlx4_clear_steering(dev); - mlx4_free_eq_table(dev); - if (mlx4_is_master(dev)) - mlx4_multi_func_cleanup(dev); - mlx4_close_hca(dev); - if (mlx4_is_slave(dev)) - mlx4_multi_func_cleanup(dev); - mlx4_cmd_cleanup(dev); - - if (dev->flags & MLX4_FLAG_MSI_X) - pci_disable_msix(pdev); - if (dev->flags & MLX4_FLAG_SRIOV) { - mlx4_warn(dev, "Disabling SR-IOV\n"); - pci_disable_sriov(pdev); - } - - if (!mlx4_is_slave(dev)) - mlx4_free_ownership(dev); - - kfree(dev->caps.qp0_tunnel); - kfree(dev->caps.qp0_proxy); - kfree(dev->caps.qp1_tunnel); - kfree(dev->caps.qp1_proxy); - kfree(dev->dev_vfs); - - kfree(priv); - pci_release_regions(pdev); - pci_disable_device(pdev); - pci_set_drvdata(pdev, NULL); - } + __mlx4_remove_one(pdev); + kfree(priv); + pci_set_drvdata(pdev, NULL); } int mlx4_restart_one(struct pci_dev *pdev) @@ -2692,7 +2710,7 @@ int mlx4_restart_one(struct pci_dev *pdev) int pci_dev_data; pci_dev_data = priv->pci_dev_data; - mlx4_remove_one(pdev); + __mlx4_remove_one(pdev); return __mlx4_init_one(pdev, pci_dev_data); } @@ -2747,7 +2765,7 @@ MODULE_DEVICE_TABLE(pci, mlx4_pci_table); static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state) { - mlx4_remove_one(pdev); + __mlx4_remove_one(pdev); return state == pci_channel_io_perm_failure ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; @@ -2755,11 +2773,11 @@ static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev, static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev) { - const struct pci_device_id *id; - int ret; + struct mlx4_dev *dev = pci_get_drvdata(pdev); + struct mlx4_priv *priv = mlx4_priv(dev); + int ret; - id = pci_match_id(mlx4_pci_table, pdev); - ret = __mlx4_init_one(pdev, id->driver_data); + ret = __mlx4_init_one(pdev, priv->pci_dev_data); return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; } diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index cf8be41abb36..f9c465101963 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -800,6 +800,7 @@ struct mlx4_priv { spinlock_t ctx_lock; int pci_dev_data; + int removed; struct list_head pgdir_list; struct mutex pgdir_mutex; From cc6ca3023f2c2bbcd062e9d4cf6afc2ba2821ada Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Sun, 13 Apr 2014 11:15:33 +0200 Subject: [PATCH 04/33] Revert "net: mvneta: fix usage as a module on RGMII configurations" This reverts commit e3a8786c10e75903f1269474e21fe8cb49c3a670. While this commit allows to use the mvneta driver as a module on some configurations, it breaks other configurations even if mvneta is used built-in. This breakage is due to the fact that on some RGMII platforms, the PCS bit has to be set, and on some other platforms, it has to be cleared. At the moment, we lack informations to know exactly the significance of this bit (the datasheet only says "enables PCS"), and so we can't produce a patch that will work on all platforms at this point. And since this change is breaking the network completely for many users, it's much better to revert it for now. We'll come back later with a proper fix that takes into account all platforms. Basically: * Armada XP GP is configured as RGMII-ID, and needs the PCS bit to be set. * Armada 370 Mirabox is configured as RGMII-ID, and needs the PCS bit to be cleared. And at the moment, we don't know how to make the distinction between those two cases. One hint is that the Armada XP GP appears in fact to be using a QSGMII connection with the PHY (Quad-SGMII), but configuring it as SGMII doesn't work, while RGMII-ID works. This needs more investigation, but in the mean time, let's unbreak the network for all those users. Signed-off-by: Thomas Petazzoni Reported-by: Arnaud Ebalard Reported-by: Alexander Reuter Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=73401 Cc: stable@vger.kernel.org Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 41 +++++++++++++++++++++------ 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index d04b1c3c9b85..b248bcbdae63 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -89,9 +89,8 @@ #define MVNETA_TX_IN_PRGRS BIT(1) #define MVNETA_TX_FIFO_EMPTY BIT(8) #define MVNETA_RX_MIN_FRAME_SIZE 0x247c -#define MVNETA_SERDES_CFG 0x24A0 +#define MVNETA_SGMII_SERDES_CFG 0x24A0 #define MVNETA_SGMII_SERDES_PROTO 0x0cc7 -#define MVNETA_RGMII_SERDES_PROTO 0x0667 #define MVNETA_TYPE_PRIO 0x24bc #define MVNETA_FORCE_UNI BIT(21) #define MVNETA_TXQ_CMD_1 0x24e4 @@ -712,6 +711,35 @@ static void mvneta_rxq_bm_disable(struct mvneta_port *pp, mvreg_write(pp, MVNETA_RXQ_CONFIG_REG(rxq->id), val); } + + +/* Sets the RGMII Enable bit (RGMIIEn) in port MAC control register */ +static void mvneta_gmac_rgmii_set(struct mvneta_port *pp, int enable) +{ + u32 val; + + val = mvreg_read(pp, MVNETA_GMAC_CTRL_2); + + if (enable) + val |= MVNETA_GMAC2_PORT_RGMII; + else + val &= ~MVNETA_GMAC2_PORT_RGMII; + + mvreg_write(pp, MVNETA_GMAC_CTRL_2, val); +} + +/* Config SGMII port */ +static void mvneta_port_sgmii_config(struct mvneta_port *pp) +{ + u32 val; + + val = mvreg_read(pp, MVNETA_GMAC_CTRL_2); + val |= MVNETA_GMAC2_PCS_ENABLE; + mvreg_write(pp, MVNETA_GMAC_CTRL_2, val); + + mvreg_write(pp, MVNETA_SGMII_SERDES_CFG, MVNETA_SGMII_SERDES_PROTO); +} + /* Start the Ethernet port RX and TX activity */ static void mvneta_port_up(struct mvneta_port *pp) { @@ -2729,15 +2757,12 @@ static void mvneta_port_power_up(struct mvneta_port *pp, int phy_mode) mvreg_write(pp, MVNETA_UNIT_INTR_CAUSE, 0); if (phy_mode == PHY_INTERFACE_MODE_SGMII) - mvreg_write(pp, MVNETA_SERDES_CFG, MVNETA_SGMII_SERDES_PROTO); - else - mvreg_write(pp, MVNETA_SERDES_CFG, MVNETA_RGMII_SERDES_PROTO); + mvneta_port_sgmii_config(pp); - val = mvreg_read(pp, MVNETA_GMAC_CTRL_2); - - val |= MVNETA_GMAC2_PCS_ENABLE | MVNETA_GMAC2_PORT_RGMII; + mvneta_gmac_rgmii_set(pp, 1); /* Cancel Port Reset */ + val = mvreg_read(pp, MVNETA_GMAC_CTRL_2); val &= ~MVNETA_GMAC2_PORT_RESET; mvreg_write(pp, MVNETA_GMAC_CTRL_2, val); From 91146153da2feab18efab2e13b0945b6bb704ded Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 13 Apr 2014 18:08:02 +0300 Subject: [PATCH 05/33] ipv4: return valid RTA_IIF on ip route get Extend commit 13378cad02afc2adc6c0e07fca03903c7ada0b37 ("ipv4: Change rt->rt_iif encoding.") from 3.6 to return valid RTA_IIF on 'ip route get ... iif DEVICE' instead of rt_iif 0 which is displayed as 'iif *'. inet_iif is not appropriate to use because skb_iif is not set. Use the skb->dev->ifindex instead. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 34d094cadb11..20a59c388e6e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2357,7 +2357,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, } } else #endif - if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) + if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex)) goto nla_put_failure; } From 05ab8f2647e4221cbdb3856dd7d32bd5407316b3 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 13 Apr 2014 18:23:33 +0200 Subject: [PATCH 06/33] filter: prevent nla extensions to peek beyond the end of the message The BPF_S_ANC_NLATTR and BPF_S_ANC_NLATTR_NEST extensions fail to check for a minimal message length before testing the supplied offset to be within the bounds of the message. This allows the subtraction of the nla header to underflow and therefore -- as the data type is unsigned -- allowing far to big offset and length values for the search of the netlink attribute. The remainder calculation for the BPF_S_ANC_NLATTR_NEST extension is also wrong. It has the minuend and subtrahend mixed up, therefore calculates a huge length value, allowing to overrun the end of the message while looking for the netlink attribute. The following three BPF snippets will trigger the bugs when attached to a UNIX datagram socket and parsing a message with length 1, 2 or 3. ,-[ PoC for missing size check in BPF_S_ANC_NLATTR ]-- | ld #0x87654321 | ldx #42 | ld #nla | ret a `--- ,-[ PoC for the same bug in BPF_S_ANC_NLATTR_NEST ]-- | ld #0x87654321 | ldx #42 | ld #nlan | ret a `--- ,-[ PoC for wrong remainder calculation in BPF_S_ANC_NLATTR_NEST ]-- | ; (needs a fake netlink header at offset 0) | ld #0 | ldx #42 | ld #nlan | ret a `--- Fix the first issue by ensuring the message length fulfills the minimal size constrains of a nla header. Fix the second bug by getting the math for the remainder calculation right. Fixes: 4738c1db15 ("[SKFILTER]: Add SKF_ADF_NLATTR instruction") Fixes: d214c7537b ("filter: add SKF_AD_NLATTR_NEST to look for nested..") Cc: Patrick McHardy Cc: Pablo Neira Ayuso Signed-off-by: Mathias Krause Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index e08b3822c72a..0e0856f5d708 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -600,6 +600,9 @@ static u64 __skb_get_nlattr(u64 ctx, u64 A, u64 X, u64 r4, u64 r5) if (skb_is_nonlinear(skb)) return 0; + if (skb->len < sizeof(struct nlattr)) + return 0; + if (A > skb->len - sizeof(struct nlattr)) return 0; @@ -618,11 +621,14 @@ static u64 __skb_get_nlattr_nest(u64 ctx, u64 A, u64 X, u64 r4, u64 r5) if (skb_is_nonlinear(skb)) return 0; + if (skb->len < sizeof(struct nlattr)) + return 0; + if (A > skb->len - sizeof(struct nlattr)) return 0; nla = (struct nlattr *) &skb->data[A]; - if (nla->nla_len > A - skb->len) + if (nla->nla_len > skb->len - A) return 0; nla = nla_find_nested(nla, X); From ee214d54bf3d51259adf8917e26dc84df1cab05a Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Fri, 11 Apr 2014 21:34:20 +0400 Subject: [PATCH 07/33] netfilter: nf_conntrack: initialize net.ct.generation [ 251.920788] INFO: trying to register non-static key. [ 251.921386] the code is fine but needs lockdep annotation. [ 251.921386] turning off the locking correctness validator. [ 251.921386] CPU: 2 PID: 15715 Comm: socket_listen Not tainted 3.14.0+ #294 [ 251.921386] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 251.921386] 0000000000000000 000000009d18c210 ffff880075f039b8 ffffffff816b7ecd [ 251.921386] ffffffff822c3b10 ffff880075f039c8 ffffffff816b36f4 ffff880075f03aa0 [ 251.921386] ffffffff810c65ff ffffffff810c4a85 00000000fffffe01 ffffffffa0075172 [ 251.921386] Call Trace: [ 251.921386] [] dump_stack+0x45/0x56 [ 251.921386] [] register_lock_class.part.24+0x38/0x3c [ 251.921386] [] __lock_acquire+0x168f/0x1b40 [ 251.921386] [] ? trace_hardirqs_on_caller+0x105/0x1d0 [ 251.921386] [] ? nf_nat_setup_info+0x252/0x3a0 [nf_nat] [ 251.921386] [] ? _raw_spin_unlock_bh+0x35/0x40 [ 251.921386] [] ? nf_nat_setup_info+0x252/0x3a0 [nf_nat] [ 251.921386] [] lock_acquire+0xa2/0x120 [ 251.921386] [] ? ipv4_confirm+0x90/0xf0 [nf_conntrack_ipv4] [ 251.921386] [] __nf_conntrack_confirm+0x129/0x410 [nf_conntrack] [ 251.921386] [] ? ipv4_confirm+0x90/0xf0 [nf_conntrack_ipv4] [ 251.921386] [] ipv4_confirm+0x90/0xf0 [nf_conntrack_ipv4] [ 251.921386] [] ? ip_fragment+0x9f0/0x9f0 [ 251.921386] [] nf_iterate+0xaa/0xc0 [ 251.921386] [] ? ip_fragment+0x9f0/0x9f0 [ 251.921386] [] nf_hook_slow+0xa4/0x190 [ 251.921386] [] ? ip_fragment+0x9f0/0x9f0 [ 251.921386] [] ip_output+0x92/0x100 [ 251.921386] [] ip_local_out+0x29/0x90 [ 251.921386] [] ip_queue_xmit+0x170/0x4c0 [ 251.921386] [] ? ip_queue_xmit+0x5/0x4c0 [ 251.921386] [] tcp_transmit_skb+0x498/0x960 [ 251.921386] [] tcp_connect+0x812/0x960 [ 251.921386] [] ? ktime_get_real+0x25/0x70 [ 251.921386] [] ? secure_tcp_sequence_number+0x6a/0xc0 [ 251.921386] [] tcp_v4_connect+0x317/0x470 [ 251.921386] [] __inet_stream_connect+0xb5/0x330 [ 251.921386] [] ? lock_sock_nested+0x33/0xa0 [ 251.921386] [] ? trace_hardirqs_on+0xd/0x10 [ 251.921386] [] ? __local_bh_enable_ip+0x75/0xe0 [ 251.921386] [] inet_stream_connect+0x38/0x50 [ 251.921386] [] SYSC_connect+0xe7/0x120 [ 251.921386] [] ? current_kernel_time+0x69/0xd0 [ 251.921386] [] ? trace_hardirqs_on_caller+0x105/0x1d0 [ 251.921386] [] ? trace_hardirqs_on+0xd/0x10 [ 251.921386] [] SyS_connect+0xe/0x10 [ 251.921386] [] system_call_fastpath+0x16/0x1b [ 312.014104] INFO: rcu_sched detected stalls on CPUs/tasks: {} (detected by 0, t=60003 jiffies, g=42359, c=42358, q=333) [ 312.015097] INFO: Stall ended before state dump start Fixes: 93bb0ceb75be ("netfilter: conntrack: remove central spinlock nf_conntrack_lock") Cc: Jesper Dangaard Brouer Cc: Pablo Neira Ayuso Cc: Patrick McHardy Cc: Jozsef Kadlecsik Cc: "David S. Miller" Signed-off-by: Andrey Vagin Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 6dba48efe01e..75421f2ba8be 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1795,6 +1795,7 @@ int nf_conntrack_init_net(struct net *net) int cpu; atomic_set(&net->ct.count, 0); + seqcount_init(&net->ct.generation); net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); if (!net->ct.pcpu_lists) From b855d416dc17061ebb271ea7ef1201d100531770 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 12 Apr 2014 13:17:57 +0200 Subject: [PATCH 08/33] netfilter: nf_tables: fix nft_cmp_fast failure on big endian for size < 4 nft_cmp_fast is used for equality comparisions of size <= 4. For comparisions of size < 4 byte a mask is calculated that is applied to both the data from userspace (during initialization) and the register value (during runtime). Both values are stored using (in effect) memcpy to a memory area that is then interpreted as u32 by nft_cmp_fast. This works fine on little endian since smaller types have the same base address, however on big endian this is not true and the smaller types are interpreted as a big number with trailing zero bytes. The mask therefore must not include the lower bytes, but the higher bytes on big endian. Add a helper function that does a cpu_to_le32 to switch the bytes on big endian. Since we're dealing with a mask of just consequitive bits, this works out fine. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 10 ++++++++++ net/netfilter/nf_tables_core.c | 3 +-- net/netfilter/nft_cmp.c | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index cf2b7ae2b9d8..a75fc8e27cd6 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -13,6 +13,16 @@ struct nft_cmp_fast_expr { u8 len; }; +/* Calculate the mask for the nft_cmp_fast expression. On big endian the + * mask needs to include the *upper* bytes when interpreting that data as + * something smaller than the full u32, therefore a cpu_to_le32 is done. + */ +static inline u32 nft_cmp_fast_mask(unsigned int len) +{ + return cpu_to_le32(~0U >> (FIELD_SIZEOF(struct nft_cmp_fast_expr, + data) * BITS_PER_BYTE - len)); +} + extern const struct nft_expr_ops nft_cmp_fast_ops; int nft_cmp_module_init(void); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 90998a6ff8b9..804105391b9a 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -25,9 +25,8 @@ static void nft_cmp_fast_eval(const struct nft_expr *expr, struct nft_data data[NFT_REG_MAX + 1]) { const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); - u32 mask; + u32 mask = nft_cmp_fast_mask(priv->len); - mask = ~0U >> (sizeof(priv->data) * BITS_PER_BYTE - priv->len); if ((data[priv->sreg].data[0] & mask) == priv->data) return; data[NFT_REG_VERDICT].verdict = NFT_BREAK; diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 954925db414d..e2b3f51c81f1 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -128,7 +128,7 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, BUG_ON(err < 0); desc.len *= BITS_PER_BYTE; - mask = ~0U >> (sizeof(priv->data) * BITS_PER_BYTE - desc.len); + mask = nft_cmp_fast_mask(desc.len); priv->data = data.data[0] & mask; priv->len = desc.len; return 0; From 30f78d8ebf7f514801e71b88a10c948275168518 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Apr 2014 21:23:36 -0700 Subject: [PATCH 09/33] ipv6: Limit mtu to 65575 bytes Francois reported that setting big mtu on loopback device could prevent tcp sessions making progress. We do not support (yet ?) IPv6 Jumbograms and cook corrupted packets. We must limit the IPv6 MTU to (65535 + 40) bytes in theory. Tested: ifconfig lo mtu 70000 netperf -H ::1 Before patch : Throughput : 0.05 Mbits After patch : Throughput : 35484 Mbits Reported-by: Francois WELLENREITER Signed-off-by: Eric Dumazet Acked-by: YOSHIFUJI Hideaki Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip6_route.h | 5 +++++ net/ipv6/route.c | 5 +++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 3c3bb184eb8f..6c4f5eac98e7 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -32,6 +32,11 @@ struct route_info { #define RT6_LOOKUP_F_SRCPREF_PUBLIC 0x00000010 #define RT6_LOOKUP_F_SRCPREF_COA 0x00000020 +/* We do not (yet ?) support IPv6 jumbograms (RFC 2675) + * Unlike IPv4, hdr->seg_len doesn't include the IPv6 header + */ +#define IP6_MAX_MTU (0xFFFF + sizeof(struct ipv6hdr)) + /* * rt6_srcprefs2flags() and rt6_flags2srcprefs() translate * between IPV6_ADDR_PREFERENCES socket option values diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5015c50a5ba7..5ea462eacd9f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1338,7 +1338,7 @@ static unsigned int ip6_mtu(const struct dst_entry *dst) unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) - return mtu; + goto out; mtu = IPV6_MIN_MTU; @@ -1348,7 +1348,8 @@ static unsigned int ip6_mtu(const struct dst_entry *dst) mtu = idev->cnf.mtu6; rcu_read_unlock(); - return mtu; +out: + return min_t(unsigned int, mtu, IP6_MAX_MTU); } static struct dst_entry *icmp6_dst_gc_list; From 77d149c4eb8964b6bd4a929b102a867505add612 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Fri, 11 Apr 2014 16:14:26 +0800 Subject: [PATCH 10/33] bnx2: Don't build unused suspend/resume functions not enabled When CONFIG_PM_SLEEP isn't enabled, bnx2_suspend/resume are unused; don't build them when they aren't used. Signed-off-by: Daniel J Blueman Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c index a8efb18e42fa..0ab83708b6a1 100644 --- a/drivers/net/ethernet/broadcom/bnx2.c +++ b/drivers/net/ethernet/broadcom/bnx2.c @@ -8627,6 +8627,7 @@ bnx2_remove_one(struct pci_dev *pdev) pci_disable_device(pdev); } +#ifdef CONFIG_PM_SLEEP static int bnx2_suspend(struct device *device) { @@ -8665,7 +8666,6 @@ bnx2_resume(struct device *device) return 0; } -#ifdef CONFIG_PM_SLEEP static SIMPLE_DEV_PM_OPS(bnx2_pm_ops, bnx2_suspend, bnx2_resume); #define BNX2_PM_OPS (&bnx2_pm_ops) From e1a5ddc5069a0c7589a139e0422200672d965581 Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Mon, 14 Apr 2014 11:17:22 +0300 Subject: [PATCH 11/33] net/mlx4_core: Defer VF initialization till PF is fully initialized Fix in commit [1] is not sufficient since a deferred VF initialization could happen after pci_enable_sriov() is finished, but before the PF is fully initialized. Need to prevent VFs from initializing till the PF is fully ready and comm channel is operational. [1] - 9798935 "net/mlx4_core: mlx4_init_slave() shouldn't access comm channel before PF is ready" CC: Stuart Hayes Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/main.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 4b86c7af2a7a..cef267e24f9c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -2369,10 +2369,10 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) } else { atomic_inc(&pf_loading); err = pci_enable_sriov(pdev, total_vfs); - atomic_dec(&pf_loading); if (err) { mlx4_err(dev, "Failed to enable SR-IOV, continuing without SR-IOV (err = %d).\n", err); + atomic_dec(&pf_loading); err = 0; } else { mlx4_warn(dev, "Running in master mode\n"); @@ -2532,6 +2532,9 @@ slave_start: priv->removed = 0; + if (mlx4_is_master(dev) && dev->num_vfs) + atomic_dec(&pf_loading); + return 0; err_port: @@ -2582,6 +2585,9 @@ err_rel_own: if (!mlx4_is_slave(dev)) mlx4_free_ownership(dev); + if (mlx4_is_master(dev) && dev->num_vfs) + atomic_dec(&pf_loading); + kfree(priv->dev.dev_vfs); err_free_dev: @@ -2675,6 +2681,7 @@ static void __mlx4_remove_one(struct pci_dev *pdev) if (dev->flags & MLX4_FLAG_SRIOV) { mlx4_warn(dev, "Disabling SR-IOV\n"); pci_disable_sriov(pdev); + dev->num_vfs = 0; } if (!mlx4_is_slave(dev)) From 1a3d0717f68345730ae939b74b952200fb165f45 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Mon, 14 Apr 2014 16:12:40 +0530 Subject: [PATCH 12/33] be2net: Fix to reap TX compls till HW doesn't respond for some time be_close() currently waits for a max of 200ms to receive all pending TX compls. This timeout value was roughly calculated based on 10G transmission speeds and the TX queue depth. This timeout may not be enough when the link is operating at lower speeds or in multi-channel/SR-IOV configs with TX-rate limiting setting. It is hard to calculate a "proper timeout value" that works in all configurations. This patch solves this problem by continuing to reap TX completions till the HW is completely silent for a period of 10ms or a HW error is detected. v2: implements the new scheme (as suggested by David Laight) instead of just waiting longer than 200ms for reaping all completions. Signed-off-by: Vasundhara Volam Signed-off-by: Sathya Perla Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_main.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 3e6df47b6973..80f754d7cf65 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -2033,11 +2033,13 @@ static void be_tx_compl_clean(struct be_adapter *adapter) bool dummy_wrb; int i, pending_txqs; - /* Wait for a max of 200ms for all the tx-completions to arrive. */ + /* Stop polling for compls when HW has been silent for 10ms */ do { pending_txqs = adapter->num_tx_qs; for_all_tx_queues(adapter, txo, i) { + cmpl = 0; + num_wrbs = 0; txq = &txo->q; while ((txcp = be_tx_compl_get(&txo->cq))) { end_idx = @@ -2050,14 +2052,13 @@ static void be_tx_compl_clean(struct be_adapter *adapter) if (cmpl) { be_cq_notify(adapter, txo->cq.id, false, cmpl); atomic_sub(num_wrbs, &txq->used); - cmpl = 0; - num_wrbs = 0; + timeo = 0; } if (atomic_read(&txq->used) == 0) pending_txqs--; } - if (pending_txqs == 0 || ++timeo > 200) + if (pending_txqs == 0 || ++timeo > 10 || be_hw_error(adapter)) break; mdelay(1); From e1ad8e33d2e57ca64d9862b63d986fc296a7b876 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 14 Apr 2014 16:12:41 +0530 Subject: [PATCH 13/33] be2net: Fix invocation of be_close() after be_clear() In the EEH error recovery path, when a permanent failure occurs, we clean up adapter structure (i.e. destroy queues etc) by calling be_clear() and return PCI_ERS_RESULT_DISCONNECT. After this the stack tries to remove device from bus and calls be_remove() which invokes netdev_unregister()->be_close(). be_close() operating on destroyed queues results in a NULL dereference. This patch fixes this problem by introducing a flag to keep track of the setup state. Signed-off-by: Kalesh AP Signed-off-by: Sathya Perla Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be.h | 1 + drivers/net/ethernet/emulex/benet/be_main.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h index 8ccaa2520dc3..97db5a7179df 100644 --- a/drivers/net/ethernet/emulex/benet/be.h +++ b/drivers/net/ethernet/emulex/benet/be.h @@ -374,6 +374,7 @@ enum vf_state { #define BE_FLAGS_NAPI_ENABLED (1 << 9) #define BE_FLAGS_QNQ_ASYNC_EVT_RCVD (1 << 11) #define BE_FLAGS_VXLAN_OFFLOADS (1 << 12) +#define BE_FLAGS_SETUP_DONE (1 << 13) #define BE_UC_PMAC_COUNT 30 #define BE_VF_UC_PMAC_COUNT 2 diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 80f754d7cf65..a18645407d21 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -2726,6 +2726,12 @@ static int be_close(struct net_device *netdev) struct be_eq_obj *eqo; int i; + /* This protection is needed as be_close() may be called even when the + * adapter is in cleared state (after eeh perm failure) + */ + if (!(adapter->flags & BE_FLAGS_SETUP_DONE)) + return 0; + be_roce_dev_close(adapter); if (adapter->flags & BE_FLAGS_NAPI_ENABLED) { @@ -3056,6 +3062,7 @@ static int be_clear(struct be_adapter *adapter) be_clear_queues(adapter); be_msix_disable(adapter); + adapter->flags &= ~BE_FLAGS_SETUP_DONE; return 0; } @@ -3560,6 +3567,7 @@ static int be_setup(struct be_adapter *adapter) adapter->phy.fc_autoneg = 1; be_schedule_worker(adapter); + adapter->flags |= BE_FLAGS_SETUP_DONE; return 0; err: be_clear(adapter); From 463518a0cbd396aac83ee3d196897d585e173796 Mon Sep 17 00:00:00 2001 From: Sucheta Chakraborty Date: Mon, 14 Apr 2014 10:02:18 -0400 Subject: [PATCH 14/33] qlcnic: Fix panic due to uninitialzed delayed_work struct in use. o AEN event was being received before initializing delayed_work struct and handlers for it. This was resulting in crash. This patch fixes it. Signed-off-by: Sucheta Chakraborty Signed-off-by: Shahed Shaikh Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c index b48737dcd3c5..d211af70c7ec 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c @@ -2348,14 +2348,13 @@ int qlcnic_83xx_init(struct qlcnic_adapter *adapter, int pci_using_dac) goto disable_intr; } + INIT_DELAYED_WORK(&adapter->idc_aen_work, qlcnic_83xx_idc_aen_work); + err = qlcnic_83xx_setup_mbx_intr(adapter); if (err) goto disable_mbx_intr; qlcnic_83xx_clear_function_resources(adapter); - - INIT_DELAYED_WORK(&adapter->idc_aen_work, qlcnic_83xx_idc_aen_work); - qlcnic_83xx_initialize_nic(adapter, 1); /* Configure default, SR-IOV or Virtual NIC mode of operation */ From 4d52e1e8d1e198962dcbfabf9c06425c38eb23d0 Mon Sep 17 00:00:00 2001 From: Sucheta Chakraborty Date: Mon, 14 Apr 2014 10:02:19 -0400 Subject: [PATCH 15/33] qlcnic: Fix to send INIT_NIC_FUNC as first mailbox. o INIT_NIC_FUNC should be first mailbox sent. Sending DCB capability and parameter query commands after that command. Signed-off-by: Sucheta Chakraborty Signed-off-by: Shahed Shaikh Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c | 2 ++ drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c | 2 -- drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c index d211af70c7ec..3ca3118efb3a 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c @@ -2355,7 +2355,9 @@ int qlcnic_83xx_init(struct qlcnic_adapter *adapter, int pci_using_dac) goto disable_mbx_intr; qlcnic_83xx_clear_function_resources(adapter); + qlcnic_dcb_enable(adapter->dcb); qlcnic_83xx_initialize_nic(adapter, 1); + qlcnic_dcb_get_info(adapter->dcb); /* Configure default, SR-IOV or Virtual NIC mode of operation */ err = qlcnic_83xx_configure_opmode(adapter); diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c index 7d4f54912bad..a51fe18f09a8 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c @@ -330,8 +330,6 @@ static int __qlcnic_dcb_attach(struct qlcnic_dcb *dcb) goto out_free_cfg; } - qlcnic_dcb_get_info(dcb); - return 0; out_free_cfg: kfree(dcb->cfg); diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index 309d05640883..84d011ed7ec2 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -2528,8 +2528,6 @@ qlcnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_out_free_hw; } - qlcnic_dcb_enable(adapter->dcb); - if (qlcnic_read_mac_addr(adapter)) dev_warn(&pdev->dev, "failed to read mac addr\n"); @@ -2549,7 +2547,10 @@ qlcnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) "Device does not support MSI interrupts\n"); if (qlcnic_82xx_check(adapter)) { + qlcnic_dcb_enable(adapter->dcb); + qlcnic_dcb_get_info(adapter->dcb); err = qlcnic_setup_intr(adapter); + if (err) { dev_err(&pdev->dev, "Failed to setup interrupt\n"); goto err_out_disable_msi; From 7b546842b1e87f5b7929cf15e9cd1ac861b14de3 Mon Sep 17 00:00:00 2001 From: Shahed Shaikh Date: Mon, 14 Apr 2014 10:02:20 -0400 Subject: [PATCH 16/33] qlcnic: Fix max ring count calculation Do not read max rings count from qlcnic_get_nic_info(). Use driver defined values for 82xx adapters. In case of 83xx adapters, use minimum of firmware provided and driver defined values. Signed-off-by: Shahed Shaikh Signed-off-by: David S. Miller --- .../net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c | 14 ++++++++------ drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c | 2 -- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c index 3ca3118efb3a..ba20c721ee97 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c @@ -2139,8 +2139,6 @@ static int qlcnic_83xx_get_nic_configuration(struct qlcnic_adapter *adapter) ahw->max_mac_filters = nic_info.max_mac_filters; ahw->max_mtu = nic_info.max_mtu; - adapter->max_tx_rings = ahw->max_tx_ques; - adapter->max_sds_rings = ahw->max_rx_ques; /* eSwitch capability indicates vNIC mode. * vNIC and SRIOV are mutually exclusive operational modes. * If SR-IOV capability is detected, SR-IOV physical function @@ -2161,6 +2159,7 @@ static int qlcnic_83xx_get_nic_configuration(struct qlcnic_adapter *adapter) int qlcnic_83xx_configure_opmode(struct qlcnic_adapter *adapter) { struct qlcnic_hardware_context *ahw = adapter->ahw; + u16 max_sds_rings, max_tx_rings; int ret; ret = qlcnic_83xx_get_nic_configuration(adapter); @@ -2173,18 +2172,21 @@ int qlcnic_83xx_configure_opmode(struct qlcnic_adapter *adapter) if (qlcnic_83xx_config_vnic_opmode(adapter)) return -EIO; - adapter->max_sds_rings = QLCNIC_MAX_VNIC_SDS_RINGS; - adapter->max_tx_rings = QLCNIC_MAX_VNIC_TX_RINGS; + max_sds_rings = QLCNIC_MAX_VNIC_SDS_RINGS; + max_tx_rings = QLCNIC_MAX_VNIC_TX_RINGS; } else if (ret == QLC_83XX_DEFAULT_OPMODE) { ahw->nic_mode = QLCNIC_DEFAULT_MODE; adapter->nic_ops->init_driver = qlcnic_83xx_init_default_driver; ahw->idc.state_entry = qlcnic_83xx_idc_ready_state_entry; - adapter->max_sds_rings = QLCNIC_MAX_SDS_RINGS; - adapter->max_tx_rings = QLCNIC_MAX_TX_RINGS; + max_sds_rings = QLCNIC_MAX_SDS_RINGS; + max_tx_rings = QLCNIC_MAX_TX_RINGS; } else { return -EIO; } + adapter->max_sds_rings = min(ahw->max_rx_ques, max_sds_rings); + adapter->max_tx_rings = min(ahw->max_tx_ques, max_tx_rings); + return 0; } diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c index 64dcbf33d8f0..a81ad5088b07 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c @@ -883,8 +883,6 @@ int qlcnic_82xx_get_nic_info(struct qlcnic_adapter *adapter, npar_info->max_rx_ques = le16_to_cpu(nic_info->max_rx_ques); npar_info->capabilities = le32_to_cpu(nic_info->capabilities); npar_info->max_mtu = le16_to_cpu(nic_info->max_mtu); - adapter->max_tx_rings = npar_info->max_tx_ques; - adapter->max_sds_rings = npar_info->max_rx_ques; } qlcnic_free_mbx_args(&cmd); From a78b6da89f52d03997619f5a78a5325bec865977 Mon Sep 17 00:00:00 2001 From: Jitendra Kalsaria Date: Mon, 14 Apr 2014 10:02:21 -0400 Subject: [PATCH 17/33] qlcnic: Fix PVID configuration on eSwitch port. Clear older PVID before adding a newer PVID to the eSwicth port Signed-off-by: Jitendra Kalsaria Signed-off-by: Shahed Shaikh Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c index a81ad5088b07..c1e11f5715b0 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c @@ -1354,6 +1354,7 @@ int qlcnic_config_switch_port(struct qlcnic_adapter *adapter, arg2 &= ~BIT_3; break; case QLCNIC_ADD_VLAN: + arg1 &= ~(0x0ffff << 16); arg1 |= (BIT_2 | BIT_5); arg1 |= (esw_cfg->vlan_id << 16); break; From 4f0302277718810494f8c618f28d1edb33af859a Mon Sep 17 00:00:00 2001 From: Jitendra Kalsaria Date: Mon, 14 Apr 2014 10:02:22 -0400 Subject: [PATCH 18/33] qlcnic: Fix QLogic application/driver interface for virtual NIC configuration o Application expect vNIC number as the array index but driver interface return configuration in array index form. o Pack the vNIC information array in the buffer such that application can access it using vNIC number as the array index. Signed-off-by: Jitendra Kalsaria Signed-off-by: Shahed Shaikh Signed-off-by: David S. Miller --- .../net/ethernet/qlogic/qlcnic/qlcnic_sysfs.c | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sysfs.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sysfs.c index 448d156c3d08..cd346e27f2e1 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sysfs.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sysfs.c @@ -354,7 +354,7 @@ int qlcnic_is_valid_nic_func(struct qlcnic_adapter *adapter, u8 pci_func) { int i; - for (i = 0; i < adapter->ahw->max_vnic_func; i++) { + for (i = 0; i < adapter->ahw->total_nic_func; i++) { if (adapter->npars[i].pci_func == pci_func) return i; } @@ -720,6 +720,7 @@ static ssize_t qlcnic_sysfs_read_npar_config(struct file *file, struct qlcnic_adapter *adapter = dev_get_drvdata(dev); struct qlcnic_npar_func_cfg *np_cfg; struct qlcnic_info nic_info; + u8 pci_func; int i, ret; u32 count; @@ -729,26 +730,28 @@ static ssize_t qlcnic_sysfs_read_npar_config(struct file *file, count = size / sizeof(struct qlcnic_npar_func_cfg); for (i = 0; i < adapter->ahw->total_nic_func; i++) { - if (qlcnic_is_valid_nic_func(adapter, i) < 0) - continue; if (adapter->npars[i].pci_func >= count) { dev_dbg(dev, "%s: Total nic functions[%d], App sent function count[%d]\n", __func__, adapter->ahw->total_nic_func, count); continue; } - ret = qlcnic_get_nic_info(adapter, &nic_info, i); - if (ret) - return ret; if (!adapter->npars[i].eswitch_status) continue; - np_cfg[i].pci_func = i; - np_cfg[i].op_mode = (u8)nic_info.op_mode; - np_cfg[i].port_num = nic_info.phys_port; - np_cfg[i].fw_capab = nic_info.capabilities; - np_cfg[i].min_bw = nic_info.min_tx_bw; - np_cfg[i].max_bw = nic_info.max_tx_bw; - np_cfg[i].max_tx_queues = nic_info.max_tx_ques; - np_cfg[i].max_rx_queues = nic_info.max_rx_ques; + pci_func = adapter->npars[i].pci_func; + if (qlcnic_is_valid_nic_func(adapter, pci_func) < 0) + continue; + ret = qlcnic_get_nic_info(adapter, &nic_info, pci_func); + if (ret) + return ret; + + np_cfg[pci_func].pci_func = pci_func; + np_cfg[pci_func].op_mode = (u8)nic_info.op_mode; + np_cfg[pci_func].port_num = nic_info.phys_port; + np_cfg[pci_func].fw_capab = nic_info.capabilities; + np_cfg[pci_func].min_bw = nic_info.min_tx_bw; + np_cfg[pci_func].max_bw = nic_info.max_tx_bw; + np_cfg[pci_func].max_tx_queues = nic_info.max_tx_ques; + np_cfg[pci_func].max_rx_queues = nic_info.max_rx_ques; } return size; } From 696f1943a1538bb448c5bf55a18793ad410da00b Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Mon, 14 Apr 2014 10:02:23 -0400 Subject: [PATCH 19/33] qlcnic: Do not disable SR-IOV when VFs are assigned to VMs o While disabling SR-IOV when VFs are assigned to VMs causes host crash so return -EPERM when user request to disable SR-IOV using pci sysfs in case of VFs are assigned to VMs. Signed-off-by: Manish Chopra Signed-off-by: Shahed Shaikh Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c index 14f748cbf0de..280137991544 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c @@ -461,6 +461,16 @@ static int qlcnic_pci_sriov_disable(struct qlcnic_adapter *adapter) { struct net_device *netdev = adapter->netdev; + if (pci_vfs_assigned(adapter->pdev)) { + netdev_err(adapter->netdev, + "SR-IOV VFs belonging to port %d are assigned to VMs. SR-IOV can not be disabled on this port\n", + adapter->portnum); + netdev_info(adapter->netdev, + "Please detach SR-IOV VFs belonging to port %d from VMs, and then try to disable SR-IOV on this port\n", + adapter->portnum); + return -EPERM; + } + rtnl_lock(); if (netif_running(netdev)) __qlcnic_down(adapter, netdev); From 2eac7648321f4a08aa4078504d7727af0af7173b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 14 Apr 2014 21:02:59 +0200 Subject: [PATCH 20/33] seccomp: fix populating a0-a5 syscall args in 32-bit x86 BPF Linus reports that on 32-bit x86 Chromium throws the following seccomp resp. audit log messages: audit: type=1326 audit(1397359304.356:28108): auid=500 uid=500 gid=500 ses=2 subj=unconfined_u:unconfined_r:chrome_sandbox_t:s0-s0:c0.c1023 pid=3677 comm="chrome" exe="/opt/google/chrome/chrome" sig=0 syscall=172 compat=0 ip=0xb2dd9852 code=0x30000 audit: type=1326 audit(1397359304.356:28109): auid=500 uid=500 gid=500 ses=2 subj=unconfined_u:unconfined_r:chrome_sandbox_t:s0-s0:c0.c1023 pid=3677 comm="chrome" exe="/opt/google/chrome/chrome" sig=0 syscall=5 compat=0 ip=0xb2dd9852 code=0x50000 These audit messages are being triggered via audit_seccomp() through __secure_computing() in seccomp mode (BPF) filter with seccomp return codes 0x30000 (== SECCOMP_RET_TRAP) and 0x50000 (== SECCOMP_RET_ERRNO) during filter runtime. Moreover, Linus reports that x86_64 Chromium seems fine. The underlying issue that explains this is that the implementation of populate_seccomp_data() is wrong. Our seccomp data structure sd that is being shared with user ABI is: struct seccomp_data { int nr; __u32 arch; __u64 instruction_pointer; __u64 args[6]; }; Therefore, a simple cast to 'unsigned long *' for storing the value of the syscall argument via syscall_get_arguments() is just wrong as on 32-bit x86 (or any other 32bit arch), it would result in storing a0-a5 at wrong offsets in args[] member, and thus i) could leak stack memory to user space and ii) tampers with the logic of seccomp BPF programs that read out and check for syscall arguments: syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]); Tested on 32-bit x86 with Google Chrome, unfortunately only via remote test machine through slow ssh X forwarding, but it fixes the issue on my side. So fix it up by storing args in type correct variables, gcc is clever and optimizes the copy away in other cases, e.g. x86_64. Fixes: bd4cf0ed331a ("net: filter: rework/optimize internal BPF interpreter's instruction set") Reported-and-bisected-by: Linus Torvalds Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Cc: Linus Torvalds Cc: Eric Paris Cc: James Morris Cc: Kees Cook Signed-off-by: David S. Miller --- kernel/seccomp.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d8d046c0726a..590c37925084 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -69,18 +69,17 @@ static void populate_seccomp_data(struct seccomp_data *sd) { struct task_struct *task = current; struct pt_regs *regs = task_pt_regs(task); + unsigned long args[6]; sd->nr = syscall_get_nr(task, regs); sd->arch = syscall_get_arch(); - - /* Unroll syscall_get_args to help gcc on arm. */ - syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]); - syscall_get_arguments(task, regs, 1, 1, (unsigned long *) &sd->args[1]); - syscall_get_arguments(task, regs, 2, 1, (unsigned long *) &sd->args[2]); - syscall_get_arguments(task, regs, 3, 1, (unsigned long *) &sd->args[3]); - syscall_get_arguments(task, regs, 4, 1, (unsigned long *) &sd->args[4]); - syscall_get_arguments(task, regs, 5, 1, (unsigned long *) &sd->args[5]); - + syscall_get_arguments(task, regs, 0, 6, args); + sd->args[0] = args[0]; + sd->args[1] = args[1]; + sd->args[2] = args[2]; + sd->args[3] = args[3]; + sd->args[4] = args[4]; + sd->args[5] = args[5]; sd->instruction_pointer = KSTK_EIP(task); } From 8c482cdc358ef931ee02262e0a4ef0f29946aa0c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 14 Apr 2014 21:20:12 +0200 Subject: [PATCH 21/33] net: filter: seccomp: fix wrong decoding of BPF_S_ANC_SECCOMP_LD_W While reviewing seccomp code, we found that BPF_S_ANC_SECCOMP_LD_W has been wrongly decoded by commit a8fc927780 ("sk-filter: Add ability to get socket filter program (v2)") into the opcode BPF_LD|BPF_B|BPF_ABS although it should have been decoded as BPF_LD|BPF_W|BPF_ABS. In practice, this should not have much side-effect though, as such conversion is/was being done through prctl(2) PR_SET_SECCOMP. Reverse operation PR_GET_SECCOMP will only return the current seccomp mode, but not the filter itself. Since the transition to the new BPF infrastructure, it's also not used anymore, so we can simply remove this as it's unreachable. Fixes: a8fc927780 ("sk-filter: Add ability to get socket filter program (v2)") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Cc: Pavel Emelyanov Signed-off-by: David S. Miller --- include/linux/filter.h | 1 - net/core/filter.c | 1 - 2 files changed, 2 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 262dcbb75ffe..024fd03e5d18 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -220,7 +220,6 @@ enum { BPF_S_ANC_RXHASH, BPF_S_ANC_CPU, BPF_S_ANC_ALU_XOR_X, - BPF_S_ANC_SECCOMP_LD_W, BPF_S_ANC_VLAN_TAG, BPF_S_ANC_VLAN_TAG_PRESENT, BPF_S_ANC_PAY_OFFSET, diff --git a/net/core/filter.c b/net/core/filter.c index 0e0856f5d708..cd58614660cf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1743,7 +1743,6 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to) [BPF_S_ANC_RXHASH] = BPF_LD|BPF_B|BPF_ABS, [BPF_S_ANC_CPU] = BPF_LD|BPF_B|BPF_ABS, [BPF_S_ANC_ALU_XOR_X] = BPF_LD|BPF_B|BPF_ABS, - [BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS, [BPF_S_ANC_VLAN_TAG] = BPF_LD|BPF_B|BPF_ABS, [BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS, [BPF_S_ANC_PAY_OFFSET] = BPF_LD|BPF_B|BPF_ABS, From bfae23249955819a42aa6c23d93708c818eff5c9 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Mon, 14 Apr 2014 14:22:43 -0500 Subject: [PATCH 22/33] cxgb4: Save the correct mac addr for hw-loopback connections in the L2T Hardware needs the local device mac address to support hw loopback for rdma loopback connections. Signed-off-by: Steve Wise Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/l2t.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.c b/drivers/net/ethernet/chelsio/cxgb4/l2t.c index 81e8402a74b4..8a96572fdde0 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/l2t.c +++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.c @@ -154,7 +154,7 @@ static int write_l2e(struct adapter *adap, struct l2t_entry *e, int sync) req->params = htons(L2T_W_PORT(e->lport) | L2T_W_NOREPLY(!sync)); req->l2t_idx = htons(e->idx); req->vlan = htons(e->vlan); - if (e->neigh) + if (e->neigh && !(e->neigh->dev->flags & IFF_LOOPBACK)) memcpy(e->dmac, e->neigh->ha, sizeof(e->dmac)); memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac)); @@ -394,6 +394,8 @@ struct l2t_entry *cxgb4_l2t_get(struct l2t_data *d, struct neighbour *neigh, if (e) { spin_lock(&e->lock); /* avoid race with t4_l2t_free */ e->state = L2T_STATE_RESOLVING; + if (neigh->dev->flags & IFF_LOOPBACK) + memcpy(e->dmac, physdev->dev_addr, sizeof(e->dmac)); memcpy(e->addr, addr, addr_len); e->ifindex = ifidx; e->hash = hash; From 362d52040c71f6e8d8158be48c812d7729cb8df1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 14 Apr 2014 21:45:17 +0200 Subject: [PATCH 23/33] Revert "net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer" This reverts commit ef2820a735f7 ("net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer") as it introduced a serious performance regression on SCTP over IPv4 and IPv6, though a not as dramatic on the latter. Measurements are on 10Gbit/s with ixgbe NICs. Current state: [root@Lab200slot2 ~]# iperf3 --sctp -4 -c 192.168.241.3 -V -l 1452 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64 Time: Fri, 11 Apr 2014 17:56:21 GMT Connecting to host 192.168.241.3, port 5201 Cookie: Lab200slot2.1397238981.812898.548918 [ 4] local 192.168.241.2 port 38616 connected to 192.168.241.3 port 5201 Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.09 sec 20.8 MBytes 161 Mbits/sec [ 4] 1.09-2.13 sec 10.8 MBytes 86.8 Mbits/sec [ 4] 2.13-3.15 sec 3.57 MBytes 29.5 Mbits/sec [ 4] 3.15-4.16 sec 4.33 MBytes 35.7 Mbits/sec [ 4] 4.16-6.21 sec 10.4 MBytes 42.7 Mbits/sec [ 4] 6.21-6.21 sec 0.00 Bytes 0.00 bits/sec [ 4] 6.21-7.35 sec 34.6 MBytes 253 Mbits/sec [ 4] 7.35-11.45 sec 22.0 MBytes 45.0 Mbits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-11.45 sec 0.00 Bytes 0.00 bits/sec [ 4] 11.45-12.51 sec 16.0 MBytes 126 Mbits/sec [ 4] 12.51-13.59 sec 20.3 MBytes 158 Mbits/sec [ 4] 13.59-14.65 sec 13.4 MBytes 107 Mbits/sec [ 4] 14.65-16.79 sec 33.3 MBytes 130 Mbits/sec [ 4] 16.79-16.79 sec 0.00 Bytes 0.00 bits/sec [ 4] 16.79-17.82 sec 5.94 MBytes 48.7 Mbits/sec (etc) [root@Lab200slot2 ~]# iperf3 --sctp -6 -c 2001:db8:0:f101::1 -V -l 1400 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0 #1 SMP Thu Apr 3 23:18:29 EDT 2014 x86_64 Time: Fri, 11 Apr 2014 19:08:41 GMT Connecting to host 2001:db8:0:f101::1, port 5201 Cookie: Lab200slot2.1397243321.714295.2b3f7c [ 4] local 2001:db8:0:f101::2 port 55804 connected to 2001:db8:0:f101::1 port 5201 Starting Test: protocol: SCTP, 1 streams, 1400 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.00 sec 169 MBytes 1.42 Gbits/sec [ 4] 1.00-2.00 sec 201 MBytes 1.69 Gbits/sec [ 4] 2.00-3.00 sec 188 MBytes 1.58 Gbits/sec [ 4] 3.00-4.00 sec 174 MBytes 1.46 Gbits/sec [ 4] 4.00-5.00 sec 165 MBytes 1.39 Gbits/sec [ 4] 5.00-6.00 sec 199 MBytes 1.67 Gbits/sec [ 4] 6.00-7.00 sec 163 MBytes 1.36 Gbits/sec [ 4] 7.00-8.00 sec 174 MBytes 1.46 Gbits/sec [ 4] 8.00-9.00 sec 193 MBytes 1.62 Gbits/sec [ 4] 9.00-10.00 sec 196 MBytes 1.65 Gbits/sec [ 4] 10.00-11.00 sec 157 MBytes 1.31 Gbits/sec [ 4] 11.00-12.00 sec 175 MBytes 1.47 Gbits/sec [ 4] 12.00-13.00 sec 192 MBytes 1.61 Gbits/sec [ 4] 13.00-14.00 sec 199 MBytes 1.67 Gbits/sec (etc) After patch: [root@Lab200slot2 ~]# iperf3 --sctp -4 -c 192.168.240.3 -V -l 1452 -t 60 iperf version 3.0.1 (10 January 2014) Linux Lab200slot2 3.14.0+ #1 SMP Mon Apr 14 12:06:40 EDT 2014 x86_64 Time: Mon, 14 Apr 2014 16:40:48 GMT Connecting to host 192.168.240.3, port 5201 Cookie: Lab200slot2.1397493648.413274.65e131 [ 4] local 192.168.240.2 port 50548 connected to 192.168.240.3 port 5201 Starting Test: protocol: SCTP, 1 streams, 1452 byte blocks, omitting 0 seconds, 60 second test [ ID] Interval Transfer Bandwidth [ 4] 0.00-1.00 sec 240 MBytes 2.02 Gbits/sec [ 4] 1.00-2.00 sec 239 MBytes 2.01 Gbits/sec [ 4] 2.00-3.00 sec 240 MBytes 2.01 Gbits/sec [ 4] 3.00-4.00 sec 239 MBytes 2.00 Gbits/sec [ 4] 4.00-5.00 sec 245 MBytes 2.05 Gbits/sec [ 4] 5.00-6.00 sec 240 MBytes 2.01 Gbits/sec [ 4] 6.00-7.00 sec 240 MBytes 2.02 Gbits/sec [ 4] 7.00-8.00 sec 239 MBytes 2.01 Gbits/sec With the reverted patch applied, the SCTP/IPv4 performance is back to normal on latest upstream for IPv4 and IPv6 and has same throughput as 3.4.2 test kernel, steady and interval reports are smooth again. Fixes: ef2820a735f7 ("net: sctp: Fix a_rwnd/rwnd management to reflect real state of the receiver's buffer") Reported-by: Peter Butler Reported-by: Dongsheng Song Reported-by: Fengguang Wu Tested-by: Peter Butler Signed-off-by: Daniel Borkmann Cc: Matija Glavinic Pecotic Cc: Alexander Sverdlin Cc: Vlad Yasevich Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 14 ++++++- net/sctp/associola.c | 82 ++++++++++++++++++++++++++++++-------- net/sctp/sm_statefuns.c | 2 +- net/sctp/socket.c | 6 +++ net/sctp/ulpevent.c | 8 +--- 5 files changed, 87 insertions(+), 25 deletions(-) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 6ee76c804893..d992ca3145fe 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1653,6 +1653,17 @@ struct sctp_association { /* This is the last advertised value of rwnd over a SACK chunk. */ __u32 a_rwnd; + /* Number of bytes by which the rwnd has slopped. The rwnd is allowed + * to slop over a maximum of the association's frag_point. + */ + __u32 rwnd_over; + + /* Keeps treack of rwnd pressure. This happens when we have + * a window, but not recevie buffer (i.e small packets). This one + * is releases slowly (1 PMTU at a time ). + */ + __u32 rwnd_press; + /* This is the sndbuf size in use for the association. * This corresponds to the sndbuf size for the association, * as specified in the sk->sndbuf. @@ -1881,7 +1892,8 @@ void sctp_assoc_update(struct sctp_association *old, __u32 sctp_association_get_next_tsn(struct sctp_association *); void sctp_assoc_sync_pmtu(struct sock *, struct sctp_association *); -void sctp_assoc_rwnd_update(struct sctp_association *, bool); +void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int); +void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int); void sctp_assoc_set_primary(struct sctp_association *, struct sctp_transport *); void sctp_assoc_del_nonprimary_peers(struct sctp_association *, diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 4f6d6f9d1274..39579c3e0d14 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1395,35 +1395,44 @@ static inline bool sctp_peer_needs_update(struct sctp_association *asoc) return false; } -/* Update asoc's rwnd for the approximated state in the buffer, - * and check whether SACK needs to be sent. - */ -void sctp_assoc_rwnd_update(struct sctp_association *asoc, bool update_peer) +/* Increase asoc's rwnd by len and send any window update SACK if needed. */ +void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len) { - int rx_count; struct sctp_chunk *sack; struct timer_list *timer; - if (asoc->ep->rcvbuf_policy) - rx_count = atomic_read(&asoc->rmem_alloc); - else - rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); + if (asoc->rwnd_over) { + if (asoc->rwnd_over >= len) { + asoc->rwnd_over -= len; + } else { + asoc->rwnd += (len - asoc->rwnd_over); + asoc->rwnd_over = 0; + } + } else { + asoc->rwnd += len; + } - if ((asoc->base.sk->sk_rcvbuf - rx_count) > 0) - asoc->rwnd = (asoc->base.sk->sk_rcvbuf - rx_count) >> 1; - else - asoc->rwnd = 0; + /* If we had window pressure, start recovering it + * once our rwnd had reached the accumulated pressure + * threshold. The idea is to recover slowly, but up + * to the initial advertised window. + */ + if (asoc->rwnd_press && asoc->rwnd >= asoc->rwnd_press) { + int change = min(asoc->pathmtu, asoc->rwnd_press); + asoc->rwnd += change; + asoc->rwnd_press -= change; + } - pr_debug("%s: asoc:%p rwnd=%u, rx_count=%d, sk_rcvbuf=%d\n", - __func__, asoc, asoc->rwnd, rx_count, - asoc->base.sk->sk_rcvbuf); + pr_debug("%s: asoc:%p rwnd increased by %d to (%u, %u) - %u\n", + __func__, asoc, len, asoc->rwnd, asoc->rwnd_over, + asoc->a_rwnd); /* Send a window update SACK if the rwnd has increased by at least the * minimum of the association's PMTU and half of the receive buffer. * The algorithm used is similar to the one described in * Section 4.2.3.3 of RFC 1122. */ - if (update_peer && sctp_peer_needs_update(asoc)) { + if (sctp_peer_needs_update(asoc)) { asoc->a_rwnd = asoc->rwnd; pr_debug("%s: sending window update SACK- asoc:%p rwnd:%u " @@ -1445,6 +1454,45 @@ void sctp_assoc_rwnd_update(struct sctp_association *asoc, bool update_peer) } } +/* Decrease asoc's rwnd by len. */ +void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len) +{ + int rx_count; + int over = 0; + + if (unlikely(!asoc->rwnd || asoc->rwnd_over)) + pr_debug("%s: association:%p has asoc->rwnd:%u, " + "asoc->rwnd_over:%u!\n", __func__, asoc, + asoc->rwnd, asoc->rwnd_over); + + if (asoc->ep->rcvbuf_policy) + rx_count = atomic_read(&asoc->rmem_alloc); + else + rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc); + + /* If we've reached or overflowed our receive buffer, announce + * a 0 rwnd if rwnd would still be positive. Store the + * the potential pressure overflow so that the window can be restored + * back to original value. + */ + if (rx_count >= asoc->base.sk->sk_rcvbuf) + over = 1; + + if (asoc->rwnd >= len) { + asoc->rwnd -= len; + if (over) { + asoc->rwnd_press += asoc->rwnd; + asoc->rwnd = 0; + } + } else { + asoc->rwnd_over = len - asoc->rwnd; + asoc->rwnd = 0; + } + + pr_debug("%s: asoc:%p rwnd decreased by %d to (%u, %u, %u)\n", + __func__, asoc, len, asoc->rwnd, asoc->rwnd_over, + asoc->rwnd_press); +} /* Build the bind address list for the association based on info from the * local endpoint and the remote peer. diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 01e002430c85..ae9fbeba40b0 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -6178,7 +6178,7 @@ static int sctp_eat_data(const struct sctp_association *asoc, * PMTU. In cases, such as loopback, this might be a rather * large spill over. */ - if ((!chunk->data_accepted) && (!asoc->rwnd || + if ((!chunk->data_accepted) && (!asoc->rwnd || asoc->rwnd_over || (datalen > asoc->rwnd + asoc->frag_point))) { /* If this is the next TSN, consider reneging to make diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e13519e9df80..ff20e2dbbbc7 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2115,6 +2115,12 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, sctp_skb_pull(skb, copied); skb_queue_head(&sk->sk_receive_queue, skb); + /* When only partial message is copied to the user, increase + * rwnd by that amount. If all the data in the skb is read, + * rwnd is updated when the event is freed. + */ + if (!sctp_ulpevent_is_notification(event)) + sctp_assoc_rwnd_increase(event->asoc, copied); goto out; } else if ((event->msg_flags & MSG_NOTIFICATION) || (event->msg_flags & MSG_EOR)) diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 8d198ae03606..85c64658bd0b 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -989,7 +989,7 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, skb = sctp_event2skb(event); /* Set the owner and charge rwnd for bytes received. */ sctp_ulpevent_set_owner(event, asoc); - sctp_assoc_rwnd_update(asoc, false); + sctp_assoc_rwnd_decrease(asoc, skb_headlen(skb)); if (!skb->data_len) return; @@ -1011,7 +1011,6 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) { struct sk_buff *skb, *frag; unsigned int len; - struct sctp_association *asoc; /* Current stack structures assume that the rcv buffer is * per socket. For UDP style sockets this is not true as @@ -1036,11 +1035,8 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) } done: - asoc = event->asoc; - sctp_association_hold(asoc); + sctp_assoc_rwnd_increase(event->asoc, len); sctp_ulpevent_release_owner(event); - sctp_assoc_rwnd_update(asoc, true); - sctp_association_put(asoc); } static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event) From 1e785f48d29a09b6cf96db7b49b6320dada332e1 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Mon, 14 Apr 2014 17:37:26 -0400 Subject: [PATCH 24/33] net: Start with correct mac_len in skb_network_protocol Sometimes, when the packet arrives at skb_mac_gso_segment() its skb->mac_len already accounts for some of the mac lenght headers in the packet. This seems to happen when forwarding through and OpenSSL tunnel. When we start looking for any vlan headers in skb_network_protocol() we seem to ignore any of the already known mac headers and start with an ETH_HLEN. This results in an incorrect offset, dropped TSO frames and general slowness of the connection. We can start counting from the known skb->mac_len and return at least that much if all mac level headers are known and accounted for. Fixes: 53d6471cef17262d3ad1c7ce8982a234244f68ec (net: Account for all vlan headers in skb_mac_gso_segment) CC: Eric Dumazet CC: Daniel Borkman Tested-by: Martin Filip Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 14dac0654f28..5b3042e69f85 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2284,7 +2284,7 @@ EXPORT_SYMBOL(skb_checksum_help); __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; - int vlan_depth = ETH_HLEN; + int vlan_depth = skb->mac_len; /* Tunnel gso handlers can set protocol to ethernet. */ if (type == htons(ETH_P_TEB)) { From ea05df4e8f5d2466dbbf2e46956e9e202a22232b Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 14 Apr 2014 15:38:49 +0200 Subject: [PATCH 25/33] net: cadence: Add architecture dependencies The Cadence ethernet chipsets are only used on specific ARM architectures. Add Kconfig dependencies so that drivers for these chipsets are only buildable on the relevant architectures. Signed-off-by: Jean Delvare Cc: Nicolas Ferre Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cadence/Kconfig b/drivers/net/ethernet/cadence/Kconfig index 751d5c7b312d..7e49c43b7af3 100644 --- a/drivers/net/ethernet/cadence/Kconfig +++ b/drivers/net/ethernet/cadence/Kconfig @@ -4,7 +4,7 @@ config NET_CADENCE bool "Cadence devices" - depends on HAS_IOMEM + depends on HAS_IOMEM && (ARM || AVR32 || COMPILE_TEST) default y ---help--- If you have a network (Ethernet) card belonging to this class, say Y. @@ -22,7 +22,7 @@ if NET_CADENCE config ARM_AT91_ETHER tristate "AT91RM9200 Ethernet support" - depends on HAS_DMA + depends on HAS_DMA && (ARCH_AT91RM9200 || COMPILE_TEST) select MACB ---help--- If you wish to compile a kernel for the AT91RM9200 and enable @@ -30,7 +30,7 @@ config ARM_AT91_ETHER config MACB tristate "Cadence MACB/GEM support" - depends on HAS_DMA + depends on HAS_DMA && (PLATFORM_AT32AP || ARCH_AT91 || ARCH_PICOXCELL || ARCH_ZYNQ || COMPILE_TEST) select PHYLIB ---help--- The Cadence MACB ethernet interface is found on many Atmel AT32 and From bb78864a0c3c55461fc757c0c4b674f409518325 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 14 Apr 2014 18:48:01 +0200 Subject: [PATCH 26/33] at86rf230: remove check if AVDD settled The AVDD regulator is only enabled when the RF section is active TX_ON (PLL_ON) state. Since commit 7dcbd22a97eb0689e6c583ad630ae0e7341e34c1 ("ieee802154: ensure that first RF212 state comes from TRX_OFF"). We are in TRX_OFF state at the time at86rf230_hw_init is run. Note that this test would only fail in case of a severe hardware malfunction (faulty/shorted power supply, etc.) so it wasn't all that useful in the first place. Signed-off-by: Alexander Aring Reviewed-by: Werner Almesberger Signed-off-by: David S. Miller --- drivers/net/ieee802154/at86rf230.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c index 430bb0db9bc4..e102eef0b332 100644 --- a/drivers/net/ieee802154/at86rf230.c +++ b/drivers/net/ieee802154/at86rf230.c @@ -1025,14 +1025,6 @@ static int at86rf230_hw_init(struct at86rf230_local *lp) return -EINVAL; } - rc = at86rf230_read_subreg(lp, SR_AVDD_OK, &status); - if (rc) - return rc; - if (!status) { - dev_err(&lp->spi->dev, "AVDD error\n"); - return -EINVAL; - } - return 0; } From 2168746cfc075d004fd7044be706054fceb24e59 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 14 Apr 2014 18:48:02 +0200 Subject: [PATCH 27/33] at86rf230: fix __at86rf230_read_subreg function The __at86rf230_read_subreg function don't mask and shift register contents which it should do. This patch adds the necessary masks and shift operations in this function. Since we have csma support this can make some trouble on state changes. Since CSMA support turned on some bits in the TRX_STATUS register that used to be zero, not masking broke checking of the TRX_STATUS field after commanding a state change. Signed-off-by: Alexander Aring Reviewed-by: Werner Almesberger Signed-off-by: David S. Miller --- drivers/net/ieee802154/at86rf230.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c index e102eef0b332..e36f194673a4 100644 --- a/drivers/net/ieee802154/at86rf230.c +++ b/drivers/net/ieee802154/at86rf230.c @@ -365,7 +365,7 @@ __at86rf230_read_subreg(struct at86rf230_local *lp, dev_vdbg(&lp->spi->dev, "buf[1] = %02x\n", buf[1]); if (status == 0) - *data = buf[1]; + *data = (buf[1] & mask) >> shift; return status; } From 1dd333f470b4e1767c9c0a14b05b5104e56b2930 Mon Sep 17 00:00:00 2001 From: "Li, Zhen-Hua" Date: Tue, 15 Apr 2014 09:53:11 +0800 Subject: [PATCH 28/33] driver/net: cosa driver uses udelay incorrectly In cosa driver, udelay with more than 20000 may cause __bad_udelay. Use msleep for instead. Signed-off-by: Li, Zhen-Hua Signed-off-by: David S. Miller --- drivers/net/wan/cosa.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c index 84734a805092..83c39e2858bf 100644 --- a/drivers/net/wan/cosa.c +++ b/drivers/net/wan/cosa.c @@ -1521,11 +1521,7 @@ static int cosa_reset_and_read_id(struct cosa_data *cosa, char *idstring) cosa_putstatus(cosa, 0); cosa_getdata8(cosa); cosa_putstatus(cosa, SR_RST); -#ifdef MODULE msleep(500); -#else - udelay(5*100000); -#endif /* Disable all IRQs from the card */ cosa_putstatus(cosa, 0); From b0270e91014dabfceaf37f5b40ad51bbf21a1302 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Apr 2014 12:58:34 -0400 Subject: [PATCH 29/33] ipv4: add a sock pointer to ip_queue_xmit() ip_queue_xmit() assumes the skb it has to transmit is attached to an inet socket. Commit 31c70d5956fc ("l2tp: keep original skb ownership") changed l2tp to not change skb ownership and thus broke this assumption. One fix is to add a new 'struct sock *sk' parameter to ip_queue_xmit(), so that we do not assume skb->sk points to the socket used by l2tp tunnel. Fixes: 31c70d5956fc ("l2tp: keep original skb ownership") Reported-by: Zhan Jianyu Tested-by: Zhan Jianyu Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet6_connection_sock.h | 2 +- include/net/inet_connection_sock.h | 2 +- include/net/ip.h | 2 +- net/dccp/output.c | 2 +- net/ipv4/ip_output.c | 5 +++-- net/ipv4/tcp_output.c | 2 +- net/ipv6/inet6_connection_sock.c | 3 +-- net/l2tp/l2tp_core.c | 4 ++-- net/l2tp/l2tp_ip.c | 2 +- net/sctp/protocol.c | 2 +- 10 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h index f981ba7adeed..74af137304be 100644 --- a/include/net/inet6_connection_sock.h +++ b/include/net/inet6_connection_sock.h @@ -40,7 +40,7 @@ void inet6_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); -int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl); +int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu); #endif /* _INET6_CONNECTION_SOCK_H */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c55aeed41ace..7a4313887568 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -36,7 +36,7 @@ struct tcp_congestion_ops; * (i.e. things that depend on the address family) */ struct inet_connection_sock_af_ops { - int (*queue_xmit)(struct sk_buff *skb, struct flowi *fl); + int (*queue_xmit)(struct sock *sk, struct sk_buff *skb, struct flowi *fl); void (*send_check)(struct sock *sk, struct sk_buff *skb); int (*rebuild_header)(struct sock *sk); void (*sk_rx_dst_set)(struct sock *sk, const struct sk_buff *skb); diff --git a/include/net/ip.h b/include/net/ip.h index 25064c28e059..77e73d293e09 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -111,7 +111,7 @@ int ip_do_nat(struct sk_buff *skb); void ip_send_check(struct iphdr *ip); int __ip_local_out(struct sk_buff *skb); int ip_local_out(struct sk_buff *skb); -int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl); +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); void ip_init(void); int ip_append_data(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, diff --git a/net/dccp/output.c b/net/dccp/output.c index 8876078859da..0248e8a3460c 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -138,7 +138,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); return net_xmit_eval(err); } return -ENOBUFS; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 1a0755fea491..7ad68b860935 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -315,9 +315,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) sizeof(fl4->saddr) + sizeof(fl4->daddr)); } -int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) +/* Note: skb->sk can be different from sk, in case of tunnels */ +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { - struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; @@ -389,6 +389,7 @@ packet_routed: ip_select_ident_more(skb, &rt->dst, sk, (skb_shinfo(skb)->gso_segs ?: 1) - 1); + /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 699fb102e971..025e25093984 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -981,7 +981,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); - err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (likely(err <= 0)) return err; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index c9138189415a..d4ade34ab375 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -224,9 +224,8 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, return dst; } -int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused) +int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused) { - struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 fl6; struct dst_entry *dst; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 47f7a5490555..a4e37d7158dc 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1131,10 +1131,10 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, skb->local_df = 1; #if IS_ENABLED(CONFIG_IPV6) if (tunnel->sock->sk_family == PF_INET6 && !tunnel->v4mapped) - error = inet6_csk_xmit(skb, NULL); + error = inet6_csk_xmit(tunnel->sock, skb, NULL); else #endif - error = ip_queue_xmit(skb, fl); + error = ip_queue_xmit(tunnel->sock, skb, fl); /* Update stats */ if (error >= 0) { diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 0b44d855269c..3397fe6897c0 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -487,7 +487,7 @@ static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m xmit: /* Queue the packet to IP for output */ - rc = ip_queue_xmit(skb, &inet->cork.fl); + rc = ip_queue_xmit(sk, skb, &inet->cork.fl); rcu_read_unlock(); error: diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 4e1d0fcb028e..c09757fbf803 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -957,7 +957,7 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS); - return ip_queue_xmit(skb, &transport->fl); + return ip_queue_xmit(&inet->sk, skb, &transport->fl); } static struct sctp_af sctp_af_inet; From aad88724c9d54acb1a9737cb6069d8470fa85f74 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Apr 2014 13:47:15 -0400 Subject: [PATCH 30/33] ipv4: add a sock pointer to dst->output() path. In the dst->output() path for ipv4, the code assumes the skb it has to transmit is attached to an inet socket, specifically via ip_mc_output() : The sk_mc_loop() test triggers a WARN_ON() when the provider of the packet is an AF_PACKET socket. The dst->output() method gets an additional 'struct sock *sk' parameter. This needs a cascade of changes so that this parameter can be propagated from vxlan to final consumer. Fixes: 8f646c922d55 ("vxlan: keep original skb ownership") Reported-by: lucien xin Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 4 ++-- include/net/dst.h | 14 +++++++++++--- include/net/ip.h | 11 ++++++++--- include/net/ip_tunnels.h | 2 +- include/net/ipv6.h | 2 +- include/net/xfrm.h | 6 +++--- net/core/dst.c | 15 +++++++++------ net/decnet/dn_route.c | 16 ++++++++++++++-- net/ipv4/ip_output.c | 11 +++++------ net/ipv4/ip_tunnel.c | 2 +- net/ipv4/ip_tunnel_core.c | 4 ++-- net/ipv4/route.c | 4 ++-- net/ipv4/xfrm4_output.c | 2 +- net/ipv6/ip6_output.c | 2 +- net/ipv6/route.c | 14 +++++++------- net/ipv6/sit.c | 5 +++-- net/ipv6/xfrm6_output.c | 2 +- net/openvswitch/vport-gre.c | 2 +- net/xfrm/xfrm_policy.c | 2 +- 19 files changed, 74 insertions(+), 46 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index c55e316373a1..82355d5d155a 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1755,8 +1755,8 @@ int vxlan_xmit_skb(struct vxlan_sock *vs, if (err) return err; - return iptunnel_xmit(rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, - false); + return iptunnel_xmit(vs->sock->sk, rt, skb, src, dst, IPPROTO_UDP, + tos, ttl, df, false); } EXPORT_SYMBOL_GPL(vxlan_xmit_skb); diff --git a/include/net/dst.h b/include/net/dst.h index 46ed958e0c6e..71c60f42be48 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -45,7 +45,7 @@ struct dst_entry { void *__pad1; #endif int (*input)(struct sk_buff *); - int (*output)(struct sk_buff *); + int (*output)(struct sock *sk, struct sk_buff *skb); unsigned short flags; #define DST_HOST 0x0001 @@ -367,7 +367,11 @@ static inline struct dst_entry *skb_dst_pop(struct sk_buff *skb) return child; } -int dst_discard(struct sk_buff *skb); +int dst_discard_sk(struct sock *sk, struct sk_buff *skb); +static inline int dst_discard(struct sk_buff *skb) +{ + return dst_discard_sk(skb->sk, skb); +} void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags); void __dst_free(struct dst_entry *dst); @@ -449,9 +453,13 @@ static inline void dst_set_expires(struct dst_entry *dst, int timeout) } /* Output packet to network from transport. */ +static inline int dst_output_sk(struct sock *sk, struct sk_buff *skb) +{ + return skb_dst(skb)->output(sk, skb); +} static inline int dst_output(struct sk_buff *skb) { - return skb_dst(skb)->output(skb); + return dst_output_sk(skb->sk, skb); } /* Input packet from network to transport. */ diff --git a/include/net/ip.h b/include/net/ip.h index 77e73d293e09..3ec2b0fb9d83 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -104,13 +104,18 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); int ip_local_deliver(struct sk_buff *skb); int ip_mr_input(struct sk_buff *skb); -int ip_output(struct sk_buff *skb); -int ip_mc_output(struct sk_buff *skb); +int ip_output(struct sock *sk, struct sk_buff *skb); +int ip_mc_output(struct sock *sk, struct sk_buff *skb); int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); int ip_do_nat(struct sk_buff *skb); void ip_send_check(struct iphdr *ip); int __ip_local_out(struct sk_buff *skb); -int ip_local_out(struct sk_buff *skb); +int ip_local_out_sk(struct sock *sk, struct sk_buff *skb); +static inline int ip_local_out(struct sk_buff *skb) +{ + return ip_local_out_sk(skb->sk, skb); +} + int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); void ip_init(void); int ip_append_data(struct sock *sk, struct flowi4 *fl4, diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index e77c10405d51..a4daf9eb8562 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -153,7 +153,7 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, } int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); -int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, +int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 4f541f11ce63..d640925bc454 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -731,7 +731,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, * skb processing functions */ -int ip6_output(struct sk_buff *skb); +int ip6_output(struct sock *sk, struct sk_buff *skb); int ip6_forward(struct sk_buff *skb); int ip6_input(struct sk_buff *skb); int ip6_mc_input(struct sk_buff *skb); diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 32682ae47b3f..116e9c7e19cb 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -333,7 +333,7 @@ struct xfrm_state_afinfo { const xfrm_address_t *saddr); int (*tmpl_sort)(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n); int (*state_sort)(struct xfrm_state **dst, struct xfrm_state **src, int n); - int (*output)(struct sk_buff *skb); + int (*output)(struct sock *sk, struct sk_buff *skb); int (*output_finish)(struct sk_buff *skb); int (*extract_input)(struct xfrm_state *x, struct sk_buff *skb); @@ -1540,7 +1540,7 @@ static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb); int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb); -int xfrm4_output(struct sk_buff *skb); +int xfrm4_output(struct sock *sk, struct sk_buff *skb); int xfrm4_output_finish(struct sk_buff *skb); int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err); int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol); @@ -1565,7 +1565,7 @@ __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr); __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr); int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb); int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb); -int xfrm6_output(struct sk_buff *skb); +int xfrm6_output(struct sock *sk, struct sk_buff *skb); int xfrm6_output_finish(struct sk_buff *skb); int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, u8 **prevhdr); diff --git a/net/core/dst.c b/net/core/dst.c index ca4231ec7347..80d6286c8b62 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -142,12 +142,12 @@ loop: mutex_unlock(&dst_gc_mutex); } -int dst_discard(struct sk_buff *skb) +int dst_discard_sk(struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } -EXPORT_SYMBOL(dst_discard); +EXPORT_SYMBOL(dst_discard_sk); const u32 dst_default_metrics[RTAX_MAX + 1] = { /* This initializer is needed to force linker to place this variable @@ -184,7 +184,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst->xfrm = NULL; #endif dst->input = dst_discard; - dst->output = dst_discard; + dst->output = dst_discard_sk; dst->error = 0; dst->obsolete = initial_obsolete; dst->header_len = 0; @@ -209,8 +209,10 @@ static void ___dst_free(struct dst_entry *dst) /* The first case (dev==NULL) is required, when protocol module is unloaded. */ - if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) - dst->input = dst->output = dst_discard; + if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { + dst->input = dst_discard; + dst->output = dst_discard_sk; + } dst->obsolete = DST_OBSOLETE_DEAD; } @@ -361,7 +363,8 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, return; if (!unregister) { - dst->input = dst->output = dst_discard; + dst->input = dst_discard; + dst->output = dst_discard_sk; } else { dst->dev = dev_net(dst->dev)->loopback_dev; dev_hold(dst->dev); diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index ce0cbbfe0f43..daccc4a36d80 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -752,7 +752,7 @@ static int dn_to_neigh_output(struct sk_buff *skb) return n->output(n, skb); } -static int dn_output(struct sk_buff *skb) +static int dn_output(struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct dn_route *rt = (struct dn_route *)dst; @@ -838,6 +838,18 @@ drop: * Used to catch bugs. This should never normally get * called. */ +static int dn_rt_bug_sk(struct sock *sk, struct sk_buff *skb) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + + net_dbg_ratelimited("dn_rt_bug: skb from:%04x to:%04x\n", + le16_to_cpu(cb->src), le16_to_cpu(cb->dst)); + + kfree_skb(skb); + + return NET_RX_DROP; +} + static int dn_rt_bug(struct sk_buff *skb) { struct dn_skb_cb *cb = DN_SKB_CB(skb); @@ -1463,7 +1475,7 @@ make_route: rt->n = neigh; rt->dst.lastuse = jiffies; - rt->dst.output = dn_rt_bug; + rt->dst.output = dn_rt_bug_sk; switch (res.type) { case RTN_UNICAST: rt->dst.input = dn_forward; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 7ad68b860935..1cbeba5edff9 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -101,17 +101,17 @@ int __ip_local_out(struct sk_buff *skb) skb_dst(skb)->dev, dst_output); } -int ip_local_out(struct sk_buff *skb) +int ip_local_out_sk(struct sock *sk, struct sk_buff *skb) { int err; err = __ip_local_out(skb); if (likely(err == 1)) - err = dst_output(skb); + err = dst_output_sk(sk, skb); return err; } -EXPORT_SYMBOL_GPL(ip_local_out); +EXPORT_SYMBOL_GPL(ip_local_out_sk); static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) { @@ -226,9 +226,8 @@ static int ip_finish_output(struct sk_buff *skb) return ip_finish_output2(skb); } -int ip_mc_output(struct sk_buff *skb) +int ip_mc_output(struct sock *sk, struct sk_buff *skb) { - struct sock *sk = skb->sk; struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->dst.dev; @@ -287,7 +286,7 @@ int ip_mc_output(struct sk_buff *skb) !(IPCB(skb)->flags & IPSKB_REROUTED)); } -int ip_output(struct sk_buff *skb) +int ip_output(struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e77381d1df9a..484d0ce27ef7 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -670,7 +670,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, return; } - err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol, + err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index e0c2b1d2ea4e..bcf206c79005 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -46,7 +46,7 @@ #include #include -int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, +int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { @@ -76,7 +76,7 @@ int iptunnel_xmit(struct rtable *rt, struct sk_buff *skb, iph->ttl = ttl; __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); - err = ip_local_out(skb); + err = ip_local_out_sk(sk, skb); if (unlikely(net_xmit_eval(err))) pkt_len = 0; return pkt_len; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 20a59c388e6e..1485aafcad59 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1129,7 +1129,7 @@ static void ipv4_link_failure(struct sk_buff *skb) dst_set_expires(&rt->dst, 0); } -static int ip_rt_bug(struct sk_buff *skb) +static int ip_rt_bug(struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, @@ -2218,7 +2218,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->__use = 1; new->input = dst_discard; - new->output = dst_discard; + new->output = dst_discard_sk; new->dev = ort->dst.dev; if (new->dev) diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index baa0f63731fd..40e701f2e1e0 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -86,7 +86,7 @@ int xfrm4_output_finish(struct sk_buff *skb) return xfrm_output(skb); } -int xfrm4_output(struct sk_buff *skb) +int xfrm4_output(struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct xfrm_state *x = dst->xfrm; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 3284d61577c0..40e7581374f7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -132,7 +132,7 @@ static int ip6_finish_output(struct sk_buff *skb) return ip6_finish_output2(skb); } -int ip6_output(struct sk_buff *skb) +int ip6_output(struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5ea462eacd9f..4011617cca68 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -84,9 +84,9 @@ static void ip6_dst_ifdown(struct dst_entry *, static int ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); -static int ip6_pkt_discard_out(struct sk_buff *skb); +static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb); static int ip6_pkt_prohibit(struct sk_buff *skb); -static int ip6_pkt_prohibit_out(struct sk_buff *skb); +static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); @@ -290,7 +290,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = { .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EINVAL, .input = dst_discard, - .output = dst_discard, + .output = dst_discard_sk, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), .rt6i_protocol = RTPROT_KERNEL, @@ -1058,7 +1058,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori new->__use = 1; new->input = dst_discard; - new->output = dst_discard; + new->output = dst_discard_sk; if (dst_metrics_read_only(&ort->dst)) new->_metrics = ort->dst._metrics; @@ -1577,7 +1577,7 @@ int ip6_route_add(struct fib6_config *cfg) switch (cfg->fc_type) { case RTN_BLACKHOLE: rt->dst.error = -EINVAL; - rt->dst.output = dst_discard; + rt->dst.output = dst_discard_sk; rt->dst.input = dst_discard; break; case RTN_PROHIBIT: @@ -2129,7 +2129,7 @@ static int ip6_pkt_discard(struct sk_buff *skb) return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_discard_out(struct sk_buff *skb) +static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); @@ -2140,7 +2140,7 @@ static int ip6_pkt_prohibit(struct sk_buff *skb) return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); } -static int ip6_pkt_prohibit_out(struct sk_buff *skb) +static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst(skb)->dev; return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 1693c8d885f0..8da8268d65f8 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -974,8 +974,9 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, goto out; } - err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, IPPROTO_IPV6, tos, - ttl, df, !net_eq(tunnel->net, dev_net(dev))); + err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, + IPPROTO_IPV6, tos, ttl, df, + !net_eq(tunnel->net, dev_net(dev))); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return NETDEV_TX_OK; diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 6cd625e37706..19ef329bdbf8 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -163,7 +163,7 @@ static int __xfrm6_output(struct sk_buff *skb) return x->outer_mode->afinfo->output_finish(skb); } -int xfrm6_output(struct sk_buff *skb) +int xfrm6_output(struct sock *sk, struct sk_buff *skb) { return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, skb_dst(skb)->dev, __xfrm6_output); diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index a3d6951602db..ebb6e2442554 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -174,7 +174,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) skb->local_df = 1; - return iptunnel_xmit(rt, skb, fl.saddr, + return iptunnel_xmit(skb->sk, rt, skb, fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE, OVS_CB(skb)->tun_key->ipv4_tos, OVS_CB(skb)->tun_key->ipv4_ttl, df, false); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f02f511b7107..c08fbd11ceff 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1842,7 +1842,7 @@ purge_queue: xfrm_pol_put(pol); } -static int xdst_queue_output(struct sk_buff *skb) +static int xdst_queue_output(struct sock *sk, struct sk_buff *skb) { unsigned long sched_next; struct dst_entry *dst = skb_dst(skb); From 54d63f787b652755e66eb4dd8892ee6d3f5197fc Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 14 Apr 2014 17:11:38 +0200 Subject: [PATCH 31/33] ip6_gre: don't allow to remove the fb_tunnel_dev It's possible to remove the FB tunnel with the command 'ip link del ip6gre0' but this is unsafe, the module always supposes that this device exists. For example, ip6gre_tunnel_lookup() may use it unconditionally. Let's add a rtnl handler for dellink, which will never remove the FB tunnel (we let ip6gre_destroy_tunnels() do the job). Introduced by commit c12b395a4664 ("gre: Support GRE over IPv6"). CC: Dmitry Kozlov Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index c98338b81d30..9d921462b57f 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1559,6 +1559,15 @@ static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[], return 0; } +static void ip6gre_dellink(struct net_device *dev, struct list_head *head) +{ + struct net *net = dev_net(dev); + struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + + if (dev != ign->fb_tunnel_dev) + unregister_netdevice_queue(dev, head); +} + static size_t ip6gre_get_size(const struct net_device *dev) { return @@ -1636,6 +1645,7 @@ static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { .validate = ip6gre_tunnel_validate, .newlink = ip6gre_newlink, .changelink = ip6gre_changelink, + .dellink = ip6gre_dellink, .get_size = ip6gre_get_size, .fill_info = ip6gre_fill_info, }; From 8564ae09e08340a26c5408637cc4b32dba9f1640 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 15 Apr 2014 11:37:14 +0200 Subject: [PATCH 32/33] qlcnic: Fix MSI-X initialization code Function qlcnic_setup_tss_rss_intr() might enter endless loop in case pci_enable_msix() contiguously returns a positive number of MSI-Xs that could have been allocated. Besides, the function contains 'err = -EIO;' assignment that never could be reached. This update fixes the aforementioned issues. Cc: Shahed Shaikh Cc: Dept-HSGLinuxNICDev@qlogic.com Cc: netdev@vger.kernel.org Cc: linux-pci@vger.kernel.org Signed-off-by: Alexander Gordeev Acked-by: Shahed Shaikh Signed-off-by: David S. Miller --- .../net/ethernet/qlogic/qlcnic/qlcnic_main.c | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index 84d011ed7ec2..dbf75393f758 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -670,7 +670,7 @@ int qlcnic_setup_tss_rss_intr(struct qlcnic_adapter *adapter) else num_msix += adapter->drv_tx_rings; - if (adapter->drv_rss_rings > 0) + if (adapter->drv_rss_rings > 0) num_msix += adapter->drv_rss_rings; else num_msix += adapter->drv_sds_rings; @@ -686,19 +686,15 @@ int qlcnic_setup_tss_rss_intr(struct qlcnic_adapter *adapter) return -ENOMEM; } -restore: for (vector = 0; vector < num_msix; vector++) adapter->msix_entries[vector].entry = vector; +restore: err = pci_enable_msix(pdev, adapter->msix_entries, num_msix); - if (err == 0) { - adapter->ahw->num_msix = num_msix; - if (adapter->drv_tss_rings > 0) - adapter->drv_tx_rings = adapter->drv_tss_rings; + if (err > 0) { + if (!adapter->drv_tss_rings && !adapter->drv_rss_rings) + return -ENOSPC; - if (adapter->drv_rss_rings > 0) - adapter->drv_sds_rings = adapter->drv_rss_rings; - } else { netdev_info(adapter->netdev, "Unable to allocate %d MSI-X vectors, Available vectors %d\n", num_msix, err); @@ -716,12 +712,20 @@ restore: "Restoring %d Tx, %d SDS rings for total %d vectors.\n", adapter->drv_tx_rings, adapter->drv_sds_rings, num_msix); - goto restore; - err = -EIO; + goto restore; + } else if (err < 0) { + return err; } - return err; + adapter->ahw->num_msix = num_msix; + if (adapter->drv_tss_rings > 0) + adapter->drv_tx_rings = adapter->drv_tss_rings; + + if (adapter->drv_rss_rings > 0) + adapter->drv_sds_rings = adapter->drv_rss_rings; + + return 0; } int qlcnic_enable_msix(struct qlcnic_adapter *adapter, u32 num_msix) From 6f1d7210376727d090e04b8635e6dda4d7eb7b0c Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 15 Apr 2014 14:22:34 -0500 Subject: [PATCH 33/33] cxgb4: use the correct max size for firmware flash The wrong max fw size was being used and causing false "too big" errors running ethtool -f. Signed-off-by: Steve Wise Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index fb2fe65903c2..bba67681aeaa 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -682,7 +682,7 @@ enum { SF_RD_ID = 0x9f, /* read ID */ SF_ERASE_SECTOR = 0xd8, /* erase sector */ - FW_MAX_SIZE = 512 * 1024, + FW_MAX_SIZE = 16 * SF_SEC_SIZE, }; /**