mlx5-updates-2019-03-01

This series adds multipath offload support and contains some small updates
 to mlx5 driver.
 
 Multipath offload support from Roi Dayan:
 
 We are going to track SW multipath route and related nexthops and reflect
 that as port affinity to the HW.
 
 1) Some patches are preparation.
 2) add the multipath mode and fib events handling.
 3) add support to handle offload failure for net error, i.e.
 port down.
 4) Small updates to match the behavior of multipath
 
 Two small updates from Eran Ben Elisha,
 5) Make a function static
 6) Update PCIe supported devices list.
 -----BEGIN PGP SIGNATURE-----
 
 iQEcBAABAgAGBQJceZBCAAoJEEg/ir3gV/o+sC0H/RWg+QKByvv0L3gqouKRvQq6
 6IL8dacbYnGFiwhXiB/087oPn4g3rOImfvnyrH+d6/clkXGPqrgSCTLhISUc7KyP
 Ig837K0fbn9LdV7a4t6OkTMiH9XUWh/Q88LpMLn0abPHIvE+blm7plbHV1x+D6pA
 +O4QM7qHcDVUYh7lV4WqQJg/caEjNML6JHDouZ0nnqqfWM9C9Jk05sp3Nj80WdWt
 0vruJRlHNTiiKcEcYCoV6z8ZUuN7OhwmyxuPiAGkK5n3Jcpjp9EQ9LfCT+kDuNKO
 SLTk1JMtsv4ZCrB6geC2QIAGUu2lGcLPUUPohgp1nY/hEbK1Vg24sPFhEdk4/r4=
 =JM/g
 -----END PGP SIGNATURE-----

Merge tag 'mlx5-updates-2019-03-01' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux

Saeed Mahameed says:

====================
mlx5-updates-2019-03-01

This series adds multipath offload support and contains some small updates
to mlx5 driver.

Multipath offload support from Roi Dayan:

We are going to track SW multipath route and related nexthops and reflect
that as port affinity to the HW.

1) Some patches are preparation.
2) add the multipath mode and fib events handling.
3) add support to handle offload failure for net error, i.e.
port down.
4) Small updates to match the behavior of multipath

Two small updates from Eran Ben Elisha,
5) Make a function static
6) Update PCIe supported devices list.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2019-03-02 14:04:20 -08:00
commit d5fa9c55e5
15 changed files with 606 additions and 109 deletions

View File

@ -30,7 +30,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
mlx5_core-$(CONFIG_MLX5_EN_ARFS) += en_arfs.o
mlx5_core-$(CONFIG_MLX5_EN_RXNFC) += en_fs_ethtool.o
mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o
mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o en_tc.o en/tc_tun.o lib/port_tun.o
mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o en_tc.o en/tc_tun.o lib/port_tun.o lag_mp.o
#
# Core extra

View File

@ -179,7 +179,7 @@ int mlx5e_tx_reporter_timeout(struct mlx5e_txqsq *sq)
/* state lock cannot be grabbed within this function.
* It can cause a dead lock or a read-after-free.
*/
int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_tx_err_ctx *err_ctx)
static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_tx_err_ctx *err_ctx)
{
return err_ctx->recover(err_ctx->sq);
}

View File

@ -54,12 +54,24 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv,
struct neighbour *n = NULL;
#if IS_ENABLED(CONFIG_INET)
struct mlx5_core_dev *mdev = priv->mdev;
struct net_device *uplink_dev;
int ret;
if (mlx5_lag_is_multipath(mdev)) {
struct mlx5_eswitch *esw = mdev->priv.eswitch;
uplink_dev = mlx5_eswitch_uplink_get_proto_dev(esw, REP_ETH);
fl4->flowi4_oif = uplink_dev->ifindex;
}
rt = ip_route_output_key(dev_net(mirred_dev), fl4);
ret = PTR_ERR_OR_ZERO(rt);
if (ret)
return ret;
if (mlx5_lag_is_multipath(mdev) && !rt->rt_gateway)
return -ENETUNREACH;
#else
return -EOPNOTSUPP;
#endif
@ -295,7 +307,9 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv,
if (!(nud_state & NUD_VALID)) {
neigh_event_send(n, NULL);
err = -EAGAIN;
/* the encap entry will be made valid on neigh update event
* and not used before that.
*/
goto out;
}
@ -408,7 +422,9 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv,
if (!(nud_state & NUD_VALID)) {
neigh_event_send(n, NULL);
err = -EAGAIN;
/* the encap entry will be made valid on neigh update event
* and not used before that.
*/
goto out;
}

View File

@ -1573,6 +1573,8 @@ static int mlx5e_init_rep_tx(struct mlx5e_priv *priv)
if (rpriv->rep->vport == MLX5_VPORT_UPLINK) {
uplink_priv = &rpriv->uplink_priv;
INIT_LIST_HEAD(&uplink_priv->unready_flows);
/* init shared tc flow table */
err = mlx5e_tc_esw_init(&uplink_priv->tc_ht);
if (err)
@ -1632,27 +1634,38 @@ static void mlx5e_vf_rep_enable(struct mlx5e_priv *priv)
static int uplink_rep_async_event(struct notifier_block *nb, unsigned long event, void *data)
{
struct mlx5e_priv *priv = container_of(nb, struct mlx5e_priv, events_nb);
struct mlx5_eqe *eqe = data;
if (event != MLX5_EVENT_TYPE_PORT_CHANGE)
return NOTIFY_DONE;
if (event == MLX5_EVENT_TYPE_PORT_CHANGE) {
struct mlx5_eqe *eqe = data;
switch (eqe->sub_type) {
case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
queue_work(priv->wq, &priv->update_carrier_work);
break;
default:
return NOTIFY_DONE;
switch (eqe->sub_type) {
case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
queue_work(priv->wq, &priv->update_carrier_work);
break;
default:
return NOTIFY_DONE;
}
return NOTIFY_OK;
}
return NOTIFY_OK;
if (event == MLX5_DEV_EVENT_PORT_AFFINITY) {
struct mlx5e_rep_priv *rpriv = priv->ppriv;
queue_work(priv->wq, &rpriv->uplink_priv.reoffload_flows_work);
return NOTIFY_OK;
}
return NOTIFY_DONE;
}
static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv)
{
struct net_device *netdev = priv->netdev;
struct mlx5_core_dev *mdev = priv->mdev;
struct mlx5e_rep_priv *rpriv = priv->ppriv;
u16 max_mtu;
netdev->min_mtu = ETH_MIN_MTU;
@ -1660,6 +1673,9 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv)
netdev->max_mtu = MLX5E_HW2SW_MTU(&priv->channels.params, max_mtu);
mlx5e_set_dev_port_mtu(priv);
INIT_WORK(&rpriv->uplink_priv.reoffload_flows_work,
mlx5e_tc_reoffload_flows_work);
mlx5_lag_add(mdev, netdev);
priv->events_nb.notifier_call = uplink_rep_async_event;
mlx5_notifier_register(mdev, &priv->events_nb);
@ -1672,11 +1688,13 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv)
static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv)
{
struct mlx5_core_dev *mdev = priv->mdev;
struct mlx5e_rep_priv *rpriv = priv->ppriv;
#ifdef CONFIG_MLX5_CORE_EN_DCB
mlx5e_dcbnl_delete_app(priv);
#endif
mlx5_notifier_unregister(mdev, &priv->events_nb);
cancel_work_sync(&rpriv->uplink_priv.reoffload_flows_work);
mlx5_lag_remove(mdev);
}

View File

@ -74,6 +74,9 @@ struct mlx5_rep_uplink_priv {
struct notifier_block netdevice_nb;
struct mlx5_tun_entropy tun_entropy;
struct list_head unready_flows;
struct work_struct reoffload_flows_work;
};
struct mlx5e_rep_priv {

View File

@ -75,6 +75,7 @@ enum {
MLX5E_TC_FLOW_HAIRPIN_RSS = BIT(MLX5E_TC_FLOW_BASE + 2),
MLX5E_TC_FLOW_SLOW = BIT(MLX5E_TC_FLOW_BASE + 3),
MLX5E_TC_FLOW_DUP = BIT(MLX5E_TC_FLOW_BASE + 4),
MLX5E_TC_FLOW_NOT_READY = BIT(MLX5E_TC_FLOW_BASE + 5),
};
#define MLX5E_TC_MAX_SPLITS 1
@ -116,6 +117,7 @@ struct mlx5e_tc_flow {
struct list_head mod_hdr; /* flows sharing the same mod hdr ID */
struct list_head hairpin; /* flows sharing the same hairpin */
struct list_head peer; /* flows with peer flow */
struct list_head unready; /* flows not ready to be offloaded (e.g due to missing route) */
union {
struct mlx5_esw_flow_attr esw_attr[0];
struct mlx5_nic_flow_attr nic_attr[0];
@ -850,12 +852,12 @@ static void mlx5e_detach_encap(struct mlx5e_priv *priv,
struct mlx5e_tc_flow *flow, int out_index);
static int mlx5e_attach_encap(struct mlx5e_priv *priv,
struct ip_tunnel_info *tun_info,
struct net_device *mirred_dev,
struct net_device **encap_dev,
struct mlx5e_tc_flow *flow,
struct net_device *mirred_dev,
int out_index,
struct netlink_ext_ack *extack,
int out_index);
struct net_device **encap_dev,
bool *encap_valid);
static struct mlx5_flow_handle *
mlx5e_tc_offload_fdb_rules(struct mlx5_eswitch *esw,
@ -927,6 +929,26 @@ mlx5e_tc_unoffload_from_slow_path(struct mlx5_eswitch *esw,
flow->flags &= ~MLX5E_TC_FLOW_SLOW;
}
static void add_unready_flow(struct mlx5e_tc_flow *flow)
{
struct mlx5_rep_uplink_priv *uplink_priv;
struct mlx5e_rep_priv *rpriv;
struct mlx5_eswitch *esw;
esw = flow->priv->mdev->priv.eswitch;
rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
uplink_priv = &rpriv->uplink_priv;
flow->flags |= MLX5E_TC_FLOW_NOT_READY;
list_add_tail(&flow->unready, &uplink_priv->unready_flows);
}
static void remove_unready_flow(struct mlx5e_tc_flow *flow)
{
list_del(&flow->unready);
flow->flags &= ~MLX5E_TC_FLOW_NOT_READY;
}
static int
mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
struct mlx5e_tc_flow *flow,
@ -941,7 +963,8 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
struct mlx5_fc *counter = NULL;
struct mlx5e_rep_priv *rpriv;
struct mlx5e_priv *out_priv;
int err = 0, encap_err = 0;
bool encap_valid = true;
int err = 0;
int out_index;
if (!mlx5_eswitch_prios_supported(esw) && attr->prio != 1) {
@ -970,14 +993,11 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
mirred_ifindex = parse_attr->mirred_ifindex[out_index];
out_dev = __dev_get_by_index(dev_net(priv->netdev),
mirred_ifindex);
err = mlx5e_attach_encap(priv,
&parse_attr->tun_info[out_index],
out_dev, &encap_dev, flow,
extack, out_index);
if (err && err != -EAGAIN)
err = mlx5e_attach_encap(priv, flow, out_dev, out_index,
extack, &encap_dev, &encap_valid);
if (err)
goto err_attach_encap;
if (err == -EAGAIN)
encap_err = err;
out_priv = netdev_priv(encap_dev);
rpriv = out_priv->ppriv;
attr->dests[out_index].rep = rpriv->rep;
@ -1005,10 +1025,11 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
attr->counter = counter;
}
/* we get here if (1) there's no error or when
* (2) there's an encap action and we're on -EAGAIN (no valid neigh)
/* we get here if one of the following takes place:
* (1) there's no error
* (2) there's an encap action and we don't have valid neigh
*/
if (encap_err == -EAGAIN) {
if (!encap_valid) {
/* continue with goto slow path rule instead */
struct mlx5_esw_flow_attr slow_attr;
@ -1048,6 +1069,12 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
struct mlx5_esw_flow_attr slow_attr;
int out_index;
if (flow->flags & MLX5E_TC_FLOW_NOT_READY) {
remove_unready_flow(flow);
kvfree(attr->parse_attr);
return;
}
if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
if (flow->flags & MLX5E_TC_FLOW_SLOW)
mlx5e_tc_unoffload_from_slow_path(esw, flow, &slow_attr);
@ -2333,31 +2360,37 @@ static bool is_merged_eswitch_dev(struct mlx5e_priv *priv,
peer_priv = netdev_priv(peer_netdev);
return (MLX5_CAP_ESW(priv->mdev, merged_eswitch) &&
(priv->netdev->netdev_ops == peer_netdev->netdev_ops) &&
same_hw_devs(priv, peer_priv) &&
MLX5_VPORT_MANAGER(peer_priv->mdev) &&
(peer_priv->mdev->priv.eswitch->mode == SRIOV_OFFLOADS));
mlx5e_eswitch_rep(priv->netdev) &&
mlx5e_eswitch_rep(peer_netdev) &&
same_hw_devs(priv, peer_priv));
}
static int mlx5e_attach_encap(struct mlx5e_priv *priv,
struct ip_tunnel_info *tun_info,
struct net_device *mirred_dev,
struct net_device **encap_dev,
struct mlx5e_tc_flow *flow,
struct net_device *mirred_dev,
int out_index,
struct netlink_ext_ack *extack,
int out_index)
struct net_device **encap_dev,
bool *encap_valid)
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
unsigned short family = ip_tunnel_info_af(tun_info);
struct mlx5_esw_flow_attr *attr = flow->esw_attr;
struct ip_tunnel_key *key = &tun_info->key;
struct mlx5e_tc_flow_parse_attr *parse_attr;
struct ip_tunnel_info *tun_info;
struct ip_tunnel_key *key;
struct mlx5e_encap_entry *e;
unsigned short family;
uintptr_t hash_key;
bool found = false;
int err = 0;
parse_attr = attr->parse_attr;
tun_info = &parse_attr->tun_info[out_index];
family = ip_tunnel_info_af(tun_info);
key = &tun_info->key;
hash_key = hash_encap_info(key);
hash_for_each_possible_rcu(esw->offloads.encap_tbl, e,
@ -2388,7 +2421,7 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
else if (family == AF_INET6)
err = mlx5e_tc_tun_create_header_ipv6(priv, mirred_dev, e);
if (err && err != -EAGAIN)
if (err)
goto out_err;
hash_add_rcu(esw->offloads.encap_tbl, &e->encap_hlist, hash_key);
@ -2400,8 +2433,9 @@ attach_flow:
if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
attr->dests[out_index].encap_id = e->encap_id;
attr->dests[out_index].flags |= MLX5_ESW_DEST_ENCAP_VALID;
*encap_valid = true;
} else {
err = -EAGAIN;
*encap_valid = false;
}
return err;
@ -2697,8 +2731,15 @@ static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow)
bool esw_paired = mlx5_devcom_is_paired(attr->in_mdev->priv.devcom,
MLX5_DEVCOM_ESW_OFFLOADS);
return esw_paired && mlx5_lag_is_sriov(attr->in_mdev) &&
(is_rep_ingress || act_is_encap);
if (!esw_paired)
return false;
if ((mlx5_lag_is_sriov(attr->in_mdev) ||
mlx5_lag_is_multipath(attr->in_mdev)) &&
(is_rep_ingress || act_is_encap))
return true;
return false;
}
static int
@ -2793,8 +2834,12 @@ __mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
goto err_free;
err = mlx5e_tc_add_fdb_flow(priv, flow, extack);
if (err)
goto err_free;
if (err) {
if (!(err == -ENETUNREACH && mlx5_lag_is_multipath(in_mdev)))
goto err_free;
add_unready_flow(flow);
}
return flow;
@ -2806,7 +2851,8 @@ out:
}
static int mlx5e_tc_add_fdb_peer_flow(struct tc_cls_flower_offload *f,
struct mlx5e_tc_flow *flow)
struct mlx5e_tc_flow *flow,
u16 flow_flags)
{
struct mlx5e_priv *priv = flow->priv, *peer_priv;
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch, *peer_esw;
@ -2835,7 +2881,7 @@ static int mlx5e_tc_add_fdb_peer_flow(struct tc_cls_flower_offload *f,
in_mdev = priv->mdev;
parse_attr = flow->esw_attr->parse_attr;
peer_flow = __mlx5e_add_fdb_flow(peer_priv, f, flow->flags,
peer_flow = __mlx5e_add_fdb_flow(peer_priv, f, flow_flags,
parse_attr->filter_dev,
flow->esw_attr->in_rep, in_mdev);
if (IS_ERR(peer_flow)) {
@ -2873,7 +2919,7 @@ mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
return PTR_ERR(flow);
if (is_peer_flow_needed(flow)) {
err = mlx5e_tc_add_fdb_peer_flow(f, flow);
err = mlx5e_tc_add_fdb_peer_flow(f, flow, flow_flags);
if (err) {
mlx5e_tc_del_fdb_flow(priv, flow);
goto out;
@ -3038,23 +3084,25 @@ int mlx5e_stats_flower(struct net_device *dev, struct mlx5e_priv *priv,
struct mlx5_eswitch *peer_esw;
struct mlx5e_tc_flow *flow;
struct mlx5_fc *counter;
u64 bytes;
u64 packets;
u64 lastuse;
u64 lastuse = 0;
u64 packets = 0;
u64 bytes = 0;
flow = rhashtable_lookup_fast(tc_ht, &f->cookie, tc_ht_params);
if (!flow || !same_flow_direction(flow, flags))
return -EINVAL;
if (!(flow->flags & MLX5E_TC_FLOW_OFFLOADED))
return 0;
if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
counter = mlx5e_tc_get_counter(flow);
if (!counter)
return 0;
counter = mlx5e_tc_get_counter(flow);
if (!counter)
return 0;
mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse);
mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse);
}
/* Under multipath it's possible for one rule to be currently
* un-offloaded while the other rule is offloaded.
*/
peer_esw = mlx5_devcom_get_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS);
if (!peer_esw)
goto out;
@ -3066,6 +3114,8 @@ int mlx5e_stats_flower(struct net_device *dev, struct mlx5e_priv *priv,
u64 lastuse2;
counter = mlx5e_tc_get_counter(flow->peer_flow);
if (!counter)
goto no_peer_counter;
mlx5_fc_query_cached(counter, &bytes2, &packets2, &lastuse2);
bytes += bytes2;
@ -3073,8 +3123,8 @@ int mlx5e_stats_flower(struct net_device *dev, struct mlx5e_priv *priv,
lastuse = max_t(u64, lastuse, lastuse2);
}
no_peer_counter:
mlx5_devcom_release_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS);
out:
flow_stats_update(&f->stats, bytes, packets, lastuse);
@ -3196,3 +3246,18 @@ void mlx5e_tc_clean_fdb_peer_flows(struct mlx5_eswitch *esw)
list_for_each_entry_safe(flow, tmp, &esw->offloads.peer_flows, peer)
__mlx5e_tc_del_fdb_peer_flow(flow);
}
void mlx5e_tc_reoffload_flows_work(struct work_struct *work)
{
struct mlx5_rep_uplink_priv *rpriv =
container_of(work, struct mlx5_rep_uplink_priv,
reoffload_flows_work);
struct mlx5e_tc_flow *flow, *tmp;
rtnl_lock();
list_for_each_entry_safe(flow, tmp, &rpriv->unready_flows, unready) {
if (!mlx5e_tc_add_fdb_flow(flow->priv, flow, NULL))
remove_unready_flow(flow);
}
rtnl_unlock();
}

View File

@ -72,6 +72,7 @@ void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe);
int mlx5e_tc_num_filters(struct mlx5e_priv *priv, int flags);
void mlx5e_tc_reoffload_flows_work(struct work_struct *work);
#else /* CONFIG_MLX5_ESWITCH */
static inline int mlx5e_tc_nic_init(struct mlx5e_priv *priv) { return 0; }

View File

@ -2476,3 +2476,10 @@ bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1)
return false;
}
bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0,
struct mlx5_core_dev *dev1)
{
return (dev0->priv.eswitch->mode == SRIOV_OFFLOADS &&
dev1->priv.eswitch->mode == SRIOV_OFFLOADS);
}

View File

@ -371,6 +371,8 @@ static inline bool mlx5_eswitch_vlan_actions_supported(struct mlx5_core_dev *dev
bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0,
struct mlx5_core_dev *dev1);
bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0,
struct mlx5_core_dev *dev1);
#define MLX5_DEBUG_ESWITCH_MASK BIT(3)

View File

@ -35,37 +35,8 @@
#include <linux/mlx5/vport.h>
#include "mlx5_core.h"
#include "eswitch.h"
enum {
MLX5_LAG_FLAG_ROCE = 1 << 0,
MLX5_LAG_FLAG_SRIOV = 1 << 1,
};
#define MLX5_LAG_MODE_FLAGS (MLX5_LAG_FLAG_ROCE | MLX5_LAG_FLAG_SRIOV)
struct lag_func {
struct mlx5_core_dev *dev;
struct net_device *netdev;
};
/* Used for collection of netdev event info. */
struct lag_tracker {
enum netdev_lag_tx_type tx_type;
struct netdev_lag_lower_state_info netdev_state[MLX5_MAX_PORTS];
bool is_bonded;
};
/* LAG data of a ConnectX card.
* It serves both its phys functions.
*/
struct mlx5_lag {
u8 flags;
u8 v2p_map[MLX5_MAX_PORTS];
struct lag_func pf[MLX5_MAX_PORTS];
struct lag_tracker tracker;
struct delayed_work bond_work;
struct notifier_block nb;
};
#include "lag.h"
#include "lag_mp.h"
/* General purpose, use for short periods of time.
* Beware of lock dependencies (preferably, no locks should be acquired
@ -147,13 +118,8 @@ static int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
}
static struct mlx5_lag *mlx5_lag_dev_get(struct mlx5_core_dev *dev)
{
return dev->priv.lag;
}
static int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev,
struct net_device *ndev)
int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev,
struct net_device *ndev)
{
int i;
@ -174,11 +140,6 @@ static bool __mlx5_lag_is_sriov(struct mlx5_lag *ldev)
return !!(ldev->flags & MLX5_LAG_FLAG_SRIOV);
}
static bool __mlx5_lag_is_active(struct mlx5_lag *ldev)
{
return !!(ldev->flags & MLX5_LAG_MODE_FLAGS);
}
static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker,
u8 *port1, u8 *port2)
{
@ -195,8 +156,8 @@ static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker,
*port2 = 1;
}
static void mlx5_modify_lag(struct mlx5_lag *ldev,
struct lag_tracker *tracker)
void mlx5_modify_lag(struct mlx5_lag *ldev,
struct lag_tracker *tracker)
{
struct mlx5_core_dev *dev0 = ldev->pf[0].dev;
u8 v2p_port1, v2p_port2;
@ -241,9 +202,9 @@ static int mlx5_create_lag(struct mlx5_lag *ldev,
return err;
}
static int mlx5_activate_lag(struct mlx5_lag *ldev,
struct lag_tracker *tracker,
u8 flags)
int mlx5_activate_lag(struct mlx5_lag *ldev,
struct lag_tracker *tracker,
u8 flags)
{
bool roce_lag = !!(flags & MLX5_LAG_FLAG_ROCE);
struct mlx5_core_dev *dev0 = ldev->pf[0].dev;
@ -386,7 +347,7 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
static void mlx5_queue_bond_work(struct mlx5_lag *ldev, unsigned long delay)
{
schedule_delayed_work(&ldev->bond_work, delay);
queue_delayed_work(ldev->wq, &ldev->bond_work, delay);
}
static void mlx5_do_bond_work(struct work_struct *work)
@ -538,6 +499,12 @@ static struct mlx5_lag *mlx5_lag_dev_alloc(void)
if (!ldev)
return NULL;
ldev->wq = create_singlethread_workqueue("mlx5_lag");
if (!ldev->wq) {
kfree(ldev);
return NULL;
}
INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work);
return ldev;
@ -545,6 +512,7 @@ static struct mlx5_lag *mlx5_lag_dev_alloc(void)
static void mlx5_lag_dev_free(struct mlx5_lag *ldev)
{
destroy_workqueue(ldev->wq);
kfree(ldev);
}
@ -592,6 +560,7 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
{
struct mlx5_lag *ldev = NULL;
struct mlx5_core_dev *tmp_dev;
int err;
if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
!MLX5_CAP_GEN(dev, lag_master) ||
@ -619,6 +588,11 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
}
}
err = mlx5_lag_mp_init(ldev);
if (err)
mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
err);
}
int mlx5_lag_get_pf_num(struct mlx5_core_dev *dev, int *pf_num)
@ -664,6 +638,7 @@ void mlx5_lag_remove(struct mlx5_core_dev *dev)
if (i == MLX5_MAX_PORTS) {
if (ldev->nb.notifier_call)
unregister_netdevice_notifier(&ldev->nb);
mlx5_lag_mp_cleanup(ldev);
cancel_delayed_work_sync(&ldev->bond_work);
mlx5_lag_dev_free(ldev);
}

View File

@ -0,0 +1,65 @@
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
/* Copyright (c) 2019 Mellanox Technologies. */
#ifndef __MLX5_LAG_H__
#define __MLX5_LAG_H__
#include "mlx5_core.h"
#include "lag_mp.h"
enum {
MLX5_LAG_FLAG_ROCE = 1 << 0,
MLX5_LAG_FLAG_SRIOV = 1 << 1,
MLX5_LAG_FLAG_MULTIPATH = 1 << 2,
};
#define MLX5_LAG_MODE_FLAGS (MLX5_LAG_FLAG_ROCE | MLX5_LAG_FLAG_SRIOV |\
MLX5_LAG_FLAG_MULTIPATH)
struct lag_func {
struct mlx5_core_dev *dev;
struct net_device *netdev;
};
/* Used for collection of netdev event info. */
struct lag_tracker {
enum netdev_lag_tx_type tx_type;
struct netdev_lag_lower_state_info netdev_state[MLX5_MAX_PORTS];
unsigned int is_bonded:1;
};
/* LAG data of a ConnectX card.
* It serves both its phys functions.
*/
struct mlx5_lag {
u8 flags;
u8 v2p_map[MLX5_MAX_PORTS];
struct lag_func pf[MLX5_MAX_PORTS];
struct lag_tracker tracker;
struct workqueue_struct *wq;
struct delayed_work bond_work;
struct notifier_block nb;
struct lag_mp lag_mp;
};
static inline struct mlx5_lag *
mlx5_lag_dev_get(struct mlx5_core_dev *dev)
{
return dev->priv.lag;
}
static inline bool
__mlx5_lag_is_active(struct mlx5_lag *ldev)
{
return !!(ldev->flags & MLX5_LAG_MODE_FLAGS);
}
void mlx5_modify_lag(struct mlx5_lag *ldev,
struct lag_tracker *tracker);
int mlx5_activate_lag(struct mlx5_lag *ldev,
struct lag_tracker *tracker,
u8 flags);
int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev,
struct net_device *ndev);
#endif /* __MLX5_LAG_H__ */

View File

@ -0,0 +1,315 @@
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2019 Mellanox Technologies. */
#include <linux/netdevice.h>
#include "lag.h"
#include "lag_mp.h"
#include "mlx5_core.h"
#include "eswitch.h"
#include "lib/mlx5.h"
static bool mlx5_lag_multipath_check_prereq(struct mlx5_lag *ldev)
{
if (!ldev->pf[0].dev || !ldev->pf[1].dev)
return false;
return mlx5_esw_multipath_prereq(ldev->pf[0].dev, ldev->pf[1].dev);
}
static bool __mlx5_lag_is_multipath(struct mlx5_lag *ldev)
{
return !!(ldev->flags & MLX5_LAG_FLAG_MULTIPATH);
}
bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev)
{
struct mlx5_lag *ldev;
bool res;
ldev = mlx5_lag_dev_get(dev);
res = ldev && __mlx5_lag_is_multipath(ldev);
return res;
}
/**
* Set lag port affinity
*
* @ldev: lag device
* @port:
* 0 - set normal affinity.
* 1 - set affinity to port 1.
* 2 - set affinity to port 2.
*
**/
static void mlx5_lag_set_port_affinity(struct mlx5_lag *ldev, int port)
{
struct lag_tracker tracker;
if (!__mlx5_lag_is_multipath(ldev))
return;
switch (port) {
case 0:
tracker.netdev_state[0].tx_enabled = true;
tracker.netdev_state[1].tx_enabled = true;
tracker.netdev_state[0].link_up = true;
tracker.netdev_state[1].link_up = true;
break;
case 1:
tracker.netdev_state[0].tx_enabled = true;
tracker.netdev_state[0].link_up = true;
tracker.netdev_state[1].tx_enabled = false;
tracker.netdev_state[1].link_up = false;
break;
case 2:
tracker.netdev_state[0].tx_enabled = false;
tracker.netdev_state[0].link_up = false;
tracker.netdev_state[1].tx_enabled = true;
tracker.netdev_state[1].link_up = true;
break;
default:
mlx5_core_warn(ldev->pf[0].dev, "Invalid affinity port %d",
port);
return;
}
if (tracker.netdev_state[0].tx_enabled)
mlx5_notifier_call_chain(ldev->pf[0].dev->priv.events,
MLX5_DEV_EVENT_PORT_AFFINITY,
(void *)0);
if (tracker.netdev_state[1].tx_enabled)
mlx5_notifier_call_chain(ldev->pf[1].dev->priv.events,
MLX5_DEV_EVENT_PORT_AFFINITY,
(void *)0);
mlx5_modify_lag(ldev, &tracker);
}
static void mlx5_lag_fib_event_flush(struct notifier_block *nb)
{
struct lag_mp *mp = container_of(nb, struct lag_mp, fib_nb);
struct mlx5_lag *ldev = container_of(mp, struct mlx5_lag, lag_mp);
flush_workqueue(ldev->wq);
}
struct mlx5_fib_event_work {
struct work_struct work;
struct mlx5_lag *ldev;
unsigned long event;
union {
struct fib_entry_notifier_info fen_info;
struct fib_nh_notifier_info fnh_info;
};
};
static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev,
unsigned long event,
struct fib_info *fi)
{
struct lag_mp *mp = &ldev->lag_mp;
/* Handle delete event */
if (event == FIB_EVENT_ENTRY_DEL) {
/* stop track */
if (mp->mfi == fi)
mp->mfi = NULL;
return;
}
/* Handle add/replace event */
if (fi->fib_nhs == 1) {
if (__mlx5_lag_is_active(ldev)) {
struct net_device *nh_dev = fi->fib_nh[0].nh_dev;
int i = mlx5_lag_dev_get_netdev_idx(ldev, nh_dev);
mlx5_lag_set_port_affinity(ldev, ++i);
}
return;
}
if (fi->fib_nhs != 2)
return;
/* Verify next hops are ports of the same hca */
if (!(fi->fib_nh[0].nh_dev == ldev->pf[0].netdev &&
fi->fib_nh[1].nh_dev == ldev->pf[1].netdev) &&
!(fi->fib_nh[0].nh_dev == ldev->pf[1].netdev &&
fi->fib_nh[1].nh_dev == ldev->pf[0].netdev)) {
mlx5_core_warn(ldev->pf[0].dev, "Multipath offload require two ports of the same HCA\n");
return;
}
/* First time we see multipath route */
if (!mp->mfi && !__mlx5_lag_is_active(ldev)) {
struct lag_tracker tracker;
tracker = ldev->tracker;
mlx5_activate_lag(ldev, &tracker, MLX5_LAG_FLAG_MULTIPATH);
}
mlx5_lag_set_port_affinity(ldev, 0);
mp->mfi = fi;
}
static void mlx5_lag_fib_nexthop_event(struct mlx5_lag *ldev,
unsigned long event,
struct fib_nh *fib_nh,
struct fib_info *fi)
{
struct lag_mp *mp = &ldev->lag_mp;
/* Check the nh event is related to the route */
if (!mp->mfi || mp->mfi != fi)
return;
/* nh added/removed */
if (event == FIB_EVENT_NH_DEL) {
int i = mlx5_lag_dev_get_netdev_idx(ldev, fib_nh->nh_dev);
if (i >= 0) {
i = (i + 1) % 2 + 1; /* peer port */
mlx5_lag_set_port_affinity(ldev, i);
}
} else if (event == FIB_EVENT_NH_ADD &&
fi->fib_nhs == 2) {
mlx5_lag_set_port_affinity(ldev, 0);
}
}
static void mlx5_lag_fib_update(struct work_struct *work)
{
struct mlx5_fib_event_work *fib_work =
container_of(work, struct mlx5_fib_event_work, work);
struct mlx5_lag *ldev = fib_work->ldev;
struct fib_nh *fib_nh;
/* Protect internal structures from changes */
rtnl_lock();
switch (fib_work->event) {
case FIB_EVENT_ENTRY_REPLACE: /* fall through */
case FIB_EVENT_ENTRY_APPEND: /* fall through */
case FIB_EVENT_ENTRY_ADD: /* fall through */
case FIB_EVENT_ENTRY_DEL:
mlx5_lag_fib_route_event(ldev, fib_work->event,
fib_work->fen_info.fi);
fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_NH_ADD: /* fall through */
case FIB_EVENT_NH_DEL:
fib_nh = fib_work->fnh_info.fib_nh;
mlx5_lag_fib_nexthop_event(ldev,
fib_work->event,
fib_work->fnh_info.fib_nh,
fib_nh->nh_parent);
fib_info_put(fib_work->fnh_info.fib_nh->nh_parent);
break;
}
rtnl_unlock();
kfree(fib_work);
}
static struct mlx5_fib_event_work *
mlx5_lag_init_fib_work(struct mlx5_lag *ldev, unsigned long event)
{
struct mlx5_fib_event_work *fib_work;
fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
if (WARN_ON(!fib_work))
return NULL;
INIT_WORK(&fib_work->work, mlx5_lag_fib_update);
fib_work->ldev = ldev;
fib_work->event = event;
return fib_work;
}
static int mlx5_lag_fib_event(struct notifier_block *nb,
unsigned long event,
void *ptr)
{
struct lag_mp *mp = container_of(nb, struct lag_mp, fib_nb);
struct mlx5_lag *ldev = container_of(mp, struct mlx5_lag, lag_mp);
struct fib_notifier_info *info = ptr;
struct mlx5_fib_event_work *fib_work;
struct fib_entry_notifier_info *fen_info;
struct fib_nh_notifier_info *fnh_info;
struct fib_info *fi;
if (info->family != AF_INET)
return NOTIFY_DONE;
if (!mlx5_lag_multipath_check_prereq(ldev))
return NOTIFY_DONE;
switch (event) {
case FIB_EVENT_ENTRY_REPLACE: /* fall through */
case FIB_EVENT_ENTRY_APPEND: /* fall through */
case FIB_EVENT_ENTRY_ADD: /* fall through */
case FIB_EVENT_ENTRY_DEL:
fen_info = container_of(info, struct fib_entry_notifier_info,
info);
fi = fen_info->fi;
if (fi->fib_dev != ldev->pf[0].netdev &&
fi->fib_dev != ldev->pf[1].netdev) {
return NOTIFY_DONE;
}
fib_work = mlx5_lag_init_fib_work(ldev, event);
if (!fib_work)
return NOTIFY_DONE;
fib_work->fen_info = *fen_info;
/* Take reference on fib_info to prevent it from being
* freed while work is queued. Release it afterwards.
*/
fib_info_hold(fib_work->fen_info.fi);
break;
case FIB_EVENT_NH_ADD: /* fall through */
case FIB_EVENT_NH_DEL:
fnh_info = container_of(info, struct fib_nh_notifier_info,
info);
fib_work = mlx5_lag_init_fib_work(ldev, event);
if (!fib_work)
return NOTIFY_DONE;
fib_work->fnh_info = *fnh_info;
fib_info_hold(fib_work->fnh_info.fib_nh->nh_parent);
break;
default:
return NOTIFY_DONE;
}
queue_work(ldev->wq, &fib_work->work);
return NOTIFY_DONE;
}
int mlx5_lag_mp_init(struct mlx5_lag *ldev)
{
struct lag_mp *mp = &ldev->lag_mp;
int err;
if (mp->fib_nb.notifier_call)
return 0;
mp->fib_nb.notifier_call = mlx5_lag_fib_event;
err = register_fib_notifier(&mp->fib_nb,
mlx5_lag_fib_event_flush);
if (err)
mp->fib_nb.notifier_call = NULL;
return err;
}
void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev)
{
struct lag_mp *mp = &ldev->lag_mp;
if (!mp->fib_nb.notifier_call)
return;
unregister_fib_notifier(&mp->fib_nb);
mp->fib_nb.notifier_call = NULL;
}

View File

@ -0,0 +1,26 @@
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
/* Copyright (c) 2019 Mellanox Technologies. */
#ifndef __MLX5_LAG_MP_H__
#define __MLX5_LAG_MP_H__
#include "lag.h"
#include "mlx5_core.h"
struct lag_mp {
struct notifier_block fib_nb;
struct fib_info *mfi; /* used in tracking fib events */
};
#ifdef CONFIG_MLX5_ESWITCH
int mlx5_lag_mp_init(struct mlx5_lag *ldev);
void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev);
#else /* CONFIG_MLX5_ESWITCH */
static inline int mlx5_lag_mp_init(struct mlx5_lag *ldev) { return 0; }
static inline void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev) {}
#endif /* CONFIG_MLX5_ESWITCH */
#endif /* __MLX5_LAG_MP_H__ */

View File

@ -1486,6 +1486,8 @@ static const struct pci_device_id mlx5_core_pci_table[] = {
{ PCI_VDEVICE(MELLANOX, 0x101a), MLX5_PCI_DEV_IS_VF}, /* ConnectX-5 Ex VF */
{ PCI_VDEVICE(MELLANOX, 0x101b) }, /* ConnectX-6 */
{ PCI_VDEVICE(MELLANOX, 0x101c), MLX5_PCI_DEV_IS_VF}, /* ConnectX-6 VF */
{ PCI_VDEVICE(MELLANOX, 0x101d) }, /* ConnectX-6 Dx */
{ PCI_VDEVICE(MELLANOX, 0x101e), MLX5_PCI_DEV_IS_VF}, /* ConnectX Family mlx5Gen Virtual Function */
{ PCI_VDEVICE(MELLANOX, 0xa2d2) }, /* BlueField integrated ConnectX-5 network controller */
{ PCI_VDEVICE(MELLANOX, 0xa2d3), MLX5_PCI_DEV_IS_VF}, /* BlueField integrated ConnectX-5 network controller VF */
{ 0, }

View File

@ -195,6 +195,7 @@ struct mlx5_rsc_debug {
enum mlx5_dev_event {
MLX5_DEV_EVENT_SYS_ERROR = 128, /* 0 - 127 are FW events */
MLX5_DEV_EVENT_PORT_AFFINITY = 129,
};
enum mlx5_port_status {
@ -1041,6 +1042,7 @@ int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev);
int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev);
bool mlx5_lag_is_roce(struct mlx5_core_dev *dev);
bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev);
bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev);
bool mlx5_lag_is_active(struct mlx5_core_dev *dev);
struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev);
int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,