linux/include/net/pkt_cls.h

588 lines
13 KiB
C
Raw Normal View History

#ifndef __NET_PKT_CLS_H
#define __NET_PKT_CLS_H
#include <linux/pkt_cls.h>
#include <net/sch_generic.h>
#include <net/act_api.h>
/* Basic packet classifier frontend definitions. */
struct tcf_walker {
int stop;
int skip;
int count;
int (*fn)(struct tcf_proto *, void *node, struct tcf_walker *);
};
int register_tcf_proto_ops(struct tcf_proto_ops *ops);
int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
#ifdef CONFIG_NET_CLS
struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
bool create);
void tcf_chain_put(struct tcf_chain *chain);
int tcf_block_get(struct tcf_block **p_block,
struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q);
void tcf_block_put(struct tcf_block *block);
static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
{
return block->q;
}
static inline struct net_device *tcf_block_dev(struct tcf_block *block)
{
return tcf_block_q(block)->dev_queue->dev;
}
int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res, bool compat_mode);
#else
static inline
int tcf_block_get(struct tcf_block **p_block,
struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q)
{
return 0;
}
static inline void tcf_block_put(struct tcf_block *block)
{
}
static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
{
return NULL;
}
static inline struct net_device *tcf_block_dev(struct tcf_block *block)
{
return NULL;
}
static inline int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res, bool compat_mode)
{
return TC_ACT_UNSPEC;
}
#endif
static inline unsigned long
__cls_set_class(unsigned long *clp, unsigned long cl)
{
net_sched: avoid calling tcf_unbind_filter() in call_rcu callback This fixes the following crash: [ 63.976822] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [ 63.980094] CPU: 1 PID: 15 Comm: ksoftirqd/1 Not tainted 3.17.0-rc6+ #648 [ 63.980094] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 63.980094] task: ffff880117dea690 ti: ffff880117dfc000 task.ti: ffff880117dfc000 [ 63.980094] RIP: 0010:[<ffffffff817e6d07>] [<ffffffff817e6d07>] u32_destroy_key+0x27/0x6d [ 63.980094] RSP: 0018:ffff880117dffcc0 EFLAGS: 00010202 [ 63.980094] RAX: ffff880117dea690 RBX: ffff8800d02e0820 RCX: 0000000000000000 [ 63.980094] RDX: 0000000000000001 RSI: 0000000000000002 RDI: 6b6b6b6b6b6b6b6b [ 63.980094] RBP: ffff880117dffcd0 R08: 0000000000000000 R09: 0000000000000000 [ 63.980094] R10: 00006c0900006ba8 R11: 00006ba100006b9d R12: 0000000000000001 [ 63.980094] R13: ffff8800d02e0898 R14: ffffffff817e6d4d R15: ffff880117387a30 [ 63.980094] FS: 0000000000000000(0000) GS:ffff88011a800000(0000) knlGS:0000000000000000 [ 63.980094] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 63.980094] CR2: 00007f07e6732fed CR3: 000000011665b000 CR4: 00000000000006e0 [ 63.980094] Stack: [ 63.980094] ffff88011a9cd300 ffffffff82051ac0 ffff880117dffce0 ffffffff817e6d68 [ 63.980094] ffff880117dffd70 ffffffff810cb4c7 ffffffff810cb3cd ffff880117dfffd8 [ 63.980094] ffff880117dea690 ffff880117dea690 ffff880117dfffd8 000000000000000a [ 63.980094] Call Trace: [ 63.980094] [<ffffffff817e6d68>] u32_delete_key_freepf_rcu+0x1b/0x1d [ 63.980094] [<ffffffff810cb4c7>] rcu_process_callbacks+0x3bb/0x691 [ 63.980094] [<ffffffff810cb3cd>] ? rcu_process_callbacks+0x2c1/0x691 [ 63.980094] [<ffffffff817e6d4d>] ? u32_destroy_key+0x6d/0x6d [ 63.980094] [<ffffffff810780a4>] __do_softirq+0x142/0x323 [ 63.980094] [<ffffffff810782a8>] run_ksoftirqd+0x23/0x53 [ 63.980094] [<ffffffff81092126>] smpboot_thread_fn+0x203/0x221 [ 63.980094] [<ffffffff81091f23>] ? smpboot_unpark_thread+0x33/0x33 [ 63.980094] [<ffffffff8108e44d>] kthread+0xc9/0xd1 [ 63.980094] [<ffffffff819e00ea>] ? do_wait_for_common+0xf8/0x125 [ 63.980094] [<ffffffff8108e384>] ? __kthread_parkme+0x61/0x61 [ 63.980094] [<ffffffff819e43ec>] ret_from_fork+0x7c/0xb0 [ 63.980094] [<ffffffff8108e384>] ? __kthread_parkme+0x61/0x61 tp could be freed in call_rcu callback too, the order is not guaranteed. John Fastabend says: ==================== Its worth noting why this is safe. Any running schedulers will either read the valid class field or it will be zeroed. All schedulers today when the class is 0 do a lookup using the same call used by the tcf_exts_bind(). So even if we have a running classifier hit the null class pointer it will do a lookup and get to the same result. This is particularly fragile at the moment because the only way to verify this is to audit the schedulers call sites. ==================== Cc: John Fastabend <john.r.fastabend@intel.com> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com> Acked-by: John Fastabend <john.r.fastabend@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-10-01 01:07:24 +02:00
return xchg(clp, cl);
}
static inline unsigned long
cls_set_class(struct tcf_proto *tp, unsigned long *clp,
unsigned long cl)
{
unsigned long old_cl;
tcf_tree_lock(tp);
old_cl = __cls_set_class(clp, cl);
tcf_tree_unlock(tp);
return old_cl;
}
static inline void
tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base)
{
unsigned long cl;
cl = tp->q->ops->cl_ops->bind_tcf(tp->q, base, r->classid);
cl = cls_set_class(tp, &r->class, cl);
if (cl)
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
}
static inline void
tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r)
{
unsigned long cl;
if ((cl = __cls_set_class(&r->class, 0)) != 0)
tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
}
struct tcf_exts {
#ifdef CONFIG_NET_CLS_ACT
__u32 type; /* for backward compat(TCA_OLD_COMPAT) */
int nr_actions;
struct tc_action **actions;
#endif
/* Map to export classifier specific extension TLV types to the
* generic extensions API. Unsupported extensions must be set to 0.
*/
int action;
int police;
};
static inline int tcf_exts_init(struct tcf_exts *exts, int action, int police)
{
#ifdef CONFIG_NET_CLS_ACT
exts->type = 0;
exts->nr_actions = 0;
exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *),
GFP_KERNEL);
if (!exts->actions)
return -ENOMEM;
#endif
exts->action = action;
exts->police = police;
return 0;
}
static inline void tcf_exts_to_list(const struct tcf_exts *exts,
struct list_head *actions)
{
#ifdef CONFIG_NET_CLS_ACT
int i;
for (i = 0; i < exts->nr_actions; i++) {
struct tc_action *a = exts->actions[i];
list_add_tail(&a->list, actions);
}
#endif
}
static inline void
tcf_exts_stats_update(const struct tcf_exts *exts,
u64 bytes, u64 packets, u64 lastuse)
{
#ifdef CONFIG_NET_CLS_ACT
int i;
preempt_disable();
for (i = 0; i < exts->nr_actions; i++) {
struct tc_action *a = exts->actions[i];
tcf_action_stats_update(a, bytes, packets, lastuse);
}
preempt_enable();
#endif
}
/**
* tcf_exts_has_actions - check if at least one action is present
* @exts: tc filter extensions handle
*
* Returns true if at least one action is present.
*/
static inline bool tcf_exts_has_actions(struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
return exts->nr_actions;
#else
return false;
#endif
}
/**
* tcf_exts_has_one_action - check if exactly one action is present
* @exts: tc filter extensions handle
*
* Returns true if exactly one action is present.
*/
static inline bool tcf_exts_has_one_action(struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
return exts->nr_actions == 1;
#else
return false;
#endif
}
/**
* tcf_exts_exec - execute tc filter extensions
* @skb: socket buffer
* @exts: tc filter extensions handle
* @res: desired result
*
* Executes all configured extensions. Returns TC_ACT_OK on a normal execution,
* a negative number if the filter must be considered unmatched or
* a positive action code (TC_ACT_*) which must be returned to the
* underlying layer.
*/
static inline int
tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts,
struct tcf_result *res)
{
#ifdef CONFIG_NET_CLS_ACT
return tcf_action_exec(skb, exts->actions, exts->nr_actions, res);
#endif
return TC_ACT_OK;
}
int tcf_exts_validate(struct net *net, struct tcf_proto *tp,
struct nlattr **tb, struct nlattr *rate_tlv,
struct tcf_exts *exts, bool ovr);
void tcf_exts_destroy(struct tcf_exts *exts);
void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src);
int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts);
int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts);
/**
* struct tcf_pkt_info - packet information
*/
struct tcf_pkt_info {
unsigned char * ptr;
int nexthdr;
};
#ifdef CONFIG_NET_EMATCH
struct tcf_ematch_ops;
/**
* struct tcf_ematch - extended match (ematch)
*
* @matchid: identifier to allow userspace to reidentify a match
* @flags: flags specifying attributes and the relation to other matches
* @ops: the operations lookup table of the corresponding ematch module
* @datalen: length of the ematch specific configuration data
* @data: ematch specific data
*/
struct tcf_ematch {
struct tcf_ematch_ops * ops;
unsigned long data;
unsigned int datalen;
u16 matchid;
u16 flags;
struct net *net;
};
static inline int tcf_em_is_container(struct tcf_ematch *em)
{
return !em->ops;
}
static inline int tcf_em_is_simple(struct tcf_ematch *em)
{
return em->flags & TCF_EM_SIMPLE;
}
static inline int tcf_em_is_inverted(struct tcf_ematch *em)
{
return em->flags & TCF_EM_INVERT;
}
static inline int tcf_em_last_match(struct tcf_ematch *em)
{
return (em->flags & TCF_EM_REL_MASK) == TCF_EM_REL_END;
}
static inline int tcf_em_early_end(struct tcf_ematch *em, int result)
{
if (tcf_em_last_match(em))
return 1;
if (result == 0 && em->flags & TCF_EM_REL_AND)
return 1;
if (result != 0 && em->flags & TCF_EM_REL_OR)
return 1;
return 0;
}
/**
* struct tcf_ematch_tree - ematch tree handle
*
* @hdr: ematch tree header supplied by userspace
* @matches: array of ematches
*/
struct tcf_ematch_tree {
struct tcf_ematch_tree_hdr hdr;
struct tcf_ematch * matches;
};
/**
* struct tcf_ematch_ops - ematch module operations
*
* @kind: identifier (kind) of this ematch module
* @datalen: length of expected configuration data (optional)
* @change: called during validation (optional)
* @match: called during ematch tree evaluation, must return 1/0
* @destroy: called during destroyage (optional)
* @dump: called during dumping process (optional)
* @owner: owner, must be set to THIS_MODULE
* @link: link to previous/next ematch module (internal use)
*/
struct tcf_ematch_ops {
int kind;
int datalen;
int (*change)(struct net *net, void *,
int, struct tcf_ematch *);
int (*match)(struct sk_buff *, struct tcf_ematch *,
struct tcf_pkt_info *);
void (*destroy)(struct tcf_ematch *);
int (*dump)(struct sk_buff *, struct tcf_ematch *);
struct module *owner;
struct list_head link;
};
int tcf_em_register(struct tcf_ematch_ops *);
void tcf_em_unregister(struct tcf_ematch_ops *);
int tcf_em_tree_validate(struct tcf_proto *, struct nlattr *,
struct tcf_ematch_tree *);
void tcf_em_tree_destroy(struct tcf_ematch_tree *);
int tcf_em_tree_dump(struct sk_buff *, struct tcf_ematch_tree *, int);
int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *,
struct tcf_pkt_info *);
/**
* tcf_em_tree_match - evaulate an ematch tree
*
* @skb: socket buffer of the packet in question
* @tree: ematch tree to be used for evaluation
* @info: packet information examined by classifier
*
* This function matches @skb against the ematch tree in @tree by going
* through all ematches respecting their logic relations returning
* as soon as the result is obvious.
*
* Returns 1 if the ematch tree as-one matches, no ematches are configured
* or ematch is not enabled in the kernel, otherwise 0 is returned.
*/
static inline int tcf_em_tree_match(struct sk_buff *skb,
struct tcf_ematch_tree *tree,
struct tcf_pkt_info *info)
{
if (tree->hdr.nmatches)
return __tcf_em_tree_match(skb, tree, info);
else
return 1;
}
#define MODULE_ALIAS_TCF_EMATCH(kind) MODULE_ALIAS("ematch-kind-" __stringify(kind))
#else /* CONFIG_NET_EMATCH */
struct tcf_ematch_tree {
};
#define tcf_em_tree_validate(tp, tb, t) ((void)(t), 0)
#define tcf_em_tree_destroy(t) do { (void)(t); } while(0)
#define tcf_em_tree_dump(skb, t, tlv) (0)
#define tcf_em_tree_match(skb, t, info) ((void)(info), 1)
#endif /* CONFIG_NET_EMATCH */
static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer)
{
switch (layer) {
case TCF_LAYER_LINK:
return skb->data;
case TCF_LAYER_NETWORK:
return skb_network_header(skb);
case TCF_LAYER_TRANSPORT:
return skb_transport_header(skb);
}
return NULL;
}
static inline int tcf_valid_offset(const struct sk_buff *skb,
const unsigned char *ptr, const int len)
{
return likely((ptr + len) <= skb_tail_pointer(skb) &&
ptr >= skb->head &&
(ptr <= (ptr + len)));
}
#ifdef CONFIG_NET_CLS_IND
#include <net/net_namespace.h>
static inline int
tcf_change_indev(struct net *net, struct nlattr *indev_tlv)
{
char indev[IFNAMSIZ];
struct net_device *dev;
if (nla_strlcpy(indev, indev_tlv, IFNAMSIZ) >= IFNAMSIZ)
return -EINVAL;
dev = __dev_get_by_name(net, indev);
if (!dev)
return -ENODEV;
return dev->ifindex;
}
static inline bool
tcf_match_indev(struct sk_buff *skb, int ifindex)
{
if (!ifindex)
return true;
if (!skb->skb_iif)
return false;
return ifindex == skb->skb_iif;
}
#endif /* CONFIG_NET_CLS_IND */
int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type,
void *type_data, bool err_stop);
struct tc_cls_common_offload {
u32 chain_index;
__be16 protocol;
u32 prio;
u32 classid;
};
static inline void
tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
const struct tcf_proto *tp)
{
cls_common->chain_index = tp->chain->index;
cls_common->protocol = tp->protocol;
cls_common->prio = tp->prio;
cls_common->classid = tp->classid;
}
struct tc_cls_u32_knode {
struct tcf_exts *exts;
struct tc_u32_sel *sel;
u32 handle;
u32 val;
u32 mask;
u32 link_handle;
u8 fshift;
};
struct tc_cls_u32_hnode {
u32 handle;
u32 prio;
unsigned int divisor;
};
enum tc_clsu32_command {
TC_CLSU32_NEW_KNODE,
TC_CLSU32_REPLACE_KNODE,
TC_CLSU32_DELETE_KNODE,
TC_CLSU32_NEW_HNODE,
TC_CLSU32_REPLACE_HNODE,
TC_CLSU32_DELETE_HNODE,
};
struct tc_cls_u32_offload {
struct tc_cls_common_offload common;
/* knode values */
enum tc_clsu32_command command;
union {
struct tc_cls_u32_knode knode;
struct tc_cls_u32_hnode hnode;
};
};
static inline bool tc_can_offload(const struct net_device *dev)
{
if (!(dev->features & NETIF_F_HW_TC))
return false;
net: sched: cls_u32 add bit to specify software only rules In the initial implementation the only way to stop a rule from being inserted into the hardware table was via the device feature flag. However this doesn't work well when working on an end host system where packets are expect to hit both the hardware and software datapaths. For example we can imagine a rule that will match an IP address and increment a field. If we install this rule in both hardware and software we may increment the field twice. To date we have only added support for the drop action so we have been able to ignore these cases. But as we extend the action support we will hit this example plus more such cases. Arguably these are not even corner cases in many working systems these cases will be common. To avoid forcing the driver to always abort (i.e. the above example) this patch adds a flag to add a rule in software only. A careful user can use this flag to build software and hardware datapaths that work together. One example we have found particularly useful is to use hardware resources to set the skb->mark on the skb when the match may be expensive to run in software but a mark lookup in a hash table is cheap. The idea here is hardware can do in one lookup what the u32 classifier may need to traverse multiple lists and hash tables to compute. The flag is only passed down on inserts. On deletion to avoid stale references in hardware we always try to remove a rule if it exists. The flags field is part of the classifier specific options. Although it is tempting to lift this into the generic structure doing this proves difficult do to how the tc netlink attributes are implemented along with how the dump/change routines are called. There is also precedence for putting seemingly generic pieces in the specific classifier options such as TCA_U32_POLICE, TCA_U32_ACT, etc. So although not ideal I've left FLAGS in the u32 options as well as it simplifies the code greatly and user space has already learned how to manage these bits ala 'tc' tool. Another thing if trying to update a rule we require the flags to be unchanged. This is to force user space, software u32 and the hardware u32 to keep in sync. Thanks to Simon Horman for catching this case. Signed-off-by: John Fastabend <john.r.fastabend@intel.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-02-26 16:54:39 +01:00
if (!dev->netdev_ops->ndo_setup_tc)
return false;
return true;
}
static inline bool tc_skip_hw(u32 flags)
{
return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false;
}
static inline bool tc_should_offload(const struct net_device *dev, u32 flags)
{
if (tc_skip_hw(flags))
return false;
return tc_can_offload(dev);
}
static inline bool tc_skip_sw(u32 flags)
{
return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false;
}
/* SKIP_HW and SKIP_SW are mutually exclusive flags. */
static inline bool tc_flags_valid(u32 flags)
{
if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW))
return false;
if (!(flags ^ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW)))
return false;
return true;
}
static inline bool tc_in_hw(u32 flags)
{
return (flags & TCA_CLS_FLAGS_IN_HW) ? true : false;
}
enum tc_fl_command {
TC_CLSFLOWER_REPLACE,
TC_CLSFLOWER_DESTROY,
TC_CLSFLOWER_STATS,
};
struct tc_cls_flower_offload {
struct tc_cls_common_offload common;
enum tc_fl_command command;
unsigned long cookie;
struct flow_dissector *dissector;
struct fl_flow_key *mask;
struct fl_flow_key *key;
struct tcf_exts *exts;
};
enum tc_matchall_command {
TC_CLSMATCHALL_REPLACE,
TC_CLSMATCHALL_DESTROY,
};
struct tc_cls_matchall_offload {
struct tc_cls_common_offload common;
enum tc_matchall_command command;
struct tcf_exts *exts;
unsigned long cookie;
};
enum tc_clsbpf_command {
TC_CLSBPF_ADD,
TC_CLSBPF_REPLACE,
TC_CLSBPF_DESTROY,
TC_CLSBPF_STATS,
};
struct tc_cls_bpf_offload {
struct tc_cls_common_offload common;
enum tc_clsbpf_command command;
struct tcf_exts *exts;
struct bpf_prog *prog;
const char *name;
bool exts_integrated;
u32 gen_flags;
};
mqprio: Introduce new hardware offload mode and shaper in mqprio The offload types currently supported in mqprio are 0 (no offload) and 1 (offload only TCs) by setting these values for the 'hw' option. If offloads are supported by setting the 'hw' option to 1, the default offload mode is 'dcb' where only the TC values are offloaded to the device. This patch introduces a new hardware offload mode called 'channel' with 'hw' set to 1 in mqprio which makes full use of the mqprio options, the TCs, the queue configurations and the QoS parameters for the TCs. This is achieved through a new netlink attribute for the 'mode' option which takes values such as 'dcb' (default) and 'channel'. The 'channel' mode also supports QoS attributes for traffic class such as minimum and maximum values for bandwidth rate limits. This patch enables configuring additional HW shaper attributes associated with a traffic class. Currently the shaper for bandwidth rate limiting is supported which takes options such as minimum and maximum bandwidth rates and are offloaded to the hardware in the 'channel' mode. The min and max limits for bandwidth rates are provided by the user along with the TCs and the queue configurations when creating the mqprio qdisc. The interface can be extended to support new HW shapers in future through the 'shaper' attribute. Introduces a new data structure 'tc_mqprio_qopt_offload' for offloading mqprio queue options and use this to be shared between the kernel and device driver. This contains a copy of the existing data structure for mqprio queue options. This new data structure can be extended when adding new attributes for traffic class such as mode, shaper, shaper parameters (bandwidth rate limits). The existing data structure for mqprio queue options will be shared between the kernel and userspace. Example: queues 4@0 4@4 hw 1 mode channel shaper bw_rlimit\ min_rate 1Gbit 2Gbit max_rate 4Gbit 5Gbit To dump the bandwidth rates: qdisc mqprio 804a: root tc 2 map 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 queues:(0:3) (4:7) mode:channel shaper:bw_rlimit min_rate:1Gbit 2Gbit max_rate:4Gbit 5Gbit Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com> Tested-by: Andrew Bowers <andrewx.bowers@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
2017-09-07 13:00:06 +02:00
struct tc_mqprio_qopt_offload {
/* struct tc_mqprio_qopt must always be the first element */
struct tc_mqprio_qopt qopt;
u16 mode;
u16 shaper;
u32 flags;
u64 min_rate[TC_QOPT_MAX_QUEUE];
u64 max_rate[TC_QOPT_MAX_QUEUE];
};
/* This structure holds cookie structure that is passed from user
* to the kernel for actions and classifiers
*/
struct tc_cookie {
u8 *data;
u32 len;
};
#endif