2010-02-03 17:17:06 +01:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2010 Patrick McHardy <kaber@trash.net>
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*/
|
2011-04-21 11:05:14 +02:00
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
2010-02-03 17:17:06 +01:00
|
|
|
#include <linux/module.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 09:04:11 +01:00
|
|
|
#include <linux/gfp.h>
|
2010-02-03 17:17:06 +01:00
|
|
|
#include <linux/skbuff.h>
|
|
|
|
#include <linux/netfilter_ipv4/ip_tables.h>
|
|
|
|
#include <linux/netfilter_ipv6/ip6_tables.h>
|
|
|
|
#include <linux/netfilter/x_tables.h>
|
|
|
|
#include <linux/netfilter/xt_CT.h>
|
|
|
|
#include <net/netfilter/nf_conntrack.h>
|
2012-03-23 00:02:07 +01:00
|
|
|
#include <net/netfilter/nf_conntrack_l4proto.h>
|
2010-02-03 17:17:06 +01:00
|
|
|
#include <net/netfilter/nf_conntrack_helper.h>
|
|
|
|
#include <net/netfilter/nf_conntrack_ecache.h>
|
2012-02-29 02:19:19 +01:00
|
|
|
#include <net/netfilter/nf_conntrack_timeout.h>
|
2010-02-15 18:13:33 +01:00
|
|
|
#include <net/netfilter/nf_conntrack_zones.h>
|
2010-02-03 17:17:06 +01:00
|
|
|
|
2013-01-30 20:20:39 +01:00
|
|
|
static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
|
2010-02-03 17:17:06 +01:00
|
|
|
{
|
|
|
|
/* Previously seen (loopback)? Ignore. */
|
|
|
|
if (skb->nfct != NULL)
|
|
|
|
return XT_CONTINUE;
|
|
|
|
|
2013-05-22 13:10:57 +02:00
|
|
|
/* special case the untracked ct : we want the percpu object */
|
|
|
|
if (!ct)
|
|
|
|
ct = nf_ct_untracked_get();
|
2010-02-03 17:17:06 +01:00
|
|
|
atomic_inc(&ct->ct_general.use);
|
|
|
|
skb->nfct = &ct->ct_general;
|
|
|
|
skb->nfctinfo = IP_CT_NEW;
|
|
|
|
|
|
|
|
return XT_CONTINUE;
|
|
|
|
}
|
|
|
|
|
2013-01-30 20:20:39 +01:00
|
|
|
static unsigned int xt_ct_target_v0(struct sk_buff *skb,
|
2012-02-29 02:19:19 +01:00
|
|
|
const struct xt_action_param *par)
|
|
|
|
{
|
2013-01-30 20:20:39 +01:00
|
|
|
const struct xt_ct_target_info *info = par->targinfo;
|
2012-02-29 02:19:19 +01:00
|
|
|
struct nf_conn *ct = info->ct;
|
|
|
|
|
2013-01-30 20:20:39 +01:00
|
|
|
return xt_ct_target(skb, ct);
|
|
|
|
}
|
2012-02-29 02:19:19 +01:00
|
|
|
|
2013-01-30 20:20:39 +01:00
|
|
|
static unsigned int xt_ct_target_v1(struct sk_buff *skb,
|
|
|
|
const struct xt_action_param *par)
|
|
|
|
{
|
|
|
|
const struct xt_ct_target_info_v1 *info = par->targinfo;
|
|
|
|
struct nf_conn *ct = info->ct;
|
2012-02-29 02:19:19 +01:00
|
|
|
|
2013-01-30 20:20:39 +01:00
|
|
|
return xt_ct_target(skb, ct);
|
2012-02-29 02:19:19 +01:00
|
|
|
}
|
|
|
|
|
2010-02-03 17:17:06 +01:00
|
|
|
static u8 xt_ct_find_proto(const struct xt_tgchk_param *par)
|
|
|
|
{
|
2010-03-11 00:38:44 +01:00
|
|
|
if (par->family == NFPROTO_IPV4) {
|
2010-02-03 17:17:06 +01:00
|
|
|
const struct ipt_entry *e = par->entryinfo;
|
|
|
|
|
|
|
|
if (e->ip.invflags & IPT_INV_PROTO)
|
|
|
|
return 0;
|
|
|
|
return e->ip.proto;
|
2010-03-11 00:38:44 +01:00
|
|
|
} else if (par->family == NFPROTO_IPV6) {
|
2010-02-03 17:17:06 +01:00
|
|
|
const struct ip6t_entry *e = par->entryinfo;
|
|
|
|
|
|
|
|
if (e->ipv6.invflags & IP6T_INV_PROTO)
|
|
|
|
return 0;
|
|
|
|
return e->ipv6.proto;
|
|
|
|
} else
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-08-28 02:53:14 +02:00
|
|
|
static int
|
|
|
|
xt_ct_set_helper(struct nf_conn *ct, const char *helper_name,
|
|
|
|
const struct xt_tgchk_param *par)
|
|
|
|
{
|
|
|
|
struct nf_conntrack_helper *helper;
|
|
|
|
struct nf_conn_help *help;
|
|
|
|
u8 proto;
|
|
|
|
|
|
|
|
proto = xt_ct_find_proto(par);
|
|
|
|
if (!proto) {
|
|
|
|
pr_info("You must specify a L4 protocol, and not use "
|
|
|
|
"inversions on it.\n");
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
|
|
|
|
helper = nf_conntrack_helper_try_module_get(helper_name, par->family,
|
|
|
|
proto);
|
|
|
|
if (helper == NULL) {
|
|
|
|
pr_info("No such helper \"%s\"\n", helper_name);
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
|
|
|
|
help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL);
|
|
|
|
if (help == NULL) {
|
|
|
|
module_put(helper->me);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
help->helper = helper;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-04-03 14:50:07 +02:00
|
|
|
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
|
|
|
|
static void __xt_ct_tg_timeout_put(struct ctnl_timeout *timeout)
|
|
|
|
{
|
|
|
|
typeof(nf_ct_timeout_put_hook) timeout_put;
|
|
|
|
|
|
|
|
timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
|
|
|
|
if (timeout_put)
|
|
|
|
timeout_put(timeout);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2012-08-28 02:53:14 +02:00
|
|
|
static int
|
|
|
|
xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par,
|
|
|
|
const char *timeout_name)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
|
|
|
|
typeof(nf_ct_timeout_find_get_hook) timeout_find_get;
|
|
|
|
struct ctnl_timeout *timeout;
|
|
|
|
struct nf_conn_timeout *timeout_ext;
|
|
|
|
struct nf_conntrack_l4proto *l4proto;
|
|
|
|
int ret = 0;
|
2012-10-11 10:49:12 +02:00
|
|
|
u8 proto;
|
2012-08-28 02:53:14 +02:00
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook);
|
|
|
|
if (timeout_find_get == NULL) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
pr_info("Timeout policy base is empty\n");
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-10-11 10:49:12 +02:00
|
|
|
proto = xt_ct_find_proto(par);
|
|
|
|
if (!proto) {
|
2012-08-28 02:53:14 +02:00
|
|
|
ret = -EINVAL;
|
2012-10-11 10:49:12 +02:00
|
|
|
pr_info("You must specify a L4 protocol, and not use "
|
|
|
|
"inversions on it.\n");
|
2012-08-28 02:53:14 +02:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
timeout = timeout_find_get(timeout_name);
|
|
|
|
if (timeout == NULL) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
pr_info("No such timeout policy \"%s\"\n", timeout_name);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (timeout->l3num != par->family) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
pr_info("Timeout policy `%s' can only be used by L3 protocol "
|
|
|
|
"number %d\n", timeout_name, timeout->l3num);
|
|
|
|
goto err_put_timeout;
|
|
|
|
}
|
|
|
|
/* Make sure the timeout policy matches any existing protocol tracker,
|
|
|
|
* otherwise default to generic.
|
|
|
|
*/
|
2012-10-11 10:49:12 +02:00
|
|
|
l4proto = __nf_ct_l4proto_find(par->family, proto);
|
2012-08-28 02:53:14 +02:00
|
|
|
if (timeout->l4proto->l4proto != l4proto->l4proto) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
pr_info("Timeout policy `%s' can only be used by L4 protocol "
|
|
|
|
"number %d\n",
|
|
|
|
timeout_name, timeout->l4proto->l4proto);
|
|
|
|
goto err_put_timeout;
|
|
|
|
}
|
|
|
|
timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC);
|
|
|
|
if (timeout_ext == NULL)
|
|
|
|
ret = -ENOMEM;
|
|
|
|
|
|
|
|
err_put_timeout:
|
|
|
|
__xt_ct_tg_timeout_put(timeout);
|
|
|
|
out:
|
|
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
|
|
|
#else
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
netfilter: nf_conntrack: add direction support for zones
This work adds a direction parameter to netfilter zones, so identity
separation can be performed only in original/reply or both directions
(default). This basically opens up the possibility of doing NAT with
conflicting IP address/port tuples from multiple, isolated tenants
on a host (e.g. from a netns) without requiring each tenant to NAT
twice resp. to use its own dedicated IP address to SNAT to, meaning
overlapping tuples can be made unique with the zone identifier in
original direction, where the NAT engine will then allocate a unique
tuple in the commonly shared default zone for the reply direction.
In some restricted, local DNAT cases, also port redirection could be
used for making the reply traffic unique w/o requiring SNAT.
The consensus we've reached and discussed at NFWS and since the initial
implementation [1] was to directly integrate the direction meta data
into the existing zones infrastructure, as opposed to the ct->mark
approach we proposed initially.
As we pass the nf_conntrack_zone object directly around, we don't have
to touch all call-sites, but only those, that contain equality checks
of zones. Thus, based on the current direction (original or reply),
we either return the actual id, or the default NF_CT_DEFAULT_ZONE_ID.
CT expectations are direction-agnostic entities when expectations are
being compared among themselves, so we can only use the identifier
in this case.
Note that zone identifiers can not be included into the hash mix
anymore as they don't contain a "stable" value that would be equal
for both directions at all times, f.e. if only zone->id would
unconditionally be xor'ed into the table slot hash, then replies won't
find the corresponding conntracking entry anymore.
If no particular direction is specified when configuring zones, the
behaviour is exactly as we expect currently (both directions).
Support has been added for the CT netlink interface as well as the
x_tables raw CT target, which both already offer existing interfaces
to user space for the configuration of zones.
Below a minimal, simplified collision example (script in [2]) with
netperf sessions:
+--- tenant-1 ---+ mark := 1
| netperf |--+
+----------------+ | CT zone := mark [ORIGINAL]
[ip,sport] := X +--------------+ +--- gateway ---+
| mark routing |--| SNAT |-- ... +
+--------------+ +---------------+ |
+--- tenant-2 ---+ | ~~~|~~~
| netperf |--+ +-----------+ |
+----------------+ mark := 2 | netserver |------ ... +
[ip,sport] := X +-----------+
[ip,port] := Y
On the gateway netns, example:
iptables -t raw -A PREROUTING -j CT --zone mark --zone-dir ORIGINAL
iptables -t nat -A POSTROUTING -o <dev> -j SNAT --to-source <ip> --random-fully
iptables -t mangle -A PREROUTING -m conntrack --ctdir ORIGINAL -j CONNMARK --save-mark
iptables -t mangle -A POSTROUTING -m conntrack --ctdir REPLY -j CONNMARK --restore-mark
conntrack dump from gateway netns:
netperf -H 10.1.1.2 -t TCP_STREAM -l60 -p12865,5555 from each tenant netns
tcp 6 431995 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=5555 dport=12865 zone-orig=1
src=10.1.1.2 dst=10.1.1.1 sport=12865 dport=1024
[ASSURED] mark=1 secctx=system_u:object_r:unlabeled_t:s0 use=1
tcp 6 431994 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=5555 dport=12865 zone-orig=2
src=10.1.1.2 dst=10.1.1.1 sport=12865 dport=5555
[ASSURED] mark=2 secctx=system_u:object_r:unlabeled_t:s0 use=1
tcp 6 299 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=39438 dport=33768 zone-orig=1
src=10.1.1.2 dst=10.1.1.1 sport=33768 dport=39438
[ASSURED] mark=1 secctx=system_u:object_r:unlabeled_t:s0 use=1
tcp 6 300 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=32889 dport=40206 zone-orig=2
src=10.1.1.2 dst=10.1.1.1 sport=40206 dport=32889
[ASSURED] mark=2 secctx=system_u:object_r:unlabeled_t:s0 use=2
Taking this further, test script in [2] creates 200 tenants and runs
original-tuple colliding netperf sessions each. A conntrack -L dump in
the gateway netns also confirms 200 overlapping entries, all in ESTABLISHED
state as expected.
I also did run various other tests with some permutations of the script,
to mention some: SNAT in random/random-fully/persistent mode, no zones (no
overlaps), static zones (original, reply, both directions), etc.
[1] http://thread.gmane.org/gmane.comp.security.firewalls.netfilter.devel/57412/
[2] https://paste.fedoraproject.org/242835/65657871/
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2015-08-14 16:03:39 +02:00
|
|
|
static u16 xt_ct_flags_to_dir(const struct xt_ct_target_info_v1 *info)
|
|
|
|
{
|
|
|
|
switch (info->flags & (XT_CT_ZONE_DIR_ORIG |
|
|
|
|
XT_CT_ZONE_DIR_REPL)) {
|
|
|
|
case XT_CT_ZONE_DIR_ORIG:
|
|
|
|
return NF_CT_ZONE_DIR_ORIG;
|
|
|
|
case XT_CT_ZONE_DIR_REPL:
|
|
|
|
return NF_CT_ZONE_DIR_REPL;
|
|
|
|
default:
|
|
|
|
return NF_CT_DEFAULT_ZONE_DIR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-01-30 20:20:39 +01:00
|
|
|
static int xt_ct_tg_check(const struct xt_tgchk_param *par,
|
|
|
|
struct xt_ct_target_info_v1 *info)
|
2012-02-29 02:19:19 +01:00
|
|
|
{
|
2015-08-08 21:40:01 +02:00
|
|
|
struct nf_conntrack_zone zone;
|
2012-02-29 02:19:19 +01:00
|
|
|
struct nf_conn *ct;
|
2013-01-10 12:42:15 +01:00
|
|
|
int ret = -EOPNOTSUPP;
|
2012-08-28 02:53:14 +02:00
|
|
|
|
2012-02-29 02:19:19 +01:00
|
|
|
if (info->flags & XT_CT_NOTRACK) {
|
2013-05-22 13:10:57 +02:00
|
|
|
ct = NULL;
|
2012-02-29 02:19:19 +01:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef CONFIG_NF_CONNTRACK_ZONES
|
netfilter: nf_conntrack: add direction support for zones
This work adds a direction parameter to netfilter zones, so identity
separation can be performed only in original/reply or both directions
(default). This basically opens up the possibility of doing NAT with
conflicting IP address/port tuples from multiple, isolated tenants
on a host (e.g. from a netns) without requiring each tenant to NAT
twice resp. to use its own dedicated IP address to SNAT to, meaning
overlapping tuples can be made unique with the zone identifier in
original direction, where the NAT engine will then allocate a unique
tuple in the commonly shared default zone for the reply direction.
In some restricted, local DNAT cases, also port redirection could be
used for making the reply traffic unique w/o requiring SNAT.
The consensus we've reached and discussed at NFWS and since the initial
implementation [1] was to directly integrate the direction meta data
into the existing zones infrastructure, as opposed to the ct->mark
approach we proposed initially.
As we pass the nf_conntrack_zone object directly around, we don't have
to touch all call-sites, but only those, that contain equality checks
of zones. Thus, based on the current direction (original or reply),
we either return the actual id, or the default NF_CT_DEFAULT_ZONE_ID.
CT expectations are direction-agnostic entities when expectations are
being compared among themselves, so we can only use the identifier
in this case.
Note that zone identifiers can not be included into the hash mix
anymore as they don't contain a "stable" value that would be equal
for both directions at all times, f.e. if only zone->id would
unconditionally be xor'ed into the table slot hash, then replies won't
find the corresponding conntracking entry anymore.
If no particular direction is specified when configuring zones, the
behaviour is exactly as we expect currently (both directions).
Support has been added for the CT netlink interface as well as the
x_tables raw CT target, which both already offer existing interfaces
to user space for the configuration of zones.
Below a minimal, simplified collision example (script in [2]) with
netperf sessions:
+--- tenant-1 ---+ mark := 1
| netperf |--+
+----------------+ | CT zone := mark [ORIGINAL]
[ip,sport] := X +--------------+ +--- gateway ---+
| mark routing |--| SNAT |-- ... +
+--------------+ +---------------+ |
+--- tenant-2 ---+ | ~~~|~~~
| netperf |--+ +-----------+ |
+----------------+ mark := 2 | netserver |------ ... +
[ip,sport] := X +-----------+
[ip,port] := Y
On the gateway netns, example:
iptables -t raw -A PREROUTING -j CT --zone mark --zone-dir ORIGINAL
iptables -t nat -A POSTROUTING -o <dev> -j SNAT --to-source <ip> --random-fully
iptables -t mangle -A PREROUTING -m conntrack --ctdir ORIGINAL -j CONNMARK --save-mark
iptables -t mangle -A POSTROUTING -m conntrack --ctdir REPLY -j CONNMARK --restore-mark
conntrack dump from gateway netns:
netperf -H 10.1.1.2 -t TCP_STREAM -l60 -p12865,5555 from each tenant netns
tcp 6 431995 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=5555 dport=12865 zone-orig=1
src=10.1.1.2 dst=10.1.1.1 sport=12865 dport=1024
[ASSURED] mark=1 secctx=system_u:object_r:unlabeled_t:s0 use=1
tcp 6 431994 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=5555 dport=12865 zone-orig=2
src=10.1.1.2 dst=10.1.1.1 sport=12865 dport=5555
[ASSURED] mark=2 secctx=system_u:object_r:unlabeled_t:s0 use=1
tcp 6 299 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=39438 dport=33768 zone-orig=1
src=10.1.1.2 dst=10.1.1.1 sport=33768 dport=39438
[ASSURED] mark=1 secctx=system_u:object_r:unlabeled_t:s0 use=1
tcp 6 300 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=32889 dport=40206 zone-orig=2
src=10.1.1.2 dst=10.1.1.1 sport=40206 dport=32889
[ASSURED] mark=2 secctx=system_u:object_r:unlabeled_t:s0 use=2
Taking this further, test script in [2] creates 200 tenants and runs
original-tuple colliding netperf sessions each. A conntrack -L dump in
the gateway netns also confirms 200 overlapping entries, all in ESTABLISHED
state as expected.
I also did run various other tests with some permutations of the script,
to mention some: SNAT in random/random-fully/persistent mode, no zones (no
overlaps), static zones (original, reply, both directions), etc.
[1] http://thread.gmane.org/gmane.comp.security.firewalls.netfilter.devel/57412/
[2] https://paste.fedoraproject.org/242835/65657871/
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2015-08-14 16:03:39 +02:00
|
|
|
if (info->zone || info->flags & (XT_CT_ZONE_DIR_ORIG |
|
2015-08-14 16:03:40 +02:00
|
|
|
XT_CT_ZONE_DIR_REPL |
|
|
|
|
XT_CT_ZONE_MARK))
|
2012-02-29 02:19:19 +01:00
|
|
|
goto err1;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
ret = nf_ct_l3proto_try_module_get(par->family);
|
|
|
|
if (ret < 0)
|
|
|
|
goto err1;
|
|
|
|
|
2015-08-08 21:40:01 +02:00
|
|
|
memset(&zone, 0, sizeof(zone));
|
|
|
|
zone.id = info->zone;
|
netfilter: nf_conntrack: add direction support for zones
This work adds a direction parameter to netfilter zones, so identity
separation can be performed only in original/reply or both directions
(default). This basically opens up the possibility of doing NAT with
conflicting IP address/port tuples from multiple, isolated tenants
on a host (e.g. from a netns) without requiring each tenant to NAT
twice resp. to use its own dedicated IP address to SNAT to, meaning
overlapping tuples can be made unique with the zone identifier in
original direction, where the NAT engine will then allocate a unique
tuple in the commonly shared default zone for the reply direction.
In some restricted, local DNAT cases, also port redirection could be
used for making the reply traffic unique w/o requiring SNAT.
The consensus we've reached and discussed at NFWS and since the initial
implementation [1] was to directly integrate the direction meta data
into the existing zones infrastructure, as opposed to the ct->mark
approach we proposed initially.
As we pass the nf_conntrack_zone object directly around, we don't have
to touch all call-sites, but only those, that contain equality checks
of zones. Thus, based on the current direction (original or reply),
we either return the actual id, or the default NF_CT_DEFAULT_ZONE_ID.
CT expectations are direction-agnostic entities when expectations are
being compared among themselves, so we can only use the identifier
in this case.
Note that zone identifiers can not be included into the hash mix
anymore as they don't contain a "stable" value that would be equal
for both directions at all times, f.e. if only zone->id would
unconditionally be xor'ed into the table slot hash, then replies won't
find the corresponding conntracking entry anymore.
If no particular direction is specified when configuring zones, the
behaviour is exactly as we expect currently (both directions).
Support has been added for the CT netlink interface as well as the
x_tables raw CT target, which both already offer existing interfaces
to user space for the configuration of zones.
Below a minimal, simplified collision example (script in [2]) with
netperf sessions:
+--- tenant-1 ---+ mark := 1
| netperf |--+
+----------------+ | CT zone := mark [ORIGINAL]
[ip,sport] := X +--------------+ +--- gateway ---+
| mark routing |--| SNAT |-- ... +
+--------------+ +---------------+ |
+--- tenant-2 ---+ | ~~~|~~~
| netperf |--+ +-----------+ |
+----------------+ mark := 2 | netserver |------ ... +
[ip,sport] := X +-----------+
[ip,port] := Y
On the gateway netns, example:
iptables -t raw -A PREROUTING -j CT --zone mark --zone-dir ORIGINAL
iptables -t nat -A POSTROUTING -o <dev> -j SNAT --to-source <ip> --random-fully
iptables -t mangle -A PREROUTING -m conntrack --ctdir ORIGINAL -j CONNMARK --save-mark
iptables -t mangle -A POSTROUTING -m conntrack --ctdir REPLY -j CONNMARK --restore-mark
conntrack dump from gateway netns:
netperf -H 10.1.1.2 -t TCP_STREAM -l60 -p12865,5555 from each tenant netns
tcp 6 431995 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=5555 dport=12865 zone-orig=1
src=10.1.1.2 dst=10.1.1.1 sport=12865 dport=1024
[ASSURED] mark=1 secctx=system_u:object_r:unlabeled_t:s0 use=1
tcp 6 431994 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=5555 dport=12865 zone-orig=2
src=10.1.1.2 dst=10.1.1.1 sport=12865 dport=5555
[ASSURED] mark=2 secctx=system_u:object_r:unlabeled_t:s0 use=1
tcp 6 299 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=39438 dport=33768 zone-orig=1
src=10.1.1.2 dst=10.1.1.1 sport=33768 dport=39438
[ASSURED] mark=1 secctx=system_u:object_r:unlabeled_t:s0 use=1
tcp 6 300 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=32889 dport=40206 zone-orig=2
src=10.1.1.2 dst=10.1.1.1 sport=40206 dport=32889
[ASSURED] mark=2 secctx=system_u:object_r:unlabeled_t:s0 use=2
Taking this further, test script in [2] creates 200 tenants and runs
original-tuple colliding netperf sessions each. A conntrack -L dump in
the gateway netns also confirms 200 overlapping entries, all in ESTABLISHED
state as expected.
I also did run various other tests with some permutations of the script,
to mention some: SNAT in random/random-fully/persistent mode, no zones (no
overlaps), static zones (original, reply, both directions), etc.
[1] http://thread.gmane.org/gmane.comp.security.firewalls.netfilter.devel/57412/
[2] https://paste.fedoraproject.org/242835/65657871/
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2015-08-14 16:03:39 +02:00
|
|
|
zone.dir = xt_ct_flags_to_dir(info);
|
2015-08-14 16:03:40 +02:00
|
|
|
if (info->flags & XT_CT_ZONE_MARK)
|
|
|
|
zone.flags |= NF_CT_FLAG_MARK;
|
2015-08-08 21:40:01 +02:00
|
|
|
|
|
|
|
ct = nf_ct_tmpl_alloc(par->net, &zone, GFP_KERNEL);
|
2015-07-28 00:42:28 +02:00
|
|
|
if (!ct) {
|
|
|
|
ret = -ENOMEM;
|
2012-02-29 02:19:19 +01:00
|
|
|
goto err2;
|
2015-07-28 00:42:28 +02:00
|
|
|
}
|
2012-02-29 02:19:19 +01:00
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
if ((info->ct_events || info->exp_events) &&
|
|
|
|
!nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events,
|
2014-01-02 10:03:45 +01:00
|
|
|
GFP_KERNEL)) {
|
|
|
|
ret = -EINVAL;
|
2012-02-29 02:19:19 +01:00
|
|
|
goto err3;
|
2014-01-02 10:03:45 +01:00
|
|
|
}
|
2012-02-29 02:19:19 +01:00
|
|
|
|
|
|
|
if (info->helper[0]) {
|
2012-08-28 02:53:14 +02:00
|
|
|
ret = xt_ct_set_helper(ct, info->helper, par);
|
|
|
|
if (ret < 0)
|
2012-06-07 12:11:50 +02:00
|
|
|
goto err3;
|
2012-02-29 02:19:19 +01:00
|
|
|
}
|
|
|
|
|
2012-04-27 02:00:50 +02:00
|
|
|
if (info->timeout[0]) {
|
2012-08-28 02:53:14 +02:00
|
|
|
ret = xt_ct_set_timeout(ct, par, info->timeout);
|
|
|
|
if (ret < 0)
|
|
|
|
goto err3;
|
2012-02-29 02:19:19 +01:00
|
|
|
}
|
netfilter: fix netns dependencies with conntrack templates
Quoting Daniel Borkmann:
"When adding connection tracking template rules to a netns, f.e. to
configure netfilter zones, the kernel will endlessly busy-loop as soon
as we try to delete the given netns in case there's at least one
template present, which is problematic i.e. if there is such bravery that
the priviledged user inside the netns is assumed untrusted.
Minimal example:
ip netns add foo
ip netns exec foo iptables -t raw -A PREROUTING -d 1.2.3.4 -j CT --zone 1
ip netns del foo
What happens is that when nf_ct_iterate_cleanup() is being called from
nf_conntrack_cleanup_net_list() for a provided netns, we always end up
with a net->ct.count > 0 and thus jump back to i_see_dead_people. We
don't get a soft-lockup as we still have a schedule() point, but the
serving CPU spins on 100% from that point onwards.
Since templates are normally allocated with nf_conntrack_alloc(), we
also bump net->ct.count. The issue why they are not yet nf_ct_put() is
because the per netns .exit() handler from x_tables (which would eventually
invoke xt_CT's xt_ct_tg_destroy() that drops reference on info->ct) is
called in the dependency chain at a *later* point in time than the per
netns .exit() handler for the connection tracker.
This is clearly a chicken'n'egg problem: after the connection tracker
.exit() handler, we've teared down all the connection tracking
infrastructure already, so rightfully, xt_ct_tg_destroy() cannot be
invoked at a later point in time during the netns cleanup, as that would
lead to a use-after-free. At the same time, we cannot make x_tables depend
on the connection tracker module, so that the xt_ct_tg_destroy() would
be invoked earlier in the cleanup chain."
Daniel confirms this has to do with the order in which modules are loaded or
having compiled nf_conntrack as modules while x_tables built-in. So we have no
guarantees regarding the order in which netns callbacks are executed.
Fix this by allocating the templates through kmalloc() from the respective
SYNPROXY and CT targets, so they don't depend on the conntrack kmem cache.
Then, release then via nf_ct_tmpl_free() from destroy_conntrack(). This branch
is marked as unlikely since conntrack templates are rarely allocated and only
from the configuration plane path.
Note that templates are not kept in any list to avoid further dependencies with
nf_conntrack anymore, thus, the tmpl larval list is removed.
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Tested-by: Daniel Borkmann <daniel@iogearbox.net>
2015-07-13 15:11:48 +02:00
|
|
|
__set_bit(IPS_CONFIRMED_BIT, &ct->status);
|
|
|
|
nf_conntrack_get(&ct->ct_general);
|
2012-02-29 02:19:19 +01:00
|
|
|
out:
|
|
|
|
info->ct = ct;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err3:
|
netfilter: conntrack: use nf_ct_tmpl_free in CT/synproxy error paths
Commit 0838aa7fcfcd ("netfilter: fix netns dependencies with conntrack
templates") migrated templates to the new allocator api, but forgot to
update error paths for them in CT and synproxy to use nf_ct_tmpl_free()
instead of nf_conntrack_free().
Due to that, memory is being freed into the wrong kmemcache, but also
we drop the per net reference count of ct objects causing an imbalance.
In Brad's case, this leads to a wrap-around of net->ct.count and thus
lets __nf_conntrack_alloc() refuse to create a new ct object:
[ 10.340913] xt_addrtype: ipv6 does not support BROADCAST matching
[ 10.810168] nf_conntrack: table full, dropping packet
[ 11.917416] r8169 0000:07:00.0 eth0: link up
[ 11.917438] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
[ 12.815902] nf_conntrack: table full, dropping packet
[ 15.688561] nf_conntrack: table full, dropping packet
[ 15.689365] nf_conntrack: table full, dropping packet
[ 15.690169] nf_conntrack: table full, dropping packet
[ 15.690967] nf_conntrack: table full, dropping packet
[...]
With slab debugging, it also reports the wrong kmemcache (kmalloc-512 vs.
nf_conntrack_ffffffff81ce75c0) and reports poison overwrites, etc. Thus,
to fix the problem, export and use nf_ct_tmpl_free() instead.
Fixes: 0838aa7fcfcd ("netfilter: fix netns dependencies with conntrack templates")
Reported-by: Brad Jackson <bjackson0971@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2015-08-31 19:11:02 +02:00
|
|
|
nf_ct_tmpl_free(ct);
|
2012-02-29 02:19:19 +01:00
|
|
|
err2:
|
|
|
|
nf_ct_l3proto_module_put(par->family);
|
|
|
|
err1:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-01-30 20:20:39 +01:00
|
|
|
static int xt_ct_tg_check_v0(const struct xt_tgchk_param *par)
|
2010-02-03 17:17:06 +01:00
|
|
|
{
|
|
|
|
struct xt_ct_target_info *info = par->targinfo;
|
2013-01-30 20:20:39 +01:00
|
|
|
struct xt_ct_target_info_v1 info_v1 = {
|
|
|
|
.flags = info->flags,
|
|
|
|
.zone = info->zone,
|
|
|
|
.ct_events = info->ct_events,
|
|
|
|
.exp_events = info->exp_events,
|
|
|
|
};
|
|
|
|
int ret;
|
2010-02-03 17:17:06 +01:00
|
|
|
|
2013-01-30 20:24:22 +01:00
|
|
|
if (info->flags & ~XT_CT_NOTRACK)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2013-01-30 20:20:39 +01:00
|
|
|
memcpy(info_v1.helper, info->helper, sizeof(info->helper));
|
2010-02-03 17:17:06 +01:00
|
|
|
|
2013-01-30 20:20:39 +01:00
|
|
|
ret = xt_ct_tg_check(par, &info_v1);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
info->ct = info_v1.ct;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int xt_ct_tg_check_v1(const struct xt_tgchk_param *par)
|
|
|
|
{
|
2013-01-30 20:24:22 +01:00
|
|
|
struct xt_ct_target_info_v1 *info = par->targinfo;
|
|
|
|
|
|
|
|
if (info->flags & ~XT_CT_NOTRACK)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
return xt_ct_tg_check(par, par->targinfo);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int xt_ct_tg_check_v2(const struct xt_tgchk_param *par)
|
|
|
|
{
|
|
|
|
struct xt_ct_target_info_v1 *info = par->targinfo;
|
|
|
|
|
|
|
|
if (info->flags & ~XT_CT_MASK)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2013-01-30 20:20:39 +01:00
|
|
|
return xt_ct_tg_check(par, par->targinfo);
|
2010-02-03 17:17:06 +01:00
|
|
|
}
|
|
|
|
|
2012-08-28 02:53:14 +02:00
|
|
|
static void xt_ct_destroy_timeout(struct nf_conn *ct)
|
2012-02-29 02:19:19 +01:00
|
|
|
{
|
|
|
|
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
|
|
|
|
struct nf_conn_timeout *timeout_ext;
|
|
|
|
typeof(nf_ct_timeout_put_hook) timeout_put;
|
2012-08-28 02:53:14 +02:00
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
|
|
|
|
|
|
|
|
if (timeout_put) {
|
|
|
|
timeout_ext = nf_ct_timeout_find(ct);
|
|
|
|
if (timeout_ext)
|
|
|
|
timeout_put(timeout_ext->timeout);
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
2012-02-29 02:19:19 +01:00
|
|
|
#endif
|
2012-08-28 02:53:14 +02:00
|
|
|
}
|
|
|
|
|
2013-01-30 20:20:39 +01:00
|
|
|
static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par,
|
|
|
|
struct xt_ct_target_info_v1 *info)
|
2012-08-28 02:53:14 +02:00
|
|
|
{
|
|
|
|
struct nf_conn *ct = info->ct;
|
|
|
|
struct nf_conn_help *help;
|
|
|
|
|
2013-05-22 13:10:57 +02:00
|
|
|
if (ct && !nf_ct_is_untracked(ct)) {
|
2012-02-29 02:19:19 +01:00
|
|
|
help = nfct_help(ct);
|
|
|
|
if (help)
|
|
|
|
module_put(help->helper->me);
|
|
|
|
|
|
|
|
nf_ct_l3proto_module_put(par->family);
|
|
|
|
|
2012-08-28 02:53:14 +02:00
|
|
|
xt_ct_destroy_timeout(ct);
|
2013-05-22 13:10:57 +02:00
|
|
|
nf_ct_put(info->ct);
|
2012-02-29 02:19:19 +01:00
|
|
|
}
|
2013-01-30 20:20:39 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void xt_ct_tg_destroy_v0(const struct xt_tgdtor_param *par)
|
|
|
|
{
|
|
|
|
struct xt_ct_target_info *info = par->targinfo;
|
|
|
|
struct xt_ct_target_info_v1 info_v1 = {
|
|
|
|
.flags = info->flags,
|
|
|
|
.zone = info->zone,
|
|
|
|
.ct_events = info->ct_events,
|
|
|
|
.exp_events = info->exp_events,
|
|
|
|
.ct = info->ct,
|
|
|
|
};
|
|
|
|
memcpy(info_v1.helper, info->helper, sizeof(info->helper));
|
|
|
|
|
|
|
|
xt_ct_tg_destroy(par, &info_v1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par)
|
|
|
|
{
|
|
|
|
xt_ct_tg_destroy(par, par->targinfo);
|
2012-02-29 02:19:19 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct xt_target xt_ct_tg_reg[] __read_mostly = {
|
|
|
|
{
|
|
|
|
.name = "CT",
|
|
|
|
.family = NFPROTO_UNSPEC,
|
|
|
|
.targetsize = sizeof(struct xt_ct_target_info),
|
|
|
|
.checkentry = xt_ct_tg_check_v0,
|
|
|
|
.destroy = xt_ct_tg_destroy_v0,
|
|
|
|
.target = xt_ct_target_v0,
|
|
|
|
.table = "raw",
|
|
|
|
.me = THIS_MODULE,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "CT",
|
|
|
|
.family = NFPROTO_UNSPEC,
|
|
|
|
.revision = 1,
|
|
|
|
.targetsize = sizeof(struct xt_ct_target_info_v1),
|
|
|
|
.checkentry = xt_ct_tg_check_v1,
|
|
|
|
.destroy = xt_ct_tg_destroy_v1,
|
|
|
|
.target = xt_ct_target_v1,
|
|
|
|
.table = "raw",
|
|
|
|
.me = THIS_MODULE,
|
|
|
|
},
|
2013-01-30 20:24:22 +01:00
|
|
|
{
|
|
|
|
.name = "CT",
|
|
|
|
.family = NFPROTO_UNSPEC,
|
|
|
|
.revision = 2,
|
|
|
|
.targetsize = sizeof(struct xt_ct_target_info_v1),
|
|
|
|
.checkentry = xt_ct_tg_check_v2,
|
|
|
|
.destroy = xt_ct_tg_destroy_v1,
|
|
|
|
.target = xt_ct_target_v1,
|
|
|
|
.table = "raw",
|
|
|
|
.me = THIS_MODULE,
|
|
|
|
},
|
2010-02-03 17:17:06 +01:00
|
|
|
};
|
|
|
|
|
2012-12-20 02:54:51 +01:00
|
|
|
static unsigned int
|
|
|
|
notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
|
|
|
|
{
|
|
|
|
/* Previously seen (loopback)? Ignore. */
|
|
|
|
if (skb->nfct != NULL)
|
|
|
|
return XT_CONTINUE;
|
|
|
|
|
|
|
|
skb->nfct = &nf_ct_untracked_get()->ct_general;
|
|
|
|
skb->nfctinfo = IP_CT_NEW;
|
|
|
|
nf_conntrack_get(skb->nfct);
|
|
|
|
|
|
|
|
return XT_CONTINUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int notrack_chk(const struct xt_tgchk_param *par)
|
|
|
|
{
|
|
|
|
if (!par->net->xt.notrack_deprecated_warning) {
|
|
|
|
pr_info("netfilter: NOTRACK target is deprecated, "
|
|
|
|
"use CT instead or upgrade iptables\n");
|
|
|
|
par->net->xt.notrack_deprecated_warning = true;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct xt_target notrack_tg_reg __read_mostly = {
|
|
|
|
.name = "NOTRACK",
|
|
|
|
.revision = 0,
|
|
|
|
.family = NFPROTO_UNSPEC,
|
|
|
|
.checkentry = notrack_chk,
|
|
|
|
.target = notrack_tg,
|
|
|
|
.table = "raw",
|
|
|
|
.me = THIS_MODULE,
|
|
|
|
};
|
|
|
|
|
2010-02-03 17:17:06 +01:00
|
|
|
static int __init xt_ct_tg_init(void)
|
|
|
|
{
|
2012-12-20 02:54:51 +01:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = xt_register_target(¬rack_tg_reg);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg));
|
|
|
|
if (ret < 0) {
|
|
|
|
xt_unregister_target(¬rack_tg_reg);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
return 0;
|
2010-02-03 17:17:06 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __exit xt_ct_tg_exit(void)
|
|
|
|
{
|
2012-02-29 02:19:19 +01:00
|
|
|
xt_unregister_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg));
|
2012-12-20 02:54:51 +01:00
|
|
|
xt_unregister_target(¬rack_tg_reg);
|
2010-02-03 17:17:06 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
module_init(xt_ct_tg_init);
|
|
|
|
module_exit(xt_ct_tg_exit);
|
|
|
|
|
|
|
|
MODULE_LICENSE("GPL");
|
|
|
|
MODULE_DESCRIPTION("Xtables: connection tracking target");
|
|
|
|
MODULE_ALIAS("ipt_CT");
|
|
|
|
MODULE_ALIAS("ip6t_CT");
|
2012-12-20 02:54:51 +01:00
|
|
|
MODULE_ALIAS("ipt_NOTRACK");
|
|
|
|
MODULE_ALIAS("ip6t_NOTRACK");
|