diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h index 93150ecf3ea4..5d10ae364b5e 100644 --- a/include/linux/list_nulls.h +++ b/include/linux/list_nulls.h @@ -56,6 +56,18 @@ static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) return is_a_nulls(h->first); } +static inline void hlist_nulls_add_head(struct hlist_nulls_node *n, + struct hlist_nulls_head *h) +{ + struct hlist_nulls_node *first = h->first; + + n->next = first; + n->pprev = &h->first; + h->first = n; + if (!is_a_nulls(first)) + first->pprev = &n->next; +} + static inline void __hlist_nulls_del(struct hlist_nulls_node *n) { struct hlist_nulls_node *next = n->next; @@ -65,6 +77,12 @@ static inline void __hlist_nulls_del(struct hlist_nulls_node *n) next->pprev = pprev; } +static inline void hlist_nulls_del(struct hlist_nulls_node *n) +{ + __hlist_nulls_del(n); + n->pprev = LIST_POISON2; +} + /** * hlist_nulls_for_each_entry - iterate over list of given type * @tpos: the type * to use as a loop cursor. diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index ecc79f959076..a632689b61b4 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -201,6 +201,8 @@ extern struct nf_conntrack_tuple_hash * __nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple); extern void nf_conntrack_hash_insert(struct nf_conn *ct); +extern void nf_ct_delete_from_lists(struct nf_conn *ct); +extern void nf_ct_insert_dying_list(struct nf_conn *ct); extern void nf_conntrack_flush_report(struct net *net, u32 pid, int report); diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index 1afb907e015a..4f20d58e2ab7 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -6,61 +6,54 @@ #define _NF_CONNTRACK_ECACHE_H #include -#include #include #include +#include +#include +#include -/* Connection tracking event bits */ +/* Connection tracking event types */ enum ip_conntrack_events { - /* New conntrack */ - IPCT_NEW_BIT = 0, - IPCT_NEW = (1 << IPCT_NEW_BIT), - - /* Expected connection */ - IPCT_RELATED_BIT = 1, - IPCT_RELATED = (1 << IPCT_RELATED_BIT), - - /* Destroyed conntrack */ - IPCT_DESTROY_BIT = 2, - IPCT_DESTROY = (1 << IPCT_DESTROY_BIT), - - /* Status has changed */ - IPCT_STATUS_BIT = 3, - IPCT_STATUS = (1 << IPCT_STATUS_BIT), - - /* Update of protocol info */ - IPCT_PROTOINFO_BIT = 4, - IPCT_PROTOINFO = (1 << IPCT_PROTOINFO_BIT), - - /* New helper for conntrack */ - IPCT_HELPER_BIT = 5, - IPCT_HELPER = (1 << IPCT_HELPER_BIT), - - /* Mark is set */ - IPCT_MARK_BIT = 6, - IPCT_MARK = (1 << IPCT_MARK_BIT), - - /* NAT sequence adjustment */ - IPCT_NATSEQADJ_BIT = 7, - IPCT_NATSEQADJ = (1 << IPCT_NATSEQADJ_BIT), - - /* Secmark is set */ - IPCT_SECMARK_BIT = 8, - IPCT_SECMARK = (1 << IPCT_SECMARK_BIT), + IPCT_NEW = 0, /* new conntrack */ + IPCT_RELATED = 1, /* related conntrack */ + IPCT_DESTROY = 2, /* destroyed conntrack */ + IPCT_STATUS = 3, /* status has changed */ + IPCT_PROTOINFO = 4, /* protocol information has changed */ + IPCT_HELPER = 5, /* new helper has been set */ + IPCT_MARK = 6, /* new mark has been set */ + IPCT_NATSEQADJ = 7, /* NAT is doing sequence adjustment */ + IPCT_SECMARK = 8, /* new security mark has been set */ }; enum ip_conntrack_expect_events { - IPEXP_NEW_BIT = 0, - IPEXP_NEW = (1 << IPEXP_NEW_BIT), + IPEXP_NEW = 0, /* new expectation */ +}; + +struct nf_conntrack_ecache { + unsigned long cache; /* bitops want long */ + unsigned long missed; /* missed events */ + u32 pid; /* netlink pid of destroyer */ +}; + +static inline struct nf_conntrack_ecache * +nf_ct_ecache_find(const struct nf_conn *ct) +{ + return nf_ct_ext_find(ct, NF_CT_EXT_ECACHE); +} + +static inline struct nf_conntrack_ecache * +nf_ct_ecache_ext_add(struct nf_conn *ct, gfp_t gfp) +{ + struct net *net = nf_ct_net(ct); + + if (!net->ct.sysctl_events) + return NULL; + + return nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); }; #ifdef CONFIG_NF_CONNTRACK_EVENTS -struct nf_conntrack_ecache { - struct nf_conn *ct; - unsigned int events; -}; - /* This structure is passed to event handler */ struct nf_ct_event { struct nf_conn *ct; @@ -76,53 +69,88 @@ extern struct nf_ct_event_notifier *nf_conntrack_event_cb; extern int nf_conntrack_register_notifier(struct nf_ct_event_notifier *nb); extern void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *nb); -extern void nf_ct_deliver_cached_events(const struct nf_conn *ct); -extern void __nf_ct_event_cache_init(struct nf_conn *ct); -extern void nf_ct_event_cache_flush(struct net *net); +extern void nf_ct_deliver_cached_events(struct nf_conn *ct); static inline void nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct) { - struct net *net = nf_ct_net(ct); - struct nf_conntrack_ecache *ecache; + struct nf_conntrack_ecache *e; - local_bh_disable(); - ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id()); - if (ct != ecache->ct) - __nf_ct_event_cache_init(ct); - ecache->events |= event; - local_bh_enable(); + if (nf_conntrack_event_cb == NULL) + return; + + e = nf_ct_ecache_find(ct); + if (e == NULL) + return; + + set_bit(event, &e->cache); } -static inline void -nf_conntrack_event_report(enum ip_conntrack_events event, - struct nf_conn *ct, - u32 pid, - int report) +static inline int +nf_conntrack_eventmask_report(unsigned int eventmask, + struct nf_conn *ct, + u32 pid, + int report) { + int ret = 0; + struct net *net = nf_ct_net(ct); struct nf_ct_event_notifier *notify; + struct nf_conntrack_ecache *e; rcu_read_lock(); notify = rcu_dereference(nf_conntrack_event_cb); if (notify == NULL) goto out_unlock; + if (!net->ct.sysctl_events) + goto out_unlock; + + e = nf_ct_ecache_find(ct); + if (e == NULL) + goto out_unlock; + if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) { struct nf_ct_event item = { .ct = ct, - .pid = pid, + .pid = e->pid ? e->pid : pid, .report = report }; - notify->fcn(event, &item); + /* This is a resent of a destroy event? If so, skip missed */ + unsigned long missed = e->pid ? 0 : e->missed; + + ret = notify->fcn(eventmask | missed, &item); + if (unlikely(ret < 0 || missed)) { + spin_lock_bh(&ct->lock); + if (ret < 0) { + /* This is a destroy event that has been + * triggered by a process, we store the PID + * to include it in the retransmission. */ + if (eventmask & (1 << IPCT_DESTROY) && + e->pid == 0 && pid != 0) + e->pid = pid; + else + e->missed |= eventmask; + } else + e->missed &= ~missed; + spin_unlock_bh(&ct->lock); + } } out_unlock: rcu_read_unlock(); + return ret; } -static inline void +static inline int +nf_conntrack_event_report(enum ip_conntrack_events event, struct nf_conn *ct, + u32 pid, int report) +{ + return nf_conntrack_eventmask_report(1 << event, ct, pid, report); +} + +static inline int nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct) { - nf_conntrack_event_report(event, ct, 0, 0); + return nf_conntrack_eventmask_report(1 << event, ct, 0, 0); } struct nf_exp_event { @@ -145,6 +173,7 @@ nf_ct_expect_event_report(enum ip_conntrack_expect_events event, u32 pid, int report) { + struct net *net = nf_ct_exp_net(exp); struct nf_exp_event_notifier *notify; rcu_read_lock(); @@ -152,13 +181,16 @@ nf_ct_expect_event_report(enum ip_conntrack_expect_events event, if (notify == NULL) goto out_unlock; + if (!net->ct.sysctl_events) + goto out_unlock; + { struct nf_exp_event item = { .exp = exp, .pid = pid, .report = report }; - notify->fcn(event, &item); + notify->fcn(1 << event, &item); } out_unlock: rcu_read_unlock(); @@ -178,12 +210,16 @@ extern void nf_conntrack_ecache_fini(struct net *net); static inline void nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct) {} -static inline void nf_conntrack_event(enum ip_conntrack_events event, - struct nf_conn *ct) {} -static inline void nf_conntrack_event_report(enum ip_conntrack_events event, - struct nf_conn *ct, - u32 pid, - int report) {} +static inline int nf_conntrack_eventmask_report(unsigned int eventmask, + struct nf_conn *ct, + u32 pid, + int report) { return 0; } +static inline int nf_conntrack_event(enum ip_conntrack_events event, + struct nf_conn *ct) { return 0; } +static inline int nf_conntrack_event_report(enum ip_conntrack_events event, + struct nf_conn *ct, + u32 pid, + int report) { return 0; } static inline void nf_ct_deliver_cached_events(const struct nf_conn *ct) {} static inline void nf_ct_expect_event(enum ip_conntrack_expect_events event, struct nf_conntrack_expect *exp) {} @@ -191,7 +227,6 @@ static inline void nf_ct_expect_event_report(enum ip_conntrack_expect_events e, struct nf_conntrack_expect *exp, u32 pid, int report) {} -static inline void nf_ct_event_cache_flush(struct net *net) {} static inline int nf_conntrack_ecache_init(struct net *net) { diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index da8ee52613a5..7f8fc5d123c5 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -8,12 +8,14 @@ enum nf_ct_ext_id NF_CT_EXT_HELPER, NF_CT_EXT_NAT, NF_CT_EXT_ACCT, + NF_CT_EXT_ECACHE, NF_CT_EXT_NUM, }; #define NF_CT_EXT_HELPER_TYPE struct nf_conn_help #define NF_CT_EXT_NAT_TYPE struct nf_conn_nat #define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter +#define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache /* Extensions: optional stuff which isn't permanently in struct. */ struct nf_ct_ext { diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index ee2a4b369a04..1b7068000927 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -50,6 +50,8 @@ extern struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp); extern int __nf_ct_try_assign_helper(struct nf_conn *ct, gfp_t flags); +extern void nf_ct_helper_destroy(struct nf_conn *ct); + static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct) { return nf_ct_ext_find(ct, NF_CT_EXT_HELPER); diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 9dc58402bc09..ba1ba0c5efd1 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -14,16 +14,17 @@ struct netns_ct { struct hlist_nulls_head *hash; struct hlist_head *expect_hash; struct hlist_nulls_head unconfirmed; + struct hlist_nulls_head dying; struct ip_conntrack_stat *stat; -#ifdef CONFIG_NF_CONNTRACK_EVENTS - struct nf_conntrack_ecache *ecache; -#endif + int sysctl_events; + unsigned int sysctl_events_retry_timeout; int sysctl_acct; int sysctl_checksum; unsigned int sysctl_log_invalid; /* Log invalid packets */ #ifdef CONFIG_SYSCTL struct ctl_table_header *sysctl_header; struct ctl_table_header *acct_sysctl_header; + struct ctl_table_header *event_sysctl_header; #endif int hash_vmalloc; int expect_vmalloc; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index edf95695e0aa..5f72b94b4918 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -182,10 +183,6 @@ destroy_conntrack(struct nf_conntrack *nfct) NF_CT_ASSERT(atomic_read(&nfct->use) == 0); NF_CT_ASSERT(!timer_pending(&ct->timeout)); - if (!test_bit(IPS_DYING_BIT, &ct->status)) - nf_conntrack_event(IPCT_DESTROY, ct); - set_bit(IPS_DYING_BIT, &ct->status); - /* To make sure we don't get any weird locking issues here: * destroy_conntrack() MUST NOT be called with a write lock * to nf_conntrack_lock!!! -HW */ @@ -219,27 +216,70 @@ destroy_conntrack(struct nf_conntrack *nfct) nf_conntrack_free(ct); } -static void death_by_timeout(unsigned long ul_conntrack) +void nf_ct_delete_from_lists(struct nf_conn *ct) { - struct nf_conn *ct = (void *)ul_conntrack; struct net *net = nf_ct_net(ct); - struct nf_conn_help *help = nfct_help(ct); - struct nf_conntrack_helper *helper; - - if (help) { - rcu_read_lock(); - helper = rcu_dereference(help->helper); - if (helper && helper->destroy) - helper->destroy(ct); - rcu_read_unlock(); - } + nf_ct_helper_destroy(ct); spin_lock_bh(&nf_conntrack_lock); /* Inside lock so preempt is disabled on module removal path. * Otherwise we can get spurious warnings. */ NF_CT_STAT_INC(net, delete_list); clean_from_lists(ct); spin_unlock_bh(&nf_conntrack_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists); + +static void death_by_event(unsigned long ul_conntrack) +{ + struct nf_conn *ct = (void *)ul_conntrack; + struct net *net = nf_ct_net(ct); + + if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { + /* bad luck, let's retry again */ + ct->timeout.expires = jiffies + + (random32() % net->ct.sysctl_events_retry_timeout); + add_timer(&ct->timeout); + return; + } + /* we've got the event delivered, now it's dying */ + set_bit(IPS_DYING_BIT, &ct->status); + spin_lock(&nf_conntrack_lock); + hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + spin_unlock(&nf_conntrack_lock); + nf_ct_put(ct); +} + +void nf_ct_insert_dying_list(struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + + /* add this conntrack to the dying list */ + spin_lock_bh(&nf_conntrack_lock); + hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &net->ct.dying); + spin_unlock_bh(&nf_conntrack_lock); + /* set a new timer to retry event delivery */ + setup_timer(&ct->timeout, death_by_event, (unsigned long)ct); + ct->timeout.expires = jiffies + + (random32() % net->ct.sysctl_events_retry_timeout); + add_timer(&ct->timeout); +} +EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); + +static void death_by_timeout(unsigned long ul_conntrack) +{ + struct nf_conn *ct = (void *)ul_conntrack; + + if (!test_bit(IPS_DYING_BIT, &ct->status) && + unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { + /* destroy event was not delivered */ + nf_ct_delete_from_lists(ct); + nf_ct_insert_dying_list(ct); + return; + } + set_bit(IPS_DYING_BIT, &ct->status); + nf_ct_delete_from_lists(ct); nf_ct_put(ct); } @@ -577,6 +617,7 @@ init_conntrack(struct net *net, } nf_ct_acct_ext_add(ct, GFP_ATOMIC); + nf_ct_ecache_ext_add(ct, GFP_ATOMIC); spin_lock_bh(&nf_conntrack_lock); exp = nf_ct_find_expectation(net, tuple); @@ -807,8 +848,6 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); NF_CT_ASSERT(skb); - spin_lock_bh(&nf_conntrack_lock); - /* Only update if this is not a fixed timeout */ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) goto acct; @@ -822,11 +861,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, /* Only update the timeout if the new timeout is at least HZ jiffies from the old timeout. Need del_timer for race avoidance (may already be dying). */ - if (newtime - ct->timeout.expires >= HZ - && del_timer(&ct->timeout)) { - ct->timeout.expires = newtime; - add_timer(&ct->timeout); - } + if (newtime - ct->timeout.expires >= HZ) + mod_timer_pending(&ct->timeout, newtime); } acct: @@ -835,13 +871,13 @@ acct: acct = nf_conn_acct_find(ct); if (acct) { + spin_lock_bh(&ct->lock); acct[CTINFO2DIR(ctinfo)].packets++; acct[CTINFO2DIR(ctinfo)].bytes += skb->len - skb_network_offset(skb); + spin_unlock_bh(&ct->lock); } } - - spin_unlock_bh(&nf_conntrack_lock); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); @@ -853,14 +889,14 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, if (do_acct) { struct nf_conn_counter *acct; - spin_lock_bh(&nf_conntrack_lock); acct = nf_conn_acct_find(ct); if (acct) { + spin_lock_bh(&ct->lock); acct[CTINFO2DIR(ctinfo)].packets++; acct[CTINFO2DIR(ctinfo)].bytes += skb->len - skb_network_offset(skb); + spin_unlock_bh(&ct->lock); } - spin_unlock_bh(&nf_conntrack_lock); } if (del_timer(&ct->timeout)) { @@ -994,11 +1030,13 @@ static int kill_report(struct nf_conn *i, void *data) { struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data; - /* get_next_corpse sets the dying bit for us */ - nf_conntrack_event_report(IPCT_DESTROY, - i, - fr->pid, - fr->report); + /* If we fail to deliver the event, death_by_timeout() will retry */ + if (nf_conntrack_event_report(IPCT_DESTROY, i, + fr->pid, fr->report) < 0) + return 1; + + /* Avoid the delivery of the destroy event in death_by_timeout(). */ + set_bit(IPS_DYING_BIT, &i->status); return 1; } @@ -1027,6 +1065,21 @@ void nf_conntrack_flush_report(struct net *net, u32 pid, int report) } EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); +static void nf_ct_release_dying_list(void) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct hlist_nulls_node *n; + + spin_lock_bh(&nf_conntrack_lock); + hlist_nulls_for_each_entry(h, n, &init_net.ct.dying, hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + /* never fails to remove them, no listeners at this point */ + nf_ct_kill(ct); + } + spin_unlock_bh(&nf_conntrack_lock); +} + static void nf_conntrack_cleanup_init_net(void) { nf_conntrack_helper_fini(); @@ -1036,10 +1089,9 @@ static void nf_conntrack_cleanup_init_net(void) static void nf_conntrack_cleanup_net(struct net *net) { - nf_ct_event_cache_flush(net); - nf_conntrack_ecache_fini(net); i_see_dead_people: nf_ct_iterate_cleanup(net, kill_all, NULL); + nf_ct_release_dying_list(); if (atomic_read(&net->ct.count) != 0) { schedule(); goto i_see_dead_people; @@ -1050,6 +1102,7 @@ static void nf_conntrack_cleanup_net(struct net *net) nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, nf_conntrack_htable_size); + nf_conntrack_ecache_fini(net); nf_conntrack_acct_fini(net); nf_conntrack_expect_fini(net); free_percpu(net->ct.stat); @@ -1220,14 +1273,12 @@ static int nf_conntrack_init_net(struct net *net) atomic_set(&net->ct.count, 0); INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, 0); + INIT_HLIST_NULLS_HEAD(&net->ct.dying, 0); net->ct.stat = alloc_percpu(struct ip_conntrack_stat); if (!net->ct.stat) { ret = -ENOMEM; goto err_stat; } - ret = nf_conntrack_ecache_init(net); - if (ret < 0) - goto err_ecache; net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, &net->ct.hash_vmalloc, 1); if (!net->ct.hash) { @@ -1241,6 +1292,9 @@ static int nf_conntrack_init_net(struct net *net) ret = nf_conntrack_acct_init(net); if (ret < 0) goto err_acct; + ret = nf_conntrack_ecache_init(net); + if (ret < 0) + goto err_ecache; /* Set up fake conntrack: - to never be deleted, not in any hashes */ @@ -1253,14 +1307,14 @@ static int nf_conntrack_init_net(struct net *net) return 0; +err_ecache: + nf_conntrack_acct_fini(net); err_acct: nf_conntrack_expect_fini(net); err_expect: nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, nf_conntrack_htable_size); err_hash: - nf_conntrack_ecache_fini(net); -err_ecache: free_percpu(net->ct.stat); err_stat: return ret; diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 5516b3e64b43..aee560b4768d 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -21,6 +21,7 @@ #include #include +#include static DEFINE_MUTEX(nf_ct_ecache_mutex); @@ -32,94 +33,51 @@ EXPORT_SYMBOL_GPL(nf_expect_event_cb); /* deliver cached events and clear cache entry - must be called with locally * disabled softirqs */ -static inline void -__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache) +void nf_ct_deliver_cached_events(struct nf_conn *ct) { + unsigned long events; struct nf_ct_event_notifier *notify; + struct nf_conntrack_ecache *e; rcu_read_lock(); notify = rcu_dereference(nf_conntrack_event_cb); if (notify == NULL) goto out_unlock; - if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct) - && ecache->events) { + e = nf_ct_ecache_find(ct); + if (e == NULL) + goto out_unlock; + + events = xchg(&e->cache, 0); + + if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct) && events) { struct nf_ct_event item = { - .ct = ecache->ct, + .ct = ct, .pid = 0, .report = 0 }; + int ret; + /* We make a copy of the missed event cache without taking + * the lock, thus we may send missed events twice. However, + * this does not harm and it happens very rarely. */ + unsigned long missed = e->missed; - notify->fcn(ecache->events, &item); + ret = notify->fcn(events | missed, &item); + if (unlikely(ret < 0 || missed)) { + spin_lock_bh(&ct->lock); + if (ret < 0) + e->missed |= events; + else + e->missed &= ~missed; + spin_unlock_bh(&ct->lock); + } } - ecache->events = 0; - nf_ct_put(ecache->ct); - ecache->ct = NULL; - out_unlock: rcu_read_unlock(); } - -/* Deliver all cached events for a particular conntrack. This is called - * by code prior to async packet handling for freeing the skb */ -void nf_ct_deliver_cached_events(const struct nf_conn *ct) -{ - struct net *net = nf_ct_net(ct); - struct nf_conntrack_ecache *ecache; - - local_bh_disable(); - ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id()); - if (ecache->ct == ct) - __nf_ct_deliver_cached_events(ecache); - local_bh_enable(); -} EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); -/* Deliver cached events for old pending events, if current conntrack != old */ -void __nf_ct_event_cache_init(struct nf_conn *ct) -{ - struct net *net = nf_ct_net(ct); - struct nf_conntrack_ecache *ecache; - - /* take care of delivering potentially old events */ - ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id()); - BUG_ON(ecache->ct == ct); - if (ecache->ct) - __nf_ct_deliver_cached_events(ecache); - /* initialize for this conntrack/packet */ - ecache->ct = ct; - nf_conntrack_get(&ct->ct_general); -} -EXPORT_SYMBOL_GPL(__nf_ct_event_cache_init); - -/* flush the event cache - touches other CPU's data and must not be called - * while packets are still passing through the code */ -void nf_ct_event_cache_flush(struct net *net) -{ - struct nf_conntrack_ecache *ecache; - int cpu; - - for_each_possible_cpu(cpu) { - ecache = per_cpu_ptr(net->ct.ecache, cpu); - if (ecache->ct) - nf_ct_put(ecache->ct); - } -} - -int nf_conntrack_ecache_init(struct net *net) -{ - net->ct.ecache = alloc_percpu(struct nf_conntrack_ecache); - if (!net->ct.ecache) - return -ENOMEM; - return 0; -} - -void nf_conntrack_ecache_fini(struct net *net) -{ - free_percpu(net->ct.ecache); -} - int nf_conntrack_register_notifier(struct nf_ct_event_notifier *new) { int ret = 0; @@ -185,3 +143,118 @@ void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *new) mutex_unlock(&nf_ct_ecache_mutex); } EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); + +#define NF_CT_EVENTS_DEFAULT 1 +static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; +static int nf_ct_events_retry_timeout __read_mostly = 15*HZ; + +#ifdef CONFIG_SYSCTL +static struct ctl_table event_sysctl_table[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nf_conntrack_events", + .data = &init_net.ct.sysctl_events, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nf_conntrack_events_retry_timeout", + .data = &init_net.ct.sysctl_events_retry_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + {} +}; +#endif /* CONFIG_SYSCTL */ + +static struct nf_ct_ext_type event_extend __read_mostly = { + .len = sizeof(struct nf_conntrack_ecache), + .align = __alignof__(struct nf_conntrack_ecache), + .id = NF_CT_EXT_ECACHE, +}; + +#ifdef CONFIG_SYSCTL +static int nf_conntrack_event_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(event_sysctl_table, sizeof(event_sysctl_table), + GFP_KERNEL); + if (!table) + goto out; + + table[0].data = &net->ct.sysctl_events; + table[1].data = &net->ct.sysctl_events_retry_timeout; + + net->ct.event_sysctl_header = + register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, table); + if (!net->ct.event_sysctl_header) { + printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n"); + goto out_register; + } + return 0; + +out_register: + kfree(table); +out: + return -ENOMEM; +} + +static void nf_conntrack_event_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = net->ct.event_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.event_sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_event_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_event_fini_sysctl(struct net *net) +{ +} +#endif /* CONFIG_SYSCTL */ + +int nf_conntrack_ecache_init(struct net *net) +{ + int ret; + + net->ct.sysctl_events = nf_ct_events; + net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout; + + if (net_eq(net, &init_net)) { + ret = nf_ct_extend_register(&event_extend); + if (ret < 0) { + printk(KERN_ERR "nf_ct_event: Unable to register " + "event extension.\n"); + goto out_extend_register; + } + } + + ret = nf_conntrack_event_init_sysctl(net); + if (ret < 0) + goto out_sysctl; + + return 0; + +out_sysctl: + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&event_extend); +out_extend_register: + return ret; +} + +void nf_conntrack_ecache_fini(struct net *net) +{ + nf_conntrack_event_fini_sysctl(net); + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&event_extend); +} diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 0fa5a422959f..65c2a7bc3afc 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -136,6 +136,20 @@ static inline int unhelp(struct nf_conntrack_tuple_hash *i, return 0; } +void nf_ct_helper_destroy(struct nf_conn *ct) +{ + struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_helper *helper; + + if (help) { + rcu_read_lock(); + helper = rcu_dereference(help->helper); + if (helper && helper->destroy) + helper->destroy(ct); + rcu_read_unlock(); + } +} + int nf_conntrack_helper_register(struct nf_conntrack_helper *me) { unsigned int h = helper_hash(&me->tuple); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 4e503ada5728..49479d194570 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -463,15 +463,16 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) struct sk_buff *skb; unsigned int type; unsigned int flags = 0, group; + int err; /* ignore our fake conntrack entry */ if (ct == &nf_conntrack_untracked) return 0; - if (events & IPCT_DESTROY) { + if (events & (1 << IPCT_DESTROY)) { type = IPCTNL_MSG_CT_DELETE; group = NFNLGRP_CONNTRACK_DESTROY; - } else if (events & (IPCT_NEW | IPCT_RELATED)) { + } else if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) { type = IPCTNL_MSG_CT_NEW; flags = NLM_F_CREATE|NLM_F_EXCL; group = NFNLGRP_CONNTRACK_NEW; @@ -519,7 +520,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) if (ctnetlink_dump_status(skb, ct) < 0) goto nla_put_failure; - if (events & IPCT_DESTROY) { + if (events & (1 << IPCT_DESTROY)) { if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0) goto nla_put_failure; @@ -527,38 +528,41 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) if (ctnetlink_dump_timeout(skb, ct) < 0) goto nla_put_failure; - if (events & IPCT_PROTOINFO + if (events & (1 << IPCT_PROTOINFO) && ctnetlink_dump_protoinfo(skb, ct) < 0) goto nla_put_failure; - if ((events & IPCT_HELPER || nfct_help(ct)) + if ((events & (1 << IPCT_HELPER) || nfct_help(ct)) && ctnetlink_dump_helpinfo(skb, ct) < 0) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_SECMARK - if ((events & IPCT_SECMARK || ct->secmark) + if ((events & (1 << IPCT_SECMARK) || ct->secmark) && ctnetlink_dump_secmark(skb, ct) < 0) goto nla_put_failure; #endif - if (events & IPCT_RELATED && + if (events & (1 << IPCT_RELATED) && ctnetlink_dump_master(skb, ct) < 0) goto nla_put_failure; - if (events & IPCT_NATSEQADJ && + if (events & (1 << IPCT_NATSEQADJ) && ctnetlink_dump_nat_seq_adj(skb, ct) < 0) goto nla_put_failure; } #ifdef CONFIG_NF_CONNTRACK_MARK - if ((events & IPCT_MARK || ct->mark) + if ((events & (1 << IPCT_MARK) || ct->mark) && ctnetlink_dump_mark(skb, ct) < 0) goto nla_put_failure; #endif rcu_read_unlock(); nlmsg_end(skb, nlh); - nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC); + err = nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC); + if (err == -ENOBUFS || err == -EAGAIN) + return -ENOBUFS; + return 0; nla_put_failure: @@ -798,10 +802,15 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, } } - nf_conntrack_event_report(IPCT_DESTROY, - ct, - NETLINK_CB(skb).pid, - nlmsg_report(nlh)); + if (nf_conntrack_event_report(IPCT_DESTROY, ct, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)) < 0) { + nf_ct_delete_from_lists(ct); + /* we failed to report the event, try later */ + nf_ct_insert_dying_list(ct); + nf_ct_put(ct); + return 0; + } /* death_by_timeout would report the event again */ set_bit(IPS_DYING_BIT, &ct->status); @@ -1253,6 +1262,7 @@ ctnetlink_create_conntrack(struct nlattr *cda[], } nf_ct_acct_ext_add(ct, GFP_ATOMIC); + nf_ct_ecache_ext_add(ct, GFP_ATOMIC); #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) @@ -1340,13 +1350,13 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, else events = IPCT_NEW; - nf_conntrack_event_report(IPCT_STATUS | - IPCT_HELPER | - IPCT_PROTOINFO | - IPCT_NATSEQADJ | - IPCT_MARK | events, - ct, NETLINK_CB(skb).pid, - nlmsg_report(nlh)); + nf_conntrack_eventmask_report((1 << IPCT_STATUS) | + (1 << IPCT_HELPER) | + (1 << IPCT_PROTOINFO) | + (1 << IPCT_NATSEQADJ) | + (1 << IPCT_MARK) | events, + ct, NETLINK_CB(skb).pid, + nlmsg_report(nlh)); nf_ct_put(ct); } else spin_unlock_bh(&nf_conntrack_lock); @@ -1365,13 +1375,13 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, if (err == 0) { nf_conntrack_get(&ct->ct_general); spin_unlock_bh(&nf_conntrack_lock); - nf_conntrack_event_report(IPCT_STATUS | - IPCT_HELPER | - IPCT_PROTOINFO | - IPCT_NATSEQADJ | - IPCT_MARK, - ct, NETLINK_CB(skb).pid, - nlmsg_report(nlh)); + nf_conntrack_eventmask_report((1 << IPCT_STATUS) | + (1 << IPCT_HELPER) | + (1 << IPCT_PROTOINFO) | + (1 << IPCT_NATSEQADJ) | + (1 << IPCT_MARK), + ct, NETLINK_CB(skb).pid, + nlmsg_report(nlh)); nf_ct_put(ct); } else spin_unlock_bh(&nf_conntrack_lock); @@ -1515,7 +1525,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) unsigned int type; int flags = 0; - if (events & IPEXP_NEW) { + if (events & (1 << IPEXP_NEW)) { type = IPCTNL_MSG_EXP_NEW; flags = NLM_F_CREATE|NLM_F_EXCL; } else diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index beb37311e1a5..2fefe147750a 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -248,14 +248,14 @@ static int nf_log_proc_dostring(ctl_table *table, int write, struct file *filp, rcu_assign_pointer(nf_loggers[tindex], logger); mutex_unlock(&nf_log_mutex); } else { - rcu_read_lock(); - logger = rcu_dereference(nf_loggers[tindex]); + mutex_lock(&nf_log_mutex); + logger = nf_loggers[tindex]; if (!logger) table->data = "NONE"; else table->data = logger->name; r = proc_dostring(table, write, filp, buffer, lenp, ppos); - rcu_read_unlock(); + mutex_unlock(&nf_log_mutex); } return r; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 46dba5f043d5..025d1a0af78b 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -364,14 +364,14 @@ int xt_check_match(struct xt_mtchk_param *par, * ebt_among is exempt from centralized matchsize checking * because it uses a dynamic-size data set. */ - printk("%s_tables: %s match: invalid size %Zu != %u\n", + pr_err("%s_tables: %s match: invalid size %Zu != %u\n", xt_prefix[par->family], par->match->name, XT_ALIGN(par->match->matchsize), size); return -EINVAL; } if (par->match->table != NULL && strcmp(par->match->table, par->table) != 0) { - printk("%s_tables: %s match: only valid in %s table, not %s\n", + pr_err("%s_tables: %s match: only valid in %s table, not %s\n", xt_prefix[par->family], par->match->name, par->match->table, par->table); return -EINVAL; @@ -379,7 +379,7 @@ int xt_check_match(struct xt_mtchk_param *par, if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) { char used[64], allow[64]; - printk("%s_tables: %s match: used from hooks %s, but only " + pr_err("%s_tables: %s match: used from hooks %s, but only " "valid from %s\n", xt_prefix[par->family], par->match->name, textify_hooks(used, sizeof(used), par->hook_mask), @@ -387,7 +387,7 @@ int xt_check_match(struct xt_mtchk_param *par, return -EINVAL; } if (par->match->proto && (par->match->proto != proto || inv_proto)) { - printk("%s_tables: %s match: only valid for protocol %u\n", + pr_err("%s_tables: %s match: only valid for protocol %u\n", xt_prefix[par->family], par->match->name, par->match->proto); return -EINVAL; @@ -514,14 +514,14 @@ int xt_check_target(struct xt_tgchk_param *par, unsigned int size, u_int8_t proto, bool inv_proto) { if (XT_ALIGN(par->target->targetsize) != size) { - printk("%s_tables: %s target: invalid size %Zu != %u\n", + pr_err("%s_tables: %s target: invalid size %Zu != %u\n", xt_prefix[par->family], par->target->name, XT_ALIGN(par->target->targetsize), size); return -EINVAL; } if (par->target->table != NULL && strcmp(par->target->table, par->table) != 0) { - printk("%s_tables: %s target: only valid in %s table, not %s\n", + pr_err("%s_tables: %s target: only valid in %s table, not %s\n", xt_prefix[par->family], par->target->name, par->target->table, par->table); return -EINVAL; @@ -529,7 +529,7 @@ int xt_check_target(struct xt_tgchk_param *par, if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) { char used[64], allow[64]; - printk("%s_tables: %s target: used from hooks %s, but only " + pr_err("%s_tables: %s target: used from hooks %s, but only " "usable from %s\n", xt_prefix[par->family], par->target->name, textify_hooks(used, sizeof(used), par->hook_mask), @@ -537,7 +537,7 @@ int xt_check_target(struct xt_tgchk_param *par, return -EINVAL; } if (par->target->proto && (par->target->proto != proto || inv_proto)) { - printk("%s_tables: %s target: only valid for protocol %u\n", + pr_err("%s_tables: %s target: only valid for protocol %u\n", xt_prefix[par->family], par->target->name, par->target->proto); return -EINVAL;