From ffb733c65000ee701294f7b80c4eca2a5f335637 Mon Sep 17 00:00:00 2001 From: "paul.moore@hp.com" Date: Wed, 4 Oct 2006 11:46:31 -0400 Subject: [PATCH 01/19] NetLabel: fix a cache race condition Testing revealed a problem with the NetLabel cache where a cached entry could be freed while in use by the LSM layer causing an oops and other problems. This patch fixes that problem by introducing a reference counter to the cache entry so that it is only freed when it is no longer in use. Signed-off-by: Paul Moore Signed-off-by: James Morris --- include/net/netlabel.h | 62 ++++++++++++++++++++++++++-------- net/ipv4/cipso_ipv4.c | 18 +++++----- net/netlabel/netlabel_kapi.c | 2 +- security/selinux/ss/services.c | 37 +++++++++++--------- 4 files changed, 79 insertions(+), 40 deletions(-) diff --git a/include/net/netlabel.h b/include/net/netlabel.h index c63a58058e21..113337c27955 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -34,6 +34,7 @@ #include #include #include +#include /* * NetLabel - A management interface for maintaining network packet label @@ -106,6 +107,7 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info); /* LSM security attributes */ struct netlbl_lsm_cache { + atomic_t refcount; void (*free) (const void *data); void *data; }; @@ -117,7 +119,7 @@ struct netlbl_lsm_secattr { unsigned char *mls_cat; size_t mls_cat_len; - struct netlbl_lsm_cache cache; + struct netlbl_lsm_cache *cache; }; /* @@ -125,6 +127,43 @@ struct netlbl_lsm_secattr { */ +/** + * netlbl_secattr_cache_alloc - Allocate and initialize a secattr cache + * @flags: the memory allocation flags + * + * Description: + * Allocate and initialize a netlbl_lsm_cache structure. Returns a pointer + * on success, NULL on failure. + * + */ +static inline struct netlbl_lsm_cache *netlbl_secattr_cache_alloc(int flags) +{ + struct netlbl_lsm_cache *cache; + + cache = kzalloc(sizeof(*cache), flags); + if (cache) + atomic_set(&cache->refcount, 1); + return cache; +} + +/** + * netlbl_secattr_cache_free - Frees a netlbl_lsm_cache struct + * @cache: the struct to free + * + * Description: + * Frees @secattr including all of the internal buffers. + * + */ +static inline void netlbl_secattr_cache_free(struct netlbl_lsm_cache *cache) +{ + if (!atomic_dec_and_test(&cache->refcount)) + return; + + if (cache->free) + cache->free(cache->data); + kfree(cache); +} + /** * netlbl_secattr_init - Initialize a netlbl_lsm_secattr struct * @secattr: the struct to initialize @@ -143,20 +182,16 @@ static inline int netlbl_secattr_init(struct netlbl_lsm_secattr *secattr) /** * netlbl_secattr_destroy - Clears a netlbl_lsm_secattr struct * @secattr: the struct to clear - * @clear_cache: cache clear flag * * Description: * Destroys the @secattr struct, including freeing all of the internal buffers. - * If @clear_cache is true then free the cache fields, otherwise leave them - * intact. The struct must be reset with a call to netlbl_secattr_init() - * before reuse. + * The struct must be reset with a call to netlbl_secattr_init() before reuse. * */ -static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr, - u32 clear_cache) +static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr) { - if (clear_cache && secattr->cache.data != NULL && secattr->cache.free) - secattr->cache.free(secattr->cache.data); + if (secattr->cache) + netlbl_secattr_cache_free(secattr->cache); kfree(secattr->domain); kfree(secattr->mls_cat); } @@ -178,17 +213,14 @@ static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc(int flags) /** * netlbl_secattr_free - Frees a netlbl_lsm_secattr struct * @secattr: the struct to free - * @clear_cache: cache clear flag * * Description: - * Frees @secattr including all of the internal buffers. If @clear_cache is - * true then free the cache fields, otherwise leave them intact. + * Frees @secattr including all of the internal buffers. * */ -static inline void netlbl_secattr_free(struct netlbl_lsm_secattr *secattr, - u32 clear_cache) +static inline void netlbl_secattr_free(struct netlbl_lsm_secattr *secattr) { - netlbl_secattr_destroy(secattr, clear_cache); + netlbl_secattr_destroy(secattr); kfree(secattr); } diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index a8e2e879a647..bde8ccaa1531 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -43,6 +43,7 @@ #include #include #include +#include #include struct cipso_v4_domhsh_entry { @@ -79,7 +80,7 @@ struct cipso_v4_map_cache_entry { unsigned char *key; size_t key_len; - struct netlbl_lsm_cache lsm_data; + struct netlbl_lsm_cache *lsm_data; u32 activity; struct list_head list; @@ -188,13 +189,14 @@ static void cipso_v4_doi_domhsh_free(struct rcu_head *entry) * @entry: the entry to free * * Description: - * This function frees the memory associated with a cache entry. + * This function frees the memory associated with a cache entry including the + * LSM cache data if there are no longer any users, i.e. reference count == 0. * */ static void cipso_v4_cache_entry_free(struct cipso_v4_map_cache_entry *entry) { - if (entry->lsm_data.free) - entry->lsm_data.free(entry->lsm_data.data); + if (entry->lsm_data) + netlbl_secattr_cache_free(entry->lsm_data); kfree(entry->key); kfree(entry); } @@ -315,8 +317,8 @@ static int cipso_v4_cache_check(const unsigned char *key, entry->key_len == key_len && memcmp(entry->key, key, key_len) == 0) { entry->activity += 1; - secattr->cache.free = entry->lsm_data.free; - secattr->cache.data = entry->lsm_data.data; + atomic_inc(&entry->lsm_data->refcount); + secattr->cache = entry->lsm_data; if (prev_entry == NULL) { spin_unlock_bh(&cipso_v4_cache[bkt].lock); return 0; @@ -383,8 +385,8 @@ int cipso_v4_cache_add(const struct sk_buff *skb, memcpy(entry->key, cipso_ptr, cipso_ptr_len); entry->key_len = cipso_ptr_len; entry->hash = cipso_v4_map_cache_hash(cipso_ptr, cipso_ptr_len); - entry->lsm_data.free = secattr->cache.free; - entry->lsm_data.data = secattr->cache.data; + atomic_inc(&secattr->cache->refcount); + entry->lsm_data = secattr->cache; bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETBITS - 1); spin_lock_bh(&cipso_v4_cache[bkt].lock); diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 54fb7de3c2b1..ff971103fd0c 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -200,7 +200,7 @@ void netlbl_cache_invalidate(void) int netlbl_cache_add(const struct sk_buff *skb, const struct netlbl_lsm_secattr *secattr) { - if (secattr->cache.data == NULL) + if (secattr->cache == NULL) return -ENOMSG; if (CIPSO_V4_OPTEXIST(skb)) diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 0c219a1b3243..bb2d2bc869ba 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -2172,7 +2172,12 @@ struct netlbl_cache { */ static void selinux_netlbl_cache_free(const void *data) { - struct netlbl_cache *cache = NETLBL_CACHE(data); + struct netlbl_cache *cache; + + if (data == NULL) + return; + + cache = NETLBL_CACHE(data); switch (cache->type) { case NETLBL_CACHE_T_MLS: ebitmap_destroy(&cache->data.mls_label.level[0].cat); @@ -2197,17 +2202,20 @@ static void selinux_netlbl_cache_add(struct sk_buff *skb, struct context *ctx) struct netlbl_lsm_secattr secattr; netlbl_secattr_init(&secattr); + secattr.cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); + if (secattr.cache == NULL) + goto netlbl_cache_add_return; cache = kzalloc(sizeof(*cache), GFP_ATOMIC); if (cache == NULL) - goto netlbl_cache_add_failure; - secattr.cache.free = selinux_netlbl_cache_free; - secattr.cache.data = (void *)cache; + goto netlbl_cache_add_return; + secattr.cache->free = selinux_netlbl_cache_free; + secattr.cache->data = (void *)cache; cache->type = NETLBL_CACHE_T_MLS; if (ebitmap_cpy(&cache->data.mls_label.level[0].cat, &ctx->range.level[0].cat) != 0) - goto netlbl_cache_add_failure; + goto netlbl_cache_add_return; cache->data.mls_label.level[1].cat.highbit = cache->data.mls_label.level[0].cat.highbit; cache->data.mls_label.level[1].cat.node = @@ -2215,13 +2223,10 @@ static void selinux_netlbl_cache_add(struct sk_buff *skb, struct context *ctx) cache->data.mls_label.level[0].sens = ctx->range.level[0].sens; cache->data.mls_label.level[1].sens = ctx->range.level[0].sens; - if (netlbl_cache_add(skb, &secattr) != 0) - goto netlbl_cache_add_failure; + netlbl_cache_add(skb, &secattr); - return; - -netlbl_cache_add_failure: - netlbl_secattr_destroy(&secattr, 1); +netlbl_cache_add_return: + netlbl_secattr_destroy(&secattr); } /** @@ -2263,8 +2268,8 @@ static int selinux_netlbl_secattr_to_sid(struct sk_buff *skb, POLICY_RDLOCK; - if (secattr->cache.data) { - cache = NETLBL_CACHE(secattr->cache.data); + if (secattr->cache) { + cache = NETLBL_CACHE(secattr->cache->data); switch (cache->type) { case NETLBL_CACHE_T_SID: *sid = cache->data.sid; @@ -2369,7 +2374,7 @@ static int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, &secattr, base_sid, sid); - netlbl_secattr_destroy(&secattr, 0); + netlbl_secattr_destroy(&secattr); return rc; } @@ -2415,7 +2420,7 @@ static int selinux_netlbl_socket_setsid(struct socket *sock, u32 sid) if (rc == 0) sksec->nlbl_state = NLBL_LABELED; - netlbl_secattr_destroy(&secattr, 0); + netlbl_secattr_destroy(&secattr); netlbl_socket_setsid_return: POLICY_RDUNLOCK; @@ -2517,7 +2522,7 @@ void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock) sksec->sid, &nlbl_peer_sid) == 0) sksec->peer_sid = nlbl_peer_sid; - netlbl_secattr_destroy(&secattr, 0); + netlbl_secattr_destroy(&secattr); sksec->nlbl_state = NLBL_REQUIRE; From 388b24057f90ba109d4bf855006a8809c383eb76 Mon Sep 17 00:00:00 2001 From: "paul.moore@hp.com" Date: Thu, 5 Oct 2006 18:28:24 -0400 Subject: [PATCH 02/19] NetLabel: use SECINITSID_UNLABELED for a base SID This patch changes NetLabel to use SECINITSID_UNLABLELED as it's source of SELinux type information when generating a NetLabel context. Signed-off-by: Paul Moore Signed-off-by: James Morris --- security/selinux/ss/services.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index bb2d2bc869ba..18274b005090 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -2336,7 +2336,7 @@ static int selinux_netlbl_secattr_to_sid(struct sk_buff *skb, selinux_netlbl_cache_add(skb, &ctx_new); ebitmap_destroy(&ctx_new.range.level[0].cat); } else { - *sid = SECINITSID_UNLABELED; + *sid = SECSID_NULL; rc = 0; } @@ -2519,7 +2519,7 @@ void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock) if (netlbl_sock_getattr(sk, &secattr) == 0 && selinux_netlbl_secattr_to_sid(NULL, &secattr, - sksec->sid, + SECINITSID_UNLABELED, &nlbl_peer_sid) == 0) sksec->peer_sid = nlbl_peer_sid; netlbl_secattr_destroy(&secattr); @@ -2552,9 +2552,6 @@ u32 selinux_netlbl_inet_conn_request(struct sk_buff *skb, u32 sock_sid) if (rc != 0) return SECSID_NULL; - if (peer_sid == SECINITSID_UNLABELED) - return SECSID_NULL; - return peer_sid; } @@ -2616,11 +2613,13 @@ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, u32 netlbl_sid; u32 recv_perm; - rc = selinux_netlbl_skbuff_getsid(skb, SECINITSID_NETMSG, &netlbl_sid); + rc = selinux_netlbl_skbuff_getsid(skb, + SECINITSID_UNLABELED, + &netlbl_sid); if (rc != 0) return rc; - if (netlbl_sid == SECINITSID_UNLABELED) + if (netlbl_sid == SECSID_NULL) return 0; switch (sksec->sclass) { @@ -2658,10 +2657,6 @@ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, u32 selinux_netlbl_socket_getpeersec_stream(struct socket *sock) { struct sk_security_struct *sksec = sock->sk->sk_security; - - if (sksec->peer_sid == SECINITSID_UNLABELED) - return SECSID_NULL; - return sksec->peer_sid; } @@ -2677,16 +2672,10 @@ u32 selinux_netlbl_socket_getpeersec_stream(struct socket *sock) u32 selinux_netlbl_socket_getpeersec_dgram(struct sk_buff *skb) { int peer_sid; - struct sock *sk = skb->sk; - struct inode_security_struct *isec; - if (sk == NULL || sk->sk_socket == NULL) - return SECSID_NULL; - - isec = SOCK_INODE(sk->sk_socket)->i_security; - if (selinux_netlbl_skbuff_getsid(skb, isec->sid, &peer_sid) != 0) - return SECSID_NULL; - if (peer_sid == SECINITSID_UNLABELED) + if (selinux_netlbl_skbuff_getsid(skb, + SECINITSID_UNLABELED, + &peer_sid) != 0) return SECSID_NULL; return peer_sid; From 134b0fc544ba062498451611cb6f3e4454221b3d Mon Sep 17 00:00:00 2001 From: James Morris Date: Thu, 5 Oct 2006 15:42:27 -0500 Subject: [PATCH 03/19] IPsec: propagate security module errors up from flow_cache_lookup When a security module is loaded (in this case, SELinux), the security_xfrm_policy_lookup() hook can return an access denied permission (or other error). We were not handling that correctly, and in fact inverting the return logic and propagating a false "ok" back up to xfrm_lookup(), which then allowed packets to pass as if they were not associated with an xfrm policy. The way I was seeing the problem was when connecting via IPsec to a confined service on an SELinux box (vsftpd), which did not have the appropriate SELinux policy permissions to send packets via IPsec. The first SYNACK would be blocked, because of an uncached lookup via flow_cache_lookup(), which would fail to resolve an xfrm policy because the SELinux policy is checked at that point via the resolver. However, retransmitted SYNACKs would then find a cached flow entry when calling into flow_cache_lookup() with a null xfrm policy, which is interpreted by xfrm_lookup() as the packet not having any associated policy and similarly to the first case, allowing it to pass without transformation. The solution presented here is to first ensure that errno values are correctly propagated all the way back up through the various call chains from security_xfrm_policy_lookup(), and handled correctly. Then, flow_cache_lookup() is modified, so that if the policy resolver fails (typically a permission denied via the security module), the flow cache entry is killed rather than having a null policy assigned (which indicates that the packet can pass freely). This also forces any future lookups for the same flow to consult the security module (e.g. SELinux) for current security policy (rather than, say, caching the error on the flow cache entry). Signed-off-by: James Morris --- include/net/flow.h | 2 +- net/core/flow.c | 38 +++++++++++++++-------- net/xfrm/xfrm_policy.c | 68 ++++++++++++++++++++++++++++++++---------- 3 files changed, 80 insertions(+), 28 deletions(-) diff --git a/include/net/flow.h b/include/net/flow.h index ddf5f3ca1720..3b44d72b27d3 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -97,7 +97,7 @@ struct flowi { #define FLOW_DIR_FWD 2 struct sock; -typedef void (*flow_resolve_t)(struct flowi *key, u16 family, u8 dir, +typedef int (*flow_resolve_t)(struct flowi *key, u16 family, u8 dir, void **objp, atomic_t **obj_refp); extern void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, diff --git a/net/core/flow.c b/net/core/flow.c index f23e7e386543..b16d31ae5e54 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -85,6 +85,14 @@ static void flow_cache_new_hashrnd(unsigned long arg) add_timer(&flow_hash_rnd_timer); } +static void flow_entry_kill(int cpu, struct flow_cache_entry *fle) +{ + if (fle->object) + atomic_dec(fle->object_ref); + kmem_cache_free(flow_cachep, fle); + flow_count(cpu)--; +} + static void __flow_cache_shrink(int cpu, int shrink_to) { struct flow_cache_entry *fle, **flp; @@ -100,10 +108,7 @@ static void __flow_cache_shrink(int cpu, int shrink_to) } while ((fle = *flp) != NULL) { *flp = fle->next; - if (fle->object) - atomic_dec(fle->object_ref); - kmem_cache_free(flow_cachep, fle); - flow_count(cpu)--; + flow_entry_kill(cpu, fle); } } } @@ -220,24 +225,33 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, nocache: { + int err; void *obj; atomic_t *obj_ref; - resolver(key, family, dir, &obj, &obj_ref); + err = resolver(key, family, dir, &obj, &obj_ref); if (fle) { - fle->genid = atomic_read(&flow_cache_genid); + if (err) { + /* Force security policy check on next lookup */ + *head = fle->next; + flow_entry_kill(cpu, fle); + } else { + fle->genid = atomic_read(&flow_cache_genid); - if (fle->object) - atomic_dec(fle->object_ref); + if (fle->object) + atomic_dec(fle->object_ref); - fle->object = obj; - fle->object_ref = obj_ref; - if (obj) - atomic_inc(fle->object_ref); + fle->object = obj; + fle->object_ref = obj_ref; + if (obj) + atomic_inc(fle->object_ref); + } } local_bh_enable(); + if (err) + obj = ERR_PTR(err); return obj; } } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 2a7861661f14..fffdd34f3baf 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -883,30 +883,32 @@ out: } EXPORT_SYMBOL(xfrm_policy_walk); -/* Find policy to apply to this flow. */ - +/* + * Find policy to apply to this flow. + * + * Returns 0 if policy found, else an -errno. + */ static int xfrm_policy_match(struct xfrm_policy *pol, struct flowi *fl, u8 type, u16 family, int dir) { struct xfrm_selector *sel = &pol->selector; - int match; + int match, ret = -ESRCH; if (pol->family != family || pol->type != type) - return 0; + return ret; match = xfrm_selector_match(sel, fl, family); - if (match) { - if (!security_xfrm_policy_lookup(pol, fl->secid, dir)) - return 1; - } + if (match) + ret = security_xfrm_policy_lookup(pol, fl->secid, dir); - return 0; + return ret; } static struct xfrm_policy *xfrm_policy_lookup_bytype(u8 type, struct flowi *fl, u16 family, u8 dir) { + int err; struct xfrm_policy *pol, *ret; xfrm_address_t *daddr, *saddr; struct hlist_node *entry; @@ -922,7 +924,15 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(u8 type, struct flowi *fl, chain = policy_hash_direct(daddr, saddr, family, dir); ret = NULL; hlist_for_each_entry(pol, entry, chain, bydst) { - if (xfrm_policy_match(pol, fl, type, family, dir)) { + err = xfrm_policy_match(pol, fl, type, family, dir); + if (err) { + if (err == -ESRCH) + continue; + else { + ret = ERR_PTR(err); + goto fail; + } + } else { ret = pol; priority = ret->priority; break; @@ -930,36 +940,53 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(u8 type, struct flowi *fl, } chain = &xfrm_policy_inexact[dir]; hlist_for_each_entry(pol, entry, chain, bydst) { - if (xfrm_policy_match(pol, fl, type, family, dir) && - pol->priority < priority) { + err = xfrm_policy_match(pol, fl, type, family, dir); + if (err) { + if (err == -ESRCH) + continue; + else { + ret = ERR_PTR(err); + goto fail; + } + } else if (pol->priority < priority) { ret = pol; break; } } if (ret) xfrm_pol_hold(ret); +fail: read_unlock_bh(&xfrm_policy_lock); return ret; } -static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, +static int xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, void **objp, atomic_t **obj_refp) { struct xfrm_policy *pol; + int err = 0; #ifdef CONFIG_XFRM_SUB_POLICY pol = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_SUB, fl, family, dir); - if (pol) + if (IS_ERR(pol)) { + err = PTR_ERR(pol); + pol = NULL; + } + if (pol || err) goto end; #endif pol = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_MAIN, fl, family, dir); - + if (IS_ERR(pol)) { + err = PTR_ERR(pol); + pol = NULL; + } #ifdef CONFIG_XFRM_SUB_POLICY end: #endif if ((*objp = (void *) pol) != NULL) *obj_refp = &pol->refcnt; + return err; } static inline int policy_to_flow_dir(int dir) @@ -1297,6 +1324,8 @@ restart: policy = flow_cache_lookup(fl, dst_orig->ops->family, dir, xfrm_policy_lookup); + if (IS_ERR(policy)) + return PTR_ERR(policy); } if (!policy) @@ -1343,6 +1372,10 @@ restart: fl, family, XFRM_POLICY_OUT); if (pols[1]) { + if (IS_ERR(pols[1])) { + err = PTR_ERR(pols[1]); + goto error; + } if (pols[1]->action == XFRM_POLICY_BLOCK) { err = -EPERM; goto error; @@ -1581,6 +1614,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, pol = flow_cache_lookup(&fl, family, fl_dir, xfrm_policy_lookup); + if (IS_ERR(pol)) + return 0; + if (!pol) { if (skb->sp && secpath_has_nontransport(skb->sp, 0, &xerr_idx)) { xfrm_secpath_reject(xerr_idx, skb, &fl); @@ -1599,6 +1635,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, &fl, family, XFRM_POLICY_IN); if (pols[1]) { + if (IS_ERR(pols[1])) + return 0; pols[1]->curlft.use_time = (unsigned long)xtime.tv_sec; npols ++; } From 5b368e61c2bcb2666bb66e2acf1d6d85ba6f474d Mon Sep 17 00:00:00 2001 From: Venkat Yekkirala Date: Thu, 5 Oct 2006 15:42:18 -0500 Subject: [PATCH 04/19] IPsec: correct semantics for SELinux policy matching Currently when an IPSec policy rule doesn't specify a security context, it is assumed to be "unlabeled" by SELinux, and so the IPSec policy rule fails to match to a flow that it would otherwise match to, unless one has explicitly added an SELinux policy rule allowing the flow to "polmatch" to the "unlabeled" IPSec policy rules. In the absence of such an explicitly added SELinux policy rule, the IPSec policy rule fails to match and so the packet(s) flow in clear text without the otherwise applicable xfrm(s) applied. The above SELinux behavior violates the SELinux security notion of "deny by default" which should actually translate to "encrypt by default" in the above case. This was first reported by Evgeniy Polyakov and the way James Morris was seeing the problem was when connecting via IPsec to a confined service on an SELinux box (vsftpd), which did not have the appropriate SELinux policy permissions to send packets via IPsec. With this patch applied, SELinux "polmatching" of flows Vs. IPSec policy rules will only come into play when there's a explicit context specified for the IPSec policy rule (which also means there's corresponding SELinux policy allowing appropriate domains/flows to polmatch to this context). Secondly, when a security module is loaded (in this case, SELinux), the security_xfrm_policy_lookup() hook can return errors other than access denied, such as -EINVAL. We were not handling that correctly, and in fact inverting the return logic and propagating a false "ok" back up to xfrm_lookup(), which then allowed packets to pass as if they were not associated with an xfrm policy. The solution for this is to first ensure that errno values are correctly propagated all the way back up through the various call chains from security_xfrm_policy_lookup(), and handled correctly. Then, flow_cache_lookup() is modified, so that if the policy resolver fails (typically a permission denied via the security module), the flow cache entry is killed rather than having a null policy assigned (which indicates that the packet can pass freely). This also forces any future lookups for the same flow to consult the security module (e.g. SELinux) for current security policy (rather than, say, caching the error on the flow cache entry). This patch: Fix the selinux side of things. This makes sure SELinux polmatching of flow contexts to IPSec policy rules comes into play only when an explicit context is associated with the IPSec policy rule. Also, this no longer defaults the context of a socket policy to the context of the socket since the "no explicit context" case is now handled properly. Signed-off-by: Venkat Yekkirala Signed-off-by: James Morris --- include/linux/security.h | 24 ++++++--------- include/net/xfrm.h | 3 +- net/ipv4/xfrm4_policy.c | 2 +- net/ipv6/xfrm6_policy.c | 2 +- net/key/af_key.c | 5 ---- net/xfrm/xfrm_policy.c | 7 +++-- net/xfrm/xfrm_user.c | 9 ------ security/dummy.c | 3 +- security/selinux/include/xfrm.h | 3 +- security/selinux/xfrm.c | 53 +++++++++++++++++++++++++-------- 10 files changed, 62 insertions(+), 49 deletions(-) diff --git a/include/linux/security.h b/include/linux/security.h index 9b5fea81f55e..b200b9856f32 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -882,7 +882,8 @@ struct request_sock; * Check permission when a flow selects a xfrm_policy for processing * XFRMs on a packet. The hook is called when selecting either a * per-socket policy or a generic xfrm policy. - * Return 0 if permission is granted. + * Return 0 if permission is granted, -ESRCH otherwise, or -errno + * on other errors. * @xfrm_state_pol_flow_match: * @x contains the state to match. * @xp contains the policy to check for a match. @@ -891,6 +892,7 @@ struct request_sock; * @xfrm_flow_state_match: * @fl contains the flow key to match. * @xfrm points to the xfrm_state to match. + * @xp points to the xfrm_policy to match. * Return 1 if there is a match. * @xfrm_decode_session: * @skb points to skb to decode. @@ -1388,7 +1390,8 @@ struct security_operations { int (*xfrm_policy_lookup)(struct xfrm_policy *xp, u32 fl_secid, u8 dir); int (*xfrm_state_pol_flow_match)(struct xfrm_state *x, struct xfrm_policy *xp, struct flowi *fl); - int (*xfrm_flow_state_match)(struct flowi *fl, struct xfrm_state *xfrm); + int (*xfrm_flow_state_match)(struct flowi *fl, struct xfrm_state *xfrm, + struct xfrm_policy *xp); int (*xfrm_decode_session)(struct sk_buff *skb, u32 *secid, int ckall); #endif /* CONFIG_SECURITY_NETWORK_XFRM */ @@ -3120,11 +3123,6 @@ static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm return security_ops->xfrm_policy_alloc_security(xp, sec_ctx, NULL); } -static inline int security_xfrm_sock_policy_alloc(struct xfrm_policy *xp, struct sock *sk) -{ - return security_ops->xfrm_policy_alloc_security(xp, NULL, sk); -} - static inline int security_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new) { return security_ops->xfrm_policy_clone_security(old, new); @@ -3175,9 +3173,10 @@ static inline int security_xfrm_state_pol_flow_match(struct xfrm_state *x, return security_ops->xfrm_state_pol_flow_match(x, xp, fl); } -static inline int security_xfrm_flow_state_match(struct flowi *fl, struct xfrm_state *xfrm) +static inline int security_xfrm_flow_state_match(struct flowi *fl, + struct xfrm_state *xfrm, struct xfrm_policy *xp) { - return security_ops->xfrm_flow_state_match(fl, xfrm); + return security_ops->xfrm_flow_state_match(fl, xfrm, xp); } static inline int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid) @@ -3197,11 +3196,6 @@ static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm return 0; } -static inline int security_xfrm_sock_policy_alloc(struct xfrm_policy *xp, struct sock *sk) -{ - return 0; -} - static inline int security_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new) { return 0; @@ -3249,7 +3243,7 @@ static inline int security_xfrm_state_pol_flow_match(struct xfrm_state *x, } static inline int security_xfrm_flow_state_match(struct flowi *fl, - struct xfrm_state *xfrm) + struct xfrm_state *xfrm, struct xfrm_policy *xp) { return 1; } diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 1e2a4ddec96e..737fdb2ee8a4 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -995,7 +995,8 @@ struct xfrm_state * xfrm_find_acq(u8 mode, u32 reqid, u8 proto, int create, unsigned short family); extern void xfrm_policy_flush(u8 type); extern int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol); -extern int xfrm_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl, int family, int strict); +extern int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *xdst, + struct flowi *fl, int family, int strict); extern void xfrm_init_pmtu(struct dst_entry *dst); extern wait_queue_head_t km_waitq; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 7a7a00147e55..1bed0cdf53e3 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -52,7 +52,7 @@ __xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy) xdst->u.rt.fl.fl4_dst == fl->fl4_dst && xdst->u.rt.fl.fl4_src == fl->fl4_src && xdst->u.rt.fl.fl4_tos == fl->fl4_tos && - xfrm_bundle_ok(xdst, fl, AF_INET, 0)) { + xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) { dst_clone(dst); break; } diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 6a252e2134d1..73cee2ec07e8 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -73,7 +73,7 @@ __xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy) xdst->u.rt6.rt6i_src.plen); if (ipv6_addr_equal(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) && ipv6_addr_equal(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) && - xfrm_bundle_ok(xdst, fl, AF_INET6, + xfrm_bundle_ok(policy, xdst, fl, AF_INET6, (xdst->u.rt6.rt6i_dst.plen != 128 || xdst->u.rt6.rt6i_src.plen != 128))) { dst_clone(dst); diff --git a/net/key/af_key.c b/net/key/af_key.c index ff98e70b0931..20ff7cca1d07 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2928,11 +2928,6 @@ static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, if (*dir) goto out; } - else { - *dir = security_xfrm_sock_policy_alloc(xp, sk); - if (*dir) - goto out; - } *dir = pol->sadb_x_policy_dir-1; return xp; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index fffdd34f3baf..695761ff1321 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1744,7 +1744,7 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) static int stale_bundle(struct dst_entry *dst) { - return !xfrm_bundle_ok((struct xfrm_dst *)dst, NULL, AF_UNSPEC, 0); + return !xfrm_bundle_ok(NULL, (struct xfrm_dst *)dst, NULL, AF_UNSPEC, 0); } void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) @@ -1866,7 +1866,8 @@ EXPORT_SYMBOL(xfrm_init_pmtu); * still valid. */ -int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family, int strict) +int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first, + struct flowi *fl, int family, int strict) { struct dst_entry *dst = &first->u.dst; struct xfrm_dst *last; @@ -1883,7 +1884,7 @@ int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family, int str if (fl && !xfrm_selector_match(&dst->xfrm->sel, fl, family)) return 0; - if (fl && !security_xfrm_flow_state_match(fl, dst->xfrm)) + if (fl && !security_xfrm_flow_state_match(fl, dst->xfrm, pol)) return 0; if (dst->xfrm->km.state != XFRM_STATE_VALID) return 0; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index d54b3a70d5df..2b2e59d8ffbc 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1992,15 +1992,6 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt, xp->type = XFRM_POLICY_TYPE_MAIN; copy_templates(xp, ut, nr); - if (!xp->security) { - int err = security_xfrm_sock_policy_alloc(xp, sk); - if (err) { - kfree(xp); - *dir = err; - return NULL; - } - } - *dir = p->dir; return xp; diff --git a/security/dummy.c b/security/dummy.c index aeee70565509..43874c1e6e23 100644 --- a/security/dummy.c +++ b/security/dummy.c @@ -881,7 +881,8 @@ static int dummy_xfrm_state_pol_flow_match(struct xfrm_state *x, return 1; } -static int dummy_xfrm_flow_state_match(struct flowi *fl, struct xfrm_state *xfrm) +static int dummy_xfrm_flow_state_match(struct flowi *fl, struct xfrm_state *xfrm, + struct xfrm_policy *xp) { return 1; } diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h index 81eb59890162..526b28019aca 100644 --- a/security/selinux/include/xfrm.h +++ b/security/selinux/include/xfrm.h @@ -19,7 +19,8 @@ int selinux_xfrm_state_delete(struct xfrm_state *x); int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 fl_secid, u8 dir); int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *xp, struct flowi *fl); -int selinux_xfrm_flow_state_match(struct flowi *fl, struct xfrm_state *xfrm); +int selinux_xfrm_flow_state_match(struct flowi *fl, struct xfrm_state *xfrm, + struct xfrm_policy *xp); /* diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index 3e742b850af6..675b995a67c3 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c @@ -77,8 +77,8 @@ static inline int selinux_authorizable_xfrm(struct xfrm_state *x) */ int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 fl_secid, u8 dir) { - int rc = 0; - u32 sel_sid = SECINITSID_UNLABELED; + int rc; + u32 sel_sid; struct xfrm_sec_ctx *ctx; /* Context sid is either set to label or ANY_ASSOC */ @@ -88,11 +88,21 @@ int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 fl_secid, u8 dir) sel_sid = ctx->ctx_sid; } + else + /* + * All flows should be treated as polmatch'ing an + * otherwise applicable "non-labeled" policy. This + * would prevent inadvertent "leaks". + */ + return 0; rc = avc_has_perm(fl_secid, sel_sid, SECCLASS_ASSOCIATION, ASSOCIATION__POLMATCH, NULL); + if (rc == -EACCES) + rc = -ESRCH; + return rc; } @@ -108,15 +118,20 @@ int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy * u32 pol_sid; int err; - if (x->security) - state_sid = x->security->ctx_sid; - else - state_sid = SECINITSID_UNLABELED; - - if (xp->security) + if (xp->security) { + if (!x->security) + /* unlabeled SA and labeled policy can't match */ + return 0; + else + state_sid = x->security->ctx_sid; pol_sid = xp->security->ctx_sid; - else - pol_sid = SECINITSID_UNLABELED; + } else + if (x->security) + /* unlabeled policy and labeled SA can't match */ + return 0; + else + /* unlabeled policy and unlabeled SA match all flows */ + return 1; err = avc_has_perm(state_sid, pol_sid, SECCLASS_ASSOCIATION, ASSOCIATION__POLMATCH, @@ -125,7 +140,11 @@ int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy * if (err) return 0; - return selinux_xfrm_flow_state_match(fl, x); + err = avc_has_perm(fl->secid, state_sid, SECCLASS_ASSOCIATION, + ASSOCIATION__SENDTO, + NULL)? 0:1; + + return err; } /* @@ -133,12 +152,22 @@ int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy * * can use a given security association. */ -int selinux_xfrm_flow_state_match(struct flowi *fl, struct xfrm_state *xfrm) +int selinux_xfrm_flow_state_match(struct flowi *fl, struct xfrm_state *xfrm, + struct xfrm_policy *xp) { int rc = 0; u32 sel_sid = SECINITSID_UNLABELED; struct xfrm_sec_ctx *ctx; + if (!xp->security) + if (!xfrm->security) + return 1; + else + return 0; + else + if (!xfrm->security) + return 0; + /* Context sid is either set to label or ANY_ASSOC */ if ((ctx = xfrm->security)) { if (!selinux_authorizable_ctx(ctx)) From 3bccfbc7a7ba4085817deae6e7c67daf0cbd045a Mon Sep 17 00:00:00 2001 From: Venkat Yekkirala Date: Thu, 5 Oct 2006 15:42:35 -0500 Subject: [PATCH 05/19] IPsec: fix handling of errors for socket policies This treats the security errors encountered in the case of socket policy matching, the same as how these are treated in the case of main/sub policies, which is to return a full lookup failure. Signed-off-by: Venkat Yekkirala Signed-off-by: James Morris --- net/xfrm/xfrm_policy.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 695761ff1321..7736b23c3f03 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1016,12 +1016,16 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struc sk->sk_family); int err = 0; - if (match) - err = security_xfrm_policy_lookup(pol, fl->secid, policy_to_flow_dir(dir)); - - if (match && !err) - xfrm_pol_hold(pol); - else + if (match) { + err = security_xfrm_policy_lookup(pol, fl->secid, + policy_to_flow_dir(dir)); + if (!err) + xfrm_pol_hold(pol); + else if (err == -ESRCH) + pol = NULL; + else + pol = ERR_PTR(err); + } else pol = NULL; } read_unlock_bh(&xfrm_policy_lock); @@ -1313,8 +1317,11 @@ restart: pol_dead = 0; xfrm_nr = 0; - if (sk && sk->sk_policy[1]) + if (sk && sk->sk_policy[1]) { policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); + if (IS_ERR(policy)) + return PTR_ERR(policy); + } if (!policy) { /* To accelerate a bit... */ @@ -1607,8 +1614,11 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } pol = NULL; - if (sk && sk->sk_policy[dir]) + if (sk && sk->sk_policy[dir]) { pol = xfrm_sk_policy_lookup(sk, dir, &fl); + if (IS_ERR(pol)) + return 0; + } if (!pol) pol = flow_cache_lookup(&fl, family, fl_dir, From 6e8c751e07b34d73069e9333f67fbe5ffe31ec3a Mon Sep 17 00:00:00 2001 From: Chad Sellers Date: Fri, 6 Oct 2006 16:09:52 -0400 Subject: [PATCH 06/19] SELinux: Bug fix in polidydb_destroy This patch fixes two bugs in policydb_destroy. Two list pointers (policydb.ocontexts[i] and policydb.genfs) were not being reset to NULL when the lists they pointed to were being freed. This caused a problem when the initial policy load failed, as the policydb being destroyed was not a temporary new policydb that was thrown away, but rather was the global (active) policydb. Consequently, later functions, particularly sys_bind->selinux_socket_bind->security_node_sid and do_rw_proc->selinux_sysctl->selinux_proc_get_sid->security_genfs_sid tried to dereference memory that had previously been freed. Signed-off-by: Chad Sellers Signed-off-by: James Morris --- security/selinux/ss/policydb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index b18895302555..ba48961f9d05 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -618,6 +618,7 @@ void policydb_destroy(struct policydb *p) c = c->next; ocontext_destroy(ctmp,i); } + p->ocontexts[i] = NULL; } g = p->genfs; @@ -633,6 +634,7 @@ void policydb_destroy(struct policydb *p) g = g->next; kfree(gtmp); } + p->genfs = NULL; cond_policydb_destroy(p); From 331c4ee7faa4ee1e1404c872a139784753100498 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Mon, 9 Oct 2006 21:34:04 -0700 Subject: [PATCH 07/19] [SCTP]: Fix receive buffer accounting. When doing receiver buffer accounting, we always used skb->truesize. This is problematic when processing bundled DATA chunks because for every DATA chunk that could be small part of one large skb, we would charge the size of the entire skb. The new approach is to store the size of the DATA chunk we are accounting for in the sctp_ulpevent structure and use that stored value for accounting. Signed-off-by: Vlad Yasevich Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 14 ++++++++++++++ include/net/sctp/ulpevent.h | 1 + net/sctp/socket.c | 22 ++++++++++++++++++---- net/sctp/ulpevent.c | 25 +++++++++++++++---------- net/sctp/ulpqueue.c | 2 +- 5 files changed, 49 insertions(+), 15 deletions(-) diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index ee68a3124076..764e3af5be93 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -139,6 +139,7 @@ int sctp_inet_listen(struct socket *sock, int backlog); void sctp_write_space(struct sock *sk); unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait); +void sctp_sock_rfree(struct sk_buff *skb); /* * sctp/primitive.c @@ -444,6 +445,19 @@ static inline struct list_head *sctp_list_dequeue(struct list_head *list) return result; } +/* SCTP version of skb_set_owner_r. We need this one because + * of the way we have to do receive buffer accounting on bundled + * chunks. + */ +static inline void sctp_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) +{ + struct sctp_ulpevent *event = sctp_skb2event(skb); + + skb->sk = sk; + skb->destructor = sctp_sock_rfree; + atomic_add(event->rmem_len, &sk->sk_rmem_alloc); +} + /* Tests if the list has one and only one entry. */ static inline int sctp_list_single_entry(struct list_head *head) { diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index 6c40cfc4832d..1a4ddc1ec7d2 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -63,6 +63,7 @@ struct sctp_ulpevent { __u32 cumtsn; int msg_flags; int iif; + unsigned int rmem_len; }; /* Retrieve the skb this event sits inside of. */ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 3fe906d65069..9deec4391187 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5362,6 +5362,20 @@ static void sctp_wfree(struct sk_buff *skb) sctp_association_put(asoc); } +/* Do accounting for the receive space on the socket. + * Accounting for the association is done in ulpevent.c + * We set this as a destructor for the cloned data skbs so that + * accounting is done at the correct time. + */ +void sctp_sock_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct sctp_ulpevent *event = sctp_skb2event(skb); + + atomic_sub(event->rmem_len, &sk->sk_rmem_alloc); +} + + /* Helper function to wait for space in the sndbuf. */ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, size_t msg_len) @@ -5634,10 +5648,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { - sock_rfree(skb); + sctp_sock_rfree(skb); __skb_unlink(skb, &oldsk->sk_receive_queue); __skb_queue_tail(&newsk->sk_receive_queue, skb); - skb_set_owner_r(skb, newsk); + sctp_skb_set_owner_r(skb, newsk); } } @@ -5665,10 +5679,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { - sock_rfree(skb); + sctp_sock_rfree(skb); __skb_unlink(skb, &oldsp->pd_lobby); __skb_queue_tail(queue, skb); - skb_set_owner_r(skb, newsk); + sctp_skb_set_owner_r(skb, newsk); } } diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index ee236784a6bb..a015283a9087 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -55,10 +55,13 @@ static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event); /* Initialize an ULP event from an given skb. */ -SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags) +SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, + int msg_flags, + unsigned int len) { memset(event, 0, sizeof(struct sctp_ulpevent)); event->msg_flags = msg_flags; + event->rmem_len = len; } /* Create a new sctp_ulpevent. */ @@ -73,7 +76,7 @@ SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags, goto fail; event = sctp_skb2event(skb); - sctp_ulpevent_init(event, msg_flags); + sctp_ulpevent_init(event, msg_flags, skb->truesize); return event; @@ -101,17 +104,16 @@ static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, sctp_association_hold((struct sctp_association *)asoc); skb = sctp_event2skb(event); event->asoc = (struct sctp_association *)asoc; - atomic_add(skb->truesize, &event->asoc->rmem_alloc); - skb_set_owner_r(skb, asoc->base.sk); + atomic_add(event->rmem_len, &event->asoc->rmem_alloc); + sctp_skb_set_owner_r(skb, asoc->base.sk); } /* A simple destructor to give up the reference to the association. */ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event) { struct sctp_association *asoc = event->asoc; - struct sk_buff *skb = sctp_event2skb(event); - atomic_sub(skb->truesize, &asoc->rmem_alloc); + atomic_sub(event->rmem_len, &asoc->rmem_alloc); sctp_association_put(asoc); } @@ -372,7 +374,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_remote_error( /* Embed the event fields inside the cloned skb. */ event = sctp_skb2event(skb); - sctp_ulpevent_init(event, MSG_NOTIFICATION); + sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); sre = (struct sctp_remote_error *) skb_push(skb, sizeof(struct sctp_remote_error)); @@ -464,7 +466,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_send_failed( /* Embed the event fields inside the cloned skb. */ event = sctp_skb2event(skb); - sctp_ulpevent_init(event, MSG_NOTIFICATION); + sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); ssf = (struct sctp_send_failed *) skb_push(skb, sizeof(struct sctp_send_failed)); @@ -682,8 +684,11 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, /* Embed the event fields inside the cloned skb. */ event = sctp_skb2event(skb); - /* Initialize event with flags 0. */ - sctp_ulpevent_init(event, 0); + /* Initialize event with flags 0 and correct length + * Since this is a clone of the original skb, only account for + * the data of this chunk as other chunks will be accounted separately. + */ + sctp_ulpevent_init(event, 0, skb->len + sizeof(struct sk_buff)); sctp_ulpevent_receive_data(event, asoc); diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 575e556aeb3e..e1d144275f97 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -309,7 +309,7 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff_head *qu if (!new) return NULL; /* try again later */ - new->sk = f_frag->sk; + sctp_skb_set_owner_r(new, f_frag->sk); skb_shinfo(new)->frag_list = pos; } else From 6aa2551cf135f1d246d31482adc8c679eeea3a83 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Mon, 9 Oct 2006 21:34:26 -0700 Subject: [PATCH 08/19] [SCTP]: Fix the RX queue size shown in /proc/net/sctp/assocs output. Show the true receive buffer usage. Signed-off-by: Vlad Yasevich Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/proc.c b/net/sctp/proc.c index a356d8d310a9..7f49e769080e 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -344,7 +344,7 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) assoc, sk, sctp_sk(sk)->type, sk->sk_state, assoc->state, hash, assoc->assoc_id, assoc->sndbuf_used, - (sk->sk_rcvbuf - assoc->rwnd), + atomic_read(&assoc->rmem_alloc), sock_i_uid(sk), sock_i_ino(sk), epb->bind_addr.port, assoc->peer.port); From effee6a00034a8d83a6dea6d221820d87364ac21 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 9 Oct 2006 21:42:14 -0700 Subject: [PATCH 09/19] [NET]: File descriptor loss while receiving SCM_RIGHTS If more than one file descriptor was sent with an SCM_RIGHTS message, and on the receiving end, after installing a nonzero (but not all) file descritpors the process runs out of fds, then the already installed fds will be lost (userspace will have no way of knowing about them). The following patch makes sure, that at least the already installed fds are sent to userspace. It doesn't solve the issue of losing file descriptors in case of an EFAULT on the userspace buffer. Signed-off-by: Miklos Szeredi Signed-off-by: David S. Miller --- net/compat.c | 3 +-- net/core/scm.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/net/compat.c b/net/compat.c index d5d69fa15d07..52d32f1bc728 100644 --- a/net/compat.c +++ b/net/compat.c @@ -285,8 +285,7 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm) if (i > 0) { int cmlen = CMSG_COMPAT_LEN(i * sizeof(int)); - if (!err) - err = put_user(SOL_SOCKET, &cm->cmsg_level); + err = put_user(SOL_SOCKET, &cm->cmsg_level); if (!err) err = put_user(SCM_RIGHTS, &cm->cmsg_type); if (!err) diff --git a/net/core/scm.c b/net/core/scm.c index 649d01ef35b6..271cf060ef8c 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -245,8 +245,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) if (i > 0) { int cmlen = CMSG_LEN(i*sizeof(int)); - if (!err) - err = put_user(SOL_SOCKET, &cm->cmsg_level); + err = put_user(SOL_SOCKET, &cm->cmsg_level); if (!err) err = put_user(SCM_RIGHTS, &cm->cmsg_type); if (!err) From 989e5b96e1af399296e2d1a34ca4a5aea1cf6d63 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 10 Oct 2006 14:47:44 -0700 Subject: [PATCH 10/19] [IPV6]: Seperate sit driver to extra module This patch removes the driver of the IPv6-in-IPv4 tunnel driver (sit) from the IPv6 module. It adds an option to Kconfig which makes it possible to compile it as a seperate module. Signed-off-by: Joerg Roedel Signed-off-by: David S. Miller --- net/ipv6/Kconfig | 13 +++++++++++++ net/ipv6/Makefile | 3 ++- net/ipv6/af_inet6.c | 2 -- net/ipv6/sit.c | 3 +++ 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index a460e8132b4d..ef5eaad44851 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -153,6 +153,19 @@ config INET6_XFRM_MODE_ROUTEOPTIMIZATION ---help--- Support for MIPv6 route optimization mode. +config IPV6_SIT + tristate "IPv6: IPv6-in-IPv4 tunnel (SIT driver)" + depends on IPV6 + default y + ---help--- + Tunneling means encapsulating data of one protocol type within + another protocol and sending it over a channel that understands the + encapsulating protocol. This driver implements encapsulation of IPv6 + into IPv4 packets. This is useful if you want to connect two IPv6 + networks over an IPv4-only path. + + Saying M here will produce a module called sit.ko. If unsure, say Y. + config IPV6_TUNNEL tristate "IPv6: IPv6-in-IPv6 tunnel" select INET6_TUNNEL diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 87274e47fe32..addcc011bc01 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_IPV6) += ipv6.o -ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \ +ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \ protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \ @@ -29,6 +29,7 @@ obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o obj-$(CONFIG_NETFILTER) += netfilter/ +obj-$(CONFIG_IPV6_SIT) += sit.o obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o obj-y += exthdrs_core.o diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e94eccb99707..858cae29581c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -850,7 +850,6 @@ static int __init inet6_init(void) err = addrconf_init(); if (err) goto addrconf_fail; - sit_init(); /* Init v6 extension headers. */ ipv6_rthdr_init(); @@ -927,7 +926,6 @@ static void __exit inet6_exit(void) mip6_fini(); #endif /* Cleanup code parts. */ - sit_cleanup(); ip6_flowlabel_cleanup(); addrconf_cleanup(); ip6_route_cleanup(); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 836eecd7e62b..dc5765b62b87 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -850,3 +850,6 @@ int __init sit_init(void) inet_del_protocol(&sit_protocol, IPPROTO_IPV6); goto out; } + +module_init(sit_init); +module_exit(sit_cleanup); From 0be669bb37693103c15e64610454f8f431a38feb Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 10 Oct 2006 14:49:53 -0700 Subject: [PATCH 11/19] [IPV6]: Seperate sit driver to extra module (addrconf.c changes) This patch contains the changes to net/ipv6/addrconf.c to remove sit specific code if the sit driver is not selected. Signed-off-by: Joerg Roedel Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e03c33b2465b..b312a5f7a759 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -396,8 +396,10 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) ndev->regen_timer.data = (unsigned long) ndev; if ((dev->flags&IFF_LOOPBACK) || dev->type == ARPHRD_TUNNEL || - dev->type == ARPHRD_NONE || - dev->type == ARPHRD_SIT) { +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) + dev->type == ARPHRD_SIT || +#endif + dev->type == ARPHRD_NONE) { printk(KERN_INFO "%s: Disabled Privacy Extensions\n", dev->name); @@ -1546,8 +1548,10 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, This thing is done here expecting that the whole class of non-broadcast devices need not cloning. */ +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) if (dev->type == ARPHRD_SIT && (dev->flags & IFF_POINTOPOINT)) cfg.fc_flags |= RTF_NONEXTHOP; +#endif ip6_route_add(&cfg); } @@ -1569,6 +1573,7 @@ static void addrconf_add_mroute(struct net_device *dev) ip6_route_add(&cfg); } +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) static void sit_route_add(struct net_device *dev) { struct fib6_config cfg = { @@ -1582,6 +1587,7 @@ static void sit_route_add(struct net_device *dev) /* prefix length - 96 bits "::d.d.d.d" */ ip6_route_add(&cfg); } +#endif static void addrconf_add_lroute(struct net_device *dev) { @@ -1852,6 +1858,7 @@ int addrconf_set_dstaddr(void __user *arg) if (dev == NULL) goto err_exit; +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) if (dev->type == ARPHRD_SIT) { struct ifreq ifr; mm_segment_t oldfs; @@ -1881,6 +1888,7 @@ int addrconf_set_dstaddr(void __user *arg) err = dev_open(dev); } } +#endif err_exit: rtnl_unlock(); @@ -2010,6 +2018,7 @@ int addrconf_del_ifaddr(void __user *arg) return err; } +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) static void sit_add_v4_addrs(struct inet6_dev *idev) { struct inet6_ifaddr * ifp; @@ -2078,6 +2087,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) } } } +#endif static void init_loopback(struct net_device *dev) { @@ -2141,6 +2151,7 @@ static void addrconf_dev_config(struct net_device *dev) addrconf_add_linklocal(idev, &addr); } +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) static void addrconf_sit_config(struct net_device *dev) { struct inet6_dev *idev; @@ -2166,6 +2177,7 @@ static void addrconf_sit_config(struct net_device *dev) } else sit_route_add(dev); } +#endif static inline int ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev) @@ -2260,9 +2272,11 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, } switch(dev->type) { +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) case ARPHRD_SIT: addrconf_sit_config(dev); break; +#endif case ARPHRD_TUNNEL6: addrconf_ip6_tnl_config(dev); break; From 4244f8a9f86a6d6e820b4cb53835c15c56d41aff Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Tue, 10 Oct 2006 19:40:50 -0700 Subject: [PATCH 12/19] [TCP]: Use TCPOLEN_TSTAMP_ALIGNED macro instead of magic number. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c83938b8fcb1..f745095196f7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -578,7 +578,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, struct tcphdr *th = skb->h.th; struct { struct tcphdr th; - u32 tsopt[3]; + u32 tsopt[TCPOLEN_TSTAMP_ALIGNED >> 2]; } rep; struct ip_reply_arg arg; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3b6575478fcc..34e004bdeab0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -653,7 +653,7 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 int tot_len = sizeof(struct tcphdr); if (ts) - tot_len += 3*4; + tot_len += TCPOLEN_TSTAMP_ALIGNED; buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, GFP_ATOMIC); From 496c98dff8e353880299168d36fa082d6fba5237 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Tue, 10 Oct 2006 19:41:21 -0700 Subject: [PATCH 13/19] [NET]: Use hton{l,s}() for non-initializers. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 4 ++-- net/ipv4/tcp_output.c | 31 ++++++++++++++++++++----------- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f5fba051df3d..d5b5dec075b8 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -611,8 +611,8 @@ static int ipgre_rcv(struct sk_buff *skb) * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header */ if (flags == 0 && - skb->protocol == __constant_htons(ETH_P_WCCP)) { - skb->protocol = __constant_htons(ETH_P_IP); + skb->protocol == htons(ETH_P_WCCP)) { + skb->protocol = htons(ETH_P_IP); if ((*(h + offset) & 0xF0) != 0x40) offset += 4; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9a253faefc81..f22536e32cb1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -273,10 +273,10 @@ static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp, __u32 tstamp) { if (tp->rx_opt.tstamp_ok) { - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | - TCPOLEN_TIMESTAMP); + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); *ptr++ = htonl(tstamp); *ptr++ = htonl(tp->rx_opt.ts_recent); } @@ -325,18 +325,27 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); if (ts) { if(sack) - *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | - (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | + (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); else - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); *ptr++ = htonl(tstamp); /* TSVAL */ *ptr++ = htonl(ts_recent); /* TSECR */ } else if(sack) - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | - (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK_PERM << 8) | + TCPOLEN_SACK_PERM); if (offer_wscale) - *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale)); + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_WINDOW << 16) | + (TCPOLEN_WINDOW << 8) | + (wscale)); } /* This routine actually transmits TCP packets queued in by From 9469c7b4aa210ce94c6e7208cfadbd0aca0ebe08 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Tue, 10 Oct 2006 19:41:46 -0700 Subject: [PATCH 14/19] [NET]: Use typesafe inet_twsk() inline function instead of cast. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 6 +++--- net/dccp/ipv6.c | 6 +++--- net/ipv4/tcp_ipv4.c | 16 +++++++--------- net/ipv6/tcp_ipv6.c | 11 +++++------ 4 files changed, 18 insertions(+), 21 deletions(-) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index bf692c1c116f..7e746c4c1688 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -311,7 +311,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) } if (sk->sk_state == DCCP_TIME_WAIT) { - inet_twsk_put((struct inet_timewait_sock *)sk); + inet_twsk_put(inet_twsk(sk)); return; } @@ -614,7 +614,7 @@ static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) bh_lock_sock(nsk); return nsk; } - inet_twsk_put((struct inet_timewait_sock *)nsk); + inet_twsk_put(inet_twsk(nsk)); return NULL; } @@ -980,7 +980,7 @@ discard_and_relse: goto discard_it; do_time_wait: - inet_twsk_put((struct inet_timewait_sock *)sk); + inet_twsk_put(inet_twsk(sk)); goto no_dccp_socket; } diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 7a47399cf31f..7171a78671aa 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -285,7 +285,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } if (sk->sk_state == DCCP_TIME_WAIT) { - inet_twsk_put((struct inet_timewait_sock *)sk); + inet_twsk_put(inet_twsk(sk)); return; } @@ -663,7 +663,7 @@ static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) bh_lock_sock(nsk); return nsk; } - inet_twsk_put((struct inet_timewait_sock *)nsk); + inet_twsk_put(inet_twsk(nsk)); return NULL; } @@ -1109,7 +1109,7 @@ discard_and_relse: goto discard_it; do_time_wait: - inet_twsk_put((struct inet_timewait_sock *)sk); + inet_twsk_put(inet_twsk(sk)); goto no_dccp_socket; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f745095196f7..6bbd98575172 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -355,7 +355,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) return; } if (sk->sk_state == TCP_TIME_WAIT) { - inet_twsk_put((struct inet_timewait_sock *)sk); + inet_twsk_put(inet_twsk(sk)); return; } @@ -960,7 +960,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) bh_lock_sock(nsk); return nsk; } - inet_twsk_put((struct inet_timewait_sock *)nsk); + inet_twsk_put(inet_twsk(nsk)); return NULL; } @@ -1154,26 +1154,24 @@ discard_and_relse: do_time_wait: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - inet_twsk_put((struct inet_timewait_sock *) sk); + inet_twsk_put(inet_twsk(sk)); goto discard_it; } if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(TCP_MIB_INERRS); - inet_twsk_put((struct inet_timewait_sock *) sk); + inet_twsk_put(inet_twsk(sk)); goto discard_it; } - switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk, - skb, th)) { + switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo, skb->nh.iph->daddr, th->dest, inet_iif(skb)); if (sk2) { - inet_twsk_deschedule((struct inet_timewait_sock *)sk, - &tcp_death_row); - inet_twsk_put((struct inet_timewait_sock *)sk); + inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_put(inet_twsk(sk)); sk = sk2; goto process; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 34e004bdeab0..4c2a7c0cafef 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -329,7 +329,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } if (sk->sk_state == TCP_TIME_WAIT) { - inet_twsk_put((struct inet_timewait_sock *)sk); + inet_twsk_put(inet_twsk(sk)); return; } @@ -749,7 +749,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) bh_lock_sock(nsk); return nsk; } - inet_twsk_put((struct inet_timewait_sock *)nsk); + inet_twsk_put(inet_twsk(nsk)); return NULL; } @@ -1283,18 +1283,17 @@ discard_and_relse: do_time_wait: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { - inet_twsk_put((struct inet_timewait_sock *)sk); + inet_twsk_put(inet_twsk(sk)); goto discard_it; } if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(TCP_MIB_INERRS); - inet_twsk_put((struct inet_timewait_sock *)sk); + inet_twsk_put(inet_twsk(sk)); goto discard_it; } - switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk, - skb, th)) { + switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2; From 42b6785eeb40fe3e9dab9981b6e3231a77c7c2f6 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Tue, 10 Oct 2006 19:42:09 -0700 Subject: [PATCH 15/19] [NET]: Introduce protocol-specific destructor for time-wait sockets. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 1 + include/net/timewait_sock.h | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 6d14c22a00c5..5f48748fe017 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -196,6 +196,7 @@ static inline void inet_twsk_put(struct inet_timewait_sock *tw) { if (atomic_dec_and_test(&tw->tw_refcnt)) { struct module *owner = tw->tw_prot->owner; + twsk_destructor((struct sock *)tw); #ifdef SOCK_REFCNT_DEBUG printk(KERN_DEBUG "%s timewait_sock %p released\n", tw->tw_prot->name, tw); diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h index 2544281e1d5e..be293d795e38 100644 --- a/include/net/timewait_sock.h +++ b/include/net/timewait_sock.h @@ -19,6 +19,7 @@ struct timewait_sock_ops { unsigned int twsk_obj_size; int (*twsk_unique)(struct sock *sk, struct sock *sktw, void *twp); + void (*twsk_destructor)(struct sock *sk); }; static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp) @@ -28,4 +29,10 @@ static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp) return 0; } +static inline void twsk_destructor(struct sock *sk) +{ + if (sk->sk_prot->twsk_prot->twsk_destructor != NULL) + sk->sk_prot->twsk_prot->twsk_destructor(sk); +} + #endif /* _TIMEWAIT_SOCK_H */ From 8238b218ec883adb44d710960a031c76105274cd Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 12 Oct 2006 00:49:15 -0700 Subject: [PATCH 16/19] [NET]: Do not memcmp() over pad bytes of struct flowi. They are not necessarily initialized to zero by the compiler, for example when using run-time initializers of automatic on-stack variables. Noticed by Eric Dumazet and Patrick McHardy. Signed-off-by: David S. Miller --- net/decnet/dn_route.c | 11 ++++++++--- net/ipv4/route.c | 12 +++++++++--- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index dd0761e3d280..a2a43d8d93fe 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -267,9 +267,14 @@ static void dn_dst_link_failure(struct sk_buff *skb) static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) { - return memcmp(&fl1->nl_u.dn_u, &fl2->nl_u.dn_u, sizeof(fl1->nl_u.dn_u)) == 0 && - fl1->oif == fl2->oif && - fl1->iif == fl2->iif; + return ((fl1->nl_u.dn_u.daddr ^ fl2->nl_u.dn_u.daddr) | + (fl1->nl_u.dn_u.saddr ^ fl2->nl_u.dn_u.saddr) | +#ifdef CONFIG_IP_ROUTE_FWMARK + (fl1->nl_u.dn_u.fwmark ^ fl2->nl_u.dn_u.fwmark) | +#endif + (fl1->nl_u.dn_u.scope ^ fl2->nl_u.dn_u.scope) | + (fl1->oif ^ fl2->oif) | + (fl1->iif ^ fl2->iif)) == 0; } static int dn_insert_route(struct dn_route *rt, unsigned hash, struct dn_route **rp) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c41ddba02e9d..925ee4dfc32c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -566,9 +566,15 @@ static inline u32 rt_score(struct rtable *rt) static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) { - return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 && - fl1->oif == fl2->oif && - fl1->iif == fl2->iif; + return ((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | + (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) | +#ifdef CONFIG_IP_ROUTE_FWMARK + (fl1->nl_u.ip4_u.fwmark ^ fl2->nl_u.ip4_u.fwmark) | +#endif + (*(u16 *)&fl1->nl_u.ip4_u.tos ^ + *(u16 *)&fl2->nl_u.ip4_u.tos) | + (fl1->oif ^ fl2->oif) | + (fl1->iif ^ fl2->iif)) == 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED From 52c41a3224666d252d34597b580f1b6d4dc440e7 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 12 Oct 2006 01:48:20 -0700 Subject: [PATCH 17/19] [DECNET]: Fix sfuzz hanging on 2.6.18 Dave Jones wrote: > sfuzz D 724EF62A 2828 28717 28691 (NOTLB) > cd69fe98 00000082 0000012d 724ef62a 0001971a 00000010 00000007 df6d22b0 > dfd81080 725bbc5e 0001971a 000cc634 00000001 df6d23bc c140e260 00000202 > de1d5ba0 cd69fea0 de1d5ba0 00000000 00000000 de1d5b60 de1d5b8c de1d5ba0 > Call Trace: > [] lock_sock+0x75/0xa6 > [] dn_getname+0x18/0x5f [decnet] > [] sys_getsockname+0x5c/0xb0 > [] sys_socketcall+0xef/0x261 > [] syscall_call+0x7/0xb > DWARF2 unwinder stuck at syscall_call+0x7/0xb > > I wonder if the plethora of lockdep related changes inadvertantly broke something? Looks like unbalanced locking. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/decnet/af_decnet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 70e027375682..3456cd331835 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1178,8 +1178,10 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int *uaddr_len if (peer) { if ((sock->state != SS_CONNECTED && sock->state != SS_CONNECTING) && - scp->accept_mode == ACC_IMMED) + scp->accept_mode == ACC_IMMED) { + release_sock(sk); return -ENOTCONN; + } memcpy(sa, &scp->peer, sizeof(struct sockaddr_dn)); } else { From b974179abef7cd680b80bd7c7042802bdd6f0eb6 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 12 Oct 2006 01:50:30 -0700 Subject: [PATCH 18/19] [RTNETLINK]: Fix use of wrong skb in do_getlink() skb is the netlink query, nskb is the reply message. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 221e4038216b..02f3c7947898 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -602,7 +602,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) goto errout; } - err = rtnl_unicast(skb, NETLINK_CB(skb).pid); + err = rtnl_unicast(nskb, NETLINK_CB(skb).pid); errout: kfree(iw_buf); dev_put(dev); From 30bdbe397bf58131a91fd836f60972442bed0544 Mon Sep 17 00:00:00 2001 From: Akinbou Mita Date: Thu, 12 Oct 2006 01:52:05 -0700 Subject: [PATCH 19/19] [PKT_SCHED] sch_htb: use rb_first() cleanup Use rb_first() to get first entry in rb tree. Signed-off-by: Akinbou Mita Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index bb3ddd4784b1..9b9c555c713f 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -786,11 +786,10 @@ static long htb_do_events(struct htb_sched *q, int level) for (i = 0; i < 500; i++) { struct htb_class *cl; long diff; - struct rb_node *p = q->wait_pq[level].rb_node; + struct rb_node *p = rb_first(&q->wait_pq[level]); + if (!p) return 0; - while (p->rb_left) - p = p->rb_left; cl = rb_entry(p, struct htb_class, pq_node); if (time_after(cl->pq_key, q->jiffies)) {