diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2761856987b2..eb125a4c30b3 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -493,6 +493,9 @@ struct tcp_sock { u32 probe_seq_start; u32 probe_seq_end; } mtu_probe; + u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG + * while socket was owned by user. + */ #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ @@ -518,6 +521,9 @@ enum tsq_flags { TCP_TSQ_DEFERRED, /* tcp_tasklet_func() found socket was owned */ TCP_WRITE_TIMER_DEFERRED, /* tcp_write_timer() found socket was owned */ TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */ + TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call + * tcp_v{4|6}_mtu_reduced() + */ }; static inline struct tcp_sock *tcp_sk(const struct sock *sk) diff --git a/include/net/sock.h b/include/net/sock.h index 88de092df50f..e067f8c18f88 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -859,6 +859,7 @@ struct proto { struct sk_buff *skb); void (*release_cb)(struct sock *sk); + void (*mtu_reduced)(struct sock *sk); /* Keeping track of sk's, looking them up, and port selection methods. */ void (*hash)(struct sock *sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 59110caeb074..bc5432e3c778 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -275,12 +275,15 @@ failure: EXPORT_SYMBOL(tcp_v4_connect); /* - * This routine does path mtu discovery as defined in RFC1191. + * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. + * It can be called through tcp_release_cb() if socket was owned by user + * at the time tcp_v4_err() was called to handle ICMP message. */ -static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) +static void tcp_v4_mtu_reduced(struct sock *sk) { struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); + u32 mtu = tcp_sk(sk)->mtu_info; /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs * send out by Linux are always <576bytes so they should go through @@ -373,8 +376,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. + * We do take care of PMTU discovery (RFC1191) special case : + * we can receive locally generated ICMP messages while socket is held. */ - if (sock_owned_by_user(sk)) + if (sock_owned_by_user(sk) && + type != ICMP_DEST_UNREACH && + code != ICMP_FRAG_NEEDED) NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) @@ -409,8 +416,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ + tp->mtu_info = info; if (!sock_owned_by_user(sk)) - do_pmtu_discovery(sk, iph, info); + tcp_v4_mtu_reduced(sk); + else + set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); goto out; } @@ -2596,6 +2606,7 @@ struct proto tcp_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, + .mtu_reduced = tcp_v4_mtu_reduced, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 950aebfd9967..33cd065cfbd8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -885,7 +885,8 @@ static void tcp_tasklet_func(unsigned long data) #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ (1UL << TCP_WRITE_TIMER_DEFERRED) | \ - (1UL << TCP_DELACK_TIMER_DEFERRED)) + (1UL << TCP_DELACK_TIMER_DEFERRED) | \ + (1UL << TCP_MTU_REDUCED_DEFERRED)) /** * tcp_release_cb - tcp release_sock() callback * @sk: socket @@ -914,6 +915,9 @@ void tcp_release_cb(struct sock *sk) if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) tcp_delack_timer_handler(sk); + + if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) + sk->sk_prot->mtu_reduced(sk); } EXPORT_SYMBOL(tcp_release_cb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0302ec3fecfc..f49476e2d884 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -315,6 +315,23 @@ failure: return err; } +static void tcp_v6_mtu_reduced(struct sock *sk) +{ + struct dst_entry *dst; + + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) + return; + + dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info); + if (!dst) + return; + + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { + tcp_sync_mss(sk, dst_mtu(dst)); + tcp_simple_retransmit(sk); + } +} + static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { @@ -342,7 +359,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } bh_lock_sock(sk); - if (sock_owned_by_user(sk)) + if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) @@ -371,21 +388,11 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } if (type == ICMPV6_PKT_TOOBIG) { - struct dst_entry *dst; - - if (sock_owned_by_user(sk)) - goto out; - if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) - goto out; - - dst = inet6_csk_update_pmtu(sk, ntohl(info)); - if (!dst) - goto out; - - if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { - tcp_sync_mss(sk, dst_mtu(dst)); - tcp_simple_retransmit(sk); - } + tp->mtu_info = ntohl(info); + if (!sock_owned_by_user(sk)) + tcp_v6_mtu_reduced(sk); + else + set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); goto out; } @@ -1949,6 +1956,7 @@ struct proto tcpv6_prot = { .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, + .mtu_reduced = tcp_v6_mtu_reduced, .hash = tcp_v6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port,