diff --git a/include/net/tcp.h b/include/net/tcp.h index 5fb1e75a32a9..423438dd6fe9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -262,6 +262,9 @@ extern int sysctl_tcp_slow_start_after_idle; extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_early_retrans; +extern int sysctl_tcp_recovery; +#define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ + extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_challenge_ack_limit; extern int sysctl_tcp_min_tso_segs; @@ -1043,6 +1046,7 @@ static inline void tcp_enable_early_retrans(struct tcp_sock *tp) tp->do_early_retrans = sysctl_tcp_early_retrans && sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack && + !(sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) && net->ipv4.sysctl_tcp_reordering == 3; } @@ -1859,13 +1863,6 @@ void tcp_v4_init(void); void tcp_init(void); /* tcp_recovery.c */ - -/* Flags to enable various loss recovery features. See below */ -extern int sysctl_tcp_recovery; - -/* Use TCP RACK to detect (some) tail and retransmit losses */ -#define TCP_RACK_LOST_RETRANS 0x1 - extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, const struct skb_mstamp *xmit_time, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9c98dc874825..4ad75b8c4fee 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2129,10 +2129,25 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * F.e. after RTO, when all the queue is considered as lost, * lost_out = packets_out and in_flight = retrans_out. * - * Essentially, we have now two algorithms counting + * Essentially, we have now a few algorithms detecting * lost packets. * - * FACK: It is the simplest heuristics. As soon as we decided + * If the receiver supports SACK: + * + * RFC6675/3517: It is the conventional algorithm. A packet is + * considered lost if the number of higher sequence packets + * SACKed is greater than or equal the DUPACK thoreshold + * (reordering). This is implemented in tcp_mark_head_lost and + * tcp_update_scoreboard. + * + * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm + * (2017-) that checks timing instead of counting DUPACKs. + * Essentially a packet is considered lost if it's not S/ACKed + * after RTT + reordering_window, where both metrics are + * dynamically measured and adjusted. This is implemented in + * tcp_rack_mark_lost. + * + * FACK: it is the simplest heuristics. As soon as we decided * that something is lost, we decide that _all_ not SACKed * packets until the most forward SACK are lost. I.e. * lost_out = fackets_out - sacked_out and left_out = fackets_out. @@ -2141,16 +2156,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * takes place. We use FACK by default until reordering * is suspected on the path to this destination. * - * NewReno: when Recovery is entered, we assume that one segment + * If the receiver does not support SACK: + * + * NewReno (RFC6582): in Recovery we assume that one segment * is lost (classic Reno). While we are in Recovery and * a partial ACK arrives, we assume that one more packet * is lost (NewReno). This heuristics are the same in NewReno * and SACK. * - * Imagine, that's all! Forget about all this shamanism about CWND inflation - * deflation etc. CWND is real congestion window, never inflated, changes - * only according to classic VJ rules. - * * Really tricky (and requiring careful tuning) part of algorithm * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). * The first determines the moment _when_ we should reduce CWND and, @@ -2807,7 +2820,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag, struct tcp_sock *tp = tcp_sk(sk); /* Use RACK to detect loss */ - if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) { + if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { u32 prior_retrans = tp->retrans_out; tcp_rack_mark_lost(sk, ack_time); diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 1e330a2f913d..4ecb38ae8504 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -1,7 +1,7 @@ #include #include -int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; +int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION; static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) { @@ -24,7 +24,9 @@ static bool tcp_rack_sent_after(const struct skb_mstamp *t1, (t1->v64 == t2->v64 && after(seq1, seq2)); } -/* Marks a packet lost, if some packet sent later has been (s)acked. +/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): + * + * Marks a packet lost, if some packet sent later has been (s)acked. * The underlying idea is similar to the traditional dupthresh and FACK * but they look at different metrics: * @@ -37,8 +39,10 @@ static bool tcp_rack_sent_after(const struct skb_mstamp *t1, * is being more resilient to reordering by simply allowing some * "settling delay", instead of tweaking the dupthresh. * - * The current version is only used after recovery starts but can be - * easily extended to detect the first loss. + * When tcp_rack_detect_loss() detects some packets are lost and we + * are not already in the CA_Recovery state, either tcp_rack_reo_timeout() + * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will + * make us enter the CA_Recovery state. */ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now, u32 *reo_timeout) @@ -54,7 +58,7 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now, * to queuing or delayed ACKs. */ reo_wnd = 1000; - if (tp->rack.reord && tcp_min_rtt(tp) != ~0U) + if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); tcp_for_write_queue(skb, sk) { @@ -105,7 +109,7 @@ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now) struct tcp_sock *tp = tcp_sk(sk); u32 timeout; - if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced) + if (!tp->rack.advanced) return; /* Reset the advanced flag to avoid unnecessary queue scanning */