740b0f1841
Upcoming congestion controls for TCP require usec resolution for RTT estimations. Millisecond resolution is simply not enough these days. FQ/pacing in DC environments also require this change for finer control and removal of bimodal behavior due to the current hack in tcp_update_pacing_rate() for 'small rtt' TCP_CONG_RTT_STAMP is no longer needed. As Julian Anastasov pointed out, we need to keep user compatibility : tcp_metrics used to export RTT and RTTVAR in msec resolution, so we added RTT_US and RTTVAR_US. An iproute2 patch is needed to use the new attributes if provided by the kernel. In this example ss command displays a srtt of 32 usecs (10Gbit link) lpk51:~# ./ss -i dst lpk52 Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port tcp ESTAB 0 1 10.246.11.51:42959 10.246.11.52:64614 cubic wscale:6,6 rto:201 rtt:0.032/0.001 ato:40 mss:1448 cwnd:10 send 3620.0Mbps pacing_rate 7240.0Mbps unacked:1 rcv_rtt:993 rcv_space:29559 Updated iproute2 ip command displays : lpk51:~# ./ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 274us rttvar 213us source 10.246.11.51 Old binary displays : lpk51:~# ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 250us rttvar 125us source 10.246.11.51 With help from Julian Anastasov, Stephen Hemminger and Yuchung Cheng Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Yuchung Cheng <ycheng@google.com> Cc: Larry Brakmo <brakmo@google.com> Cc: Julian Anastasov <ja@ssi.bg> Signed-off-by: David S. Miller <davem@davemloft.net>
195 lines
4.9 KiB
C
195 lines
4.9 KiB
C
/*
|
|
* TCP HYBLA
|
|
*
|
|
* TCP-HYBLA Congestion control algorithm, based on:
|
|
* C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
|
|
* for Heterogeneous Networks",
|
|
* International Journal on satellite Communications,
|
|
* September 2004
|
|
* Daniele Lacamera
|
|
* root at danielinux.net
|
|
*/
|
|
|
|
#include <linux/module.h>
|
|
#include <net/tcp.h>
|
|
|
|
/* Tcp Hybla structure. */
|
|
struct hybla {
|
|
bool hybla_en;
|
|
u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
|
|
u32 rho; /* Rho parameter, integer part */
|
|
u32 rho2; /* Rho * Rho, integer part */
|
|
u32 rho_3ls; /* Rho parameter, <<3 */
|
|
u32 rho2_7ls; /* Rho^2, <<7 */
|
|
u32 minrtt_us; /* Minimum smoothed round trip time value seen */
|
|
};
|
|
|
|
/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
|
|
static int rtt0 = 25;
|
|
module_param(rtt0, int, 0644);
|
|
MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
|
|
|
|
|
|
/* This is called to refresh values for hybla parameters */
|
|
static inline void hybla_recalc_param (struct sock *sk)
|
|
{
|
|
struct hybla *ca = inet_csk_ca(sk);
|
|
|
|
ca->rho_3ls = max_t(u32,
|
|
tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC),
|
|
8U);
|
|
ca->rho = ca->rho_3ls >> 3;
|
|
ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
|
|
ca->rho2 = ca->rho2_7ls >> 7;
|
|
}
|
|
|
|
static void hybla_init(struct sock *sk)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
struct hybla *ca = inet_csk_ca(sk);
|
|
|
|
ca->rho = 0;
|
|
ca->rho2 = 0;
|
|
ca->rho_3ls = 0;
|
|
ca->rho2_7ls = 0;
|
|
ca->snd_cwnd_cents = 0;
|
|
ca->hybla_en = true;
|
|
tp->snd_cwnd = 2;
|
|
tp->snd_cwnd_clamp = 65535;
|
|
|
|
/* 1st Rho measurement based on initial srtt */
|
|
hybla_recalc_param(sk);
|
|
|
|
/* set minimum rtt as this is the 1st ever seen */
|
|
ca->minrtt_us = tp->srtt_us;
|
|
tp->snd_cwnd = ca->rho;
|
|
}
|
|
|
|
static void hybla_state(struct sock *sk, u8 ca_state)
|
|
{
|
|
struct hybla *ca = inet_csk_ca(sk);
|
|
|
|
ca->hybla_en = (ca_state == TCP_CA_Open);
|
|
}
|
|
|
|
static inline u32 hybla_fraction(u32 odds)
|
|
{
|
|
static const u32 fractions[] = {
|
|
128, 139, 152, 165, 181, 197, 215, 234,
|
|
};
|
|
|
|
return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
|
|
}
|
|
|
|
/* TCP Hybla main routine.
|
|
* This is the algorithm behavior:
|
|
* o Recalc Hybla parameters if min_rtt has changed
|
|
* o Give cwnd a new value based on the model proposed
|
|
* o remember increments <1
|
|
*/
|
|
static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
|
|
u32 in_flight)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
struct hybla *ca = inet_csk_ca(sk);
|
|
u32 increment, odd, rho_fractions;
|
|
int is_slowstart = 0;
|
|
|
|
/* Recalculate rho only if this srtt is the lowest */
|
|
if (tp->srtt_us < ca->minrtt_us) {
|
|
hybla_recalc_param(sk);
|
|
ca->minrtt_us = tp->srtt_us;
|
|
}
|
|
|
|
if (!tcp_is_cwnd_limited(sk, in_flight))
|
|
return;
|
|
|
|
if (!ca->hybla_en) {
|
|
tcp_reno_cong_avoid(sk, ack, acked, in_flight);
|
|
return;
|
|
}
|
|
|
|
if (ca->rho == 0)
|
|
hybla_recalc_param(sk);
|
|
|
|
rho_fractions = ca->rho_3ls - (ca->rho << 3);
|
|
|
|
if (tp->snd_cwnd < tp->snd_ssthresh) {
|
|
/*
|
|
* slow start
|
|
* INC = 2^RHO - 1
|
|
* This is done by splitting the rho parameter
|
|
* into 2 parts: an integer part and a fraction part.
|
|
* Inrement<<7 is estimated by doing:
|
|
* [2^(int+fract)]<<7
|
|
* that is equal to:
|
|
* (2^int) * [(2^fract) <<7]
|
|
* 2^int is straightly computed as 1<<int,
|
|
* while we will use hybla_slowstart_fraction_increment() to
|
|
* calculate 2^fract in a <<7 value.
|
|
*/
|
|
is_slowstart = 1;
|
|
increment = ((1 << min(ca->rho, 16U)) *
|
|
hybla_fraction(rho_fractions)) - 128;
|
|
} else {
|
|
/*
|
|
* congestion avoidance
|
|
* INC = RHO^2 / W
|
|
* as long as increment is estimated as (rho<<7)/window
|
|
* it already is <<7 and we can easily count its fractions.
|
|
*/
|
|
increment = ca->rho2_7ls / tp->snd_cwnd;
|
|
if (increment < 128)
|
|
tp->snd_cwnd_cnt++;
|
|
}
|
|
|
|
odd = increment % 128;
|
|
tp->snd_cwnd += increment >> 7;
|
|
ca->snd_cwnd_cents += odd;
|
|
|
|
/* check when fractions goes >=128 and increase cwnd by 1. */
|
|
while (ca->snd_cwnd_cents >= 128) {
|
|
tp->snd_cwnd++;
|
|
ca->snd_cwnd_cents -= 128;
|
|
tp->snd_cwnd_cnt = 0;
|
|
}
|
|
/* check when cwnd has not been incremented for a while */
|
|
if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tp->snd_cwnd) {
|
|
tp->snd_cwnd++;
|
|
tp->snd_cwnd_cnt = 0;
|
|
}
|
|
/* clamp down slowstart cwnd to ssthresh value. */
|
|
if (is_slowstart)
|
|
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
|
|
|
|
tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
|
|
}
|
|
|
|
static struct tcp_congestion_ops tcp_hybla __read_mostly = {
|
|
.init = hybla_init,
|
|
.ssthresh = tcp_reno_ssthresh,
|
|
.cong_avoid = hybla_cong_avoid,
|
|
.set_state = hybla_state,
|
|
|
|
.owner = THIS_MODULE,
|
|
.name = "hybla"
|
|
};
|
|
|
|
static int __init hybla_register(void)
|
|
{
|
|
BUILD_BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE);
|
|
return tcp_register_congestion_control(&tcp_hybla);
|
|
}
|
|
|
|
static void __exit hybla_unregister(void)
|
|
{
|
|
tcp_unregister_congestion_control(&tcp_hybla);
|
|
}
|
|
|
|
module_init(hybla_register);
|
|
module_exit(hybla_unregister);
|
|
|
|
MODULE_AUTHOR("Daniele Lacamera");
|
|
MODULE_LICENSE("GPL");
|
|
MODULE_DESCRIPTION("TCP Hybla");
|