1*0f8782eaSNeal Cardwell /* Bottleneck Bandwidth and RTT (BBR) congestion control 2*0f8782eaSNeal Cardwell * 3*0f8782eaSNeal Cardwell * BBR congestion control computes the sending rate based on the delivery 4*0f8782eaSNeal Cardwell * rate (throughput) estimated from ACKs. In a nutshell: 5*0f8782eaSNeal Cardwell * 6*0f8782eaSNeal Cardwell * On each ACK, update our model of the network path: 7*0f8782eaSNeal Cardwell * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) 8*0f8782eaSNeal Cardwell * min_rtt = windowed_min(rtt, 10 seconds) 9*0f8782eaSNeal Cardwell * pacing_rate = pacing_gain * bottleneck_bandwidth 10*0f8782eaSNeal Cardwell * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) 11*0f8782eaSNeal Cardwell * 12*0f8782eaSNeal Cardwell * The core algorithm does not react directly to packet losses or delays, 13*0f8782eaSNeal Cardwell * although BBR may adjust the size of next send per ACK when loss is 14*0f8782eaSNeal Cardwell * observed, or adjust the sending rate if it estimates there is a 15*0f8782eaSNeal Cardwell * traffic policer, in order to keep the drop rate reasonable. 16*0f8782eaSNeal Cardwell * 17*0f8782eaSNeal Cardwell * BBR is described in detail in: 18*0f8782eaSNeal Cardwell * "BBR: Congestion-Based Congestion Control", 19*0f8782eaSNeal Cardwell * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, 20*0f8782eaSNeal Cardwell * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. 21*0f8782eaSNeal Cardwell * 22*0f8782eaSNeal Cardwell * There is a public e-mail list for discussing BBR development and testing: 23*0f8782eaSNeal Cardwell * https://groups.google.com/forum/#!forum/bbr-dev 24*0f8782eaSNeal Cardwell * 25*0f8782eaSNeal Cardwell * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled, 26*0f8782eaSNeal Cardwell * since pacing is integral to the BBR design and implementation. 27*0f8782eaSNeal Cardwell * BBR without pacing would not function properly, and may incur unnecessary 28*0f8782eaSNeal Cardwell * high packet loss rates. 29*0f8782eaSNeal Cardwell */ 30*0f8782eaSNeal Cardwell #include <linux/module.h> 31*0f8782eaSNeal Cardwell #include <net/tcp.h> 32*0f8782eaSNeal Cardwell #include <linux/inet_diag.h> 33*0f8782eaSNeal Cardwell #include <linux/inet.h> 34*0f8782eaSNeal Cardwell #include <linux/random.h> 35*0f8782eaSNeal Cardwell #include <linux/win_minmax.h> 36*0f8782eaSNeal Cardwell 37*0f8782eaSNeal Cardwell /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth 38*0f8782eaSNeal Cardwell * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. 39*0f8782eaSNeal Cardwell * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. 40*0f8782eaSNeal Cardwell * Since the minimum window is >=4 packets, the lower bound isn't 41*0f8782eaSNeal Cardwell * an issue. The upper bound isn't an issue with existing technologies. 42*0f8782eaSNeal Cardwell */ 43*0f8782eaSNeal Cardwell #define BW_SCALE 24 44*0f8782eaSNeal Cardwell #define BW_UNIT (1 << BW_SCALE) 45*0f8782eaSNeal Cardwell 46*0f8782eaSNeal Cardwell #define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ 47*0f8782eaSNeal Cardwell #define BBR_UNIT (1 << BBR_SCALE) 48*0f8782eaSNeal Cardwell 49*0f8782eaSNeal Cardwell /* BBR has the following modes for deciding how fast to send: */ 50*0f8782eaSNeal Cardwell enum bbr_mode { 51*0f8782eaSNeal Cardwell BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ 52*0f8782eaSNeal Cardwell BBR_DRAIN, /* drain any queue created during startup */ 53*0f8782eaSNeal Cardwell BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ 54*0f8782eaSNeal Cardwell BBR_PROBE_RTT, /* cut cwnd to min to probe min_rtt */ 55*0f8782eaSNeal Cardwell }; 56*0f8782eaSNeal Cardwell 57*0f8782eaSNeal Cardwell /* BBR congestion control block */ 58*0f8782eaSNeal Cardwell struct bbr { 59*0f8782eaSNeal Cardwell u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ 60*0f8782eaSNeal Cardwell u32 min_rtt_stamp; /* timestamp of min_rtt_us */ 61*0f8782eaSNeal Cardwell u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ 62*0f8782eaSNeal Cardwell struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ 63*0f8782eaSNeal Cardwell u32 rtt_cnt; /* count of packet-timed rounds elapsed */ 64*0f8782eaSNeal Cardwell u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ 65*0f8782eaSNeal Cardwell struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */ 66*0f8782eaSNeal Cardwell u32 mode:3, /* current bbr_mode in state machine */ 67*0f8782eaSNeal Cardwell prev_ca_state:3, /* CA state on previous ACK */ 68*0f8782eaSNeal Cardwell packet_conservation:1, /* use packet conservation? */ 69*0f8782eaSNeal Cardwell restore_cwnd:1, /* decided to revert cwnd to old value */ 70*0f8782eaSNeal Cardwell round_start:1, /* start of packet-timed tx->ack round? */ 71*0f8782eaSNeal Cardwell tso_segs_goal:7, /* segments we want in each skb we send */ 72*0f8782eaSNeal Cardwell idle_restart:1, /* restarting after idle? */ 73*0f8782eaSNeal Cardwell probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ 74*0f8782eaSNeal Cardwell unused:5, 75*0f8782eaSNeal Cardwell lt_is_sampling:1, /* taking long-term ("LT") samples now? */ 76*0f8782eaSNeal Cardwell lt_rtt_cnt:7, /* round trips in long-term interval */ 77*0f8782eaSNeal Cardwell lt_use_bw:1; /* use lt_bw as our bw estimate? */ 78*0f8782eaSNeal Cardwell u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ 79*0f8782eaSNeal Cardwell u32 lt_last_delivered; /* LT intvl start: tp->delivered */ 80*0f8782eaSNeal Cardwell u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ 81*0f8782eaSNeal Cardwell u32 lt_last_lost; /* LT intvl start: tp->lost */ 82*0f8782eaSNeal Cardwell u32 pacing_gain:10, /* current gain for setting pacing rate */ 83*0f8782eaSNeal Cardwell cwnd_gain:10, /* current gain for setting cwnd */ 84*0f8782eaSNeal Cardwell full_bw_cnt:3, /* number of rounds without large bw gains */ 85*0f8782eaSNeal Cardwell cycle_idx:3, /* current index in pacing_gain cycle array */ 86*0f8782eaSNeal Cardwell unused_b:6; 87*0f8782eaSNeal Cardwell u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ 88*0f8782eaSNeal Cardwell u32 full_bw; /* recent bw, to estimate if pipe is full */ 89*0f8782eaSNeal Cardwell }; 90*0f8782eaSNeal Cardwell 91*0f8782eaSNeal Cardwell #define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ 92*0f8782eaSNeal Cardwell 93*0f8782eaSNeal Cardwell /* Window length of bw filter (in rounds): */ 94*0f8782eaSNeal Cardwell static const int bbr_bw_rtts = CYCLE_LEN + 2; 95*0f8782eaSNeal Cardwell /* Window length of min_rtt filter (in sec): */ 96*0f8782eaSNeal Cardwell static const u32 bbr_min_rtt_win_sec = 10; 97*0f8782eaSNeal Cardwell /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ 98*0f8782eaSNeal Cardwell static const u32 bbr_probe_rtt_mode_ms = 200; 99*0f8782eaSNeal Cardwell /* Skip TSO below the following bandwidth (bits/sec): */ 100*0f8782eaSNeal Cardwell static const int bbr_min_tso_rate = 1200000; 101*0f8782eaSNeal Cardwell 102*0f8782eaSNeal Cardwell /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain 103*0f8782eaSNeal Cardwell * that will allow a smoothly increasing pacing rate that will double each RTT 104*0f8782eaSNeal Cardwell * and send the same number of packets per RTT that an un-paced, slow-starting 105*0f8782eaSNeal Cardwell * Reno or CUBIC flow would: 106*0f8782eaSNeal Cardwell */ 107*0f8782eaSNeal Cardwell static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; 108*0f8782eaSNeal Cardwell /* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain 109*0f8782eaSNeal Cardwell * the queue created in BBR_STARTUP in a single round: 110*0f8782eaSNeal Cardwell */ 111*0f8782eaSNeal Cardwell static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; 112*0f8782eaSNeal Cardwell /* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */ 113*0f8782eaSNeal Cardwell static const int bbr_cwnd_gain = BBR_UNIT * 2; 114*0f8782eaSNeal Cardwell /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ 115*0f8782eaSNeal Cardwell static const int bbr_pacing_gain[] = { 116*0f8782eaSNeal Cardwell BBR_UNIT * 5 / 4, /* probe for more available bw */ 117*0f8782eaSNeal Cardwell BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ 118*0f8782eaSNeal Cardwell BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ 119*0f8782eaSNeal Cardwell BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ 120*0f8782eaSNeal Cardwell }; 121*0f8782eaSNeal Cardwell /* Randomize the starting gain cycling phase over N phases: */ 122*0f8782eaSNeal Cardwell static const u32 bbr_cycle_rand = 7; 123*0f8782eaSNeal Cardwell 124*0f8782eaSNeal Cardwell /* Try to keep at least this many packets in flight, if things go smoothly. For 125*0f8782eaSNeal Cardwell * smooth functioning, a sliding window protocol ACKing every other packet 126*0f8782eaSNeal Cardwell * needs at least 4 packets in flight: 127*0f8782eaSNeal Cardwell */ 128*0f8782eaSNeal Cardwell static const u32 bbr_cwnd_min_target = 4; 129*0f8782eaSNeal Cardwell 130*0f8782eaSNeal Cardwell /* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ 131*0f8782eaSNeal Cardwell /* If bw has increased significantly (1.25x), there may be more bw available: */ 132*0f8782eaSNeal Cardwell static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; 133*0f8782eaSNeal Cardwell /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ 134*0f8782eaSNeal Cardwell static const u32 bbr_full_bw_cnt = 3; 135*0f8782eaSNeal Cardwell 136*0f8782eaSNeal Cardwell /* "long-term" ("LT") bandwidth estimator parameters... */ 137*0f8782eaSNeal Cardwell /* The minimum number of rounds in an LT bw sampling interval: */ 138*0f8782eaSNeal Cardwell static const u32 bbr_lt_intvl_min_rtts = 4; 139*0f8782eaSNeal Cardwell /* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ 140*0f8782eaSNeal Cardwell static const u32 bbr_lt_loss_thresh = 50; 141*0f8782eaSNeal Cardwell /* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ 142*0f8782eaSNeal Cardwell static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; 143*0f8782eaSNeal Cardwell /* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ 144*0f8782eaSNeal Cardwell static const u32 bbr_lt_bw_diff = 4000 / 8; 145*0f8782eaSNeal Cardwell /* If we estimate we're policed, use lt_bw for this many round trips: */ 146*0f8782eaSNeal Cardwell static const u32 bbr_lt_bw_max_rtts = 48; 147*0f8782eaSNeal Cardwell 148*0f8782eaSNeal Cardwell /* Do we estimate that STARTUP filled the pipe? */ 149*0f8782eaSNeal Cardwell static bool bbr_full_bw_reached(const struct sock *sk) 150*0f8782eaSNeal Cardwell { 151*0f8782eaSNeal Cardwell const struct bbr *bbr = inet_csk_ca(sk); 152*0f8782eaSNeal Cardwell 153*0f8782eaSNeal Cardwell return bbr->full_bw_cnt >= bbr_full_bw_cnt; 154*0f8782eaSNeal Cardwell } 155*0f8782eaSNeal Cardwell 156*0f8782eaSNeal Cardwell /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ 157*0f8782eaSNeal Cardwell static u32 bbr_max_bw(const struct sock *sk) 158*0f8782eaSNeal Cardwell { 159*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 160*0f8782eaSNeal Cardwell 161*0f8782eaSNeal Cardwell return minmax_get(&bbr->bw); 162*0f8782eaSNeal Cardwell } 163*0f8782eaSNeal Cardwell 164*0f8782eaSNeal Cardwell /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ 165*0f8782eaSNeal Cardwell static u32 bbr_bw(const struct sock *sk) 166*0f8782eaSNeal Cardwell { 167*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 168*0f8782eaSNeal Cardwell 169*0f8782eaSNeal Cardwell return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); 170*0f8782eaSNeal Cardwell } 171*0f8782eaSNeal Cardwell 172*0f8782eaSNeal Cardwell /* Return rate in bytes per second, optionally with a gain. 173*0f8782eaSNeal Cardwell * The order here is chosen carefully to avoid overflow of u64. This should 174*0f8782eaSNeal Cardwell * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. 175*0f8782eaSNeal Cardwell */ 176*0f8782eaSNeal Cardwell static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) 177*0f8782eaSNeal Cardwell { 178*0f8782eaSNeal Cardwell rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache); 179*0f8782eaSNeal Cardwell rate *= gain; 180*0f8782eaSNeal Cardwell rate >>= BBR_SCALE; 181*0f8782eaSNeal Cardwell rate *= USEC_PER_SEC; 182*0f8782eaSNeal Cardwell return rate >> BW_SCALE; 183*0f8782eaSNeal Cardwell } 184*0f8782eaSNeal Cardwell 185*0f8782eaSNeal Cardwell /* Pace using current bw estimate and a gain factor. In order to help drive the 186*0f8782eaSNeal Cardwell * network toward lower queues while maintaining high utilization and low 187*0f8782eaSNeal Cardwell * latency, the average pacing rate aims to be slightly (~1%) lower than the 188*0f8782eaSNeal Cardwell * estimated bandwidth. This is an important aspect of the design. In this 189*0f8782eaSNeal Cardwell * implementation this slightly lower pacing rate is achieved implicitly by not 190*0f8782eaSNeal Cardwell * including link-layer headers in the packet size used for the pacing rate. 191*0f8782eaSNeal Cardwell */ 192*0f8782eaSNeal Cardwell static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) 193*0f8782eaSNeal Cardwell { 194*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 195*0f8782eaSNeal Cardwell u64 rate = bw; 196*0f8782eaSNeal Cardwell 197*0f8782eaSNeal Cardwell rate = bbr_rate_bytes_per_sec(sk, rate, gain); 198*0f8782eaSNeal Cardwell rate = min_t(u64, rate, sk->sk_max_pacing_rate); 199*0f8782eaSNeal Cardwell if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate) 200*0f8782eaSNeal Cardwell sk->sk_pacing_rate = rate; 201*0f8782eaSNeal Cardwell } 202*0f8782eaSNeal Cardwell 203*0f8782eaSNeal Cardwell /* Return count of segments we want in the skbs we send, or 0 for default. */ 204*0f8782eaSNeal Cardwell static u32 bbr_tso_segs_goal(struct sock *sk) 205*0f8782eaSNeal Cardwell { 206*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 207*0f8782eaSNeal Cardwell 208*0f8782eaSNeal Cardwell return bbr->tso_segs_goal; 209*0f8782eaSNeal Cardwell } 210*0f8782eaSNeal Cardwell 211*0f8782eaSNeal Cardwell static void bbr_set_tso_segs_goal(struct sock *sk) 212*0f8782eaSNeal Cardwell { 213*0f8782eaSNeal Cardwell struct tcp_sock *tp = tcp_sk(sk); 214*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 215*0f8782eaSNeal Cardwell u32 min_segs; 216*0f8782eaSNeal Cardwell 217*0f8782eaSNeal Cardwell min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; 218*0f8782eaSNeal Cardwell bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs), 219*0f8782eaSNeal Cardwell 0x7FU); 220*0f8782eaSNeal Cardwell } 221*0f8782eaSNeal Cardwell 222*0f8782eaSNeal Cardwell /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ 223*0f8782eaSNeal Cardwell static void bbr_save_cwnd(struct sock *sk) 224*0f8782eaSNeal Cardwell { 225*0f8782eaSNeal Cardwell struct tcp_sock *tp = tcp_sk(sk); 226*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 227*0f8782eaSNeal Cardwell 228*0f8782eaSNeal Cardwell if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) 229*0f8782eaSNeal Cardwell bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ 230*0f8782eaSNeal Cardwell else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ 231*0f8782eaSNeal Cardwell bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); 232*0f8782eaSNeal Cardwell } 233*0f8782eaSNeal Cardwell 234*0f8782eaSNeal Cardwell static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) 235*0f8782eaSNeal Cardwell { 236*0f8782eaSNeal Cardwell struct tcp_sock *tp = tcp_sk(sk); 237*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 238*0f8782eaSNeal Cardwell 239*0f8782eaSNeal Cardwell if (event == CA_EVENT_TX_START && tp->app_limited) { 240*0f8782eaSNeal Cardwell bbr->idle_restart = 1; 241*0f8782eaSNeal Cardwell /* Avoid pointless buffer overflows: pace at est. bw if we don't 242*0f8782eaSNeal Cardwell * need more speed (we're restarting from idle and app-limited). 243*0f8782eaSNeal Cardwell */ 244*0f8782eaSNeal Cardwell if (bbr->mode == BBR_PROBE_BW) 245*0f8782eaSNeal Cardwell bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); 246*0f8782eaSNeal Cardwell } 247*0f8782eaSNeal Cardwell } 248*0f8782eaSNeal Cardwell 249*0f8782eaSNeal Cardwell /* Find target cwnd. Right-size the cwnd based on min RTT and the 250*0f8782eaSNeal Cardwell * estimated bottleneck bandwidth: 251*0f8782eaSNeal Cardwell * 252*0f8782eaSNeal Cardwell * cwnd = bw * min_rtt * gain = BDP * gain 253*0f8782eaSNeal Cardwell * 254*0f8782eaSNeal Cardwell * The key factor, gain, controls the amount of queue. While a small gain 255*0f8782eaSNeal Cardwell * builds a smaller queue, it becomes more vulnerable to noise in RTT 256*0f8782eaSNeal Cardwell * measurements (e.g., delayed ACKs or other ACK compression effects). This 257*0f8782eaSNeal Cardwell * noise may cause BBR to under-estimate the rate. 258*0f8782eaSNeal Cardwell * 259*0f8782eaSNeal Cardwell * To achieve full performance in high-speed paths, we budget enough cwnd to 260*0f8782eaSNeal Cardwell * fit full-sized skbs in-flight on both end hosts to fully utilize the path: 261*0f8782eaSNeal Cardwell * - one skb in sending host Qdisc, 262*0f8782eaSNeal Cardwell * - one skb in sending host TSO/GSO engine 263*0f8782eaSNeal Cardwell * - one skb being received by receiver host LRO/GRO/delayed-ACK engine 264*0f8782eaSNeal Cardwell * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because 265*0f8782eaSNeal Cardwell * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, 266*0f8782eaSNeal Cardwell * which allows 2 outstanding 2-packet sequences, to try to keep pipe 267*0f8782eaSNeal Cardwell * full even with ACK-every-other-packet delayed ACKs. 268*0f8782eaSNeal Cardwell */ 269*0f8782eaSNeal Cardwell static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) 270*0f8782eaSNeal Cardwell { 271*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 272*0f8782eaSNeal Cardwell u32 cwnd; 273*0f8782eaSNeal Cardwell u64 w; 274*0f8782eaSNeal Cardwell 275*0f8782eaSNeal Cardwell /* If we've never had a valid RTT sample, cap cwnd at the initial 276*0f8782eaSNeal Cardwell * default. This should only happen when the connection is not using TCP 277*0f8782eaSNeal Cardwell * timestamps and has retransmitted all of the SYN/SYNACK/data packets 278*0f8782eaSNeal Cardwell * ACKed so far. In this case, an RTO can cut cwnd to 1, in which 279*0f8782eaSNeal Cardwell * case we need to slow-start up toward something safe: TCP_INIT_CWND. 280*0f8782eaSNeal Cardwell */ 281*0f8782eaSNeal Cardwell if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ 282*0f8782eaSNeal Cardwell return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ 283*0f8782eaSNeal Cardwell 284*0f8782eaSNeal Cardwell w = (u64)bw * bbr->min_rtt_us; 285*0f8782eaSNeal Cardwell 286*0f8782eaSNeal Cardwell /* Apply a gain to the given value, then remove the BW_SCALE shift. */ 287*0f8782eaSNeal Cardwell cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; 288*0f8782eaSNeal Cardwell 289*0f8782eaSNeal Cardwell /* Allow enough full-sized skbs in flight to utilize end systems. */ 290*0f8782eaSNeal Cardwell cwnd += 3 * bbr->tso_segs_goal; 291*0f8782eaSNeal Cardwell 292*0f8782eaSNeal Cardwell /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ 293*0f8782eaSNeal Cardwell cwnd = (cwnd + 1) & ~1U; 294*0f8782eaSNeal Cardwell 295*0f8782eaSNeal Cardwell return cwnd; 296*0f8782eaSNeal Cardwell } 297*0f8782eaSNeal Cardwell 298*0f8782eaSNeal Cardwell /* An optimization in BBR to reduce losses: On the first round of recovery, we 299*0f8782eaSNeal Cardwell * follow the packet conservation principle: send P packets per P packets acked. 300*0f8782eaSNeal Cardwell * After that, we slow-start and send at most 2*P packets per P packets acked. 301*0f8782eaSNeal Cardwell * After recovery finishes, or upon undo, we restore the cwnd we had when 302*0f8782eaSNeal Cardwell * recovery started (capped by the target cwnd based on estimated BDP). 303*0f8782eaSNeal Cardwell * 304*0f8782eaSNeal Cardwell * TODO(ycheng/ncardwell): implement a rate-based approach. 305*0f8782eaSNeal Cardwell */ 306*0f8782eaSNeal Cardwell static bool bbr_set_cwnd_to_recover_or_restore( 307*0f8782eaSNeal Cardwell struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) 308*0f8782eaSNeal Cardwell { 309*0f8782eaSNeal Cardwell struct tcp_sock *tp = tcp_sk(sk); 310*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 311*0f8782eaSNeal Cardwell u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; 312*0f8782eaSNeal Cardwell u32 cwnd = tp->snd_cwnd; 313*0f8782eaSNeal Cardwell 314*0f8782eaSNeal Cardwell /* An ACK for P pkts should release at most 2*P packets. We do this 315*0f8782eaSNeal Cardwell * in two steps. First, here we deduct the number of lost packets. 316*0f8782eaSNeal Cardwell * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. 317*0f8782eaSNeal Cardwell */ 318*0f8782eaSNeal Cardwell if (rs->losses > 0) 319*0f8782eaSNeal Cardwell cwnd = max_t(s32, cwnd - rs->losses, 1); 320*0f8782eaSNeal Cardwell 321*0f8782eaSNeal Cardwell if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { 322*0f8782eaSNeal Cardwell /* Starting 1st round of Recovery, so do packet conservation. */ 323*0f8782eaSNeal Cardwell bbr->packet_conservation = 1; 324*0f8782eaSNeal Cardwell bbr->next_rtt_delivered = tp->delivered; /* start round now */ 325*0f8782eaSNeal Cardwell /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ 326*0f8782eaSNeal Cardwell cwnd = tcp_packets_in_flight(tp) + acked; 327*0f8782eaSNeal Cardwell } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { 328*0f8782eaSNeal Cardwell /* Exiting loss recovery; restore cwnd saved before recovery. */ 329*0f8782eaSNeal Cardwell bbr->restore_cwnd = 1; 330*0f8782eaSNeal Cardwell bbr->packet_conservation = 0; 331*0f8782eaSNeal Cardwell } 332*0f8782eaSNeal Cardwell bbr->prev_ca_state = state; 333*0f8782eaSNeal Cardwell 334*0f8782eaSNeal Cardwell if (bbr->restore_cwnd) { 335*0f8782eaSNeal Cardwell /* Restore cwnd after exiting loss recovery or PROBE_RTT. */ 336*0f8782eaSNeal Cardwell cwnd = max(cwnd, bbr->prior_cwnd); 337*0f8782eaSNeal Cardwell bbr->restore_cwnd = 0; 338*0f8782eaSNeal Cardwell } 339*0f8782eaSNeal Cardwell 340*0f8782eaSNeal Cardwell if (bbr->packet_conservation) { 341*0f8782eaSNeal Cardwell *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); 342*0f8782eaSNeal Cardwell return true; /* yes, using packet conservation */ 343*0f8782eaSNeal Cardwell } 344*0f8782eaSNeal Cardwell *new_cwnd = cwnd; 345*0f8782eaSNeal Cardwell return false; 346*0f8782eaSNeal Cardwell } 347*0f8782eaSNeal Cardwell 348*0f8782eaSNeal Cardwell /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss 349*0f8782eaSNeal Cardwell * has drawn us down below target), or snap down to target if we're above it. 350*0f8782eaSNeal Cardwell */ 351*0f8782eaSNeal Cardwell static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, 352*0f8782eaSNeal Cardwell u32 acked, u32 bw, int gain) 353*0f8782eaSNeal Cardwell { 354*0f8782eaSNeal Cardwell struct tcp_sock *tp = tcp_sk(sk); 355*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 356*0f8782eaSNeal Cardwell u32 cwnd = 0, target_cwnd = 0; 357*0f8782eaSNeal Cardwell 358*0f8782eaSNeal Cardwell if (!acked) 359*0f8782eaSNeal Cardwell return; 360*0f8782eaSNeal Cardwell 361*0f8782eaSNeal Cardwell if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) 362*0f8782eaSNeal Cardwell goto done; 363*0f8782eaSNeal Cardwell 364*0f8782eaSNeal Cardwell /* If we're below target cwnd, slow start cwnd toward target cwnd. */ 365*0f8782eaSNeal Cardwell target_cwnd = bbr_target_cwnd(sk, bw, gain); 366*0f8782eaSNeal Cardwell if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ 367*0f8782eaSNeal Cardwell cwnd = min(cwnd + acked, target_cwnd); 368*0f8782eaSNeal Cardwell else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) 369*0f8782eaSNeal Cardwell cwnd = cwnd + acked; 370*0f8782eaSNeal Cardwell cwnd = max(cwnd, bbr_cwnd_min_target); 371*0f8782eaSNeal Cardwell 372*0f8782eaSNeal Cardwell done: 373*0f8782eaSNeal Cardwell tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ 374*0f8782eaSNeal Cardwell if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ 375*0f8782eaSNeal Cardwell tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target); 376*0f8782eaSNeal Cardwell } 377*0f8782eaSNeal Cardwell 378*0f8782eaSNeal Cardwell /* End cycle phase if it's time and/or we hit the phase's in-flight target. */ 379*0f8782eaSNeal Cardwell static bool bbr_is_next_cycle_phase(struct sock *sk, 380*0f8782eaSNeal Cardwell const struct rate_sample *rs) 381*0f8782eaSNeal Cardwell { 382*0f8782eaSNeal Cardwell struct tcp_sock *tp = tcp_sk(sk); 383*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 384*0f8782eaSNeal Cardwell bool is_full_length = 385*0f8782eaSNeal Cardwell skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) > 386*0f8782eaSNeal Cardwell bbr->min_rtt_us; 387*0f8782eaSNeal Cardwell u32 inflight, bw; 388*0f8782eaSNeal Cardwell 389*0f8782eaSNeal Cardwell /* The pacing_gain of 1.0 paces at the estimated bw to try to fully 390*0f8782eaSNeal Cardwell * use the pipe without increasing the queue. 391*0f8782eaSNeal Cardwell */ 392*0f8782eaSNeal Cardwell if (bbr->pacing_gain == BBR_UNIT) 393*0f8782eaSNeal Cardwell return is_full_length; /* just use wall clock time */ 394*0f8782eaSNeal Cardwell 395*0f8782eaSNeal Cardwell inflight = rs->prior_in_flight; /* what was in-flight before ACK? */ 396*0f8782eaSNeal Cardwell bw = bbr_max_bw(sk); 397*0f8782eaSNeal Cardwell 398*0f8782eaSNeal Cardwell /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at 399*0f8782eaSNeal Cardwell * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is 400*0f8782eaSNeal Cardwell * small (e.g. on a LAN). We do not persist if packets are lost, since 401*0f8782eaSNeal Cardwell * a path with small buffers may not hold that much. 402*0f8782eaSNeal Cardwell */ 403*0f8782eaSNeal Cardwell if (bbr->pacing_gain > BBR_UNIT) 404*0f8782eaSNeal Cardwell return is_full_length && 405*0f8782eaSNeal Cardwell (rs->losses || /* perhaps pacing_gain*BDP won't fit */ 406*0f8782eaSNeal Cardwell inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain)); 407*0f8782eaSNeal Cardwell 408*0f8782eaSNeal Cardwell /* A pacing_gain < 1.0 tries to drain extra queue we added if bw 409*0f8782eaSNeal Cardwell * probing didn't find more bw. If inflight falls to match BDP then we 410*0f8782eaSNeal Cardwell * estimate queue is drained; persisting would underutilize the pipe. 411*0f8782eaSNeal Cardwell */ 412*0f8782eaSNeal Cardwell return is_full_length || 413*0f8782eaSNeal Cardwell inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT); 414*0f8782eaSNeal Cardwell } 415*0f8782eaSNeal Cardwell 416*0f8782eaSNeal Cardwell static void bbr_advance_cycle_phase(struct sock *sk) 417*0f8782eaSNeal Cardwell { 418*0f8782eaSNeal Cardwell struct tcp_sock *tp = tcp_sk(sk); 419*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 420*0f8782eaSNeal Cardwell 421*0f8782eaSNeal Cardwell bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); 422*0f8782eaSNeal Cardwell bbr->cycle_mstamp = tp->delivered_mstamp; 423*0f8782eaSNeal Cardwell bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; 424*0f8782eaSNeal Cardwell } 425*0f8782eaSNeal Cardwell 426*0f8782eaSNeal Cardwell /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ 427*0f8782eaSNeal Cardwell static void bbr_update_cycle_phase(struct sock *sk, 428*0f8782eaSNeal Cardwell const struct rate_sample *rs) 429*0f8782eaSNeal Cardwell { 430*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 431*0f8782eaSNeal Cardwell 432*0f8782eaSNeal Cardwell if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw && 433*0f8782eaSNeal Cardwell bbr_is_next_cycle_phase(sk, rs)) 434*0f8782eaSNeal Cardwell bbr_advance_cycle_phase(sk); 435*0f8782eaSNeal Cardwell } 436*0f8782eaSNeal Cardwell 437*0f8782eaSNeal Cardwell static void bbr_reset_startup_mode(struct sock *sk) 438*0f8782eaSNeal Cardwell { 439*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 440*0f8782eaSNeal Cardwell 441*0f8782eaSNeal Cardwell bbr->mode = BBR_STARTUP; 442*0f8782eaSNeal Cardwell bbr->pacing_gain = bbr_high_gain; 443*0f8782eaSNeal Cardwell bbr->cwnd_gain = bbr_high_gain; 444*0f8782eaSNeal Cardwell } 445*0f8782eaSNeal Cardwell 446*0f8782eaSNeal Cardwell static void bbr_reset_probe_bw_mode(struct sock *sk) 447*0f8782eaSNeal Cardwell { 448*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 449*0f8782eaSNeal Cardwell 450*0f8782eaSNeal Cardwell bbr->mode = BBR_PROBE_BW; 451*0f8782eaSNeal Cardwell bbr->pacing_gain = BBR_UNIT; 452*0f8782eaSNeal Cardwell bbr->cwnd_gain = bbr_cwnd_gain; 453*0f8782eaSNeal Cardwell bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); 454*0f8782eaSNeal Cardwell bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ 455*0f8782eaSNeal Cardwell } 456*0f8782eaSNeal Cardwell 457*0f8782eaSNeal Cardwell static void bbr_reset_mode(struct sock *sk) 458*0f8782eaSNeal Cardwell { 459*0f8782eaSNeal Cardwell if (!bbr_full_bw_reached(sk)) 460*0f8782eaSNeal Cardwell bbr_reset_startup_mode(sk); 461*0f8782eaSNeal Cardwell else 462*0f8782eaSNeal Cardwell bbr_reset_probe_bw_mode(sk); 463*0f8782eaSNeal Cardwell } 464*0f8782eaSNeal Cardwell 465*0f8782eaSNeal Cardwell /* Start a new long-term sampling interval. */ 466*0f8782eaSNeal Cardwell static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) 467*0f8782eaSNeal Cardwell { 468*0f8782eaSNeal Cardwell struct tcp_sock *tp = tcp_sk(sk); 469*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 470*0f8782eaSNeal Cardwell 471*0f8782eaSNeal Cardwell bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies; 472*0f8782eaSNeal Cardwell bbr->lt_last_delivered = tp->delivered; 473*0f8782eaSNeal Cardwell bbr->lt_last_lost = tp->lost; 474*0f8782eaSNeal Cardwell bbr->lt_rtt_cnt = 0; 475*0f8782eaSNeal Cardwell } 476*0f8782eaSNeal Cardwell 477*0f8782eaSNeal Cardwell /* Completely reset long-term bandwidth sampling. */ 478*0f8782eaSNeal Cardwell static void bbr_reset_lt_bw_sampling(struct sock *sk) 479*0f8782eaSNeal Cardwell { 480*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 481*0f8782eaSNeal Cardwell 482*0f8782eaSNeal Cardwell bbr->lt_bw = 0; 483*0f8782eaSNeal Cardwell bbr->lt_use_bw = 0; 484*0f8782eaSNeal Cardwell bbr->lt_is_sampling = false; 485*0f8782eaSNeal Cardwell bbr_reset_lt_bw_sampling_interval(sk); 486*0f8782eaSNeal Cardwell } 487*0f8782eaSNeal Cardwell 488*0f8782eaSNeal Cardwell /* Long-term bw sampling interval is done. Estimate whether we're policed. */ 489*0f8782eaSNeal Cardwell static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) 490*0f8782eaSNeal Cardwell { 491*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 492*0f8782eaSNeal Cardwell u32 diff; 493*0f8782eaSNeal Cardwell 494*0f8782eaSNeal Cardwell if (bbr->lt_bw) { /* do we have bw from a previous interval? */ 495*0f8782eaSNeal Cardwell /* Is new bw close to the lt_bw from the previous interval? */ 496*0f8782eaSNeal Cardwell diff = abs(bw - bbr->lt_bw); 497*0f8782eaSNeal Cardwell if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || 498*0f8782eaSNeal Cardwell (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= 499*0f8782eaSNeal Cardwell bbr_lt_bw_diff)) { 500*0f8782eaSNeal Cardwell /* All criteria are met; estimate we're policed. */ 501*0f8782eaSNeal Cardwell bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ 502*0f8782eaSNeal Cardwell bbr->lt_use_bw = 1; 503*0f8782eaSNeal Cardwell bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ 504*0f8782eaSNeal Cardwell bbr->lt_rtt_cnt = 0; 505*0f8782eaSNeal Cardwell return; 506*0f8782eaSNeal Cardwell } 507*0f8782eaSNeal Cardwell } 508*0f8782eaSNeal Cardwell bbr->lt_bw = bw; 509*0f8782eaSNeal Cardwell bbr_reset_lt_bw_sampling_interval(sk); 510*0f8782eaSNeal Cardwell } 511*0f8782eaSNeal Cardwell 512*0f8782eaSNeal Cardwell /* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of 513*0f8782eaSNeal Cardwell * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and 514*0f8782eaSNeal Cardwell * explicitly models their policed rate, to reduce unnecessary losses. We 515*0f8782eaSNeal Cardwell * estimate that we're policed if we see 2 consecutive sampling intervals with 516*0f8782eaSNeal Cardwell * consistent throughput and high packet loss. If we think we're being policed, 517*0f8782eaSNeal Cardwell * set lt_bw to the "long-term" average delivery rate from those 2 intervals. 518*0f8782eaSNeal Cardwell */ 519*0f8782eaSNeal Cardwell static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) 520*0f8782eaSNeal Cardwell { 521*0f8782eaSNeal Cardwell struct tcp_sock *tp = tcp_sk(sk); 522*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 523*0f8782eaSNeal Cardwell u32 lost, delivered; 524*0f8782eaSNeal Cardwell u64 bw; 525*0f8782eaSNeal Cardwell s32 t; 526*0f8782eaSNeal Cardwell 527*0f8782eaSNeal Cardwell if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ 528*0f8782eaSNeal Cardwell if (bbr->mode == BBR_PROBE_BW && bbr->round_start && 529*0f8782eaSNeal Cardwell ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { 530*0f8782eaSNeal Cardwell bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ 531*0f8782eaSNeal Cardwell bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ 532*0f8782eaSNeal Cardwell } 533*0f8782eaSNeal Cardwell return; 534*0f8782eaSNeal Cardwell } 535*0f8782eaSNeal Cardwell 536*0f8782eaSNeal Cardwell /* Wait for the first loss before sampling, to let the policer exhaust 537*0f8782eaSNeal Cardwell * its tokens and estimate the steady-state rate allowed by the policer. 538*0f8782eaSNeal Cardwell * Starting samples earlier includes bursts that over-estimate the bw. 539*0f8782eaSNeal Cardwell */ 540*0f8782eaSNeal Cardwell if (!bbr->lt_is_sampling) { 541*0f8782eaSNeal Cardwell if (!rs->losses) 542*0f8782eaSNeal Cardwell return; 543*0f8782eaSNeal Cardwell bbr_reset_lt_bw_sampling_interval(sk); 544*0f8782eaSNeal Cardwell bbr->lt_is_sampling = true; 545*0f8782eaSNeal Cardwell } 546*0f8782eaSNeal Cardwell 547*0f8782eaSNeal Cardwell /* To avoid underestimates, reset sampling if we run out of data. */ 548*0f8782eaSNeal Cardwell if (rs->is_app_limited) { 549*0f8782eaSNeal Cardwell bbr_reset_lt_bw_sampling(sk); 550*0f8782eaSNeal Cardwell return; 551*0f8782eaSNeal Cardwell } 552*0f8782eaSNeal Cardwell 553*0f8782eaSNeal Cardwell if (bbr->round_start) 554*0f8782eaSNeal Cardwell bbr->lt_rtt_cnt++; /* count round trips in this interval */ 555*0f8782eaSNeal Cardwell if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) 556*0f8782eaSNeal Cardwell return; /* sampling interval needs to be longer */ 557*0f8782eaSNeal Cardwell if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { 558*0f8782eaSNeal Cardwell bbr_reset_lt_bw_sampling(sk); /* interval is too long */ 559*0f8782eaSNeal Cardwell return; 560*0f8782eaSNeal Cardwell } 561*0f8782eaSNeal Cardwell 562*0f8782eaSNeal Cardwell /* End sampling interval when a packet is lost, so we estimate the 563*0f8782eaSNeal Cardwell * policer tokens were exhausted. Stopping the sampling before the 564*0f8782eaSNeal Cardwell * tokens are exhausted under-estimates the policed rate. 565*0f8782eaSNeal Cardwell */ 566*0f8782eaSNeal Cardwell if (!rs->losses) 567*0f8782eaSNeal Cardwell return; 568*0f8782eaSNeal Cardwell 569*0f8782eaSNeal Cardwell /* Calculate packets lost and delivered in sampling interval. */ 570*0f8782eaSNeal Cardwell lost = tp->lost - bbr->lt_last_lost; 571*0f8782eaSNeal Cardwell delivered = tp->delivered - bbr->lt_last_delivered; 572*0f8782eaSNeal Cardwell /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ 573*0f8782eaSNeal Cardwell if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) 574*0f8782eaSNeal Cardwell return; 575*0f8782eaSNeal Cardwell 576*0f8782eaSNeal Cardwell /* Find average delivery rate in this sampling interval. */ 577*0f8782eaSNeal Cardwell t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp); 578*0f8782eaSNeal Cardwell if (t < 1) 579*0f8782eaSNeal Cardwell return; /* interval is less than one jiffy, so wait */ 580*0f8782eaSNeal Cardwell t = jiffies_to_usecs(t); 581*0f8782eaSNeal Cardwell /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */ 582*0f8782eaSNeal Cardwell if (t < 1) { 583*0f8782eaSNeal Cardwell bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ 584*0f8782eaSNeal Cardwell return; 585*0f8782eaSNeal Cardwell } 586*0f8782eaSNeal Cardwell bw = (u64)delivered * BW_UNIT; 587*0f8782eaSNeal Cardwell do_div(bw, t); 588*0f8782eaSNeal Cardwell bbr_lt_bw_interval_done(sk, bw); 589*0f8782eaSNeal Cardwell } 590*0f8782eaSNeal Cardwell 591*0f8782eaSNeal Cardwell /* Estimate the bandwidth based on how fast packets are delivered */ 592*0f8782eaSNeal Cardwell static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) 593*0f8782eaSNeal Cardwell { 594*0f8782eaSNeal Cardwell struct tcp_sock *tp = tcp_sk(sk); 595*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 596*0f8782eaSNeal Cardwell u64 bw; 597*0f8782eaSNeal Cardwell 598*0f8782eaSNeal Cardwell bbr->round_start = 0; 599*0f8782eaSNeal Cardwell if (rs->delivered < 0 || rs->interval_us <= 0) 600*0f8782eaSNeal Cardwell return; /* Not a valid observation */ 601*0f8782eaSNeal Cardwell 602*0f8782eaSNeal Cardwell /* See if we've reached the next RTT */ 603*0f8782eaSNeal Cardwell if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { 604*0f8782eaSNeal Cardwell bbr->next_rtt_delivered = tp->delivered; 605*0f8782eaSNeal Cardwell bbr->rtt_cnt++; 606*0f8782eaSNeal Cardwell bbr->round_start = 1; 607*0f8782eaSNeal Cardwell bbr->packet_conservation = 0; 608*0f8782eaSNeal Cardwell } 609*0f8782eaSNeal Cardwell 610*0f8782eaSNeal Cardwell bbr_lt_bw_sampling(sk, rs); 611*0f8782eaSNeal Cardwell 612*0f8782eaSNeal Cardwell /* Divide delivered by the interval to find a (lower bound) bottleneck 613*0f8782eaSNeal Cardwell * bandwidth sample. Delivered is in packets and interval_us in uS and 614*0f8782eaSNeal Cardwell * ratio will be <<1 for most connections. So delivered is first scaled. 615*0f8782eaSNeal Cardwell */ 616*0f8782eaSNeal Cardwell bw = (u64)rs->delivered * BW_UNIT; 617*0f8782eaSNeal Cardwell do_div(bw, rs->interval_us); 618*0f8782eaSNeal Cardwell 619*0f8782eaSNeal Cardwell /* If this sample is application-limited, it is likely to have a very 620*0f8782eaSNeal Cardwell * low delivered count that represents application behavior rather than 621*0f8782eaSNeal Cardwell * the available network rate. Such a sample could drag down estimated 622*0f8782eaSNeal Cardwell * bw, causing needless slow-down. Thus, to continue to send at the 623*0f8782eaSNeal Cardwell * last measured network rate, we filter out app-limited samples unless 624*0f8782eaSNeal Cardwell * they describe the path bw at least as well as our bw model. 625*0f8782eaSNeal Cardwell * 626*0f8782eaSNeal Cardwell * So the goal during app-limited phase is to proceed with the best 627*0f8782eaSNeal Cardwell * network rate no matter how long. We automatically leave this 628*0f8782eaSNeal Cardwell * phase when app writes faster than the network can deliver :) 629*0f8782eaSNeal Cardwell */ 630*0f8782eaSNeal Cardwell if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { 631*0f8782eaSNeal Cardwell /* Incorporate new sample into our max bw filter. */ 632*0f8782eaSNeal Cardwell minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); 633*0f8782eaSNeal Cardwell } 634*0f8782eaSNeal Cardwell } 635*0f8782eaSNeal Cardwell 636*0f8782eaSNeal Cardwell /* Estimate when the pipe is full, using the change in delivery rate: BBR 637*0f8782eaSNeal Cardwell * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by 638*0f8782eaSNeal Cardwell * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited 639*0f8782eaSNeal Cardwell * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the 640*0f8782eaSNeal Cardwell * higher rwin, 3: we get higher delivery rate samples. Or transient 641*0f8782eaSNeal Cardwell * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar 642*0f8782eaSNeal Cardwell * design goal, but uses delay and inter-ACK spacing instead of bandwidth. 643*0f8782eaSNeal Cardwell */ 644*0f8782eaSNeal Cardwell static void bbr_check_full_bw_reached(struct sock *sk, 645*0f8782eaSNeal Cardwell const struct rate_sample *rs) 646*0f8782eaSNeal Cardwell { 647*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 648*0f8782eaSNeal Cardwell u32 bw_thresh; 649*0f8782eaSNeal Cardwell 650*0f8782eaSNeal Cardwell if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) 651*0f8782eaSNeal Cardwell return; 652*0f8782eaSNeal Cardwell 653*0f8782eaSNeal Cardwell bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; 654*0f8782eaSNeal Cardwell if (bbr_max_bw(sk) >= bw_thresh) { 655*0f8782eaSNeal Cardwell bbr->full_bw = bbr_max_bw(sk); 656*0f8782eaSNeal Cardwell bbr->full_bw_cnt = 0; 657*0f8782eaSNeal Cardwell return; 658*0f8782eaSNeal Cardwell } 659*0f8782eaSNeal Cardwell ++bbr->full_bw_cnt; 660*0f8782eaSNeal Cardwell } 661*0f8782eaSNeal Cardwell 662*0f8782eaSNeal Cardwell /* If pipe is probably full, drain the queue and then enter steady-state. */ 663*0f8782eaSNeal Cardwell static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) 664*0f8782eaSNeal Cardwell { 665*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 666*0f8782eaSNeal Cardwell 667*0f8782eaSNeal Cardwell if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { 668*0f8782eaSNeal Cardwell bbr->mode = BBR_DRAIN; /* drain queue we created */ 669*0f8782eaSNeal Cardwell bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ 670*0f8782eaSNeal Cardwell bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ 671*0f8782eaSNeal Cardwell } /* fall through to check if in-flight is already small: */ 672*0f8782eaSNeal Cardwell if (bbr->mode == BBR_DRAIN && 673*0f8782eaSNeal Cardwell tcp_packets_in_flight(tcp_sk(sk)) <= 674*0f8782eaSNeal Cardwell bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) 675*0f8782eaSNeal Cardwell bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ 676*0f8782eaSNeal Cardwell } 677*0f8782eaSNeal Cardwell 678*0f8782eaSNeal Cardwell /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and 679*0f8782eaSNeal Cardwell * periodically drain the bottleneck queue, to converge to measure the true 680*0f8782eaSNeal Cardwell * min_rtt (unloaded propagation delay). This allows the flows to keep queues 681*0f8782eaSNeal Cardwell * small (reducing queuing delay and packet loss) and achieve fairness among 682*0f8782eaSNeal Cardwell * BBR flows. 683*0f8782eaSNeal Cardwell * 684*0f8782eaSNeal Cardwell * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, 685*0f8782eaSNeal Cardwell * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. 686*0f8782eaSNeal Cardwell * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed 687*0f8782eaSNeal Cardwell * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and 688*0f8782eaSNeal Cardwell * re-enter the previous mode. BBR uses 200ms to approximately bound the 689*0f8782eaSNeal Cardwell * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). 690*0f8782eaSNeal Cardwell * 691*0f8782eaSNeal Cardwell * Note that flows need only pay 2% if they are busy sending over the last 10 692*0f8782eaSNeal Cardwell * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have 693*0f8782eaSNeal Cardwell * natural silences or low-rate periods within 10 seconds where the rate is low 694*0f8782eaSNeal Cardwell * enough for long enough to drain its queue in the bottleneck. We pick up 695*0f8782eaSNeal Cardwell * these min RTT measurements opportunistically with our min_rtt filter. :-) 696*0f8782eaSNeal Cardwell */ 697*0f8782eaSNeal Cardwell static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) 698*0f8782eaSNeal Cardwell { 699*0f8782eaSNeal Cardwell struct tcp_sock *tp = tcp_sk(sk); 700*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 701*0f8782eaSNeal Cardwell bool filter_expired; 702*0f8782eaSNeal Cardwell 703*0f8782eaSNeal Cardwell /* Track min RTT seen in the min_rtt_win_sec filter window: */ 704*0f8782eaSNeal Cardwell filter_expired = after(tcp_time_stamp, 705*0f8782eaSNeal Cardwell bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); 706*0f8782eaSNeal Cardwell if (rs->rtt_us >= 0 && 707*0f8782eaSNeal Cardwell (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { 708*0f8782eaSNeal Cardwell bbr->min_rtt_us = rs->rtt_us; 709*0f8782eaSNeal Cardwell bbr->min_rtt_stamp = tcp_time_stamp; 710*0f8782eaSNeal Cardwell } 711*0f8782eaSNeal Cardwell 712*0f8782eaSNeal Cardwell if (bbr_probe_rtt_mode_ms > 0 && filter_expired && 713*0f8782eaSNeal Cardwell !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { 714*0f8782eaSNeal Cardwell bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ 715*0f8782eaSNeal Cardwell bbr->pacing_gain = BBR_UNIT; 716*0f8782eaSNeal Cardwell bbr->cwnd_gain = BBR_UNIT; 717*0f8782eaSNeal Cardwell bbr_save_cwnd(sk); /* note cwnd so we can restore it */ 718*0f8782eaSNeal Cardwell bbr->probe_rtt_done_stamp = 0; 719*0f8782eaSNeal Cardwell } 720*0f8782eaSNeal Cardwell 721*0f8782eaSNeal Cardwell if (bbr->mode == BBR_PROBE_RTT) { 722*0f8782eaSNeal Cardwell /* Ignore low rate samples during this mode. */ 723*0f8782eaSNeal Cardwell tp->app_limited = 724*0f8782eaSNeal Cardwell (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; 725*0f8782eaSNeal Cardwell /* Maintain min packets in flight for max(200 ms, 1 round). */ 726*0f8782eaSNeal Cardwell if (!bbr->probe_rtt_done_stamp && 727*0f8782eaSNeal Cardwell tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { 728*0f8782eaSNeal Cardwell bbr->probe_rtt_done_stamp = tcp_time_stamp + 729*0f8782eaSNeal Cardwell msecs_to_jiffies(bbr_probe_rtt_mode_ms); 730*0f8782eaSNeal Cardwell bbr->probe_rtt_round_done = 0; 731*0f8782eaSNeal Cardwell bbr->next_rtt_delivered = tp->delivered; 732*0f8782eaSNeal Cardwell } else if (bbr->probe_rtt_done_stamp) { 733*0f8782eaSNeal Cardwell if (bbr->round_start) 734*0f8782eaSNeal Cardwell bbr->probe_rtt_round_done = 1; 735*0f8782eaSNeal Cardwell if (bbr->probe_rtt_round_done && 736*0f8782eaSNeal Cardwell after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) { 737*0f8782eaSNeal Cardwell bbr->min_rtt_stamp = tcp_time_stamp; 738*0f8782eaSNeal Cardwell bbr->restore_cwnd = 1; /* snap to prior_cwnd */ 739*0f8782eaSNeal Cardwell bbr_reset_mode(sk); 740*0f8782eaSNeal Cardwell } 741*0f8782eaSNeal Cardwell } 742*0f8782eaSNeal Cardwell } 743*0f8782eaSNeal Cardwell bbr->idle_restart = 0; 744*0f8782eaSNeal Cardwell } 745*0f8782eaSNeal Cardwell 746*0f8782eaSNeal Cardwell static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) 747*0f8782eaSNeal Cardwell { 748*0f8782eaSNeal Cardwell bbr_update_bw(sk, rs); 749*0f8782eaSNeal Cardwell bbr_update_cycle_phase(sk, rs); 750*0f8782eaSNeal Cardwell bbr_check_full_bw_reached(sk, rs); 751*0f8782eaSNeal Cardwell bbr_check_drain(sk, rs); 752*0f8782eaSNeal Cardwell bbr_update_min_rtt(sk, rs); 753*0f8782eaSNeal Cardwell } 754*0f8782eaSNeal Cardwell 755*0f8782eaSNeal Cardwell static void bbr_main(struct sock *sk, const struct rate_sample *rs) 756*0f8782eaSNeal Cardwell { 757*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 758*0f8782eaSNeal Cardwell u32 bw; 759*0f8782eaSNeal Cardwell 760*0f8782eaSNeal Cardwell bbr_update_model(sk, rs); 761*0f8782eaSNeal Cardwell 762*0f8782eaSNeal Cardwell bw = bbr_bw(sk); 763*0f8782eaSNeal Cardwell bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); 764*0f8782eaSNeal Cardwell bbr_set_tso_segs_goal(sk); 765*0f8782eaSNeal Cardwell bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); 766*0f8782eaSNeal Cardwell } 767*0f8782eaSNeal Cardwell 768*0f8782eaSNeal Cardwell static void bbr_init(struct sock *sk) 769*0f8782eaSNeal Cardwell { 770*0f8782eaSNeal Cardwell struct tcp_sock *tp = tcp_sk(sk); 771*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 772*0f8782eaSNeal Cardwell u64 bw; 773*0f8782eaSNeal Cardwell 774*0f8782eaSNeal Cardwell bbr->prior_cwnd = 0; 775*0f8782eaSNeal Cardwell bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ 776*0f8782eaSNeal Cardwell bbr->rtt_cnt = 0; 777*0f8782eaSNeal Cardwell bbr->next_rtt_delivered = 0; 778*0f8782eaSNeal Cardwell bbr->prev_ca_state = TCP_CA_Open; 779*0f8782eaSNeal Cardwell bbr->packet_conservation = 0; 780*0f8782eaSNeal Cardwell 781*0f8782eaSNeal Cardwell bbr->probe_rtt_done_stamp = 0; 782*0f8782eaSNeal Cardwell bbr->probe_rtt_round_done = 0; 783*0f8782eaSNeal Cardwell bbr->min_rtt_us = tcp_min_rtt(tp); 784*0f8782eaSNeal Cardwell bbr->min_rtt_stamp = tcp_time_stamp; 785*0f8782eaSNeal Cardwell 786*0f8782eaSNeal Cardwell minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ 787*0f8782eaSNeal Cardwell 788*0f8782eaSNeal Cardwell /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ 789*0f8782eaSNeal Cardwell bw = (u64)tp->snd_cwnd * BW_UNIT; 790*0f8782eaSNeal Cardwell do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC); 791*0f8782eaSNeal Cardwell sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */ 792*0f8782eaSNeal Cardwell bbr_set_pacing_rate(sk, bw, bbr_high_gain); 793*0f8782eaSNeal Cardwell 794*0f8782eaSNeal Cardwell bbr->restore_cwnd = 0; 795*0f8782eaSNeal Cardwell bbr->round_start = 0; 796*0f8782eaSNeal Cardwell bbr->idle_restart = 0; 797*0f8782eaSNeal Cardwell bbr->full_bw = 0; 798*0f8782eaSNeal Cardwell bbr->full_bw_cnt = 0; 799*0f8782eaSNeal Cardwell bbr->cycle_mstamp.v64 = 0; 800*0f8782eaSNeal Cardwell bbr->cycle_idx = 0; 801*0f8782eaSNeal Cardwell bbr_reset_lt_bw_sampling(sk); 802*0f8782eaSNeal Cardwell bbr_reset_startup_mode(sk); 803*0f8782eaSNeal Cardwell } 804*0f8782eaSNeal Cardwell 805*0f8782eaSNeal Cardwell static u32 bbr_sndbuf_expand(struct sock *sk) 806*0f8782eaSNeal Cardwell { 807*0f8782eaSNeal Cardwell /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ 808*0f8782eaSNeal Cardwell return 3; 809*0f8782eaSNeal Cardwell } 810*0f8782eaSNeal Cardwell 811*0f8782eaSNeal Cardwell /* In theory BBR does not need to undo the cwnd since it does not 812*0f8782eaSNeal Cardwell * always reduce cwnd on losses (see bbr_main()). Keep it for now. 813*0f8782eaSNeal Cardwell */ 814*0f8782eaSNeal Cardwell static u32 bbr_undo_cwnd(struct sock *sk) 815*0f8782eaSNeal Cardwell { 816*0f8782eaSNeal Cardwell return tcp_sk(sk)->snd_cwnd; 817*0f8782eaSNeal Cardwell } 818*0f8782eaSNeal Cardwell 819*0f8782eaSNeal Cardwell /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ 820*0f8782eaSNeal Cardwell static u32 bbr_ssthresh(struct sock *sk) 821*0f8782eaSNeal Cardwell { 822*0f8782eaSNeal Cardwell bbr_save_cwnd(sk); 823*0f8782eaSNeal Cardwell return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */ 824*0f8782eaSNeal Cardwell } 825*0f8782eaSNeal Cardwell 826*0f8782eaSNeal Cardwell static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, 827*0f8782eaSNeal Cardwell union tcp_cc_info *info) 828*0f8782eaSNeal Cardwell { 829*0f8782eaSNeal Cardwell if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || 830*0f8782eaSNeal Cardwell ext & (1 << (INET_DIAG_VEGASINFO - 1))) { 831*0f8782eaSNeal Cardwell struct tcp_sock *tp = tcp_sk(sk); 832*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 833*0f8782eaSNeal Cardwell u64 bw = bbr_bw(sk); 834*0f8782eaSNeal Cardwell 835*0f8782eaSNeal Cardwell bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; 836*0f8782eaSNeal Cardwell memset(&info->bbr, 0, sizeof(info->bbr)); 837*0f8782eaSNeal Cardwell info->bbr.bbr_bw_lo = (u32)bw; 838*0f8782eaSNeal Cardwell info->bbr.bbr_bw_hi = (u32)(bw >> 32); 839*0f8782eaSNeal Cardwell info->bbr.bbr_min_rtt = bbr->min_rtt_us; 840*0f8782eaSNeal Cardwell info->bbr.bbr_pacing_gain = bbr->pacing_gain; 841*0f8782eaSNeal Cardwell info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; 842*0f8782eaSNeal Cardwell *attr = INET_DIAG_BBRINFO; 843*0f8782eaSNeal Cardwell return sizeof(info->bbr); 844*0f8782eaSNeal Cardwell } 845*0f8782eaSNeal Cardwell return 0; 846*0f8782eaSNeal Cardwell } 847*0f8782eaSNeal Cardwell 848*0f8782eaSNeal Cardwell static void bbr_set_state(struct sock *sk, u8 new_state) 849*0f8782eaSNeal Cardwell { 850*0f8782eaSNeal Cardwell struct bbr *bbr = inet_csk_ca(sk); 851*0f8782eaSNeal Cardwell 852*0f8782eaSNeal Cardwell if (new_state == TCP_CA_Loss) { 853*0f8782eaSNeal Cardwell struct rate_sample rs = { .losses = 1 }; 854*0f8782eaSNeal Cardwell 855*0f8782eaSNeal Cardwell bbr->prev_ca_state = TCP_CA_Loss; 856*0f8782eaSNeal Cardwell bbr->full_bw = 0; 857*0f8782eaSNeal Cardwell bbr->round_start = 1; /* treat RTO like end of a round */ 858*0f8782eaSNeal Cardwell bbr_lt_bw_sampling(sk, &rs); 859*0f8782eaSNeal Cardwell } 860*0f8782eaSNeal Cardwell } 861*0f8782eaSNeal Cardwell 862*0f8782eaSNeal Cardwell static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { 863*0f8782eaSNeal Cardwell .flags = TCP_CONG_NON_RESTRICTED, 864*0f8782eaSNeal Cardwell .name = "bbr", 865*0f8782eaSNeal Cardwell .owner = THIS_MODULE, 866*0f8782eaSNeal Cardwell .init = bbr_init, 867*0f8782eaSNeal Cardwell .cong_control = bbr_main, 868*0f8782eaSNeal Cardwell .sndbuf_expand = bbr_sndbuf_expand, 869*0f8782eaSNeal Cardwell .undo_cwnd = bbr_undo_cwnd, 870*0f8782eaSNeal Cardwell .cwnd_event = bbr_cwnd_event, 871*0f8782eaSNeal Cardwell .ssthresh = bbr_ssthresh, 872*0f8782eaSNeal Cardwell .tso_segs_goal = bbr_tso_segs_goal, 873*0f8782eaSNeal Cardwell .get_info = bbr_get_info, 874*0f8782eaSNeal Cardwell .set_state = bbr_set_state, 875*0f8782eaSNeal Cardwell }; 876*0f8782eaSNeal Cardwell 877*0f8782eaSNeal Cardwell static int __init bbr_register(void) 878*0f8782eaSNeal Cardwell { 879*0f8782eaSNeal Cardwell BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); 880*0f8782eaSNeal Cardwell return tcp_register_congestion_control(&tcp_bbr_cong_ops); 881*0f8782eaSNeal Cardwell } 882*0f8782eaSNeal Cardwell 883*0f8782eaSNeal Cardwell static void __exit bbr_unregister(void) 884*0f8782eaSNeal Cardwell { 885*0f8782eaSNeal Cardwell tcp_unregister_congestion_control(&tcp_bbr_cong_ops); 886*0f8782eaSNeal Cardwell } 887*0f8782eaSNeal Cardwell 888*0f8782eaSNeal Cardwell module_init(bbr_register); 889*0f8782eaSNeal Cardwell module_exit(bbr_unregister); 890*0f8782eaSNeal Cardwell 891*0f8782eaSNeal Cardwell MODULE_AUTHOR("Van Jacobson <vanj@google.com>"); 892*0f8782eaSNeal Cardwell MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>"); 893*0f8782eaSNeal Cardwell MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>"); 894*0f8782eaSNeal Cardwell MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>"); 895*0f8782eaSNeal Cardwell MODULE_LICENSE("Dual BSD/GPL"); 896*0f8782eaSNeal Cardwell MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); 897