1 /* 2 * CAIA Delay-Gradient (CDG) congestion control 3 * 4 * This implementation is based on the paper: 5 * D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using 6 * delay gradients." In IFIP Networking, pages 328-341. Springer, 2011. 7 * 8 * Scavenger traffic (Less-than-Best-Effort) should disable coexistence 9 * heuristics using parameters use_shadow=0 and use_ineff=0. 10 * 11 * Parameters window, backoff_beta, and backoff_factor are crucial for 12 * throughput and delay. Future work is needed to determine better defaults, 13 * and to provide guidelines for use in different environments/contexts. 14 * 15 * Except for window, knobs are configured via /sys/module/tcp_cdg/parameters/. 16 * Parameter window is only configurable when loading tcp_cdg as a module. 17 * 18 * Notable differences from paper/FreeBSD: 19 * o Using Hybrid Slow start and Proportional Rate Reduction. 20 * o Add toggle for shadow window mechanism. Suggested by David Hayes. 21 * o Add toggle for non-congestion loss tolerance. 22 * o Scaling parameter G is changed to a backoff factor; 23 * conversion is given by: backoff_factor = 1000/(G * window). 24 * o Limit shadow window to 2 * cwnd, or to cwnd when application limited. 25 * o More accurate e^-x. 26 */ 27 #include <linux/kernel.h> 28 #include <linux/random.h> 29 #include <linux/module.h> 30 #include <net/tcp.h> 31 32 #define HYSTART_ACK_TRAIN 1 33 #define HYSTART_DELAY 2 34 35 static int window __read_mostly = 8; 36 static unsigned int backoff_beta __read_mostly = 0.7071 * 1024; /* sqrt 0.5 */ 37 static unsigned int backoff_factor __read_mostly = 42; 38 static unsigned int hystart_detect __read_mostly = 3; 39 static unsigned int use_ineff __read_mostly = 5; 40 static bool use_shadow __read_mostly = true; 41 static bool use_tolerance __read_mostly; 42 43 module_param(window, int, 0444); 44 MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)"); 45 module_param(backoff_beta, uint, 0644); 46 MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)"); 47 module_param(backoff_factor, uint, 0644); 48 MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor"); 49 module_param(hystart_detect, uint, 0644); 50 MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start " 51 "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)"); 52 module_param(use_ineff, uint, 0644); 53 MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)"); 54 module_param(use_shadow, bool, 0644); 55 MODULE_PARM_DESC(use_shadow, "use shadow window heuristic"); 56 module_param(use_tolerance, bool, 0644); 57 MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic"); 58 59 struct minmax { 60 union { 61 struct { 62 s32 min; 63 s32 max; 64 }; 65 u64 v64; 66 }; 67 }; 68 69 enum cdg_state { 70 CDG_UNKNOWN = 0, 71 CDG_NONFULL = 1, 72 CDG_FULL = 2, 73 CDG_BACKOFF = 3, 74 }; 75 76 struct cdg { 77 struct minmax rtt; 78 struct minmax rtt_prev; 79 struct minmax *gradients; 80 struct minmax gsum; 81 bool gfilled; 82 u8 tail; 83 u8 state; 84 u8 delack; 85 u32 rtt_seq; 86 u32 undo_cwnd; 87 u32 shadow_wnd; 88 u16 backoff_cnt; 89 u16 sample_cnt; 90 s32 delay_min; 91 u32 last_ack; 92 u32 round_start; 93 }; 94 95 /** 96 * nexp_u32 - negative base-e exponential 97 * @ux: x in units of micro 98 * 99 * Returns exp(ux * -1e-6) * U32_MAX. 100 */ 101 static u32 __pure nexp_u32(u32 ux) 102 { 103 static const u16 v[] = { 104 /* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */ 105 65535, 106 65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422, 107 61378, 57484, 50423, 38795, 22965, 8047, 987, 14, 108 }; 109 u32 msb = ux >> 8; 110 u32 res; 111 int i; 112 113 /* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */ 114 if (msb > U16_MAX) 115 return 0; 116 117 /* Scale first eight bits linearly: */ 118 res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000); 119 120 /* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */ 121 for (i = 1; msb; i++, msb >>= 1) { 122 u32 y = v[i & -(msb & 1)] + U32_C(1); 123 124 res = ((u64)res * y) >> 16; 125 } 126 127 return res; 128 } 129 130 /* Based on the HyStart algorithm (by Ha et al.) that is implemented in 131 * tcp_cubic. Differences/experimental changes: 132 * o Using Hayes' delayed ACK filter. 133 * o Using a usec clock for the ACK train. 134 * o Reset ACK train when application limited. 135 * o Invoked at any cwnd (i.e. also when cwnd < 16). 136 * o Invoked only when cwnd < ssthresh (i.e. not when cwnd == ssthresh). 137 */ 138 static void tcp_cdg_hystart_update(struct sock *sk) 139 { 140 struct cdg *ca = inet_csk_ca(sk); 141 struct tcp_sock *tp = tcp_sk(sk); 142 143 ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min); 144 if (ca->delay_min == 0) 145 return; 146 147 if (hystart_detect & HYSTART_ACK_TRAIN) { 148 u32 now_us = div_u64(local_clock(), NSEC_PER_USEC); 149 150 if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) { 151 ca->last_ack = now_us; 152 ca->round_start = now_us; 153 } else if (before(now_us, ca->last_ack + 3000)) { 154 u32 base_owd = max(ca->delay_min / 2U, 125U); 155 156 ca->last_ack = now_us; 157 if (after(now_us, ca->round_start + base_owd)) { 158 NET_INC_STATS(sock_net(sk), 159 LINUX_MIB_TCPHYSTARTTRAINDETECT); 160 NET_ADD_STATS(sock_net(sk), 161 LINUX_MIB_TCPHYSTARTTRAINCWND, 162 tp->snd_cwnd); 163 tp->snd_ssthresh = tp->snd_cwnd; 164 return; 165 } 166 } 167 } 168 169 if (hystart_detect & HYSTART_DELAY) { 170 if (ca->sample_cnt < 8) { 171 ca->sample_cnt++; 172 } else { 173 s32 thresh = max(ca->delay_min + ca->delay_min / 8U, 174 125U); 175 176 if (ca->rtt.min > thresh) { 177 NET_INC_STATS(sock_net(sk), 178 LINUX_MIB_TCPHYSTARTDELAYDETECT); 179 NET_ADD_STATS(sock_net(sk), 180 LINUX_MIB_TCPHYSTARTDELAYCWND, 181 tp->snd_cwnd); 182 tp->snd_ssthresh = tp->snd_cwnd; 183 } 184 } 185 } 186 } 187 188 static s32 tcp_cdg_grad(struct cdg *ca) 189 { 190 s32 gmin = ca->rtt.min - ca->rtt_prev.min; 191 s32 gmax = ca->rtt.max - ca->rtt_prev.max; 192 s32 grad; 193 194 if (ca->gradients) { 195 ca->gsum.min += gmin - ca->gradients[ca->tail].min; 196 ca->gsum.max += gmax - ca->gradients[ca->tail].max; 197 ca->gradients[ca->tail].min = gmin; 198 ca->gradients[ca->tail].max = gmax; 199 ca->tail = (ca->tail + 1) & (window - 1); 200 gmin = ca->gsum.min; 201 gmax = ca->gsum.max; 202 } 203 204 /* We keep sums to ignore gradients during cwnd reductions; 205 * the paper's smoothed gradients otherwise simplify to: 206 * (rtt_latest - rtt_oldest) / window. 207 * 208 * We also drop division by window here. 209 */ 210 grad = gmin > 0 ? gmin : gmax; 211 212 /* Extrapolate missing values in gradient window: */ 213 if (!ca->gfilled) { 214 if (!ca->gradients && window > 1) 215 grad *= window; /* Memory allocation failed. */ 216 else if (ca->tail == 0) 217 ca->gfilled = true; 218 else 219 grad = (grad * window) / (int)ca->tail; 220 } 221 222 /* Backoff was effectual: */ 223 if (gmin <= -32 || gmax <= -32) 224 ca->backoff_cnt = 0; 225 226 if (use_tolerance) { 227 /* Reduce small variations to zero: */ 228 gmin = DIV_ROUND_CLOSEST(gmin, 64); 229 gmax = DIV_ROUND_CLOSEST(gmax, 64); 230 231 if (gmin > 0 && gmax <= 0) 232 ca->state = CDG_FULL; 233 else if ((gmin > 0 && gmax > 0) || gmax < 0) 234 ca->state = CDG_NONFULL; 235 } 236 return grad; 237 } 238 239 static bool tcp_cdg_backoff(struct sock *sk, u32 grad) 240 { 241 struct cdg *ca = inet_csk_ca(sk); 242 struct tcp_sock *tp = tcp_sk(sk); 243 244 if (prandom_u32() <= nexp_u32(grad * backoff_factor)) 245 return false; 246 247 if (use_ineff) { 248 ca->backoff_cnt++; 249 if (ca->backoff_cnt > use_ineff) 250 return false; 251 } 252 253 ca->shadow_wnd = max(ca->shadow_wnd, tp->snd_cwnd); 254 ca->state = CDG_BACKOFF; 255 tcp_enter_cwr(sk); 256 return true; 257 } 258 259 /* Not called in CWR or Recovery state. */ 260 static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) 261 { 262 struct cdg *ca = inet_csk_ca(sk); 263 struct tcp_sock *tp = tcp_sk(sk); 264 u32 prior_snd_cwnd; 265 u32 incr; 266 267 if (tcp_in_slow_start(tp) && hystart_detect) 268 tcp_cdg_hystart_update(sk); 269 270 if (after(ack, ca->rtt_seq) && ca->rtt.v64) { 271 s32 grad = 0; 272 273 if (ca->rtt_prev.v64) 274 grad = tcp_cdg_grad(ca); 275 ca->rtt_seq = tp->snd_nxt; 276 ca->rtt_prev = ca->rtt; 277 ca->rtt.v64 = 0; 278 ca->last_ack = 0; 279 ca->sample_cnt = 0; 280 281 if (grad > 0 && tcp_cdg_backoff(sk, grad)) 282 return; 283 } 284 285 if (!tcp_is_cwnd_limited(sk)) { 286 ca->shadow_wnd = min(ca->shadow_wnd, tp->snd_cwnd); 287 return; 288 } 289 290 prior_snd_cwnd = tp->snd_cwnd; 291 tcp_reno_cong_avoid(sk, ack, acked); 292 293 incr = tp->snd_cwnd - prior_snd_cwnd; 294 ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr); 295 } 296 297 static void tcp_cdg_acked(struct sock *sk, const struct ack_sample *sample) 298 { 299 struct cdg *ca = inet_csk_ca(sk); 300 struct tcp_sock *tp = tcp_sk(sk); 301 302 if (sample->rtt_us <= 0) 303 return; 304 305 /* A heuristic for filtering delayed ACKs, adapted from: 306 * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support 307 * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010. 308 */ 309 if (tp->sacked_out == 0) { 310 if (sample->pkts_acked == 1 && ca->delack) { 311 /* A delayed ACK is only used for the minimum if it is 312 * provenly lower than an existing non-zero minimum. 313 */ 314 ca->rtt.min = min(ca->rtt.min, sample->rtt_us); 315 ca->delack--; 316 return; 317 } else if (sample->pkts_acked > 1 && ca->delack < 5) { 318 ca->delack++; 319 } 320 } 321 322 ca->rtt.min = min_not_zero(ca->rtt.min, sample->rtt_us); 323 ca->rtt.max = max(ca->rtt.max, sample->rtt_us); 324 } 325 326 static u32 tcp_cdg_ssthresh(struct sock *sk) 327 { 328 struct cdg *ca = inet_csk_ca(sk); 329 struct tcp_sock *tp = tcp_sk(sk); 330 331 ca->undo_cwnd = tp->snd_cwnd; 332 333 if (ca->state == CDG_BACKOFF) 334 return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10); 335 336 if (ca->state == CDG_NONFULL && use_tolerance) 337 return tp->snd_cwnd; 338 339 ca->shadow_wnd = min(ca->shadow_wnd >> 1, tp->snd_cwnd); 340 if (use_shadow) 341 return max3(2U, ca->shadow_wnd, tp->snd_cwnd >> 1); 342 return max(2U, tp->snd_cwnd >> 1); 343 } 344 345 static u32 tcp_cdg_undo_cwnd(struct sock *sk) 346 { 347 struct cdg *ca = inet_csk_ca(sk); 348 349 return max(tcp_sk(sk)->snd_cwnd, ca->undo_cwnd); 350 } 351 352 static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) 353 { 354 struct cdg *ca = inet_csk_ca(sk); 355 struct tcp_sock *tp = tcp_sk(sk); 356 struct minmax *gradients; 357 358 switch (ev) { 359 case CA_EVENT_CWND_RESTART: 360 gradients = ca->gradients; 361 if (gradients) 362 memset(gradients, 0, window * sizeof(gradients[0])); 363 memset(ca, 0, sizeof(*ca)); 364 365 ca->gradients = gradients; 366 ca->rtt_seq = tp->snd_nxt; 367 ca->shadow_wnd = tp->snd_cwnd; 368 break; 369 case CA_EVENT_COMPLETE_CWR: 370 ca->state = CDG_UNKNOWN; 371 ca->rtt_seq = tp->snd_nxt; 372 ca->rtt_prev = ca->rtt; 373 ca->rtt.v64 = 0; 374 break; 375 default: 376 break; 377 } 378 } 379 380 static void tcp_cdg_init(struct sock *sk) 381 { 382 struct cdg *ca = inet_csk_ca(sk); 383 struct tcp_sock *tp = tcp_sk(sk); 384 385 /* We silently fall back to window = 1 if allocation fails. */ 386 if (window > 1) 387 ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), 388 GFP_NOWAIT | __GFP_NOWARN); 389 ca->rtt_seq = tp->snd_nxt; 390 ca->shadow_wnd = tp->snd_cwnd; 391 } 392 393 static void tcp_cdg_release(struct sock *sk) 394 { 395 struct cdg *ca = inet_csk_ca(sk); 396 397 kfree(ca->gradients); 398 } 399 400 struct tcp_congestion_ops tcp_cdg __read_mostly = { 401 .cong_avoid = tcp_cdg_cong_avoid, 402 .cwnd_event = tcp_cdg_cwnd_event, 403 .pkts_acked = tcp_cdg_acked, 404 .undo_cwnd = tcp_cdg_undo_cwnd, 405 .ssthresh = tcp_cdg_ssthresh, 406 .release = tcp_cdg_release, 407 .init = tcp_cdg_init, 408 .owner = THIS_MODULE, 409 .name = "cdg", 410 }; 411 412 static int __init tcp_cdg_register(void) 413 { 414 if (backoff_beta > 1024 || window < 1 || window > 256) 415 return -ERANGE; 416 if (!is_power_of_2(window)) 417 return -EINVAL; 418 419 BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE); 420 tcp_register_congestion_control(&tcp_cdg); 421 return 0; 422 } 423 424 static void __exit tcp_cdg_unregister(void) 425 { 426 tcp_unregister_congestion_control(&tcp_cdg); 427 } 428 429 module_init(tcp_cdg_register); 430 module_exit(tcp_cdg_unregister); 431 MODULE_AUTHOR("Kenneth Klette Jonassen"); 432 MODULE_LICENSE("GPL"); 433 MODULE_DESCRIPTION("TCP CDG"); 434