1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright (c) 2019 Facebook 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of version 2 of the GNU General Public 6 * License as published by the Free Software Foundation. 7 * 8 * Example program for Host Bandwidth Managment 9 * 10 * This program loads a cgroup skb BPF program to enforce cgroup output 11 * (egress) or input (ingress) bandwidth limits. 12 * 13 * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog] 14 * Where: 15 * -d Print BPF trace debug buffer 16 * -l Also limit flows doing loopback 17 * -n <#> To create cgroup \"/hbm#\" and attach prog 18 * Default is /hbm1 19 * --no_cn Do not return cn notifications 20 * -r <rate> Rate limit in Mbps 21 * -s Get HBM stats (marked, dropped, etc.) 22 * -t <time> Exit after specified seconds (default is 0) 23 * -w Work conserving flag. cgroup can increase its bandwidth 24 * beyond the rate limit specified while there is available 25 * bandwidth. Current implementation assumes there is only 26 * NIC (eth0), but can be extended to support multiple NICs. 27 * Currrently only supported for egress. 28 * -h Print this info 29 * prog BPF program file name. Name defaults to hbm_out_kern.o 30 */ 31 32 #define _GNU_SOURCE 33 34 #include <stdio.h> 35 #include <stdlib.h> 36 #include <assert.h> 37 #include <sys/resource.h> 38 #include <sys/time.h> 39 #include <unistd.h> 40 #include <errno.h> 41 #include <fcntl.h> 42 #include <linux/unistd.h> 43 44 #include <linux/bpf.h> 45 #include <bpf/bpf.h> 46 #include <getopt.h> 47 48 #include "bpf_load.h" 49 #include "bpf_rlimit.h" 50 #include "cgroup_helpers.h" 51 #include "hbm.h" 52 #include "bpf_util.h" 53 #include <bpf/bpf.h> 54 #include <bpf/libbpf.h> 55 56 bool outFlag = true; 57 int minRate = 1000; /* cgroup rate limit in Mbps */ 58 int rate = 1000; /* can grow if rate conserving is enabled */ 59 int dur = 1; 60 bool stats_flag; 61 bool loopback_flag; 62 bool debugFlag; 63 bool work_conserving_flag; 64 bool no_cn_flag; 65 bool edt_flag; 66 67 static void Usage(void); 68 static void read_trace_pipe2(void); 69 static void do_error(char *msg, bool errno_flag); 70 71 #define DEBUGFS "/sys/kernel/debug/tracing/" 72 73 struct bpf_object *obj; 74 int bpfprog_fd; 75 int cgroup_storage_fd; 76 77 static void read_trace_pipe2(void) 78 { 79 int trace_fd; 80 FILE *outf; 81 char *outFname = "hbm_out.log"; 82 83 trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); 84 if (trace_fd < 0) { 85 printf("Error opening trace_pipe\n"); 86 return; 87 } 88 89 // Future support of ingress 90 // if (!outFlag) 91 // outFname = "hbm_in.log"; 92 outf = fopen(outFname, "w"); 93 94 if (outf == NULL) 95 printf("Error creating %s\n", outFname); 96 97 while (1) { 98 static char buf[4097]; 99 ssize_t sz; 100 101 sz = read(trace_fd, buf, sizeof(buf) - 1); 102 if (sz > 0) { 103 buf[sz] = 0; 104 puts(buf); 105 if (outf != NULL) { 106 fprintf(outf, "%s\n", buf); 107 fflush(outf); 108 } 109 } 110 } 111 } 112 113 static void do_error(char *msg, bool errno_flag) 114 { 115 if (errno_flag) 116 printf("ERROR: %s, errno: %d\n", msg, errno); 117 else 118 printf("ERROR: %s\n", msg); 119 exit(1); 120 } 121 122 static int prog_load(char *prog) 123 { 124 struct bpf_prog_load_attr prog_load_attr = { 125 .prog_type = BPF_PROG_TYPE_CGROUP_SKB, 126 .file = prog, 127 .expected_attach_type = BPF_CGROUP_INET_EGRESS, 128 }; 129 int map_fd; 130 struct bpf_map *map; 131 132 int ret = 0; 133 134 if (access(prog, O_RDONLY) < 0) { 135 printf("Error accessing file %s: %s\n", prog, strerror(errno)); 136 return 1; 137 } 138 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd)) 139 ret = 1; 140 if (!ret) { 141 map = bpf_object__find_map_by_name(obj, "queue_stats"); 142 map_fd = bpf_map__fd(map); 143 if (map_fd < 0) { 144 printf("Map not found: %s\n", strerror(map_fd)); 145 ret = 1; 146 } 147 } 148 149 if (ret) { 150 printf("ERROR: bpf_prog_load_xattr failed for: %s\n", prog); 151 printf(" Output from verifier:\n%s\n------\n", bpf_log_buf); 152 ret = -1; 153 } else { 154 ret = map_fd; 155 } 156 157 return ret; 158 } 159 160 static int run_bpf_prog(char *prog, int cg_id) 161 { 162 int map_fd; 163 int rc = 0; 164 int key = 0; 165 int cg1 = 0; 166 int type = BPF_CGROUP_INET_EGRESS; 167 char cg_dir[100]; 168 struct hbm_queue_stats qstats = {0}; 169 170 sprintf(cg_dir, "/hbm%d", cg_id); 171 map_fd = prog_load(prog); 172 if (map_fd == -1) 173 return 1; 174 175 if (setup_cgroup_environment()) { 176 printf("ERROR: setting cgroup environment\n"); 177 goto err; 178 } 179 cg1 = create_and_get_cgroup(cg_dir); 180 if (!cg1) { 181 printf("ERROR: create_and_get_cgroup\n"); 182 goto err; 183 } 184 if (join_cgroup(cg_dir)) { 185 printf("ERROR: join_cgroup\n"); 186 goto err; 187 } 188 189 qstats.rate = rate; 190 qstats.stats = stats_flag ? 1 : 0; 191 qstats.loopback = loopback_flag ? 1 : 0; 192 qstats.no_cn = no_cn_flag ? 1 : 0; 193 if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) { 194 printf("ERROR: Could not update map element\n"); 195 goto err; 196 } 197 198 if (!outFlag) 199 type = BPF_CGROUP_INET_INGRESS; 200 if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) { 201 printf("ERROR: bpf_prog_attach fails!\n"); 202 log_err("Attaching prog"); 203 goto err; 204 } 205 206 if (work_conserving_flag) { 207 struct timeval t0, t_last, t_new; 208 FILE *fin; 209 unsigned long long last_eth_tx_bytes, new_eth_tx_bytes; 210 signed long long last_cg_tx_bytes, new_cg_tx_bytes; 211 signed long long delta_time, delta_bytes, delta_rate; 212 int delta_ms; 213 #define DELTA_RATE_CHECK 10000 /* in us */ 214 #define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ 215 216 bpf_map_lookup_elem(map_fd, &key, &qstats); 217 if (gettimeofday(&t0, NULL) < 0) 218 do_error("gettimeofday failed", true); 219 t_last = t0; 220 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r"); 221 if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1) 222 do_error("fscanf fails", false); 223 fclose(fin); 224 last_cg_tx_bytes = qstats.bytes_total; 225 while (true) { 226 usleep(DELTA_RATE_CHECK); 227 if (gettimeofday(&t_new, NULL) < 0) 228 do_error("gettimeofday failed", true); 229 delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 + 230 (t_new.tv_usec - t0.tv_usec)/1000; 231 if (delta_ms > dur * 1000) 232 break; 233 delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 + 234 (t_new.tv_usec - t_last.tv_usec); 235 if (delta_time == 0) 236 continue; 237 t_last = t_new; 238 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", 239 "r"); 240 if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1) 241 do_error("fscanf fails", false); 242 fclose(fin); 243 printf(" new_eth_tx_bytes:%llu\n", 244 new_eth_tx_bytes); 245 bpf_map_lookup_elem(map_fd, &key, &qstats); 246 new_cg_tx_bytes = qstats.bytes_total; 247 delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; 248 last_eth_tx_bytes = new_eth_tx_bytes; 249 delta_rate = (delta_bytes * 8000000) / delta_time; 250 printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps", 251 delta_ms, delta_rate/1000000000.0, 252 rate/1000.0); 253 if (delta_rate < RATE_THRESHOLD) { 254 /* can increase cgroup rate limit, but first 255 * check if we are using the current limit. 256 * Currently increasing by 6.25%, unknown 257 * if that is the optimal rate. 258 */ 259 int rate_diff100; 260 261 delta_bytes = new_cg_tx_bytes - 262 last_cg_tx_bytes; 263 last_cg_tx_bytes = new_cg_tx_bytes; 264 delta_rate = (delta_bytes * 8000000) / 265 delta_time; 266 printf(" rate:%.3fGbps", 267 delta_rate/1000000000.0); 268 rate_diff100 = (((long long)rate)*1000000 - 269 delta_rate) * 100 / 270 (((long long) rate) * 1000000); 271 printf(" rdiff:%d", rate_diff100); 272 if (rate_diff100 <= 3) { 273 rate += (rate >> 4); 274 if (rate > RATE_THRESHOLD / 1000000) 275 rate = RATE_THRESHOLD / 1000000; 276 qstats.rate = rate; 277 printf(" INC\n"); 278 } else { 279 printf("\n"); 280 } 281 } else { 282 /* Need to decrease cgroup rate limit. 283 * Currently decreasing by 12.5%, unknown 284 * if that is optimal 285 */ 286 printf(" DEC\n"); 287 rate -= (rate >> 3); 288 if (rate < minRate) 289 rate = minRate; 290 qstats.rate = rate; 291 } 292 if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) 293 do_error("update map element fails", false); 294 } 295 } else { 296 sleep(dur); 297 } 298 // Get stats! 299 if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) { 300 char fname[100]; 301 FILE *fout; 302 303 if (!outFlag) 304 sprintf(fname, "hbm.%d.in", cg_id); 305 else 306 sprintf(fname, "hbm.%d.out", cg_id); 307 fout = fopen(fname, "w"); 308 fprintf(fout, "id:%d\n", cg_id); 309 fprintf(fout, "ERROR: Could not lookup queue_stats\n"); 310 } else if (stats_flag && qstats.lastPacketTime > 311 qstats.firstPacketTime) { 312 long long delta_us = (qstats.lastPacketTime - 313 qstats.firstPacketTime)/1000; 314 unsigned int rate_mbps = ((qstats.bytes_total - 315 qstats.bytes_dropped) * 8 / 316 delta_us); 317 double percent_pkts, percent_bytes; 318 char fname[100]; 319 FILE *fout; 320 int k; 321 static const char *returnValNames[] = { 322 "DROP_PKT", 323 "ALLOW_PKT", 324 "DROP_PKT_CWR", 325 "ALLOW_PKT_CWR" 326 }; 327 #define RET_VAL_COUNT 4 328 329 // Future support of ingress 330 // if (!outFlag) 331 // sprintf(fname, "hbm.%d.in", cg_id); 332 // else 333 sprintf(fname, "hbm.%d.out", cg_id); 334 fout = fopen(fname, "w"); 335 fprintf(fout, "id:%d\n", cg_id); 336 fprintf(fout, "rate_mbps:%d\n", rate_mbps); 337 fprintf(fout, "duration:%.1f secs\n", 338 (qstats.lastPacketTime - qstats.firstPacketTime) / 339 1000000000.0); 340 fprintf(fout, "packets:%d\n", (int)qstats.pkts_total); 341 fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total / 342 1000000)); 343 fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped); 344 fprintf(fout, "bytes_dropped_MB:%d\n", 345 (int)(qstats.bytes_dropped / 346 1000000)); 347 // Marked Pkts and Bytes 348 percent_pkts = (qstats.pkts_marked * 100.0) / 349 (qstats.pkts_total + 1); 350 percent_bytes = (qstats.bytes_marked * 100.0) / 351 (qstats.bytes_total + 1); 352 fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts); 353 fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes); 354 355 // Dropped Pkts and Bytes 356 percent_pkts = (qstats.pkts_dropped * 100.0) / 357 (qstats.pkts_total + 1); 358 percent_bytes = (qstats.bytes_dropped * 100.0) / 359 (qstats.bytes_total + 1); 360 fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); 361 fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); 362 363 // ECN CE markings 364 percent_pkts = (qstats.pkts_ecn_ce * 100.0) / 365 (qstats.pkts_total + 1); 366 fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts, 367 (int)qstats.pkts_ecn_ce); 368 369 // Average cwnd 370 fprintf(fout, "avg cwnd:%d\n", 371 (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1))); 372 // Average rtt 373 fprintf(fout, "avg rtt:%d\n", 374 (int)(qstats.sum_rtt / (qstats.pkts_total + 1))); 375 // Average credit 376 if (edt_flag) 377 fprintf(fout, "avg credit_ms:%.03f\n", 378 (qstats.sum_credit / 379 (qstats.pkts_total + 1.0)) / 1000000.0); 380 else 381 fprintf(fout, "avg credit:%d\n", 382 (int)(qstats.sum_credit / 383 (1500 * ((int)qstats.pkts_total ) + 1))); 384 385 // Return values stats 386 for (k = 0; k < RET_VAL_COUNT; k++) { 387 percent_pkts = (qstats.returnValCount[k] * 100.0) / 388 (qstats.pkts_total + 1); 389 fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k], 390 percent_pkts, (int)qstats.returnValCount[k]); 391 } 392 fclose(fout); 393 } 394 395 if (debugFlag) 396 read_trace_pipe2(); 397 return rc; 398 err: 399 rc = 1; 400 401 if (cg1) 402 close(cg1); 403 cleanup_cgroup_environment(); 404 405 return rc; 406 } 407 408 static void Usage(void) 409 { 410 printf("This program loads a cgroup skb BPF program to enforce\n" 411 "cgroup output (egress) bandwidth limits.\n\n" 412 "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n" 413 " [-s] [-t <secs>] [-w] [-h] [prog]\n" 414 " Where:\n" 415 " -o indicates egress direction (default)\n" 416 " -d print BPF trace debug buffer\n" 417 " --edt use fq's Earliest Departure Time\n" 418 " -l also limit flows using loopback\n" 419 " -n <#> to create cgroup \"/hbm#\" and attach prog\n" 420 " Default is /hbm1\n" 421 " --no_cn disable CN notifications\n" 422 " -r <rate> Rate in Mbps\n" 423 " -s Update HBM stats\n" 424 " -t <time> Exit after specified seconds (default is 0)\n" 425 " -w Work conserving flag. cgroup can increase\n" 426 " bandwidth beyond the rate limit specified\n" 427 " while there is available bandwidth. Current\n" 428 " implementation assumes there is only eth0\n" 429 " but can be extended to support multiple NICs\n" 430 " -h print this info\n" 431 " prog BPF program file name. Name defaults to\n" 432 " hbm_out_kern.o\n"); 433 } 434 435 int main(int argc, char **argv) 436 { 437 char *prog = "hbm_out_kern.o"; 438 int k; 439 int cg_id = 1; 440 char *optstring = "iodln:r:st:wh"; 441 struct option loptions[] = { 442 {"no_cn", 0, NULL, 1}, 443 {"edt", 0, NULL, 2}, 444 {NULL, 0, NULL, 0} 445 }; 446 447 while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) { 448 switch (k) { 449 case 1: 450 no_cn_flag = true; 451 break; 452 case 2: 453 prog = "hbm_edt_kern.o"; 454 edt_flag = true; 455 break; 456 case'o': 457 break; 458 case 'd': 459 debugFlag = true; 460 break; 461 case 'l': 462 loopback_flag = true; 463 break; 464 case 'n': 465 cg_id = atoi(optarg); 466 break; 467 case 'r': 468 minRate = atoi(optarg) * 1.024; 469 rate = minRate; 470 break; 471 case 's': 472 stats_flag = true; 473 break; 474 case 't': 475 dur = atoi(optarg); 476 break; 477 case 'w': 478 work_conserving_flag = true; 479 break; 480 case '?': 481 if (optopt == 'n' || optopt == 'r' || optopt == 't') 482 fprintf(stderr, 483 "Option -%c requires an argument.\n\n", 484 optopt); 485 case 'h': 486 fallthrough; 487 default: 488 Usage(); 489 return 0; 490 } 491 } 492 493 if (optind < argc) 494 prog = argv[optind]; 495 printf("HBM prog: %s\n", prog != NULL ? prog : "NULL"); 496 497 return run_bpf_prog(prog, cg_id); 498 } 499