1 // SPDX-License-Identifier: GPL-2.0 2 /* -*- linux-c -*- 3 * sysctl_net_core.c: sysctl interface to net core subsystem. 4 * 5 * Begun April 1, 1996, Mike Shaver. 6 * Added /proc/sys/net/core directory entry (empty =) ). [MS] 7 */ 8 9 #include <linux/mm.h> 10 #include <linux/sysctl.h> 11 #include <linux/module.h> 12 #include <linux/socket.h> 13 #include <linux/netdevice.h> 14 #include <linux/ratelimit.h> 15 #include <linux/vmalloc.h> 16 #include <linux/init.h> 17 #include <linux/slab.h> 18 19 #include <net/ip.h> 20 #include <net/sock.h> 21 #include <net/net_ratelimit.h> 22 #include <net/busy_poll.h> 23 #include <net/pkt_sched.h> 24 25 static int zero = 0; 26 static int one = 1; 27 static int two __maybe_unused = 2; 28 static int min_sndbuf = SOCK_MIN_SNDBUF; 29 static int min_rcvbuf = SOCK_MIN_RCVBUF; 30 static int max_skb_frags = MAX_SKB_FRAGS; 31 32 static int net_msg_warn; /* Unused, but still a sysctl */ 33 34 int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0; 35 EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); 36 37 #ifdef CONFIG_RPS 38 static int rps_sock_flow_sysctl(struct ctl_table *table, int write, 39 void __user *buffer, size_t *lenp, loff_t *ppos) 40 { 41 unsigned int orig_size, size; 42 int ret, i; 43 struct ctl_table tmp = { 44 .data = &size, 45 .maxlen = sizeof(size), 46 .mode = table->mode 47 }; 48 struct rps_sock_flow_table *orig_sock_table, *sock_table; 49 static DEFINE_MUTEX(sock_flow_mutex); 50 51 mutex_lock(&sock_flow_mutex); 52 53 orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, 54 lockdep_is_held(&sock_flow_mutex)); 55 size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; 56 57 ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); 58 59 if (write) { 60 if (size) { 61 if (size > 1<<29) { 62 /* Enforce limit to prevent overflow */ 63 mutex_unlock(&sock_flow_mutex); 64 return -EINVAL; 65 } 66 size = roundup_pow_of_two(size); 67 if (size != orig_size) { 68 sock_table = 69 vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); 70 if (!sock_table) { 71 mutex_unlock(&sock_flow_mutex); 72 return -ENOMEM; 73 } 74 rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; 75 sock_table->mask = size - 1; 76 } else 77 sock_table = orig_sock_table; 78 79 for (i = 0; i < size; i++) 80 sock_table->ents[i] = RPS_NO_CPU; 81 } else 82 sock_table = NULL; 83 84 if (sock_table != orig_sock_table) { 85 rcu_assign_pointer(rps_sock_flow_table, sock_table); 86 if (sock_table) { 87 static_key_slow_inc(&rps_needed); 88 static_key_slow_inc(&rfs_needed); 89 } 90 if (orig_sock_table) { 91 static_key_slow_dec(&rps_needed); 92 static_key_slow_dec(&rfs_needed); 93 synchronize_rcu(); 94 vfree(orig_sock_table); 95 } 96 } 97 } 98 99 mutex_unlock(&sock_flow_mutex); 100 101 return ret; 102 } 103 #endif /* CONFIG_RPS */ 104 105 #ifdef CONFIG_NET_FLOW_LIMIT 106 static DEFINE_MUTEX(flow_limit_update_mutex); 107 108 static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, 109 void __user *buffer, size_t *lenp, 110 loff_t *ppos) 111 { 112 struct sd_flow_limit *cur; 113 struct softnet_data *sd; 114 cpumask_var_t mask; 115 int i, len, ret = 0; 116 117 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 118 return -ENOMEM; 119 120 if (write) { 121 ret = cpumask_parse_user(buffer, *lenp, mask); 122 if (ret) 123 goto done; 124 125 mutex_lock(&flow_limit_update_mutex); 126 len = sizeof(*cur) + netdev_flow_limit_table_len; 127 for_each_possible_cpu(i) { 128 sd = &per_cpu(softnet_data, i); 129 cur = rcu_dereference_protected(sd->flow_limit, 130 lockdep_is_held(&flow_limit_update_mutex)); 131 if (cur && !cpumask_test_cpu(i, mask)) { 132 RCU_INIT_POINTER(sd->flow_limit, NULL); 133 synchronize_rcu(); 134 kfree(cur); 135 } else if (!cur && cpumask_test_cpu(i, mask)) { 136 cur = kzalloc_node(len, GFP_KERNEL, 137 cpu_to_node(i)); 138 if (!cur) { 139 /* not unwinding previous changes */ 140 ret = -ENOMEM; 141 goto write_unlock; 142 } 143 cur->num_buckets = netdev_flow_limit_table_len; 144 rcu_assign_pointer(sd->flow_limit, cur); 145 } 146 } 147 write_unlock: 148 mutex_unlock(&flow_limit_update_mutex); 149 } else { 150 char kbuf[128]; 151 152 if (*ppos || !*lenp) { 153 *lenp = 0; 154 goto done; 155 } 156 157 cpumask_clear(mask); 158 rcu_read_lock(); 159 for_each_possible_cpu(i) { 160 sd = &per_cpu(softnet_data, i); 161 if (rcu_dereference(sd->flow_limit)) 162 cpumask_set_cpu(i, mask); 163 } 164 rcu_read_unlock(); 165 166 len = min(sizeof(kbuf) - 1, *lenp); 167 len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); 168 if (!len) { 169 *lenp = 0; 170 goto done; 171 } 172 if (len < *lenp) 173 kbuf[len++] = '\n'; 174 if (copy_to_user(buffer, kbuf, len)) { 175 ret = -EFAULT; 176 goto done; 177 } 178 *lenp = len; 179 *ppos += len; 180 } 181 182 done: 183 free_cpumask_var(mask); 184 return ret; 185 } 186 187 static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, 188 void __user *buffer, size_t *lenp, 189 loff_t *ppos) 190 { 191 unsigned int old, *ptr; 192 int ret; 193 194 mutex_lock(&flow_limit_update_mutex); 195 196 ptr = table->data; 197 old = *ptr; 198 ret = proc_dointvec(table, write, buffer, lenp, ppos); 199 if (!ret && write && !is_power_of_2(*ptr)) { 200 *ptr = old; 201 ret = -EINVAL; 202 } 203 204 mutex_unlock(&flow_limit_update_mutex); 205 return ret; 206 } 207 #endif /* CONFIG_NET_FLOW_LIMIT */ 208 209 #ifdef CONFIG_NET_SCHED 210 static int set_default_qdisc(struct ctl_table *table, int write, 211 void __user *buffer, size_t *lenp, loff_t *ppos) 212 { 213 char id[IFNAMSIZ]; 214 struct ctl_table tbl = { 215 .data = id, 216 .maxlen = IFNAMSIZ, 217 }; 218 int ret; 219 220 qdisc_get_default(id, IFNAMSIZ); 221 222 ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 223 if (write && ret == 0) 224 ret = qdisc_set_default(id); 225 return ret; 226 } 227 #endif 228 229 static int proc_do_dev_weight(struct ctl_table *table, int write, 230 void __user *buffer, size_t *lenp, loff_t *ppos) 231 { 232 int ret; 233 234 ret = proc_dointvec(table, write, buffer, lenp, ppos); 235 if (ret != 0) 236 return ret; 237 238 dev_rx_weight = weight_p * dev_weight_rx_bias; 239 dev_tx_weight = weight_p * dev_weight_tx_bias; 240 241 return ret; 242 } 243 244 static int proc_do_rss_key(struct ctl_table *table, int write, 245 void __user *buffer, size_t *lenp, loff_t *ppos) 246 { 247 struct ctl_table fake_table; 248 char buf[NETDEV_RSS_KEY_LEN * 3]; 249 250 snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key); 251 fake_table.data = buf; 252 fake_table.maxlen = sizeof(buf); 253 return proc_dostring(&fake_table, write, buffer, lenp, ppos); 254 } 255 256 #ifdef CONFIG_BPF_JIT 257 static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, 258 void __user *buffer, size_t *lenp, 259 loff_t *ppos) 260 { 261 int ret, jit_enable = *(int *)table->data; 262 struct ctl_table tmp = *table; 263 264 if (write && !capable(CAP_SYS_ADMIN)) 265 return -EPERM; 266 267 tmp.data = &jit_enable; 268 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 269 if (write && !ret) { 270 if (jit_enable < 2 || 271 (jit_enable == 2 && bpf_dump_raw_ok())) { 272 *(int *)table->data = jit_enable; 273 if (jit_enable == 2) 274 pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); 275 } else { 276 ret = -EPERM; 277 } 278 } 279 return ret; 280 } 281 282 static int 283 proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, 284 void __user *buffer, size_t *lenp, 285 loff_t *ppos) 286 { 287 if (!capable(CAP_SYS_ADMIN)) 288 return -EPERM; 289 290 return proc_dointvec_minmax(table, write, buffer, lenp, ppos); 291 } 292 #endif 293 294 static struct ctl_table net_core_table[] = { 295 #ifdef CONFIG_NET 296 { 297 .procname = "wmem_max", 298 .data = &sysctl_wmem_max, 299 .maxlen = sizeof(int), 300 .mode = 0644, 301 .proc_handler = proc_dointvec_minmax, 302 .extra1 = &min_sndbuf, 303 }, 304 { 305 .procname = "rmem_max", 306 .data = &sysctl_rmem_max, 307 .maxlen = sizeof(int), 308 .mode = 0644, 309 .proc_handler = proc_dointvec_minmax, 310 .extra1 = &min_rcvbuf, 311 }, 312 { 313 .procname = "wmem_default", 314 .data = &sysctl_wmem_default, 315 .maxlen = sizeof(int), 316 .mode = 0644, 317 .proc_handler = proc_dointvec_minmax, 318 .extra1 = &min_sndbuf, 319 }, 320 { 321 .procname = "rmem_default", 322 .data = &sysctl_rmem_default, 323 .maxlen = sizeof(int), 324 .mode = 0644, 325 .proc_handler = proc_dointvec_minmax, 326 .extra1 = &min_rcvbuf, 327 }, 328 { 329 .procname = "dev_weight", 330 .data = &weight_p, 331 .maxlen = sizeof(int), 332 .mode = 0644, 333 .proc_handler = proc_do_dev_weight, 334 }, 335 { 336 .procname = "dev_weight_rx_bias", 337 .data = &dev_weight_rx_bias, 338 .maxlen = sizeof(int), 339 .mode = 0644, 340 .proc_handler = proc_do_dev_weight, 341 }, 342 { 343 .procname = "dev_weight_tx_bias", 344 .data = &dev_weight_tx_bias, 345 .maxlen = sizeof(int), 346 .mode = 0644, 347 .proc_handler = proc_do_dev_weight, 348 }, 349 { 350 .procname = "netdev_max_backlog", 351 .data = &netdev_max_backlog, 352 .maxlen = sizeof(int), 353 .mode = 0644, 354 .proc_handler = proc_dointvec 355 }, 356 { 357 .procname = "netdev_rss_key", 358 .data = &netdev_rss_key, 359 .maxlen = sizeof(int), 360 .mode = 0444, 361 .proc_handler = proc_do_rss_key, 362 }, 363 #ifdef CONFIG_BPF_JIT 364 { 365 .procname = "bpf_jit_enable", 366 .data = &bpf_jit_enable, 367 .maxlen = sizeof(int), 368 .mode = 0644, 369 .proc_handler = proc_dointvec_minmax_bpf_enable, 370 # ifdef CONFIG_BPF_JIT_ALWAYS_ON 371 .extra1 = &one, 372 .extra2 = &one, 373 # else 374 .extra1 = &zero, 375 .extra2 = &two, 376 # endif 377 }, 378 # ifdef CONFIG_HAVE_EBPF_JIT 379 { 380 .procname = "bpf_jit_harden", 381 .data = &bpf_jit_harden, 382 .maxlen = sizeof(int), 383 .mode = 0600, 384 .proc_handler = proc_dointvec_minmax_bpf_restricted, 385 .extra1 = &zero, 386 .extra2 = &two, 387 }, 388 { 389 .procname = "bpf_jit_kallsyms", 390 .data = &bpf_jit_kallsyms, 391 .maxlen = sizeof(int), 392 .mode = 0600, 393 .proc_handler = proc_dointvec_minmax_bpf_restricted, 394 .extra1 = &zero, 395 .extra2 = &one, 396 }, 397 # endif 398 { 399 .procname = "bpf_jit_limit", 400 .data = &bpf_jit_limit, 401 .maxlen = sizeof(int), 402 .mode = 0600, 403 .proc_handler = proc_dointvec_minmax_bpf_restricted, 404 .extra1 = &one, 405 }, 406 #endif 407 { 408 .procname = "netdev_tstamp_prequeue", 409 .data = &netdev_tstamp_prequeue, 410 .maxlen = sizeof(int), 411 .mode = 0644, 412 .proc_handler = proc_dointvec 413 }, 414 { 415 .procname = "message_cost", 416 .data = &net_ratelimit_state.interval, 417 .maxlen = sizeof(int), 418 .mode = 0644, 419 .proc_handler = proc_dointvec_jiffies, 420 }, 421 { 422 .procname = "message_burst", 423 .data = &net_ratelimit_state.burst, 424 .maxlen = sizeof(int), 425 .mode = 0644, 426 .proc_handler = proc_dointvec, 427 }, 428 { 429 .procname = "optmem_max", 430 .data = &sysctl_optmem_max, 431 .maxlen = sizeof(int), 432 .mode = 0644, 433 .proc_handler = proc_dointvec 434 }, 435 { 436 .procname = "tstamp_allow_data", 437 .data = &sysctl_tstamp_allow_data, 438 .maxlen = sizeof(int), 439 .mode = 0644, 440 .proc_handler = proc_dointvec_minmax, 441 .extra1 = &zero, 442 .extra2 = &one 443 }, 444 #ifdef CONFIG_RPS 445 { 446 .procname = "rps_sock_flow_entries", 447 .maxlen = sizeof(int), 448 .mode = 0644, 449 .proc_handler = rps_sock_flow_sysctl 450 }, 451 #endif 452 #ifdef CONFIG_NET_FLOW_LIMIT 453 { 454 .procname = "flow_limit_cpu_bitmap", 455 .mode = 0644, 456 .proc_handler = flow_limit_cpu_sysctl 457 }, 458 { 459 .procname = "flow_limit_table_len", 460 .data = &netdev_flow_limit_table_len, 461 .maxlen = sizeof(int), 462 .mode = 0644, 463 .proc_handler = flow_limit_table_len_sysctl 464 }, 465 #endif /* CONFIG_NET_FLOW_LIMIT */ 466 #ifdef CONFIG_NET_RX_BUSY_POLL 467 { 468 .procname = "busy_poll", 469 .data = &sysctl_net_busy_poll, 470 .maxlen = sizeof(unsigned int), 471 .mode = 0644, 472 .proc_handler = proc_dointvec_minmax, 473 .extra1 = &zero, 474 }, 475 { 476 .procname = "busy_read", 477 .data = &sysctl_net_busy_read, 478 .maxlen = sizeof(unsigned int), 479 .mode = 0644, 480 .proc_handler = proc_dointvec_minmax, 481 .extra1 = &zero, 482 }, 483 #endif 484 #ifdef CONFIG_NET_SCHED 485 { 486 .procname = "default_qdisc", 487 .mode = 0644, 488 .maxlen = IFNAMSIZ, 489 .proc_handler = set_default_qdisc 490 }, 491 #endif 492 #endif /* CONFIG_NET */ 493 { 494 .procname = "netdev_budget", 495 .data = &netdev_budget, 496 .maxlen = sizeof(int), 497 .mode = 0644, 498 .proc_handler = proc_dointvec 499 }, 500 { 501 .procname = "warnings", 502 .data = &net_msg_warn, 503 .maxlen = sizeof(int), 504 .mode = 0644, 505 .proc_handler = proc_dointvec 506 }, 507 { 508 .procname = "max_skb_frags", 509 .data = &sysctl_max_skb_frags, 510 .maxlen = sizeof(int), 511 .mode = 0644, 512 .proc_handler = proc_dointvec_minmax, 513 .extra1 = &one, 514 .extra2 = &max_skb_frags, 515 }, 516 { 517 .procname = "netdev_budget_usecs", 518 .data = &netdev_budget_usecs, 519 .maxlen = sizeof(unsigned int), 520 .mode = 0644, 521 .proc_handler = proc_dointvec_minmax, 522 .extra1 = &zero, 523 }, 524 { 525 .procname = "fb_tunnels_only_for_init_net", 526 .data = &sysctl_fb_tunnels_only_for_init_net, 527 .maxlen = sizeof(int), 528 .mode = 0644, 529 .proc_handler = proc_dointvec_minmax, 530 .extra1 = &zero, 531 .extra2 = &one, 532 }, 533 { } 534 }; 535 536 static struct ctl_table netns_core_table[] = { 537 { 538 .procname = "somaxconn", 539 .data = &init_net.core.sysctl_somaxconn, 540 .maxlen = sizeof(int), 541 .mode = 0644, 542 .extra1 = &zero, 543 .proc_handler = proc_dointvec_minmax 544 }, 545 { } 546 }; 547 548 static __net_init int sysctl_core_net_init(struct net *net) 549 { 550 struct ctl_table *tbl; 551 552 tbl = netns_core_table; 553 if (!net_eq(net, &init_net)) { 554 tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); 555 if (tbl == NULL) 556 goto err_dup; 557 558 tbl[0].data = &net->core.sysctl_somaxconn; 559 560 /* Don't export any sysctls to unprivileged users */ 561 if (net->user_ns != &init_user_ns) { 562 tbl[0].procname = NULL; 563 } 564 } 565 566 net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl); 567 if (net->core.sysctl_hdr == NULL) 568 goto err_reg; 569 570 return 0; 571 572 err_reg: 573 if (tbl != netns_core_table) 574 kfree(tbl); 575 err_dup: 576 return -ENOMEM; 577 } 578 579 static __net_exit void sysctl_core_net_exit(struct net *net) 580 { 581 struct ctl_table *tbl; 582 583 tbl = net->core.sysctl_hdr->ctl_table_arg; 584 unregister_net_sysctl_table(net->core.sysctl_hdr); 585 BUG_ON(tbl == netns_core_table); 586 kfree(tbl); 587 } 588 589 static __net_initdata struct pernet_operations sysctl_core_ops = { 590 .init = sysctl_core_net_init, 591 .exit = sysctl_core_net_exit, 592 }; 593 594 static __init int sysctl_core_init(void) 595 { 596 register_net_sysctl(&init_net, "net/core", net_core_table); 597 return register_pernet_subsys(&sysctl_core_ops); 598 } 599 600 fs_initcall(sysctl_core_init); 601