1 // SPDX-License-Identifier: GPL-2.0 2 /* -*- linux-c -*- 3 * sysctl_net_core.c: sysctl interface to net core subsystem. 4 * 5 * Begun April 1, 1996, Mike Shaver. 6 * Added /proc/sys/net/core directory entry (empty =) ). [MS] 7 */ 8 9 #include <linux/mm.h> 10 #include <linux/sysctl.h> 11 #include <linux/module.h> 12 #include <linux/socket.h> 13 #include <linux/netdevice.h> 14 #include <linux/ratelimit.h> 15 #include <linux/vmalloc.h> 16 #include <linux/init.h> 17 #include <linux/slab.h> 18 #include <linux/kmemleak.h> 19 20 #include <net/ip.h> 21 #include <net/sock.h> 22 #include <net/net_ratelimit.h> 23 #include <net/busy_poll.h> 24 #include <net/pkt_sched.h> 25 26 static int zero = 0; 27 static int one = 1; 28 static int two __maybe_unused = 2; 29 static int min_sndbuf = SOCK_MIN_SNDBUF; 30 static int min_rcvbuf = SOCK_MIN_RCVBUF; 31 static int max_skb_frags = MAX_SKB_FRAGS; 32 33 static int net_msg_warn; /* Unused, but still a sysctl */ 34 35 int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0; 36 EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); 37 38 #ifdef CONFIG_RPS 39 static int rps_sock_flow_sysctl(struct ctl_table *table, int write, 40 void __user *buffer, size_t *lenp, loff_t *ppos) 41 { 42 unsigned int orig_size, size; 43 int ret, i; 44 struct ctl_table tmp = { 45 .data = &size, 46 .maxlen = sizeof(size), 47 .mode = table->mode 48 }; 49 struct rps_sock_flow_table *orig_sock_table, *sock_table; 50 static DEFINE_MUTEX(sock_flow_mutex); 51 52 mutex_lock(&sock_flow_mutex); 53 54 orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, 55 lockdep_is_held(&sock_flow_mutex)); 56 size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; 57 58 ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); 59 60 if (write) { 61 if (size) { 62 if (size > 1<<29) { 63 /* Enforce limit to prevent overflow */ 64 mutex_unlock(&sock_flow_mutex); 65 return -EINVAL; 66 } 67 size = roundup_pow_of_two(size); 68 if (size != orig_size) { 69 sock_table = 70 vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); 71 if (!sock_table) { 72 mutex_unlock(&sock_flow_mutex); 73 return -ENOMEM; 74 } 75 rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; 76 sock_table->mask = size - 1; 77 } else 78 sock_table = orig_sock_table; 79 80 for (i = 0; i < size; i++) 81 sock_table->ents[i] = RPS_NO_CPU; 82 } else 83 sock_table = NULL; 84 85 if (sock_table != orig_sock_table) { 86 rcu_assign_pointer(rps_sock_flow_table, sock_table); 87 if (sock_table) { 88 static_key_slow_inc(&rps_needed); 89 static_key_slow_inc(&rfs_needed); 90 } 91 if (orig_sock_table) { 92 static_key_slow_dec(&rps_needed); 93 static_key_slow_dec(&rfs_needed); 94 synchronize_rcu(); 95 vfree(orig_sock_table); 96 } 97 } 98 } 99 100 mutex_unlock(&sock_flow_mutex); 101 102 return ret; 103 } 104 #endif /* CONFIG_RPS */ 105 106 #ifdef CONFIG_NET_FLOW_LIMIT 107 static DEFINE_MUTEX(flow_limit_update_mutex); 108 109 static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, 110 void __user *buffer, size_t *lenp, 111 loff_t *ppos) 112 { 113 struct sd_flow_limit *cur; 114 struct softnet_data *sd; 115 cpumask_var_t mask; 116 int i, len, ret = 0; 117 118 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 119 return -ENOMEM; 120 121 if (write) { 122 ret = cpumask_parse_user(buffer, *lenp, mask); 123 if (ret) 124 goto done; 125 126 mutex_lock(&flow_limit_update_mutex); 127 len = sizeof(*cur) + netdev_flow_limit_table_len; 128 for_each_possible_cpu(i) { 129 sd = &per_cpu(softnet_data, i); 130 cur = rcu_dereference_protected(sd->flow_limit, 131 lockdep_is_held(&flow_limit_update_mutex)); 132 if (cur && !cpumask_test_cpu(i, mask)) { 133 RCU_INIT_POINTER(sd->flow_limit, NULL); 134 synchronize_rcu(); 135 kfree(cur); 136 } else if (!cur && cpumask_test_cpu(i, mask)) { 137 cur = kzalloc_node(len, GFP_KERNEL, 138 cpu_to_node(i)); 139 if (!cur) { 140 /* not unwinding previous changes */ 141 ret = -ENOMEM; 142 goto write_unlock; 143 } 144 cur->num_buckets = netdev_flow_limit_table_len; 145 rcu_assign_pointer(sd->flow_limit, cur); 146 } 147 } 148 write_unlock: 149 mutex_unlock(&flow_limit_update_mutex); 150 } else { 151 char kbuf[128]; 152 153 if (*ppos || !*lenp) { 154 *lenp = 0; 155 goto done; 156 } 157 158 cpumask_clear(mask); 159 rcu_read_lock(); 160 for_each_possible_cpu(i) { 161 sd = &per_cpu(softnet_data, i); 162 if (rcu_dereference(sd->flow_limit)) 163 cpumask_set_cpu(i, mask); 164 } 165 rcu_read_unlock(); 166 167 len = min(sizeof(kbuf) - 1, *lenp); 168 len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); 169 if (!len) { 170 *lenp = 0; 171 goto done; 172 } 173 if (len < *lenp) 174 kbuf[len++] = '\n'; 175 if (copy_to_user(buffer, kbuf, len)) { 176 ret = -EFAULT; 177 goto done; 178 } 179 *lenp = len; 180 *ppos += len; 181 } 182 183 done: 184 free_cpumask_var(mask); 185 return ret; 186 } 187 188 static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, 189 void __user *buffer, size_t *lenp, 190 loff_t *ppos) 191 { 192 unsigned int old, *ptr; 193 int ret; 194 195 mutex_lock(&flow_limit_update_mutex); 196 197 ptr = table->data; 198 old = *ptr; 199 ret = proc_dointvec(table, write, buffer, lenp, ppos); 200 if (!ret && write && !is_power_of_2(*ptr)) { 201 *ptr = old; 202 ret = -EINVAL; 203 } 204 205 mutex_unlock(&flow_limit_update_mutex); 206 return ret; 207 } 208 #endif /* CONFIG_NET_FLOW_LIMIT */ 209 210 #ifdef CONFIG_NET_SCHED 211 static int set_default_qdisc(struct ctl_table *table, int write, 212 void __user *buffer, size_t *lenp, loff_t *ppos) 213 { 214 char id[IFNAMSIZ]; 215 struct ctl_table tbl = { 216 .data = id, 217 .maxlen = IFNAMSIZ, 218 }; 219 int ret; 220 221 qdisc_get_default(id, IFNAMSIZ); 222 223 ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 224 if (write && ret == 0) 225 ret = qdisc_set_default(id); 226 return ret; 227 } 228 #endif 229 230 static int proc_do_dev_weight(struct ctl_table *table, int write, 231 void __user *buffer, size_t *lenp, loff_t *ppos) 232 { 233 int ret; 234 235 ret = proc_dointvec(table, write, buffer, lenp, ppos); 236 if (ret != 0) 237 return ret; 238 239 dev_rx_weight = weight_p * dev_weight_rx_bias; 240 dev_tx_weight = weight_p * dev_weight_tx_bias; 241 242 return ret; 243 } 244 245 static int proc_do_rss_key(struct ctl_table *table, int write, 246 void __user *buffer, size_t *lenp, loff_t *ppos) 247 { 248 struct ctl_table fake_table; 249 char buf[NETDEV_RSS_KEY_LEN * 3]; 250 251 snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key); 252 fake_table.data = buf; 253 fake_table.maxlen = sizeof(buf); 254 return proc_dostring(&fake_table, write, buffer, lenp, ppos); 255 } 256 257 #ifdef CONFIG_BPF_JIT 258 static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, 259 void __user *buffer, size_t *lenp, 260 loff_t *ppos) 261 { 262 int ret, jit_enable = *(int *)table->data; 263 struct ctl_table tmp = *table; 264 265 if (write && !capable(CAP_SYS_ADMIN)) 266 return -EPERM; 267 268 tmp.data = &jit_enable; 269 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 270 if (write && !ret) { 271 if (jit_enable < 2 || 272 (jit_enable == 2 && bpf_dump_raw_ok())) { 273 *(int *)table->data = jit_enable; 274 if (jit_enable == 2) 275 pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); 276 } else { 277 ret = -EPERM; 278 } 279 } 280 return ret; 281 } 282 283 # ifdef CONFIG_HAVE_EBPF_JIT 284 static int 285 proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, 286 void __user *buffer, size_t *lenp, 287 loff_t *ppos) 288 { 289 if (!capable(CAP_SYS_ADMIN)) 290 return -EPERM; 291 292 return proc_dointvec_minmax(table, write, buffer, lenp, ppos); 293 } 294 # endif 295 #endif 296 297 static struct ctl_table net_core_table[] = { 298 #ifdef CONFIG_NET 299 { 300 .procname = "wmem_max", 301 .data = &sysctl_wmem_max, 302 .maxlen = sizeof(int), 303 .mode = 0644, 304 .proc_handler = proc_dointvec_minmax, 305 .extra1 = &min_sndbuf, 306 }, 307 { 308 .procname = "rmem_max", 309 .data = &sysctl_rmem_max, 310 .maxlen = sizeof(int), 311 .mode = 0644, 312 .proc_handler = proc_dointvec_minmax, 313 .extra1 = &min_rcvbuf, 314 }, 315 { 316 .procname = "wmem_default", 317 .data = &sysctl_wmem_default, 318 .maxlen = sizeof(int), 319 .mode = 0644, 320 .proc_handler = proc_dointvec_minmax, 321 .extra1 = &min_sndbuf, 322 }, 323 { 324 .procname = "rmem_default", 325 .data = &sysctl_rmem_default, 326 .maxlen = sizeof(int), 327 .mode = 0644, 328 .proc_handler = proc_dointvec_minmax, 329 .extra1 = &min_rcvbuf, 330 }, 331 { 332 .procname = "dev_weight", 333 .data = &weight_p, 334 .maxlen = sizeof(int), 335 .mode = 0644, 336 .proc_handler = proc_do_dev_weight, 337 }, 338 { 339 .procname = "dev_weight_rx_bias", 340 .data = &dev_weight_rx_bias, 341 .maxlen = sizeof(int), 342 .mode = 0644, 343 .proc_handler = proc_do_dev_weight, 344 }, 345 { 346 .procname = "dev_weight_tx_bias", 347 .data = &dev_weight_tx_bias, 348 .maxlen = sizeof(int), 349 .mode = 0644, 350 .proc_handler = proc_do_dev_weight, 351 }, 352 { 353 .procname = "netdev_max_backlog", 354 .data = &netdev_max_backlog, 355 .maxlen = sizeof(int), 356 .mode = 0644, 357 .proc_handler = proc_dointvec 358 }, 359 { 360 .procname = "netdev_rss_key", 361 .data = &netdev_rss_key, 362 .maxlen = sizeof(int), 363 .mode = 0444, 364 .proc_handler = proc_do_rss_key, 365 }, 366 #ifdef CONFIG_BPF_JIT 367 { 368 .procname = "bpf_jit_enable", 369 .data = &bpf_jit_enable, 370 .maxlen = sizeof(int), 371 .mode = 0644, 372 .proc_handler = proc_dointvec_minmax_bpf_enable, 373 # ifdef CONFIG_BPF_JIT_ALWAYS_ON 374 .extra1 = &one, 375 .extra2 = &one, 376 # else 377 .extra1 = &zero, 378 .extra2 = &two, 379 # endif 380 }, 381 # ifdef CONFIG_HAVE_EBPF_JIT 382 { 383 .procname = "bpf_jit_harden", 384 .data = &bpf_jit_harden, 385 .maxlen = sizeof(int), 386 .mode = 0600, 387 .proc_handler = proc_dointvec_minmax_bpf_restricted, 388 .extra1 = &zero, 389 .extra2 = &two, 390 }, 391 { 392 .procname = "bpf_jit_kallsyms", 393 .data = &bpf_jit_kallsyms, 394 .maxlen = sizeof(int), 395 .mode = 0600, 396 .proc_handler = proc_dointvec_minmax_bpf_restricted, 397 .extra1 = &zero, 398 .extra2 = &one, 399 }, 400 # endif 401 #endif 402 { 403 .procname = "netdev_tstamp_prequeue", 404 .data = &netdev_tstamp_prequeue, 405 .maxlen = sizeof(int), 406 .mode = 0644, 407 .proc_handler = proc_dointvec 408 }, 409 { 410 .procname = "message_cost", 411 .data = &net_ratelimit_state.interval, 412 .maxlen = sizeof(int), 413 .mode = 0644, 414 .proc_handler = proc_dointvec_jiffies, 415 }, 416 { 417 .procname = "message_burst", 418 .data = &net_ratelimit_state.burst, 419 .maxlen = sizeof(int), 420 .mode = 0644, 421 .proc_handler = proc_dointvec, 422 }, 423 { 424 .procname = "optmem_max", 425 .data = &sysctl_optmem_max, 426 .maxlen = sizeof(int), 427 .mode = 0644, 428 .proc_handler = proc_dointvec 429 }, 430 { 431 .procname = "tstamp_allow_data", 432 .data = &sysctl_tstamp_allow_data, 433 .maxlen = sizeof(int), 434 .mode = 0644, 435 .proc_handler = proc_dointvec_minmax, 436 .extra1 = &zero, 437 .extra2 = &one 438 }, 439 #ifdef CONFIG_RPS 440 { 441 .procname = "rps_sock_flow_entries", 442 .maxlen = sizeof(int), 443 .mode = 0644, 444 .proc_handler = rps_sock_flow_sysctl 445 }, 446 #endif 447 #ifdef CONFIG_NET_FLOW_LIMIT 448 { 449 .procname = "flow_limit_cpu_bitmap", 450 .mode = 0644, 451 .proc_handler = flow_limit_cpu_sysctl 452 }, 453 { 454 .procname = "flow_limit_table_len", 455 .data = &netdev_flow_limit_table_len, 456 .maxlen = sizeof(int), 457 .mode = 0644, 458 .proc_handler = flow_limit_table_len_sysctl 459 }, 460 #endif /* CONFIG_NET_FLOW_LIMIT */ 461 #ifdef CONFIG_NET_RX_BUSY_POLL 462 { 463 .procname = "busy_poll", 464 .data = &sysctl_net_busy_poll, 465 .maxlen = sizeof(unsigned int), 466 .mode = 0644, 467 .proc_handler = proc_dointvec_minmax, 468 .extra1 = &zero, 469 }, 470 { 471 .procname = "busy_read", 472 .data = &sysctl_net_busy_read, 473 .maxlen = sizeof(unsigned int), 474 .mode = 0644, 475 .proc_handler = proc_dointvec_minmax, 476 .extra1 = &zero, 477 }, 478 #endif 479 #ifdef CONFIG_NET_SCHED 480 { 481 .procname = "default_qdisc", 482 .mode = 0644, 483 .maxlen = IFNAMSIZ, 484 .proc_handler = set_default_qdisc 485 }, 486 #endif 487 #endif /* CONFIG_NET */ 488 { 489 .procname = "netdev_budget", 490 .data = &netdev_budget, 491 .maxlen = sizeof(int), 492 .mode = 0644, 493 .proc_handler = proc_dointvec 494 }, 495 { 496 .procname = "warnings", 497 .data = &net_msg_warn, 498 .maxlen = sizeof(int), 499 .mode = 0644, 500 .proc_handler = proc_dointvec 501 }, 502 { 503 .procname = "max_skb_frags", 504 .data = &sysctl_max_skb_frags, 505 .maxlen = sizeof(int), 506 .mode = 0644, 507 .proc_handler = proc_dointvec_minmax, 508 .extra1 = &one, 509 .extra2 = &max_skb_frags, 510 }, 511 { 512 .procname = "netdev_budget_usecs", 513 .data = &netdev_budget_usecs, 514 .maxlen = sizeof(unsigned int), 515 .mode = 0644, 516 .proc_handler = proc_dointvec_minmax, 517 .extra1 = &zero, 518 }, 519 { 520 .procname = "fb_tunnels_only_for_init_net", 521 .data = &sysctl_fb_tunnels_only_for_init_net, 522 .maxlen = sizeof(int), 523 .mode = 0644, 524 .proc_handler = proc_dointvec_minmax, 525 .extra1 = &zero, 526 .extra2 = &one, 527 }, 528 { } 529 }; 530 531 static struct ctl_table netns_core_table[] = { 532 { 533 .procname = "somaxconn", 534 .data = &init_net.core.sysctl_somaxconn, 535 .maxlen = sizeof(int), 536 .mode = 0644, 537 .extra1 = &zero, 538 .proc_handler = proc_dointvec_minmax 539 }, 540 { } 541 }; 542 543 static __net_init int sysctl_core_net_init(struct net *net) 544 { 545 struct ctl_table *tbl; 546 547 tbl = netns_core_table; 548 if (!net_eq(net, &init_net)) { 549 tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); 550 if (tbl == NULL) 551 goto err_dup; 552 553 tbl[0].data = &net->core.sysctl_somaxconn; 554 555 /* Don't export any sysctls to unprivileged users */ 556 if (net->user_ns != &init_user_ns) { 557 tbl[0].procname = NULL; 558 } 559 } 560 561 net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl); 562 if (net->core.sysctl_hdr == NULL) 563 goto err_reg; 564 565 return 0; 566 567 err_reg: 568 if (tbl != netns_core_table) 569 kfree(tbl); 570 err_dup: 571 return -ENOMEM; 572 } 573 574 static __net_exit void sysctl_core_net_exit(struct net *net) 575 { 576 struct ctl_table *tbl; 577 578 tbl = net->core.sysctl_hdr->ctl_table_arg; 579 unregister_net_sysctl_table(net->core.sysctl_hdr); 580 BUG_ON(tbl == netns_core_table); 581 kfree(tbl); 582 } 583 584 static __net_initdata struct pernet_operations sysctl_core_ops = { 585 .init = sysctl_core_net_init, 586 .exit = sysctl_core_net_exit, 587 }; 588 589 static __init int sysctl_core_init(void) 590 { 591 register_net_sysctl(&init_net, "net/core", net_core_table); 592 return register_pernet_subsys(&sysctl_core_ops); 593 } 594 595 fs_initcall(sysctl_core_init); 596