1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <asm/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/init.h> 111 #include <linux/highmem.h> 112 #include <linux/user_namespace.h> 113 #include <linux/static_key.h> 114 #include <linux/memcontrol.h> 115 #include <linux/prefetch.h> 116 #include <linux/compat.h> 117 #include <linux/mroute.h> 118 #include <linux/mroute6.h> 119 #include <linux/icmpv6.h> 120 121 #include <linux/uaccess.h> 122 123 #include <linux/netdevice.h> 124 #include <net/protocol.h> 125 #include <linux/skbuff.h> 126 #include <net/net_namespace.h> 127 #include <net/request_sock.h> 128 #include <net/sock.h> 129 #include <linux/net_tstamp.h> 130 #include <net/xfrm.h> 131 #include <linux/ipsec.h> 132 #include <net/cls_cgroup.h> 133 #include <net/netprio_cgroup.h> 134 #include <linux/sock_diag.h> 135 136 #include <linux/filter.h> 137 #include <net/sock_reuseport.h> 138 #include <net/bpf_sk_storage.h> 139 140 #include <trace/events/sock.h> 141 142 #include <net/tcp.h> 143 #include <net/busy_poll.h> 144 #include <net/phonet/phonet.h> 145 146 #include <linux/ethtool.h> 147 148 #include "dev.h" 149 150 static DEFINE_MUTEX(proto_list_mutex); 151 static LIST_HEAD(proto_list); 152 153 static void sock_def_write_space_wfree(struct sock *sk); 154 static void sock_def_write_space(struct sock *sk); 155 156 /** 157 * sk_ns_capable - General socket capability test 158 * @sk: Socket to use a capability on or through 159 * @user_ns: The user namespace of the capability to use 160 * @cap: The capability to use 161 * 162 * Test to see if the opener of the socket had when the socket was 163 * created and the current process has the capability @cap in the user 164 * namespace @user_ns. 165 */ 166 bool sk_ns_capable(const struct sock *sk, 167 struct user_namespace *user_ns, int cap) 168 { 169 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 170 ns_capable(user_ns, cap); 171 } 172 EXPORT_SYMBOL(sk_ns_capable); 173 174 /** 175 * sk_capable - Socket global capability test 176 * @sk: Socket to use a capability on or through 177 * @cap: The global capability to use 178 * 179 * Test to see if the opener of the socket had when the socket was 180 * created and the current process has the capability @cap in all user 181 * namespaces. 182 */ 183 bool sk_capable(const struct sock *sk, int cap) 184 { 185 return sk_ns_capable(sk, &init_user_ns, cap); 186 } 187 EXPORT_SYMBOL(sk_capable); 188 189 /** 190 * sk_net_capable - Network namespace socket capability test 191 * @sk: Socket to use a capability on or through 192 * @cap: The capability to use 193 * 194 * Test to see if the opener of the socket had when the socket was created 195 * and the current process has the capability @cap over the network namespace 196 * the socket is a member of. 197 */ 198 bool sk_net_capable(const struct sock *sk, int cap) 199 { 200 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 201 } 202 EXPORT_SYMBOL(sk_net_capable); 203 204 /* 205 * Each address family might have different locking rules, so we have 206 * one slock key per address family and separate keys for internal and 207 * userspace sockets. 208 */ 209 static struct lock_class_key af_family_keys[AF_MAX]; 210 static struct lock_class_key af_family_kern_keys[AF_MAX]; 211 static struct lock_class_key af_family_slock_keys[AF_MAX]; 212 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 213 214 /* 215 * Make lock validator output more readable. (we pre-construct these 216 * strings build-time, so that runtime initialization of socket 217 * locks is fast): 218 */ 219 220 #define _sock_locks(x) \ 221 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 222 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 223 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 224 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 225 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 226 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 227 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 228 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 229 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 230 x "27" , x "28" , x "AF_CAN" , \ 231 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 232 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 233 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 234 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 235 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 236 x "AF_MCTP" , \ 237 x "AF_MAX" 238 239 static const char *const af_family_key_strings[AF_MAX+1] = { 240 _sock_locks("sk_lock-") 241 }; 242 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 243 _sock_locks("slock-") 244 }; 245 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 246 _sock_locks("clock-") 247 }; 248 249 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 250 _sock_locks("k-sk_lock-") 251 }; 252 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 253 _sock_locks("k-slock-") 254 }; 255 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 256 _sock_locks("k-clock-") 257 }; 258 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 259 _sock_locks("rlock-") 260 }; 261 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 262 _sock_locks("wlock-") 263 }; 264 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 265 _sock_locks("elock-") 266 }; 267 268 /* 269 * sk_callback_lock and sk queues locking rules are per-address-family, 270 * so split the lock classes by using a per-AF key: 271 */ 272 static struct lock_class_key af_callback_keys[AF_MAX]; 273 static struct lock_class_key af_rlock_keys[AF_MAX]; 274 static struct lock_class_key af_wlock_keys[AF_MAX]; 275 static struct lock_class_key af_elock_keys[AF_MAX]; 276 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 277 278 /* Run time adjustable parameters. */ 279 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 280 EXPORT_SYMBOL(sysctl_wmem_max); 281 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 282 EXPORT_SYMBOL(sysctl_rmem_max); 283 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 284 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 285 286 /* Maximal space eaten by iovec or ancillary data plus some space */ 287 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 288 EXPORT_SYMBOL(sysctl_optmem_max); 289 290 int sysctl_tstamp_allow_data __read_mostly = 1; 291 292 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 293 EXPORT_SYMBOL_GPL(memalloc_socks_key); 294 295 /** 296 * sk_set_memalloc - sets %SOCK_MEMALLOC 297 * @sk: socket to set it on 298 * 299 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 300 * It's the responsibility of the admin to adjust min_free_kbytes 301 * to meet the requirements 302 */ 303 void sk_set_memalloc(struct sock *sk) 304 { 305 sock_set_flag(sk, SOCK_MEMALLOC); 306 sk->sk_allocation |= __GFP_MEMALLOC; 307 static_branch_inc(&memalloc_socks_key); 308 } 309 EXPORT_SYMBOL_GPL(sk_set_memalloc); 310 311 void sk_clear_memalloc(struct sock *sk) 312 { 313 sock_reset_flag(sk, SOCK_MEMALLOC); 314 sk->sk_allocation &= ~__GFP_MEMALLOC; 315 static_branch_dec(&memalloc_socks_key); 316 317 /* 318 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 319 * progress of swapping. SOCK_MEMALLOC may be cleared while 320 * it has rmem allocations due to the last swapfile being deactivated 321 * but there is a risk that the socket is unusable due to exceeding 322 * the rmem limits. Reclaim the reserves and obey rmem limits again. 323 */ 324 sk_mem_reclaim(sk); 325 } 326 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 327 328 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 329 { 330 int ret; 331 unsigned int noreclaim_flag; 332 333 /* these should have been dropped before queueing */ 334 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 335 336 noreclaim_flag = memalloc_noreclaim_save(); 337 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, 338 tcp_v6_do_rcv, 339 tcp_v4_do_rcv, 340 sk, skb); 341 memalloc_noreclaim_restore(noreclaim_flag); 342 343 return ret; 344 } 345 EXPORT_SYMBOL(__sk_backlog_rcv); 346 347 void sk_error_report(struct sock *sk) 348 { 349 sk->sk_error_report(sk); 350 351 switch (sk->sk_family) { 352 case AF_INET: 353 fallthrough; 354 case AF_INET6: 355 trace_inet_sk_error_report(sk); 356 break; 357 default: 358 break; 359 } 360 } 361 EXPORT_SYMBOL(sk_error_report); 362 363 int sock_get_timeout(long timeo, void *optval, bool old_timeval) 364 { 365 struct __kernel_sock_timeval tv; 366 367 if (timeo == MAX_SCHEDULE_TIMEOUT) { 368 tv.tv_sec = 0; 369 tv.tv_usec = 0; 370 } else { 371 tv.tv_sec = timeo / HZ; 372 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 373 } 374 375 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 376 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 377 *(struct old_timeval32 *)optval = tv32; 378 return sizeof(tv32); 379 } 380 381 if (old_timeval) { 382 struct __kernel_old_timeval old_tv; 383 old_tv.tv_sec = tv.tv_sec; 384 old_tv.tv_usec = tv.tv_usec; 385 *(struct __kernel_old_timeval *)optval = old_tv; 386 return sizeof(old_tv); 387 } 388 389 *(struct __kernel_sock_timeval *)optval = tv; 390 return sizeof(tv); 391 } 392 EXPORT_SYMBOL(sock_get_timeout); 393 394 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, 395 sockptr_t optval, int optlen, bool old_timeval) 396 { 397 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 398 struct old_timeval32 tv32; 399 400 if (optlen < sizeof(tv32)) 401 return -EINVAL; 402 403 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 404 return -EFAULT; 405 tv->tv_sec = tv32.tv_sec; 406 tv->tv_usec = tv32.tv_usec; 407 } else if (old_timeval) { 408 struct __kernel_old_timeval old_tv; 409 410 if (optlen < sizeof(old_tv)) 411 return -EINVAL; 412 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 413 return -EFAULT; 414 tv->tv_sec = old_tv.tv_sec; 415 tv->tv_usec = old_tv.tv_usec; 416 } else { 417 if (optlen < sizeof(*tv)) 418 return -EINVAL; 419 if (copy_from_sockptr(tv, optval, sizeof(*tv))) 420 return -EFAULT; 421 } 422 423 return 0; 424 } 425 EXPORT_SYMBOL(sock_copy_user_timeval); 426 427 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 428 bool old_timeval) 429 { 430 struct __kernel_sock_timeval tv; 431 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); 432 long val; 433 434 if (err) 435 return err; 436 437 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 438 return -EDOM; 439 440 if (tv.tv_sec < 0) { 441 static int warned __read_mostly; 442 443 WRITE_ONCE(*timeo_p, 0); 444 if (warned < 10 && net_ratelimit()) { 445 warned++; 446 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 447 __func__, current->comm, task_pid_nr(current)); 448 } 449 return 0; 450 } 451 val = MAX_SCHEDULE_TIMEOUT; 452 if ((tv.tv_sec || tv.tv_usec) && 453 (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) 454 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, 455 USEC_PER_SEC / HZ); 456 WRITE_ONCE(*timeo_p, val); 457 return 0; 458 } 459 460 static bool sock_needs_netstamp(const struct sock *sk) 461 { 462 switch (sk->sk_family) { 463 case AF_UNSPEC: 464 case AF_UNIX: 465 return false; 466 default: 467 return true; 468 } 469 } 470 471 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 472 { 473 if (sk->sk_flags & flags) { 474 sk->sk_flags &= ~flags; 475 if (sock_needs_netstamp(sk) && 476 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 477 net_disable_timestamp(); 478 } 479 } 480 481 482 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 483 { 484 unsigned long flags; 485 struct sk_buff_head *list = &sk->sk_receive_queue; 486 487 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 488 atomic_inc(&sk->sk_drops); 489 trace_sock_rcvqueue_full(sk, skb); 490 return -ENOMEM; 491 } 492 493 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 494 atomic_inc(&sk->sk_drops); 495 return -ENOBUFS; 496 } 497 498 skb->dev = NULL; 499 skb_set_owner_r(skb, sk); 500 501 /* we escape from rcu protected region, make sure we dont leak 502 * a norefcounted dst 503 */ 504 skb_dst_force(skb); 505 506 spin_lock_irqsave(&list->lock, flags); 507 sock_skb_set_dropcount(sk, skb); 508 __skb_queue_tail(list, skb); 509 spin_unlock_irqrestore(&list->lock, flags); 510 511 if (!sock_flag(sk, SOCK_DEAD)) 512 sk->sk_data_ready(sk); 513 return 0; 514 } 515 EXPORT_SYMBOL(__sock_queue_rcv_skb); 516 517 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, 518 enum skb_drop_reason *reason) 519 { 520 enum skb_drop_reason drop_reason; 521 int err; 522 523 err = sk_filter(sk, skb); 524 if (err) { 525 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 526 goto out; 527 } 528 err = __sock_queue_rcv_skb(sk, skb); 529 switch (err) { 530 case -ENOMEM: 531 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; 532 break; 533 case -ENOBUFS: 534 drop_reason = SKB_DROP_REASON_PROTO_MEM; 535 break; 536 default: 537 drop_reason = SKB_NOT_DROPPED_YET; 538 break; 539 } 540 out: 541 if (reason) 542 *reason = drop_reason; 543 return err; 544 } 545 EXPORT_SYMBOL(sock_queue_rcv_skb_reason); 546 547 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 548 const int nested, unsigned int trim_cap, bool refcounted) 549 { 550 int rc = NET_RX_SUCCESS; 551 552 if (sk_filter_trim_cap(sk, skb, trim_cap)) 553 goto discard_and_relse; 554 555 skb->dev = NULL; 556 557 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 558 atomic_inc(&sk->sk_drops); 559 goto discard_and_relse; 560 } 561 if (nested) 562 bh_lock_sock_nested(sk); 563 else 564 bh_lock_sock(sk); 565 if (!sock_owned_by_user(sk)) { 566 /* 567 * trylock + unlock semantics: 568 */ 569 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 570 571 rc = sk_backlog_rcv(sk, skb); 572 573 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 574 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { 575 bh_unlock_sock(sk); 576 atomic_inc(&sk->sk_drops); 577 goto discard_and_relse; 578 } 579 580 bh_unlock_sock(sk); 581 out: 582 if (refcounted) 583 sock_put(sk); 584 return rc; 585 discard_and_relse: 586 kfree_skb(skb); 587 goto out; 588 } 589 EXPORT_SYMBOL(__sk_receive_skb); 590 591 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 592 u32)); 593 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 594 u32)); 595 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 596 { 597 struct dst_entry *dst = __sk_dst_get(sk); 598 599 if (dst && dst->obsolete && 600 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 601 dst, cookie) == NULL) { 602 sk_tx_queue_clear(sk); 603 sk->sk_dst_pending_confirm = 0; 604 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 605 dst_release(dst); 606 return NULL; 607 } 608 609 return dst; 610 } 611 EXPORT_SYMBOL(__sk_dst_check); 612 613 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 614 { 615 struct dst_entry *dst = sk_dst_get(sk); 616 617 if (dst && dst->obsolete && 618 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 619 dst, cookie) == NULL) { 620 sk_dst_reset(sk); 621 dst_release(dst); 622 return NULL; 623 } 624 625 return dst; 626 } 627 EXPORT_SYMBOL(sk_dst_check); 628 629 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 630 { 631 int ret = -ENOPROTOOPT; 632 #ifdef CONFIG_NETDEVICES 633 struct net *net = sock_net(sk); 634 635 /* Sorry... */ 636 ret = -EPERM; 637 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 638 goto out; 639 640 ret = -EINVAL; 641 if (ifindex < 0) 642 goto out; 643 644 /* Paired with all READ_ONCE() done locklessly. */ 645 WRITE_ONCE(sk->sk_bound_dev_if, ifindex); 646 647 if (sk->sk_prot->rehash) 648 sk->sk_prot->rehash(sk); 649 sk_dst_reset(sk); 650 651 ret = 0; 652 653 out: 654 #endif 655 656 return ret; 657 } 658 659 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 660 { 661 int ret; 662 663 if (lock_sk) 664 lock_sock(sk); 665 ret = sock_bindtoindex_locked(sk, ifindex); 666 if (lock_sk) 667 release_sock(sk); 668 669 return ret; 670 } 671 EXPORT_SYMBOL(sock_bindtoindex); 672 673 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 674 { 675 int ret = -ENOPROTOOPT; 676 #ifdef CONFIG_NETDEVICES 677 struct net *net = sock_net(sk); 678 char devname[IFNAMSIZ]; 679 int index; 680 681 ret = -EINVAL; 682 if (optlen < 0) 683 goto out; 684 685 /* Bind this socket to a particular device like "eth0", 686 * as specified in the passed interface name. If the 687 * name is "" or the option length is zero the socket 688 * is not bound. 689 */ 690 if (optlen > IFNAMSIZ - 1) 691 optlen = IFNAMSIZ - 1; 692 memset(devname, 0, sizeof(devname)); 693 694 ret = -EFAULT; 695 if (copy_from_sockptr(devname, optval, optlen)) 696 goto out; 697 698 index = 0; 699 if (devname[0] != '\0') { 700 struct net_device *dev; 701 702 rcu_read_lock(); 703 dev = dev_get_by_name_rcu(net, devname); 704 if (dev) 705 index = dev->ifindex; 706 rcu_read_unlock(); 707 ret = -ENODEV; 708 if (!dev) 709 goto out; 710 } 711 712 sockopt_lock_sock(sk); 713 ret = sock_bindtoindex_locked(sk, index); 714 sockopt_release_sock(sk); 715 out: 716 #endif 717 718 return ret; 719 } 720 721 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, 722 sockptr_t optlen, int len) 723 { 724 int ret = -ENOPROTOOPT; 725 #ifdef CONFIG_NETDEVICES 726 int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); 727 struct net *net = sock_net(sk); 728 char devname[IFNAMSIZ]; 729 730 if (bound_dev_if == 0) { 731 len = 0; 732 goto zero; 733 } 734 735 ret = -EINVAL; 736 if (len < IFNAMSIZ) 737 goto out; 738 739 ret = netdev_get_name(net, devname, bound_dev_if); 740 if (ret) 741 goto out; 742 743 len = strlen(devname) + 1; 744 745 ret = -EFAULT; 746 if (copy_to_sockptr(optval, devname, len)) 747 goto out; 748 749 zero: 750 ret = -EFAULT; 751 if (copy_to_sockptr(optlen, &len, sizeof(int))) 752 goto out; 753 754 ret = 0; 755 756 out: 757 #endif 758 759 return ret; 760 } 761 762 bool sk_mc_loop(struct sock *sk) 763 { 764 if (dev_recursion_level()) 765 return false; 766 if (!sk) 767 return true; 768 switch (sk->sk_family) { 769 case AF_INET: 770 return inet_sk(sk)->mc_loop; 771 #if IS_ENABLED(CONFIG_IPV6) 772 case AF_INET6: 773 return inet6_sk(sk)->mc_loop; 774 #endif 775 } 776 WARN_ON_ONCE(1); 777 return true; 778 } 779 EXPORT_SYMBOL(sk_mc_loop); 780 781 void sock_set_reuseaddr(struct sock *sk) 782 { 783 lock_sock(sk); 784 sk->sk_reuse = SK_CAN_REUSE; 785 release_sock(sk); 786 } 787 EXPORT_SYMBOL(sock_set_reuseaddr); 788 789 void sock_set_reuseport(struct sock *sk) 790 { 791 lock_sock(sk); 792 sk->sk_reuseport = true; 793 release_sock(sk); 794 } 795 EXPORT_SYMBOL(sock_set_reuseport); 796 797 void sock_no_linger(struct sock *sk) 798 { 799 lock_sock(sk); 800 sk->sk_lingertime = 0; 801 sock_set_flag(sk, SOCK_LINGER); 802 release_sock(sk); 803 } 804 EXPORT_SYMBOL(sock_no_linger); 805 806 void sock_set_priority(struct sock *sk, u32 priority) 807 { 808 lock_sock(sk); 809 WRITE_ONCE(sk->sk_priority, priority); 810 release_sock(sk); 811 } 812 EXPORT_SYMBOL(sock_set_priority); 813 814 void sock_set_sndtimeo(struct sock *sk, s64 secs) 815 { 816 lock_sock(sk); 817 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 818 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); 819 else 820 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); 821 release_sock(sk); 822 } 823 EXPORT_SYMBOL(sock_set_sndtimeo); 824 825 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 826 { 827 if (val) { 828 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 829 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); 830 sock_set_flag(sk, SOCK_RCVTSTAMP); 831 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 832 } else { 833 sock_reset_flag(sk, SOCK_RCVTSTAMP); 834 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 835 } 836 } 837 838 void sock_enable_timestamps(struct sock *sk) 839 { 840 lock_sock(sk); 841 __sock_set_timestamps(sk, true, false, true); 842 release_sock(sk); 843 } 844 EXPORT_SYMBOL(sock_enable_timestamps); 845 846 void sock_set_timestamp(struct sock *sk, int optname, bool valbool) 847 { 848 switch (optname) { 849 case SO_TIMESTAMP_OLD: 850 __sock_set_timestamps(sk, valbool, false, false); 851 break; 852 case SO_TIMESTAMP_NEW: 853 __sock_set_timestamps(sk, valbool, true, false); 854 break; 855 case SO_TIMESTAMPNS_OLD: 856 __sock_set_timestamps(sk, valbool, false, true); 857 break; 858 case SO_TIMESTAMPNS_NEW: 859 __sock_set_timestamps(sk, valbool, true, true); 860 break; 861 } 862 } 863 864 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) 865 { 866 struct net *net = sock_net(sk); 867 struct net_device *dev = NULL; 868 bool match = false; 869 int *vclock_index; 870 int i, num; 871 872 if (sk->sk_bound_dev_if) 873 dev = dev_get_by_index(net, sk->sk_bound_dev_if); 874 875 if (!dev) { 876 pr_err("%s: sock not bind to device\n", __func__); 877 return -EOPNOTSUPP; 878 } 879 880 num = ethtool_get_phc_vclocks(dev, &vclock_index); 881 dev_put(dev); 882 883 for (i = 0; i < num; i++) { 884 if (*(vclock_index + i) == phc_index) { 885 match = true; 886 break; 887 } 888 } 889 890 if (num > 0) 891 kfree(vclock_index); 892 893 if (!match) 894 return -EINVAL; 895 896 sk->sk_bind_phc = phc_index; 897 898 return 0; 899 } 900 901 int sock_set_timestamping(struct sock *sk, int optname, 902 struct so_timestamping timestamping) 903 { 904 int val = timestamping.flags; 905 int ret; 906 907 if (val & ~SOF_TIMESTAMPING_MASK) 908 return -EINVAL; 909 910 if (val & SOF_TIMESTAMPING_OPT_ID_TCP && 911 !(val & SOF_TIMESTAMPING_OPT_ID)) 912 return -EINVAL; 913 914 if (val & SOF_TIMESTAMPING_OPT_ID && 915 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 916 if (sk_is_tcp(sk)) { 917 if ((1 << sk->sk_state) & 918 (TCPF_CLOSE | TCPF_LISTEN)) 919 return -EINVAL; 920 if (val & SOF_TIMESTAMPING_OPT_ID_TCP) 921 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); 922 else 923 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); 924 } else { 925 atomic_set(&sk->sk_tskey, 0); 926 } 927 } 928 929 if (val & SOF_TIMESTAMPING_OPT_STATS && 930 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) 931 return -EINVAL; 932 933 if (val & SOF_TIMESTAMPING_BIND_PHC) { 934 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); 935 if (ret) 936 return ret; 937 } 938 939 sk->sk_tsflags = val; 940 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 941 942 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 943 sock_enable_timestamp(sk, 944 SOCK_TIMESTAMPING_RX_SOFTWARE); 945 else 946 sock_disable_timestamp(sk, 947 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 948 return 0; 949 } 950 951 void sock_set_keepalive(struct sock *sk) 952 { 953 lock_sock(sk); 954 if (sk->sk_prot->keepalive) 955 sk->sk_prot->keepalive(sk, true); 956 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 957 release_sock(sk); 958 } 959 EXPORT_SYMBOL(sock_set_keepalive); 960 961 static void __sock_set_rcvbuf(struct sock *sk, int val) 962 { 963 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 964 * as a negative value. 965 */ 966 val = min_t(int, val, INT_MAX / 2); 967 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 968 969 /* We double it on the way in to account for "struct sk_buff" etc. 970 * overhead. Applications assume that the SO_RCVBUF setting they make 971 * will allow that much actual data to be received on that socket. 972 * 973 * Applications are unaware that "struct sk_buff" and other overheads 974 * allocate from the receive buffer during socket buffer allocation. 975 * 976 * And after considering the possible alternatives, returning the value 977 * we actually used in getsockopt is the most desirable behavior. 978 */ 979 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 980 } 981 982 void sock_set_rcvbuf(struct sock *sk, int val) 983 { 984 lock_sock(sk); 985 __sock_set_rcvbuf(sk, val); 986 release_sock(sk); 987 } 988 EXPORT_SYMBOL(sock_set_rcvbuf); 989 990 static void __sock_set_mark(struct sock *sk, u32 val) 991 { 992 if (val != sk->sk_mark) { 993 WRITE_ONCE(sk->sk_mark, val); 994 sk_dst_reset(sk); 995 } 996 } 997 998 void sock_set_mark(struct sock *sk, u32 val) 999 { 1000 lock_sock(sk); 1001 __sock_set_mark(sk, val); 1002 release_sock(sk); 1003 } 1004 EXPORT_SYMBOL(sock_set_mark); 1005 1006 static void sock_release_reserved_memory(struct sock *sk, int bytes) 1007 { 1008 /* Round down bytes to multiple of pages */ 1009 bytes = round_down(bytes, PAGE_SIZE); 1010 1011 WARN_ON(bytes > sk->sk_reserved_mem); 1012 WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); 1013 sk_mem_reclaim(sk); 1014 } 1015 1016 static int sock_reserve_memory(struct sock *sk, int bytes) 1017 { 1018 long allocated; 1019 bool charged; 1020 int pages; 1021 1022 if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) 1023 return -EOPNOTSUPP; 1024 1025 if (!bytes) 1026 return 0; 1027 1028 pages = sk_mem_pages(bytes); 1029 1030 /* pre-charge to memcg */ 1031 charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, 1032 GFP_KERNEL | __GFP_RETRY_MAYFAIL); 1033 if (!charged) 1034 return -ENOMEM; 1035 1036 /* pre-charge to forward_alloc */ 1037 sk_memory_allocated_add(sk, pages); 1038 allocated = sk_memory_allocated(sk); 1039 /* If the system goes into memory pressure with this 1040 * precharge, give up and return error. 1041 */ 1042 if (allocated > sk_prot_mem_limits(sk, 1)) { 1043 sk_memory_allocated_sub(sk, pages); 1044 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); 1045 return -ENOMEM; 1046 } 1047 sk->sk_forward_alloc += pages << PAGE_SHIFT; 1048 1049 WRITE_ONCE(sk->sk_reserved_mem, 1050 sk->sk_reserved_mem + (pages << PAGE_SHIFT)); 1051 1052 return 0; 1053 } 1054 1055 void sockopt_lock_sock(struct sock *sk) 1056 { 1057 /* When current->bpf_ctx is set, the setsockopt is called from 1058 * a bpf prog. bpf has ensured the sk lock has been 1059 * acquired before calling setsockopt(). 1060 */ 1061 if (has_current_bpf_ctx()) 1062 return; 1063 1064 lock_sock(sk); 1065 } 1066 EXPORT_SYMBOL(sockopt_lock_sock); 1067 1068 void sockopt_release_sock(struct sock *sk) 1069 { 1070 if (has_current_bpf_ctx()) 1071 return; 1072 1073 release_sock(sk); 1074 } 1075 EXPORT_SYMBOL(sockopt_release_sock); 1076 1077 bool sockopt_ns_capable(struct user_namespace *ns, int cap) 1078 { 1079 return has_current_bpf_ctx() || ns_capable(ns, cap); 1080 } 1081 EXPORT_SYMBOL(sockopt_ns_capable); 1082 1083 bool sockopt_capable(int cap) 1084 { 1085 return has_current_bpf_ctx() || capable(cap); 1086 } 1087 EXPORT_SYMBOL(sockopt_capable); 1088 1089 /* 1090 * This is meant for all protocols to use and covers goings on 1091 * at the socket level. Everything here is generic. 1092 */ 1093 1094 int sk_setsockopt(struct sock *sk, int level, int optname, 1095 sockptr_t optval, unsigned int optlen) 1096 { 1097 struct so_timestamping timestamping; 1098 struct socket *sock = sk->sk_socket; 1099 struct sock_txtime sk_txtime; 1100 int val; 1101 int valbool; 1102 struct linger ling; 1103 int ret = 0; 1104 1105 /* 1106 * Options without arguments 1107 */ 1108 1109 if (optname == SO_BINDTODEVICE) 1110 return sock_setbindtodevice(sk, optval, optlen); 1111 1112 if (optlen < sizeof(int)) 1113 return -EINVAL; 1114 1115 if (copy_from_sockptr(&val, optval, sizeof(val))) 1116 return -EFAULT; 1117 1118 valbool = val ? 1 : 0; 1119 1120 sockopt_lock_sock(sk); 1121 1122 switch (optname) { 1123 case SO_DEBUG: 1124 if (val && !sockopt_capable(CAP_NET_ADMIN)) 1125 ret = -EACCES; 1126 else 1127 sock_valbool_flag(sk, SOCK_DBG, valbool); 1128 break; 1129 case SO_REUSEADDR: 1130 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 1131 break; 1132 case SO_REUSEPORT: 1133 sk->sk_reuseport = valbool; 1134 break; 1135 case SO_TYPE: 1136 case SO_PROTOCOL: 1137 case SO_DOMAIN: 1138 case SO_ERROR: 1139 ret = -ENOPROTOOPT; 1140 break; 1141 case SO_DONTROUTE: 1142 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 1143 sk_dst_reset(sk); 1144 break; 1145 case SO_BROADCAST: 1146 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 1147 break; 1148 case SO_SNDBUF: 1149 /* Don't error on this BSD doesn't and if you think 1150 * about it this is right. Otherwise apps have to 1151 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1152 * are treated in BSD as hints 1153 */ 1154 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); 1155 set_sndbuf: 1156 /* Ensure val * 2 fits into an int, to prevent max_t() 1157 * from treating it as a negative value. 1158 */ 1159 val = min_t(int, val, INT_MAX / 2); 1160 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1161 WRITE_ONCE(sk->sk_sndbuf, 1162 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 1163 /* Wake up sending tasks if we upped the value. */ 1164 sk->sk_write_space(sk); 1165 break; 1166 1167 case SO_SNDBUFFORCE: 1168 if (!sockopt_capable(CAP_NET_ADMIN)) { 1169 ret = -EPERM; 1170 break; 1171 } 1172 1173 /* No negative values (to prevent underflow, as val will be 1174 * multiplied by 2). 1175 */ 1176 if (val < 0) 1177 val = 0; 1178 goto set_sndbuf; 1179 1180 case SO_RCVBUF: 1181 /* Don't error on this BSD doesn't and if you think 1182 * about it this is right. Otherwise apps have to 1183 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1184 * are treated in BSD as hints 1185 */ 1186 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); 1187 break; 1188 1189 case SO_RCVBUFFORCE: 1190 if (!sockopt_capable(CAP_NET_ADMIN)) { 1191 ret = -EPERM; 1192 break; 1193 } 1194 1195 /* No negative values (to prevent underflow, as val will be 1196 * multiplied by 2). 1197 */ 1198 __sock_set_rcvbuf(sk, max(val, 0)); 1199 break; 1200 1201 case SO_KEEPALIVE: 1202 if (sk->sk_prot->keepalive) 1203 sk->sk_prot->keepalive(sk, valbool); 1204 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 1205 break; 1206 1207 case SO_OOBINLINE: 1208 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 1209 break; 1210 1211 case SO_NO_CHECK: 1212 sk->sk_no_check_tx = valbool; 1213 break; 1214 1215 case SO_PRIORITY: 1216 if ((val >= 0 && val <= 6) || 1217 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || 1218 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1219 WRITE_ONCE(sk->sk_priority, val); 1220 else 1221 ret = -EPERM; 1222 break; 1223 1224 case SO_LINGER: 1225 if (optlen < sizeof(ling)) { 1226 ret = -EINVAL; /* 1003.1g */ 1227 break; 1228 } 1229 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 1230 ret = -EFAULT; 1231 break; 1232 } 1233 if (!ling.l_onoff) 1234 sock_reset_flag(sk, SOCK_LINGER); 1235 else { 1236 #if (BITS_PER_LONG == 32) 1237 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 1238 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 1239 else 1240 #endif 1241 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 1242 sock_set_flag(sk, SOCK_LINGER); 1243 } 1244 break; 1245 1246 case SO_BSDCOMPAT: 1247 break; 1248 1249 case SO_PASSCRED: 1250 assign_bit(SOCK_PASSCRED, &sock->flags, valbool); 1251 break; 1252 1253 case SO_PASSPIDFD: 1254 assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); 1255 break; 1256 1257 case SO_TIMESTAMP_OLD: 1258 case SO_TIMESTAMP_NEW: 1259 case SO_TIMESTAMPNS_OLD: 1260 case SO_TIMESTAMPNS_NEW: 1261 sock_set_timestamp(sk, optname, valbool); 1262 break; 1263 1264 case SO_TIMESTAMPING_NEW: 1265 case SO_TIMESTAMPING_OLD: 1266 if (optlen == sizeof(timestamping)) { 1267 if (copy_from_sockptr(×tamping, optval, 1268 sizeof(timestamping))) { 1269 ret = -EFAULT; 1270 break; 1271 } 1272 } else { 1273 memset(×tamping, 0, sizeof(timestamping)); 1274 timestamping.flags = val; 1275 } 1276 ret = sock_set_timestamping(sk, optname, timestamping); 1277 break; 1278 1279 case SO_RCVLOWAT: 1280 { 1281 int (*set_rcvlowat)(struct sock *sk, int val) = NULL; 1282 1283 if (val < 0) 1284 val = INT_MAX; 1285 if (sock) 1286 set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; 1287 if (set_rcvlowat) 1288 ret = set_rcvlowat(sk, val); 1289 else 1290 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1291 break; 1292 } 1293 case SO_RCVTIMEO_OLD: 1294 case SO_RCVTIMEO_NEW: 1295 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, 1296 optlen, optname == SO_RCVTIMEO_OLD); 1297 break; 1298 1299 case SO_SNDTIMEO_OLD: 1300 case SO_SNDTIMEO_NEW: 1301 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, 1302 optlen, optname == SO_SNDTIMEO_OLD); 1303 break; 1304 1305 case SO_ATTACH_FILTER: { 1306 struct sock_fprog fprog; 1307 1308 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1309 if (!ret) 1310 ret = sk_attach_filter(&fprog, sk); 1311 break; 1312 } 1313 case SO_ATTACH_BPF: 1314 ret = -EINVAL; 1315 if (optlen == sizeof(u32)) { 1316 u32 ufd; 1317 1318 ret = -EFAULT; 1319 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1320 break; 1321 1322 ret = sk_attach_bpf(ufd, sk); 1323 } 1324 break; 1325 1326 case SO_ATTACH_REUSEPORT_CBPF: { 1327 struct sock_fprog fprog; 1328 1329 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1330 if (!ret) 1331 ret = sk_reuseport_attach_filter(&fprog, sk); 1332 break; 1333 } 1334 case SO_ATTACH_REUSEPORT_EBPF: 1335 ret = -EINVAL; 1336 if (optlen == sizeof(u32)) { 1337 u32 ufd; 1338 1339 ret = -EFAULT; 1340 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1341 break; 1342 1343 ret = sk_reuseport_attach_bpf(ufd, sk); 1344 } 1345 break; 1346 1347 case SO_DETACH_REUSEPORT_BPF: 1348 ret = reuseport_detach_prog(sk); 1349 break; 1350 1351 case SO_DETACH_FILTER: 1352 ret = sk_detach_filter(sk); 1353 break; 1354 1355 case SO_LOCK_FILTER: 1356 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1357 ret = -EPERM; 1358 else 1359 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1360 break; 1361 1362 case SO_PASSSEC: 1363 assign_bit(SOCK_PASSSEC, &sock->flags, valbool); 1364 break; 1365 case SO_MARK: 1366 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 1367 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1368 ret = -EPERM; 1369 break; 1370 } 1371 1372 __sock_set_mark(sk, val); 1373 break; 1374 case SO_RCVMARK: 1375 sock_valbool_flag(sk, SOCK_RCVMARK, valbool); 1376 break; 1377 1378 case SO_RXQ_OVFL: 1379 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1380 break; 1381 1382 case SO_WIFI_STATUS: 1383 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1384 break; 1385 1386 case SO_PEEK_OFF: 1387 { 1388 int (*set_peek_off)(struct sock *sk, int val); 1389 1390 set_peek_off = READ_ONCE(sock->ops)->set_peek_off; 1391 if (set_peek_off) 1392 ret = set_peek_off(sk, val); 1393 else 1394 ret = -EOPNOTSUPP; 1395 break; 1396 } 1397 1398 case SO_NOFCS: 1399 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1400 break; 1401 1402 case SO_SELECT_ERR_QUEUE: 1403 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1404 break; 1405 1406 #ifdef CONFIG_NET_RX_BUSY_POLL 1407 case SO_BUSY_POLL: 1408 if (val < 0) 1409 ret = -EINVAL; 1410 else 1411 WRITE_ONCE(sk->sk_ll_usec, val); 1412 break; 1413 case SO_PREFER_BUSY_POLL: 1414 if (valbool && !sockopt_capable(CAP_NET_ADMIN)) 1415 ret = -EPERM; 1416 else 1417 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1418 break; 1419 case SO_BUSY_POLL_BUDGET: 1420 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) { 1421 ret = -EPERM; 1422 } else { 1423 if (val < 0 || val > U16_MAX) 1424 ret = -EINVAL; 1425 else 1426 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1427 } 1428 break; 1429 #endif 1430 1431 case SO_MAX_PACING_RATE: 1432 { 1433 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1434 1435 if (sizeof(ulval) != sizeof(val) && 1436 optlen >= sizeof(ulval) && 1437 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1438 ret = -EFAULT; 1439 break; 1440 } 1441 if (ulval != ~0UL) 1442 cmpxchg(&sk->sk_pacing_status, 1443 SK_PACING_NONE, 1444 SK_PACING_NEEDED); 1445 /* Pairs with READ_ONCE() from sk_getsockopt() */ 1446 WRITE_ONCE(sk->sk_max_pacing_rate, ulval); 1447 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); 1448 break; 1449 } 1450 case SO_INCOMING_CPU: 1451 reuseport_update_incoming_cpu(sk, val); 1452 break; 1453 1454 case SO_CNX_ADVICE: 1455 if (val == 1) 1456 dst_negative_advice(sk); 1457 break; 1458 1459 case SO_ZEROCOPY: 1460 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1461 if (!(sk_is_tcp(sk) || 1462 (sk->sk_type == SOCK_DGRAM && 1463 sk->sk_protocol == IPPROTO_UDP))) 1464 ret = -EOPNOTSUPP; 1465 } else if (sk->sk_family != PF_RDS) { 1466 ret = -EOPNOTSUPP; 1467 } 1468 if (!ret) { 1469 if (val < 0 || val > 1) 1470 ret = -EINVAL; 1471 else 1472 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1473 } 1474 break; 1475 1476 case SO_TXTIME: 1477 if (optlen != sizeof(struct sock_txtime)) { 1478 ret = -EINVAL; 1479 break; 1480 } else if (copy_from_sockptr(&sk_txtime, optval, 1481 sizeof(struct sock_txtime))) { 1482 ret = -EFAULT; 1483 break; 1484 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1485 ret = -EINVAL; 1486 break; 1487 } 1488 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1489 * scheduler has enough safe guards. 1490 */ 1491 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1492 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1493 ret = -EPERM; 1494 break; 1495 } 1496 sock_valbool_flag(sk, SOCK_TXTIME, true); 1497 sk->sk_clockid = sk_txtime.clockid; 1498 sk->sk_txtime_deadline_mode = 1499 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1500 sk->sk_txtime_report_errors = 1501 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1502 break; 1503 1504 case SO_BINDTOIFINDEX: 1505 ret = sock_bindtoindex_locked(sk, val); 1506 break; 1507 1508 case SO_BUF_LOCK: 1509 if (val & ~SOCK_BUF_LOCK_MASK) { 1510 ret = -EINVAL; 1511 break; 1512 } 1513 sk->sk_userlocks = val | (sk->sk_userlocks & 1514 ~SOCK_BUF_LOCK_MASK); 1515 break; 1516 1517 case SO_RESERVE_MEM: 1518 { 1519 int delta; 1520 1521 if (val < 0) { 1522 ret = -EINVAL; 1523 break; 1524 } 1525 1526 delta = val - sk->sk_reserved_mem; 1527 if (delta < 0) 1528 sock_release_reserved_memory(sk, -delta); 1529 else 1530 ret = sock_reserve_memory(sk, delta); 1531 break; 1532 } 1533 1534 case SO_TXREHASH: 1535 if (val < -1 || val > 1) { 1536 ret = -EINVAL; 1537 break; 1538 } 1539 if ((u8)val == SOCK_TXREHASH_DEFAULT) 1540 val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); 1541 /* Paired with READ_ONCE() in tcp_rtx_synack() 1542 * and sk_getsockopt(). 1543 */ 1544 WRITE_ONCE(sk->sk_txrehash, (u8)val); 1545 break; 1546 1547 default: 1548 ret = -ENOPROTOOPT; 1549 break; 1550 } 1551 sockopt_release_sock(sk); 1552 return ret; 1553 } 1554 1555 int sock_setsockopt(struct socket *sock, int level, int optname, 1556 sockptr_t optval, unsigned int optlen) 1557 { 1558 return sk_setsockopt(sock->sk, level, optname, 1559 optval, optlen); 1560 } 1561 EXPORT_SYMBOL(sock_setsockopt); 1562 1563 static const struct cred *sk_get_peer_cred(struct sock *sk) 1564 { 1565 const struct cred *cred; 1566 1567 spin_lock(&sk->sk_peer_lock); 1568 cred = get_cred(sk->sk_peer_cred); 1569 spin_unlock(&sk->sk_peer_lock); 1570 1571 return cred; 1572 } 1573 1574 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1575 struct ucred *ucred) 1576 { 1577 ucred->pid = pid_vnr(pid); 1578 ucred->uid = ucred->gid = -1; 1579 if (cred) { 1580 struct user_namespace *current_ns = current_user_ns(); 1581 1582 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1583 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1584 } 1585 } 1586 1587 static int groups_to_user(sockptr_t dst, const struct group_info *src) 1588 { 1589 struct user_namespace *user_ns = current_user_ns(); 1590 int i; 1591 1592 for (i = 0; i < src->ngroups; i++) { 1593 gid_t gid = from_kgid_munged(user_ns, src->gid[i]); 1594 1595 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) 1596 return -EFAULT; 1597 } 1598 1599 return 0; 1600 } 1601 1602 int sk_getsockopt(struct sock *sk, int level, int optname, 1603 sockptr_t optval, sockptr_t optlen) 1604 { 1605 struct socket *sock = sk->sk_socket; 1606 1607 union { 1608 int val; 1609 u64 val64; 1610 unsigned long ulval; 1611 struct linger ling; 1612 struct old_timeval32 tm32; 1613 struct __kernel_old_timeval tm; 1614 struct __kernel_sock_timeval stm; 1615 struct sock_txtime txtime; 1616 struct so_timestamping timestamping; 1617 } v; 1618 1619 int lv = sizeof(int); 1620 int len; 1621 1622 if (copy_from_sockptr(&len, optlen, sizeof(int))) 1623 return -EFAULT; 1624 if (len < 0) 1625 return -EINVAL; 1626 1627 memset(&v, 0, sizeof(v)); 1628 1629 switch (optname) { 1630 case SO_DEBUG: 1631 v.val = sock_flag(sk, SOCK_DBG); 1632 break; 1633 1634 case SO_DONTROUTE: 1635 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1636 break; 1637 1638 case SO_BROADCAST: 1639 v.val = sock_flag(sk, SOCK_BROADCAST); 1640 break; 1641 1642 case SO_SNDBUF: 1643 v.val = READ_ONCE(sk->sk_sndbuf); 1644 break; 1645 1646 case SO_RCVBUF: 1647 v.val = READ_ONCE(sk->sk_rcvbuf); 1648 break; 1649 1650 case SO_REUSEADDR: 1651 v.val = sk->sk_reuse; 1652 break; 1653 1654 case SO_REUSEPORT: 1655 v.val = sk->sk_reuseport; 1656 break; 1657 1658 case SO_KEEPALIVE: 1659 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1660 break; 1661 1662 case SO_TYPE: 1663 v.val = sk->sk_type; 1664 break; 1665 1666 case SO_PROTOCOL: 1667 v.val = sk->sk_protocol; 1668 break; 1669 1670 case SO_DOMAIN: 1671 v.val = sk->sk_family; 1672 break; 1673 1674 case SO_ERROR: 1675 v.val = -sock_error(sk); 1676 if (v.val == 0) 1677 v.val = xchg(&sk->sk_err_soft, 0); 1678 break; 1679 1680 case SO_OOBINLINE: 1681 v.val = sock_flag(sk, SOCK_URGINLINE); 1682 break; 1683 1684 case SO_NO_CHECK: 1685 v.val = sk->sk_no_check_tx; 1686 break; 1687 1688 case SO_PRIORITY: 1689 v.val = READ_ONCE(sk->sk_priority); 1690 break; 1691 1692 case SO_LINGER: 1693 lv = sizeof(v.ling); 1694 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1695 v.ling.l_linger = sk->sk_lingertime / HZ; 1696 break; 1697 1698 case SO_BSDCOMPAT: 1699 break; 1700 1701 case SO_TIMESTAMP_OLD: 1702 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1703 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1704 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1705 break; 1706 1707 case SO_TIMESTAMPNS_OLD: 1708 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1709 break; 1710 1711 case SO_TIMESTAMP_NEW: 1712 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1713 break; 1714 1715 case SO_TIMESTAMPNS_NEW: 1716 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1717 break; 1718 1719 case SO_TIMESTAMPING_OLD: 1720 lv = sizeof(v.timestamping); 1721 v.timestamping.flags = sk->sk_tsflags; 1722 v.timestamping.bind_phc = sk->sk_bind_phc; 1723 break; 1724 1725 case SO_RCVTIMEO_OLD: 1726 case SO_RCVTIMEO_NEW: 1727 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, 1728 SO_RCVTIMEO_OLD == optname); 1729 break; 1730 1731 case SO_SNDTIMEO_OLD: 1732 case SO_SNDTIMEO_NEW: 1733 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, 1734 SO_SNDTIMEO_OLD == optname); 1735 break; 1736 1737 case SO_RCVLOWAT: 1738 v.val = READ_ONCE(sk->sk_rcvlowat); 1739 break; 1740 1741 case SO_SNDLOWAT: 1742 v.val = 1; 1743 break; 1744 1745 case SO_PASSCRED: 1746 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1747 break; 1748 1749 case SO_PASSPIDFD: 1750 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); 1751 break; 1752 1753 case SO_PEERCRED: 1754 { 1755 struct ucred peercred; 1756 if (len > sizeof(peercred)) 1757 len = sizeof(peercred); 1758 1759 spin_lock(&sk->sk_peer_lock); 1760 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1761 spin_unlock(&sk->sk_peer_lock); 1762 1763 if (copy_to_sockptr(optval, &peercred, len)) 1764 return -EFAULT; 1765 goto lenout; 1766 } 1767 1768 case SO_PEERPIDFD: 1769 { 1770 struct pid *peer_pid; 1771 struct file *pidfd_file = NULL; 1772 int pidfd; 1773 1774 if (len > sizeof(pidfd)) 1775 len = sizeof(pidfd); 1776 1777 spin_lock(&sk->sk_peer_lock); 1778 peer_pid = get_pid(sk->sk_peer_pid); 1779 spin_unlock(&sk->sk_peer_lock); 1780 1781 if (!peer_pid) 1782 return -ENODATA; 1783 1784 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); 1785 put_pid(peer_pid); 1786 if (pidfd < 0) 1787 return pidfd; 1788 1789 if (copy_to_sockptr(optval, &pidfd, len) || 1790 copy_to_sockptr(optlen, &len, sizeof(int))) { 1791 put_unused_fd(pidfd); 1792 fput(pidfd_file); 1793 1794 return -EFAULT; 1795 } 1796 1797 fd_install(pidfd, pidfd_file); 1798 return 0; 1799 } 1800 1801 case SO_PEERGROUPS: 1802 { 1803 const struct cred *cred; 1804 int ret, n; 1805 1806 cred = sk_get_peer_cred(sk); 1807 if (!cred) 1808 return -ENODATA; 1809 1810 n = cred->group_info->ngroups; 1811 if (len < n * sizeof(gid_t)) { 1812 len = n * sizeof(gid_t); 1813 put_cred(cred); 1814 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; 1815 } 1816 len = n * sizeof(gid_t); 1817 1818 ret = groups_to_user(optval, cred->group_info); 1819 put_cred(cred); 1820 if (ret) 1821 return ret; 1822 goto lenout; 1823 } 1824 1825 case SO_PEERNAME: 1826 { 1827 struct sockaddr_storage address; 1828 1829 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); 1830 if (lv < 0) 1831 return -ENOTCONN; 1832 if (lv < len) 1833 return -EINVAL; 1834 if (copy_to_sockptr(optval, &address, len)) 1835 return -EFAULT; 1836 goto lenout; 1837 } 1838 1839 /* Dubious BSD thing... Probably nobody even uses it, but 1840 * the UNIX standard wants it for whatever reason... -DaveM 1841 */ 1842 case SO_ACCEPTCONN: 1843 v.val = sk->sk_state == TCP_LISTEN; 1844 break; 1845 1846 case SO_PASSSEC: 1847 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1848 break; 1849 1850 case SO_PEERSEC: 1851 return security_socket_getpeersec_stream(sock, 1852 optval, optlen, len); 1853 1854 case SO_MARK: 1855 v.val = READ_ONCE(sk->sk_mark); 1856 break; 1857 1858 case SO_RCVMARK: 1859 v.val = sock_flag(sk, SOCK_RCVMARK); 1860 break; 1861 1862 case SO_RXQ_OVFL: 1863 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1864 break; 1865 1866 case SO_WIFI_STATUS: 1867 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1868 break; 1869 1870 case SO_PEEK_OFF: 1871 if (!READ_ONCE(sock->ops)->set_peek_off) 1872 return -EOPNOTSUPP; 1873 1874 v.val = READ_ONCE(sk->sk_peek_off); 1875 break; 1876 case SO_NOFCS: 1877 v.val = sock_flag(sk, SOCK_NOFCS); 1878 break; 1879 1880 case SO_BINDTODEVICE: 1881 return sock_getbindtodevice(sk, optval, optlen, len); 1882 1883 case SO_GET_FILTER: 1884 len = sk_get_filter(sk, optval, len); 1885 if (len < 0) 1886 return len; 1887 1888 goto lenout; 1889 1890 case SO_LOCK_FILTER: 1891 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1892 break; 1893 1894 case SO_BPF_EXTENSIONS: 1895 v.val = bpf_tell_extensions(); 1896 break; 1897 1898 case SO_SELECT_ERR_QUEUE: 1899 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1900 break; 1901 1902 #ifdef CONFIG_NET_RX_BUSY_POLL 1903 case SO_BUSY_POLL: 1904 v.val = READ_ONCE(sk->sk_ll_usec); 1905 break; 1906 case SO_PREFER_BUSY_POLL: 1907 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 1908 break; 1909 #endif 1910 1911 case SO_MAX_PACING_RATE: 1912 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ 1913 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 1914 lv = sizeof(v.ulval); 1915 v.ulval = READ_ONCE(sk->sk_max_pacing_rate); 1916 } else { 1917 /* 32bit version */ 1918 v.val = min_t(unsigned long, ~0U, 1919 READ_ONCE(sk->sk_max_pacing_rate)); 1920 } 1921 break; 1922 1923 case SO_INCOMING_CPU: 1924 v.val = READ_ONCE(sk->sk_incoming_cpu); 1925 break; 1926 1927 case SO_MEMINFO: 1928 { 1929 u32 meminfo[SK_MEMINFO_VARS]; 1930 1931 sk_get_meminfo(sk, meminfo); 1932 1933 len = min_t(unsigned int, len, sizeof(meminfo)); 1934 if (copy_to_sockptr(optval, &meminfo, len)) 1935 return -EFAULT; 1936 1937 goto lenout; 1938 } 1939 1940 #ifdef CONFIG_NET_RX_BUSY_POLL 1941 case SO_INCOMING_NAPI_ID: 1942 v.val = READ_ONCE(sk->sk_napi_id); 1943 1944 /* aggregate non-NAPI IDs down to 0 */ 1945 if (v.val < MIN_NAPI_ID) 1946 v.val = 0; 1947 1948 break; 1949 #endif 1950 1951 case SO_COOKIE: 1952 lv = sizeof(u64); 1953 if (len < lv) 1954 return -EINVAL; 1955 v.val64 = sock_gen_cookie(sk); 1956 break; 1957 1958 case SO_ZEROCOPY: 1959 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1960 break; 1961 1962 case SO_TXTIME: 1963 lv = sizeof(v.txtime); 1964 v.txtime.clockid = sk->sk_clockid; 1965 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1966 SOF_TXTIME_DEADLINE_MODE : 0; 1967 v.txtime.flags |= sk->sk_txtime_report_errors ? 1968 SOF_TXTIME_REPORT_ERRORS : 0; 1969 break; 1970 1971 case SO_BINDTOIFINDEX: 1972 v.val = READ_ONCE(sk->sk_bound_dev_if); 1973 break; 1974 1975 case SO_NETNS_COOKIE: 1976 lv = sizeof(u64); 1977 if (len != lv) 1978 return -EINVAL; 1979 v.val64 = sock_net(sk)->net_cookie; 1980 break; 1981 1982 case SO_BUF_LOCK: 1983 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; 1984 break; 1985 1986 case SO_RESERVE_MEM: 1987 v.val = READ_ONCE(sk->sk_reserved_mem); 1988 break; 1989 1990 case SO_TXREHASH: 1991 /* Paired with WRITE_ONCE() in sk_setsockopt() */ 1992 v.val = READ_ONCE(sk->sk_txrehash); 1993 break; 1994 1995 default: 1996 /* We implement the SO_SNDLOWAT etc to not be settable 1997 * (1003.1g 7). 1998 */ 1999 return -ENOPROTOOPT; 2000 } 2001 2002 if (len > lv) 2003 len = lv; 2004 if (copy_to_sockptr(optval, &v, len)) 2005 return -EFAULT; 2006 lenout: 2007 if (copy_to_sockptr(optlen, &len, sizeof(int))) 2008 return -EFAULT; 2009 return 0; 2010 } 2011 2012 int sock_getsockopt(struct socket *sock, int level, int optname, 2013 char __user *optval, int __user *optlen) 2014 { 2015 return sk_getsockopt(sock->sk, level, optname, 2016 USER_SOCKPTR(optval), 2017 USER_SOCKPTR(optlen)); 2018 } 2019 2020 /* 2021 * Initialize an sk_lock. 2022 * 2023 * (We also register the sk_lock with the lock validator.) 2024 */ 2025 static inline void sock_lock_init(struct sock *sk) 2026 { 2027 if (sk->sk_kern_sock) 2028 sock_lock_init_class_and_name( 2029 sk, 2030 af_family_kern_slock_key_strings[sk->sk_family], 2031 af_family_kern_slock_keys + sk->sk_family, 2032 af_family_kern_key_strings[sk->sk_family], 2033 af_family_kern_keys + sk->sk_family); 2034 else 2035 sock_lock_init_class_and_name( 2036 sk, 2037 af_family_slock_key_strings[sk->sk_family], 2038 af_family_slock_keys + sk->sk_family, 2039 af_family_key_strings[sk->sk_family], 2040 af_family_keys + sk->sk_family); 2041 } 2042 2043 /* 2044 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 2045 * even temporarly, because of RCU lookups. sk_node should also be left as is. 2046 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 2047 */ 2048 static void sock_copy(struct sock *nsk, const struct sock *osk) 2049 { 2050 const struct proto *prot = READ_ONCE(osk->sk_prot); 2051 #ifdef CONFIG_SECURITY_NETWORK 2052 void *sptr = nsk->sk_security; 2053 #endif 2054 2055 /* If we move sk_tx_queue_mapping out of the private section, 2056 * we must check if sk_tx_queue_clear() is called after 2057 * sock_copy() in sk_clone_lock(). 2058 */ 2059 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 2060 offsetof(struct sock, sk_dontcopy_begin) || 2061 offsetof(struct sock, sk_tx_queue_mapping) >= 2062 offsetof(struct sock, sk_dontcopy_end)); 2063 2064 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 2065 2066 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 2067 prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 2068 2069 #ifdef CONFIG_SECURITY_NETWORK 2070 nsk->sk_security = sptr; 2071 security_sk_clone(osk, nsk); 2072 #endif 2073 } 2074 2075 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 2076 int family) 2077 { 2078 struct sock *sk; 2079 struct kmem_cache *slab; 2080 2081 slab = prot->slab; 2082 if (slab != NULL) { 2083 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 2084 if (!sk) 2085 return sk; 2086 if (want_init_on_alloc(priority)) 2087 sk_prot_clear_nulls(sk, prot->obj_size); 2088 } else 2089 sk = kmalloc(prot->obj_size, priority); 2090 2091 if (sk != NULL) { 2092 if (security_sk_alloc(sk, family, priority)) 2093 goto out_free; 2094 2095 if (!try_module_get(prot->owner)) 2096 goto out_free_sec; 2097 } 2098 2099 return sk; 2100 2101 out_free_sec: 2102 security_sk_free(sk); 2103 out_free: 2104 if (slab != NULL) 2105 kmem_cache_free(slab, sk); 2106 else 2107 kfree(sk); 2108 return NULL; 2109 } 2110 2111 static void sk_prot_free(struct proto *prot, struct sock *sk) 2112 { 2113 struct kmem_cache *slab; 2114 struct module *owner; 2115 2116 owner = prot->owner; 2117 slab = prot->slab; 2118 2119 cgroup_sk_free(&sk->sk_cgrp_data); 2120 mem_cgroup_sk_free(sk); 2121 security_sk_free(sk); 2122 if (slab != NULL) 2123 kmem_cache_free(slab, sk); 2124 else 2125 kfree(sk); 2126 module_put(owner); 2127 } 2128 2129 /** 2130 * sk_alloc - All socket objects are allocated here 2131 * @net: the applicable net namespace 2132 * @family: protocol family 2133 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2134 * @prot: struct proto associated with this new sock instance 2135 * @kern: is this to be a kernel socket? 2136 */ 2137 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 2138 struct proto *prot, int kern) 2139 { 2140 struct sock *sk; 2141 2142 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 2143 if (sk) { 2144 sk->sk_family = family; 2145 /* 2146 * See comment in struct sock definition to understand 2147 * why we need sk_prot_creator -acme 2148 */ 2149 sk->sk_prot = sk->sk_prot_creator = prot; 2150 sk->sk_kern_sock = kern; 2151 sock_lock_init(sk); 2152 sk->sk_net_refcnt = kern ? 0 : 1; 2153 if (likely(sk->sk_net_refcnt)) { 2154 get_net_track(net, &sk->ns_tracker, priority); 2155 sock_inuse_add(net, 1); 2156 } else { 2157 __netns_tracker_alloc(net, &sk->ns_tracker, 2158 false, priority); 2159 } 2160 2161 sock_net_set(sk, net); 2162 refcount_set(&sk->sk_wmem_alloc, 1); 2163 2164 mem_cgroup_sk_alloc(sk); 2165 cgroup_sk_alloc(&sk->sk_cgrp_data); 2166 sock_update_classid(&sk->sk_cgrp_data); 2167 sock_update_netprioidx(&sk->sk_cgrp_data); 2168 sk_tx_queue_clear(sk); 2169 } 2170 2171 return sk; 2172 } 2173 EXPORT_SYMBOL(sk_alloc); 2174 2175 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 2176 * grace period. This is the case for UDP sockets and TCP listeners. 2177 */ 2178 static void __sk_destruct(struct rcu_head *head) 2179 { 2180 struct sock *sk = container_of(head, struct sock, sk_rcu); 2181 struct sk_filter *filter; 2182 2183 if (sk->sk_destruct) 2184 sk->sk_destruct(sk); 2185 2186 filter = rcu_dereference_check(sk->sk_filter, 2187 refcount_read(&sk->sk_wmem_alloc) == 0); 2188 if (filter) { 2189 sk_filter_uncharge(sk, filter); 2190 RCU_INIT_POINTER(sk->sk_filter, NULL); 2191 } 2192 2193 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 2194 2195 #ifdef CONFIG_BPF_SYSCALL 2196 bpf_sk_storage_free(sk); 2197 #endif 2198 2199 if (atomic_read(&sk->sk_omem_alloc)) 2200 pr_debug("%s: optmem leakage (%d bytes) detected\n", 2201 __func__, atomic_read(&sk->sk_omem_alloc)); 2202 2203 if (sk->sk_frag.page) { 2204 put_page(sk->sk_frag.page); 2205 sk->sk_frag.page = NULL; 2206 } 2207 2208 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ 2209 put_cred(sk->sk_peer_cred); 2210 put_pid(sk->sk_peer_pid); 2211 2212 if (likely(sk->sk_net_refcnt)) 2213 put_net_track(sock_net(sk), &sk->ns_tracker); 2214 else 2215 __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); 2216 2217 sk_prot_free(sk->sk_prot_creator, sk); 2218 } 2219 2220 void sk_destruct(struct sock *sk) 2221 { 2222 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 2223 2224 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 2225 reuseport_detach_sock(sk); 2226 use_call_rcu = true; 2227 } 2228 2229 if (use_call_rcu) 2230 call_rcu(&sk->sk_rcu, __sk_destruct); 2231 else 2232 __sk_destruct(&sk->sk_rcu); 2233 } 2234 2235 static void __sk_free(struct sock *sk) 2236 { 2237 if (likely(sk->sk_net_refcnt)) 2238 sock_inuse_add(sock_net(sk), -1); 2239 2240 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 2241 sock_diag_broadcast_destroy(sk); 2242 else 2243 sk_destruct(sk); 2244 } 2245 2246 void sk_free(struct sock *sk) 2247 { 2248 /* 2249 * We subtract one from sk_wmem_alloc and can know if 2250 * some packets are still in some tx queue. 2251 * If not null, sock_wfree() will call __sk_free(sk) later 2252 */ 2253 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 2254 __sk_free(sk); 2255 } 2256 EXPORT_SYMBOL(sk_free); 2257 2258 static void sk_init_common(struct sock *sk) 2259 { 2260 skb_queue_head_init(&sk->sk_receive_queue); 2261 skb_queue_head_init(&sk->sk_write_queue); 2262 skb_queue_head_init(&sk->sk_error_queue); 2263 2264 rwlock_init(&sk->sk_callback_lock); 2265 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 2266 af_rlock_keys + sk->sk_family, 2267 af_family_rlock_key_strings[sk->sk_family]); 2268 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 2269 af_wlock_keys + sk->sk_family, 2270 af_family_wlock_key_strings[sk->sk_family]); 2271 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 2272 af_elock_keys + sk->sk_family, 2273 af_family_elock_key_strings[sk->sk_family]); 2274 lockdep_set_class_and_name(&sk->sk_callback_lock, 2275 af_callback_keys + sk->sk_family, 2276 af_family_clock_key_strings[sk->sk_family]); 2277 } 2278 2279 /** 2280 * sk_clone_lock - clone a socket, and lock its clone 2281 * @sk: the socket to clone 2282 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2283 * 2284 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 2285 */ 2286 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 2287 { 2288 struct proto *prot = READ_ONCE(sk->sk_prot); 2289 struct sk_filter *filter; 2290 bool is_charged = true; 2291 struct sock *newsk; 2292 2293 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 2294 if (!newsk) 2295 goto out; 2296 2297 sock_copy(newsk, sk); 2298 2299 newsk->sk_prot_creator = prot; 2300 2301 /* SANITY */ 2302 if (likely(newsk->sk_net_refcnt)) { 2303 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); 2304 sock_inuse_add(sock_net(newsk), 1); 2305 } else { 2306 /* Kernel sockets are not elevating the struct net refcount. 2307 * Instead, use a tracker to more easily detect if a layer 2308 * is not properly dismantling its kernel sockets at netns 2309 * destroy time. 2310 */ 2311 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, 2312 false, priority); 2313 } 2314 sk_node_init(&newsk->sk_node); 2315 sock_lock_init(newsk); 2316 bh_lock_sock(newsk); 2317 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 2318 newsk->sk_backlog.len = 0; 2319 2320 atomic_set(&newsk->sk_rmem_alloc, 0); 2321 2322 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ 2323 refcount_set(&newsk->sk_wmem_alloc, 1); 2324 2325 atomic_set(&newsk->sk_omem_alloc, 0); 2326 sk_init_common(newsk); 2327 2328 newsk->sk_dst_cache = NULL; 2329 newsk->sk_dst_pending_confirm = 0; 2330 newsk->sk_wmem_queued = 0; 2331 newsk->sk_forward_alloc = 0; 2332 newsk->sk_reserved_mem = 0; 2333 atomic_set(&newsk->sk_drops, 0); 2334 newsk->sk_send_head = NULL; 2335 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 2336 atomic_set(&newsk->sk_zckey, 0); 2337 2338 sock_reset_flag(newsk, SOCK_DONE); 2339 2340 /* sk->sk_memcg will be populated at accept() time */ 2341 newsk->sk_memcg = NULL; 2342 2343 cgroup_sk_clone(&newsk->sk_cgrp_data); 2344 2345 rcu_read_lock(); 2346 filter = rcu_dereference(sk->sk_filter); 2347 if (filter != NULL) 2348 /* though it's an empty new sock, the charging may fail 2349 * if sysctl_optmem_max was changed between creation of 2350 * original socket and cloning 2351 */ 2352 is_charged = sk_filter_charge(newsk, filter); 2353 RCU_INIT_POINTER(newsk->sk_filter, filter); 2354 rcu_read_unlock(); 2355 2356 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 2357 /* We need to make sure that we don't uncharge the new 2358 * socket if we couldn't charge it in the first place 2359 * as otherwise we uncharge the parent's filter. 2360 */ 2361 if (!is_charged) 2362 RCU_INIT_POINTER(newsk->sk_filter, NULL); 2363 sk_free_unlock_clone(newsk); 2364 newsk = NULL; 2365 goto out; 2366 } 2367 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 2368 2369 if (bpf_sk_storage_clone(sk, newsk)) { 2370 sk_free_unlock_clone(newsk); 2371 newsk = NULL; 2372 goto out; 2373 } 2374 2375 /* Clear sk_user_data if parent had the pointer tagged 2376 * as not suitable for copying when cloning. 2377 */ 2378 if (sk_user_data_is_nocopy(newsk)) 2379 newsk->sk_user_data = NULL; 2380 2381 newsk->sk_err = 0; 2382 newsk->sk_err_soft = 0; 2383 newsk->sk_priority = 0; 2384 newsk->sk_incoming_cpu = raw_smp_processor_id(); 2385 2386 /* Before updating sk_refcnt, we must commit prior changes to memory 2387 * (Documentation/RCU/rculist_nulls.rst for details) 2388 */ 2389 smp_wmb(); 2390 refcount_set(&newsk->sk_refcnt, 2); 2391 2392 sk_set_socket(newsk, NULL); 2393 sk_tx_queue_clear(newsk); 2394 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2395 2396 if (newsk->sk_prot->sockets_allocated) 2397 sk_sockets_allocated_inc(newsk); 2398 2399 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2400 net_enable_timestamp(); 2401 out: 2402 return newsk; 2403 } 2404 EXPORT_SYMBOL_GPL(sk_clone_lock); 2405 2406 void sk_free_unlock_clone(struct sock *sk) 2407 { 2408 /* It is still raw copy of parent, so invalidate 2409 * destructor and make plain sk_free() */ 2410 sk->sk_destruct = NULL; 2411 bh_unlock_sock(sk); 2412 sk_free(sk); 2413 } 2414 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 2415 2416 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) 2417 { 2418 bool is_ipv6 = false; 2419 u32 max_size; 2420 2421 #if IS_ENABLED(CONFIG_IPV6) 2422 is_ipv6 = (sk->sk_family == AF_INET6 && 2423 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); 2424 #endif 2425 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ 2426 max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : 2427 READ_ONCE(dst->dev->gso_ipv4_max_size); 2428 if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) 2429 max_size = GSO_LEGACY_MAX_SIZE; 2430 2431 return max_size - (MAX_TCP_HEADER + 1); 2432 } 2433 2434 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2435 { 2436 u32 max_segs = 1; 2437 2438 sk->sk_route_caps = dst->dev->features; 2439 if (sk_is_tcp(sk)) 2440 sk->sk_route_caps |= NETIF_F_GSO; 2441 if (sk->sk_route_caps & NETIF_F_GSO) 2442 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2443 if (unlikely(sk->sk_gso_disabled)) 2444 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2445 if (sk_can_gso(sk)) { 2446 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2447 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2448 } else { 2449 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2450 sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); 2451 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ 2452 max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); 2453 } 2454 } 2455 sk->sk_gso_max_segs = max_segs; 2456 sk_dst_set(sk, dst); 2457 } 2458 EXPORT_SYMBOL_GPL(sk_setup_caps); 2459 2460 /* 2461 * Simple resource managers for sockets. 2462 */ 2463 2464 2465 /* 2466 * Write buffer destructor automatically called from kfree_skb. 2467 */ 2468 void sock_wfree(struct sk_buff *skb) 2469 { 2470 struct sock *sk = skb->sk; 2471 unsigned int len = skb->truesize; 2472 bool free; 2473 2474 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2475 if (sock_flag(sk, SOCK_RCU_FREE) && 2476 sk->sk_write_space == sock_def_write_space) { 2477 rcu_read_lock(); 2478 free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); 2479 sock_def_write_space_wfree(sk); 2480 rcu_read_unlock(); 2481 if (unlikely(free)) 2482 __sk_free(sk); 2483 return; 2484 } 2485 2486 /* 2487 * Keep a reference on sk_wmem_alloc, this will be released 2488 * after sk_write_space() call 2489 */ 2490 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2491 sk->sk_write_space(sk); 2492 len = 1; 2493 } 2494 /* 2495 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2496 * could not do because of in-flight packets 2497 */ 2498 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2499 __sk_free(sk); 2500 } 2501 EXPORT_SYMBOL(sock_wfree); 2502 2503 /* This variant of sock_wfree() is used by TCP, 2504 * since it sets SOCK_USE_WRITE_QUEUE. 2505 */ 2506 void __sock_wfree(struct sk_buff *skb) 2507 { 2508 struct sock *sk = skb->sk; 2509 2510 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2511 __sk_free(sk); 2512 } 2513 2514 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2515 { 2516 skb_orphan(skb); 2517 skb->sk = sk; 2518 #ifdef CONFIG_INET 2519 if (unlikely(!sk_fullsock(sk))) { 2520 skb->destructor = sock_edemux; 2521 sock_hold(sk); 2522 return; 2523 } 2524 #endif 2525 skb->destructor = sock_wfree; 2526 skb_set_hash_from_sk(skb, sk); 2527 /* 2528 * We used to take a refcount on sk, but following operation 2529 * is enough to guarantee sk_free() wont free this sock until 2530 * all in-flight packets are completed 2531 */ 2532 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 2533 } 2534 EXPORT_SYMBOL(skb_set_owner_w); 2535 2536 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2537 { 2538 #ifdef CONFIG_TLS_DEVICE 2539 /* Drivers depend on in-order delivery for crypto offload, 2540 * partial orphan breaks out-of-order-OK logic. 2541 */ 2542 if (skb->decrypted) 2543 return false; 2544 #endif 2545 return (skb->destructor == sock_wfree || 2546 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2547 } 2548 2549 /* This helper is used by netem, as it can hold packets in its 2550 * delay queue. We want to allow the owner socket to send more 2551 * packets, as if they were already TX completed by a typical driver. 2552 * But we also want to keep skb->sk set because some packet schedulers 2553 * rely on it (sch_fq for example). 2554 */ 2555 void skb_orphan_partial(struct sk_buff *skb) 2556 { 2557 if (skb_is_tcp_pure_ack(skb)) 2558 return; 2559 2560 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2561 return; 2562 2563 skb_orphan(skb); 2564 } 2565 EXPORT_SYMBOL(skb_orphan_partial); 2566 2567 /* 2568 * Read buffer destructor automatically called from kfree_skb. 2569 */ 2570 void sock_rfree(struct sk_buff *skb) 2571 { 2572 struct sock *sk = skb->sk; 2573 unsigned int len = skb->truesize; 2574 2575 atomic_sub(len, &sk->sk_rmem_alloc); 2576 sk_mem_uncharge(sk, len); 2577 } 2578 EXPORT_SYMBOL(sock_rfree); 2579 2580 /* 2581 * Buffer destructor for skbs that are not used directly in read or write 2582 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2583 */ 2584 void sock_efree(struct sk_buff *skb) 2585 { 2586 sock_put(skb->sk); 2587 } 2588 EXPORT_SYMBOL(sock_efree); 2589 2590 /* Buffer destructor for prefetch/receive path where reference count may 2591 * not be held, e.g. for listen sockets. 2592 */ 2593 #ifdef CONFIG_INET 2594 void sock_pfree(struct sk_buff *skb) 2595 { 2596 if (sk_is_refcounted(skb->sk)) 2597 sock_gen_put(skb->sk); 2598 } 2599 EXPORT_SYMBOL(sock_pfree); 2600 #endif /* CONFIG_INET */ 2601 2602 kuid_t sock_i_uid(struct sock *sk) 2603 { 2604 kuid_t uid; 2605 2606 read_lock_bh(&sk->sk_callback_lock); 2607 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 2608 read_unlock_bh(&sk->sk_callback_lock); 2609 return uid; 2610 } 2611 EXPORT_SYMBOL(sock_i_uid); 2612 2613 unsigned long __sock_i_ino(struct sock *sk) 2614 { 2615 unsigned long ino; 2616 2617 read_lock(&sk->sk_callback_lock); 2618 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 2619 read_unlock(&sk->sk_callback_lock); 2620 return ino; 2621 } 2622 EXPORT_SYMBOL(__sock_i_ino); 2623 2624 unsigned long sock_i_ino(struct sock *sk) 2625 { 2626 unsigned long ino; 2627 2628 local_bh_disable(); 2629 ino = __sock_i_ino(sk); 2630 local_bh_enable(); 2631 return ino; 2632 } 2633 EXPORT_SYMBOL(sock_i_ino); 2634 2635 /* 2636 * Allocate a skb from the socket's send buffer. 2637 */ 2638 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2639 gfp_t priority) 2640 { 2641 if (force || 2642 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2643 struct sk_buff *skb = alloc_skb(size, priority); 2644 2645 if (skb) { 2646 skb_set_owner_w(skb, sk); 2647 return skb; 2648 } 2649 } 2650 return NULL; 2651 } 2652 EXPORT_SYMBOL(sock_wmalloc); 2653 2654 static void sock_ofree(struct sk_buff *skb) 2655 { 2656 struct sock *sk = skb->sk; 2657 2658 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2659 } 2660 2661 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2662 gfp_t priority) 2663 { 2664 struct sk_buff *skb; 2665 2666 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2667 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2668 READ_ONCE(sysctl_optmem_max)) 2669 return NULL; 2670 2671 skb = alloc_skb(size, priority); 2672 if (!skb) 2673 return NULL; 2674 2675 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2676 skb->sk = sk; 2677 skb->destructor = sock_ofree; 2678 return skb; 2679 } 2680 2681 /* 2682 * Allocate a memory block from the socket's option memory buffer. 2683 */ 2684 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2685 { 2686 int optmem_max = READ_ONCE(sysctl_optmem_max); 2687 2688 if ((unsigned int)size <= optmem_max && 2689 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { 2690 void *mem; 2691 /* First do the add, to avoid the race if kmalloc 2692 * might sleep. 2693 */ 2694 atomic_add(size, &sk->sk_omem_alloc); 2695 mem = kmalloc(size, priority); 2696 if (mem) 2697 return mem; 2698 atomic_sub(size, &sk->sk_omem_alloc); 2699 } 2700 return NULL; 2701 } 2702 EXPORT_SYMBOL(sock_kmalloc); 2703 2704 /* Free an option memory block. Note, we actually want the inline 2705 * here as this allows gcc to detect the nullify and fold away the 2706 * condition entirely. 2707 */ 2708 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2709 const bool nullify) 2710 { 2711 if (WARN_ON_ONCE(!mem)) 2712 return; 2713 if (nullify) 2714 kfree_sensitive(mem); 2715 else 2716 kfree(mem); 2717 atomic_sub(size, &sk->sk_omem_alloc); 2718 } 2719 2720 void sock_kfree_s(struct sock *sk, void *mem, int size) 2721 { 2722 __sock_kfree_s(sk, mem, size, false); 2723 } 2724 EXPORT_SYMBOL(sock_kfree_s); 2725 2726 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2727 { 2728 __sock_kfree_s(sk, mem, size, true); 2729 } 2730 EXPORT_SYMBOL(sock_kzfree_s); 2731 2732 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2733 I think, these locks should be removed for datagram sockets. 2734 */ 2735 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2736 { 2737 DEFINE_WAIT(wait); 2738 2739 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2740 for (;;) { 2741 if (!timeo) 2742 break; 2743 if (signal_pending(current)) 2744 break; 2745 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2746 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2747 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2748 break; 2749 if (sk->sk_shutdown & SEND_SHUTDOWN) 2750 break; 2751 if (sk->sk_err) 2752 break; 2753 timeo = schedule_timeout(timeo); 2754 } 2755 finish_wait(sk_sleep(sk), &wait); 2756 return timeo; 2757 } 2758 2759 2760 /* 2761 * Generic send/receive buffer handlers 2762 */ 2763 2764 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2765 unsigned long data_len, int noblock, 2766 int *errcode, int max_page_order) 2767 { 2768 struct sk_buff *skb; 2769 long timeo; 2770 int err; 2771 2772 timeo = sock_sndtimeo(sk, noblock); 2773 for (;;) { 2774 err = sock_error(sk); 2775 if (err != 0) 2776 goto failure; 2777 2778 err = -EPIPE; 2779 if (sk->sk_shutdown & SEND_SHUTDOWN) 2780 goto failure; 2781 2782 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2783 break; 2784 2785 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2786 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2787 err = -EAGAIN; 2788 if (!timeo) 2789 goto failure; 2790 if (signal_pending(current)) 2791 goto interrupted; 2792 timeo = sock_wait_for_wmem(sk, timeo); 2793 } 2794 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2795 errcode, sk->sk_allocation); 2796 if (skb) 2797 skb_set_owner_w(skb, sk); 2798 return skb; 2799 2800 interrupted: 2801 err = sock_intr_errno(timeo); 2802 failure: 2803 *errcode = err; 2804 return NULL; 2805 } 2806 EXPORT_SYMBOL(sock_alloc_send_pskb); 2807 2808 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, 2809 struct sockcm_cookie *sockc) 2810 { 2811 u32 tsflags; 2812 2813 switch (cmsg->cmsg_type) { 2814 case SO_MARK: 2815 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 2816 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2817 return -EPERM; 2818 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2819 return -EINVAL; 2820 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2821 break; 2822 case SO_TIMESTAMPING_OLD: 2823 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2824 return -EINVAL; 2825 2826 tsflags = *(u32 *)CMSG_DATA(cmsg); 2827 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2828 return -EINVAL; 2829 2830 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2831 sockc->tsflags |= tsflags; 2832 break; 2833 case SCM_TXTIME: 2834 if (!sock_flag(sk, SOCK_TXTIME)) 2835 return -EINVAL; 2836 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2837 return -EINVAL; 2838 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2839 break; 2840 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2841 case SCM_RIGHTS: 2842 case SCM_CREDENTIALS: 2843 break; 2844 default: 2845 return -EINVAL; 2846 } 2847 return 0; 2848 } 2849 EXPORT_SYMBOL(__sock_cmsg_send); 2850 2851 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2852 struct sockcm_cookie *sockc) 2853 { 2854 struct cmsghdr *cmsg; 2855 int ret; 2856 2857 for_each_cmsghdr(cmsg, msg) { 2858 if (!CMSG_OK(msg, cmsg)) 2859 return -EINVAL; 2860 if (cmsg->cmsg_level != SOL_SOCKET) 2861 continue; 2862 ret = __sock_cmsg_send(sk, cmsg, sockc); 2863 if (ret) 2864 return ret; 2865 } 2866 return 0; 2867 } 2868 EXPORT_SYMBOL(sock_cmsg_send); 2869 2870 static void sk_enter_memory_pressure(struct sock *sk) 2871 { 2872 if (!sk->sk_prot->enter_memory_pressure) 2873 return; 2874 2875 sk->sk_prot->enter_memory_pressure(sk); 2876 } 2877 2878 static void sk_leave_memory_pressure(struct sock *sk) 2879 { 2880 if (sk->sk_prot->leave_memory_pressure) { 2881 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, 2882 tcp_leave_memory_pressure, sk); 2883 } else { 2884 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2885 2886 if (memory_pressure && READ_ONCE(*memory_pressure)) 2887 WRITE_ONCE(*memory_pressure, 0); 2888 } 2889 } 2890 2891 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 2892 2893 /** 2894 * skb_page_frag_refill - check that a page_frag contains enough room 2895 * @sz: minimum size of the fragment we want to get 2896 * @pfrag: pointer to page_frag 2897 * @gfp: priority for memory allocation 2898 * 2899 * Note: While this allocator tries to use high order pages, there is 2900 * no guarantee that allocations succeed. Therefore, @sz MUST be 2901 * less or equal than PAGE_SIZE. 2902 */ 2903 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2904 { 2905 if (pfrag->page) { 2906 if (page_ref_count(pfrag->page) == 1) { 2907 pfrag->offset = 0; 2908 return true; 2909 } 2910 if (pfrag->offset + sz <= pfrag->size) 2911 return true; 2912 put_page(pfrag->page); 2913 } 2914 2915 pfrag->offset = 0; 2916 if (SKB_FRAG_PAGE_ORDER && 2917 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 2918 /* Avoid direct reclaim but allow kswapd to wake */ 2919 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2920 __GFP_COMP | __GFP_NOWARN | 2921 __GFP_NORETRY, 2922 SKB_FRAG_PAGE_ORDER); 2923 if (likely(pfrag->page)) { 2924 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2925 return true; 2926 } 2927 } 2928 pfrag->page = alloc_page(gfp); 2929 if (likely(pfrag->page)) { 2930 pfrag->size = PAGE_SIZE; 2931 return true; 2932 } 2933 return false; 2934 } 2935 EXPORT_SYMBOL(skb_page_frag_refill); 2936 2937 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2938 { 2939 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2940 return true; 2941 2942 sk_enter_memory_pressure(sk); 2943 sk_stream_moderate_sndbuf(sk); 2944 return false; 2945 } 2946 EXPORT_SYMBOL(sk_page_frag_refill); 2947 2948 void __lock_sock(struct sock *sk) 2949 __releases(&sk->sk_lock.slock) 2950 __acquires(&sk->sk_lock.slock) 2951 { 2952 DEFINE_WAIT(wait); 2953 2954 for (;;) { 2955 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2956 TASK_UNINTERRUPTIBLE); 2957 spin_unlock_bh(&sk->sk_lock.slock); 2958 schedule(); 2959 spin_lock_bh(&sk->sk_lock.slock); 2960 if (!sock_owned_by_user(sk)) 2961 break; 2962 } 2963 finish_wait(&sk->sk_lock.wq, &wait); 2964 } 2965 2966 void __release_sock(struct sock *sk) 2967 __releases(&sk->sk_lock.slock) 2968 __acquires(&sk->sk_lock.slock) 2969 { 2970 struct sk_buff *skb, *next; 2971 2972 while ((skb = sk->sk_backlog.head) != NULL) { 2973 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2974 2975 spin_unlock_bh(&sk->sk_lock.slock); 2976 2977 do { 2978 next = skb->next; 2979 prefetch(next); 2980 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); 2981 skb_mark_not_on_list(skb); 2982 sk_backlog_rcv(sk, skb); 2983 2984 cond_resched(); 2985 2986 skb = next; 2987 } while (skb != NULL); 2988 2989 spin_lock_bh(&sk->sk_lock.slock); 2990 } 2991 2992 /* 2993 * Doing the zeroing here guarantee we can not loop forever 2994 * while a wild producer attempts to flood us. 2995 */ 2996 sk->sk_backlog.len = 0; 2997 } 2998 2999 void __sk_flush_backlog(struct sock *sk) 3000 { 3001 spin_lock_bh(&sk->sk_lock.slock); 3002 __release_sock(sk); 3003 spin_unlock_bh(&sk->sk_lock.slock); 3004 } 3005 EXPORT_SYMBOL_GPL(__sk_flush_backlog); 3006 3007 /** 3008 * sk_wait_data - wait for data to arrive at sk_receive_queue 3009 * @sk: sock to wait on 3010 * @timeo: for how long 3011 * @skb: last skb seen on sk_receive_queue 3012 * 3013 * Now socket state including sk->sk_err is changed only under lock, 3014 * hence we may omit checks after joining wait queue. 3015 * We check receive queue before schedule() only as optimization; 3016 * it is very likely that release_sock() added new data. 3017 */ 3018 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 3019 { 3020 DEFINE_WAIT_FUNC(wait, woken_wake_function); 3021 int rc; 3022 3023 add_wait_queue(sk_sleep(sk), &wait); 3024 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3025 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 3026 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3027 remove_wait_queue(sk_sleep(sk), &wait); 3028 return rc; 3029 } 3030 EXPORT_SYMBOL(sk_wait_data); 3031 3032 /** 3033 * __sk_mem_raise_allocated - increase memory_allocated 3034 * @sk: socket 3035 * @size: memory size to allocate 3036 * @amt: pages to allocate 3037 * @kind: allocation type 3038 * 3039 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 3040 */ 3041 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 3042 { 3043 bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg; 3044 struct proto *prot = sk->sk_prot; 3045 bool charged = true; 3046 long allocated; 3047 3048 sk_memory_allocated_add(sk, amt); 3049 allocated = sk_memory_allocated(sk); 3050 if (memcg_charge && 3051 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt, 3052 gfp_memcg_charge()))) 3053 goto suppress_allocation; 3054 3055 /* Under limit. */ 3056 if (allocated <= sk_prot_mem_limits(sk, 0)) { 3057 sk_leave_memory_pressure(sk); 3058 return 1; 3059 } 3060 3061 /* Under pressure. */ 3062 if (allocated > sk_prot_mem_limits(sk, 1)) 3063 sk_enter_memory_pressure(sk); 3064 3065 /* Over hard limit. */ 3066 if (allocated > sk_prot_mem_limits(sk, 2)) 3067 goto suppress_allocation; 3068 3069 /* guarantee minimum buffer size under pressure */ 3070 if (kind == SK_MEM_RECV) { 3071 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 3072 return 1; 3073 3074 } else { /* SK_MEM_SEND */ 3075 int wmem0 = sk_get_wmem0(sk, prot); 3076 3077 if (sk->sk_type == SOCK_STREAM) { 3078 if (sk->sk_wmem_queued < wmem0) 3079 return 1; 3080 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 3081 return 1; 3082 } 3083 } 3084 3085 if (sk_has_memory_pressure(sk)) { 3086 u64 alloc; 3087 3088 if (!sk_under_memory_pressure(sk)) 3089 return 1; 3090 alloc = sk_sockets_allocated_read_positive(sk); 3091 if (sk_prot_mem_limits(sk, 2) > alloc * 3092 sk_mem_pages(sk->sk_wmem_queued + 3093 atomic_read(&sk->sk_rmem_alloc) + 3094 sk->sk_forward_alloc)) 3095 return 1; 3096 } 3097 3098 suppress_allocation: 3099 3100 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 3101 sk_stream_moderate_sndbuf(sk); 3102 3103 /* Fail only if socket is _under_ its sndbuf. 3104 * In this case we cannot block, so that we have to fail. 3105 */ 3106 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { 3107 /* Force charge with __GFP_NOFAIL */ 3108 if (memcg_charge && !charged) { 3109 mem_cgroup_charge_skmem(sk->sk_memcg, amt, 3110 gfp_memcg_charge() | __GFP_NOFAIL); 3111 } 3112 return 1; 3113 } 3114 } 3115 3116 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 3117 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 3118 3119 sk_memory_allocated_sub(sk, amt); 3120 3121 if (memcg_charge && charged) 3122 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 3123 3124 return 0; 3125 } 3126 3127 /** 3128 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 3129 * @sk: socket 3130 * @size: memory size to allocate 3131 * @kind: allocation type 3132 * 3133 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 3134 * rmem allocation. This function assumes that protocols which have 3135 * memory_pressure use sk_wmem_queued as write buffer accounting. 3136 */ 3137 int __sk_mem_schedule(struct sock *sk, int size, int kind) 3138 { 3139 int ret, amt = sk_mem_pages(size); 3140 3141 sk->sk_forward_alloc += amt << PAGE_SHIFT; 3142 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 3143 if (!ret) 3144 sk->sk_forward_alloc -= amt << PAGE_SHIFT; 3145 return ret; 3146 } 3147 EXPORT_SYMBOL(__sk_mem_schedule); 3148 3149 /** 3150 * __sk_mem_reduce_allocated - reclaim memory_allocated 3151 * @sk: socket 3152 * @amount: number of quanta 3153 * 3154 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 3155 */ 3156 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 3157 { 3158 sk_memory_allocated_sub(sk, amount); 3159 3160 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 3161 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 3162 3163 if (sk_under_memory_pressure(sk) && 3164 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 3165 sk_leave_memory_pressure(sk); 3166 } 3167 3168 /** 3169 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 3170 * @sk: socket 3171 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) 3172 */ 3173 void __sk_mem_reclaim(struct sock *sk, int amount) 3174 { 3175 amount >>= PAGE_SHIFT; 3176 sk->sk_forward_alloc -= amount << PAGE_SHIFT; 3177 __sk_mem_reduce_allocated(sk, amount); 3178 } 3179 EXPORT_SYMBOL(__sk_mem_reclaim); 3180 3181 int sk_set_peek_off(struct sock *sk, int val) 3182 { 3183 WRITE_ONCE(sk->sk_peek_off, val); 3184 return 0; 3185 } 3186 EXPORT_SYMBOL_GPL(sk_set_peek_off); 3187 3188 /* 3189 * Set of default routines for initialising struct proto_ops when 3190 * the protocol does not support a particular function. In certain 3191 * cases where it makes no sense for a protocol to have a "do nothing" 3192 * function, some default processing is provided. 3193 */ 3194 3195 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 3196 { 3197 return -EOPNOTSUPP; 3198 } 3199 EXPORT_SYMBOL(sock_no_bind); 3200 3201 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 3202 int len, int flags) 3203 { 3204 return -EOPNOTSUPP; 3205 } 3206 EXPORT_SYMBOL(sock_no_connect); 3207 3208 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 3209 { 3210 return -EOPNOTSUPP; 3211 } 3212 EXPORT_SYMBOL(sock_no_socketpair); 3213 3214 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 3215 bool kern) 3216 { 3217 return -EOPNOTSUPP; 3218 } 3219 EXPORT_SYMBOL(sock_no_accept); 3220 3221 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 3222 int peer) 3223 { 3224 return -EOPNOTSUPP; 3225 } 3226 EXPORT_SYMBOL(sock_no_getname); 3227 3228 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3229 { 3230 return -EOPNOTSUPP; 3231 } 3232 EXPORT_SYMBOL(sock_no_ioctl); 3233 3234 int sock_no_listen(struct socket *sock, int backlog) 3235 { 3236 return -EOPNOTSUPP; 3237 } 3238 EXPORT_SYMBOL(sock_no_listen); 3239 3240 int sock_no_shutdown(struct socket *sock, int how) 3241 { 3242 return -EOPNOTSUPP; 3243 } 3244 EXPORT_SYMBOL(sock_no_shutdown); 3245 3246 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 3247 { 3248 return -EOPNOTSUPP; 3249 } 3250 EXPORT_SYMBOL(sock_no_sendmsg); 3251 3252 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 3253 { 3254 return -EOPNOTSUPP; 3255 } 3256 EXPORT_SYMBOL(sock_no_sendmsg_locked); 3257 3258 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 3259 int flags) 3260 { 3261 return -EOPNOTSUPP; 3262 } 3263 EXPORT_SYMBOL(sock_no_recvmsg); 3264 3265 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 3266 { 3267 /* Mirror missing mmap method error code */ 3268 return -ENODEV; 3269 } 3270 EXPORT_SYMBOL(sock_no_mmap); 3271 3272 /* 3273 * When a file is received (via SCM_RIGHTS, etc), we must bump the 3274 * various sock-based usage counts. 3275 */ 3276 void __receive_sock(struct file *file) 3277 { 3278 struct socket *sock; 3279 3280 sock = sock_from_file(file); 3281 if (sock) { 3282 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 3283 sock_update_classid(&sock->sk->sk_cgrp_data); 3284 } 3285 } 3286 3287 /* 3288 * Default Socket Callbacks 3289 */ 3290 3291 static void sock_def_wakeup(struct sock *sk) 3292 { 3293 struct socket_wq *wq; 3294 3295 rcu_read_lock(); 3296 wq = rcu_dereference(sk->sk_wq); 3297 if (skwq_has_sleeper(wq)) 3298 wake_up_interruptible_all(&wq->wait); 3299 rcu_read_unlock(); 3300 } 3301 3302 static void sock_def_error_report(struct sock *sk) 3303 { 3304 struct socket_wq *wq; 3305 3306 rcu_read_lock(); 3307 wq = rcu_dereference(sk->sk_wq); 3308 if (skwq_has_sleeper(wq)) 3309 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 3310 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 3311 rcu_read_unlock(); 3312 } 3313 3314 void sock_def_readable(struct sock *sk) 3315 { 3316 struct socket_wq *wq; 3317 3318 trace_sk_data_ready(sk); 3319 3320 rcu_read_lock(); 3321 wq = rcu_dereference(sk->sk_wq); 3322 if (skwq_has_sleeper(wq)) 3323 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 3324 EPOLLRDNORM | EPOLLRDBAND); 3325 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 3326 rcu_read_unlock(); 3327 } 3328 3329 static void sock_def_write_space(struct sock *sk) 3330 { 3331 struct socket_wq *wq; 3332 3333 rcu_read_lock(); 3334 3335 /* Do not wake up a writer until he can make "significant" 3336 * progress. --DaveM 3337 */ 3338 if (sock_writeable(sk)) { 3339 wq = rcu_dereference(sk->sk_wq); 3340 if (skwq_has_sleeper(wq)) 3341 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3342 EPOLLWRNORM | EPOLLWRBAND); 3343 3344 /* Should agree with poll, otherwise some programs break */ 3345 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 3346 } 3347 3348 rcu_read_unlock(); 3349 } 3350 3351 /* An optimised version of sock_def_write_space(), should only be called 3352 * for SOCK_RCU_FREE sockets under RCU read section and after putting 3353 * ->sk_wmem_alloc. 3354 */ 3355 static void sock_def_write_space_wfree(struct sock *sk) 3356 { 3357 /* Do not wake up a writer until he can make "significant" 3358 * progress. --DaveM 3359 */ 3360 if (sock_writeable(sk)) { 3361 struct socket_wq *wq = rcu_dereference(sk->sk_wq); 3362 3363 /* rely on refcount_sub from sock_wfree() */ 3364 smp_mb__after_atomic(); 3365 if (wq && waitqueue_active(&wq->wait)) 3366 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3367 EPOLLWRNORM | EPOLLWRBAND); 3368 3369 /* Should agree with poll, otherwise some programs break */ 3370 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 3371 } 3372 } 3373 3374 static void sock_def_destruct(struct sock *sk) 3375 { 3376 } 3377 3378 void sk_send_sigurg(struct sock *sk) 3379 { 3380 if (sk->sk_socket && sk->sk_socket->file) 3381 if (send_sigurg(&sk->sk_socket->file->f_owner)) 3382 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 3383 } 3384 EXPORT_SYMBOL(sk_send_sigurg); 3385 3386 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 3387 unsigned long expires) 3388 { 3389 if (!mod_timer(timer, expires)) 3390 sock_hold(sk); 3391 } 3392 EXPORT_SYMBOL(sk_reset_timer); 3393 3394 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 3395 { 3396 if (del_timer(timer)) 3397 __sock_put(sk); 3398 } 3399 EXPORT_SYMBOL(sk_stop_timer); 3400 3401 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 3402 { 3403 if (del_timer_sync(timer)) 3404 __sock_put(sk); 3405 } 3406 EXPORT_SYMBOL(sk_stop_timer_sync); 3407 3408 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) 3409 { 3410 sk_init_common(sk); 3411 sk->sk_send_head = NULL; 3412 3413 timer_setup(&sk->sk_timer, NULL, 0); 3414 3415 sk->sk_allocation = GFP_KERNEL; 3416 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); 3417 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); 3418 sk->sk_state = TCP_CLOSE; 3419 sk->sk_use_task_frag = true; 3420 sk_set_socket(sk, sock); 3421 3422 sock_set_flag(sk, SOCK_ZAPPED); 3423 3424 if (sock) { 3425 sk->sk_type = sock->type; 3426 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3427 sock->sk = sk; 3428 } else { 3429 RCU_INIT_POINTER(sk->sk_wq, NULL); 3430 } 3431 sk->sk_uid = uid; 3432 3433 rwlock_init(&sk->sk_callback_lock); 3434 if (sk->sk_kern_sock) 3435 lockdep_set_class_and_name( 3436 &sk->sk_callback_lock, 3437 af_kern_callback_keys + sk->sk_family, 3438 af_family_kern_clock_key_strings[sk->sk_family]); 3439 else 3440 lockdep_set_class_and_name( 3441 &sk->sk_callback_lock, 3442 af_callback_keys + sk->sk_family, 3443 af_family_clock_key_strings[sk->sk_family]); 3444 3445 sk->sk_state_change = sock_def_wakeup; 3446 sk->sk_data_ready = sock_def_readable; 3447 sk->sk_write_space = sock_def_write_space; 3448 sk->sk_error_report = sock_def_error_report; 3449 sk->sk_destruct = sock_def_destruct; 3450 3451 sk->sk_frag.page = NULL; 3452 sk->sk_frag.offset = 0; 3453 sk->sk_peek_off = -1; 3454 3455 sk->sk_peer_pid = NULL; 3456 sk->sk_peer_cred = NULL; 3457 spin_lock_init(&sk->sk_peer_lock); 3458 3459 sk->sk_write_pending = 0; 3460 sk->sk_rcvlowat = 1; 3461 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3462 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3463 3464 sk->sk_stamp = SK_DEFAULT_STAMP; 3465 #if BITS_PER_LONG==32 3466 seqlock_init(&sk->sk_stamp_seq); 3467 #endif 3468 atomic_set(&sk->sk_zckey, 0); 3469 3470 #ifdef CONFIG_NET_RX_BUSY_POLL 3471 sk->sk_napi_id = 0; 3472 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); 3473 #endif 3474 3475 sk->sk_max_pacing_rate = ~0UL; 3476 sk->sk_pacing_rate = ~0UL; 3477 WRITE_ONCE(sk->sk_pacing_shift, 10); 3478 sk->sk_incoming_cpu = -1; 3479 3480 sk_rx_queue_clear(sk); 3481 /* 3482 * Before updating sk_refcnt, we must commit prior changes to memory 3483 * (Documentation/RCU/rculist_nulls.rst for details) 3484 */ 3485 smp_wmb(); 3486 refcount_set(&sk->sk_refcnt, 1); 3487 atomic_set(&sk->sk_drops, 0); 3488 } 3489 EXPORT_SYMBOL(sock_init_data_uid); 3490 3491 void sock_init_data(struct socket *sock, struct sock *sk) 3492 { 3493 kuid_t uid = sock ? 3494 SOCK_INODE(sock)->i_uid : 3495 make_kuid(sock_net(sk)->user_ns, 0); 3496 3497 sock_init_data_uid(sock, sk, uid); 3498 } 3499 EXPORT_SYMBOL(sock_init_data); 3500 3501 void lock_sock_nested(struct sock *sk, int subclass) 3502 { 3503 /* The sk_lock has mutex_lock() semantics here. */ 3504 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3505 3506 might_sleep(); 3507 spin_lock_bh(&sk->sk_lock.slock); 3508 if (sock_owned_by_user_nocheck(sk)) 3509 __lock_sock(sk); 3510 sk->sk_lock.owned = 1; 3511 spin_unlock_bh(&sk->sk_lock.slock); 3512 } 3513 EXPORT_SYMBOL(lock_sock_nested); 3514 3515 void release_sock(struct sock *sk) 3516 { 3517 spin_lock_bh(&sk->sk_lock.slock); 3518 if (sk->sk_backlog.tail) 3519 __release_sock(sk); 3520 3521 /* Warning : release_cb() might need to release sk ownership, 3522 * ie call sock_release_ownership(sk) before us. 3523 */ 3524 if (sk->sk_prot->release_cb) 3525 sk->sk_prot->release_cb(sk); 3526 3527 sock_release_ownership(sk); 3528 if (waitqueue_active(&sk->sk_lock.wq)) 3529 wake_up(&sk->sk_lock.wq); 3530 spin_unlock_bh(&sk->sk_lock.slock); 3531 } 3532 EXPORT_SYMBOL(release_sock); 3533 3534 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3535 { 3536 might_sleep(); 3537 spin_lock_bh(&sk->sk_lock.slock); 3538 3539 if (!sock_owned_by_user_nocheck(sk)) { 3540 /* 3541 * Fast path return with bottom halves disabled and 3542 * sock::sk_lock.slock held. 3543 * 3544 * The 'mutex' is not contended and holding 3545 * sock::sk_lock.slock prevents all other lockers to 3546 * proceed so the corresponding unlock_sock_fast() can 3547 * avoid the slow path of release_sock() completely and 3548 * just release slock. 3549 * 3550 * From a semantical POV this is equivalent to 'acquiring' 3551 * the 'mutex', hence the corresponding lockdep 3552 * mutex_release() has to happen in the fast path of 3553 * unlock_sock_fast(). 3554 */ 3555 return false; 3556 } 3557 3558 __lock_sock(sk); 3559 sk->sk_lock.owned = 1; 3560 __acquire(&sk->sk_lock.slock); 3561 spin_unlock_bh(&sk->sk_lock.slock); 3562 return true; 3563 } 3564 EXPORT_SYMBOL(__lock_sock_fast); 3565 3566 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3567 bool timeval, bool time32) 3568 { 3569 struct sock *sk = sock->sk; 3570 struct timespec64 ts; 3571 3572 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3573 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3574 if (ts.tv_sec == -1) 3575 return -ENOENT; 3576 if (ts.tv_sec == 0) { 3577 ktime_t kt = ktime_get_real(); 3578 sock_write_timestamp(sk, kt); 3579 ts = ktime_to_timespec64(kt); 3580 } 3581 3582 if (timeval) 3583 ts.tv_nsec /= 1000; 3584 3585 #ifdef CONFIG_COMPAT_32BIT_TIME 3586 if (time32) 3587 return put_old_timespec32(&ts, userstamp); 3588 #endif 3589 #ifdef CONFIG_SPARC64 3590 /* beware of padding in sparc64 timeval */ 3591 if (timeval && !in_compat_syscall()) { 3592 struct __kernel_old_timeval __user tv = { 3593 .tv_sec = ts.tv_sec, 3594 .tv_usec = ts.tv_nsec, 3595 }; 3596 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3597 return -EFAULT; 3598 return 0; 3599 } 3600 #endif 3601 return put_timespec64(&ts, userstamp); 3602 } 3603 EXPORT_SYMBOL(sock_gettstamp); 3604 3605 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3606 { 3607 if (!sock_flag(sk, flag)) { 3608 unsigned long previous_flags = sk->sk_flags; 3609 3610 sock_set_flag(sk, flag); 3611 /* 3612 * we just set one of the two flags which require net 3613 * time stamping, but time stamping might have been on 3614 * already because of the other one 3615 */ 3616 if (sock_needs_netstamp(sk) && 3617 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3618 net_enable_timestamp(); 3619 } 3620 } 3621 3622 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3623 int level, int type) 3624 { 3625 struct sock_exterr_skb *serr; 3626 struct sk_buff *skb; 3627 int copied, err; 3628 3629 err = -EAGAIN; 3630 skb = sock_dequeue_err_skb(sk); 3631 if (skb == NULL) 3632 goto out; 3633 3634 copied = skb->len; 3635 if (copied > len) { 3636 msg->msg_flags |= MSG_TRUNC; 3637 copied = len; 3638 } 3639 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3640 if (err) 3641 goto out_free_skb; 3642 3643 sock_recv_timestamp(msg, sk, skb); 3644 3645 serr = SKB_EXT_ERR(skb); 3646 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3647 3648 msg->msg_flags |= MSG_ERRQUEUE; 3649 err = copied; 3650 3651 out_free_skb: 3652 kfree_skb(skb); 3653 out: 3654 return err; 3655 } 3656 EXPORT_SYMBOL(sock_recv_errqueue); 3657 3658 /* 3659 * Get a socket option on an socket. 3660 * 3661 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3662 * asynchronous errors should be reported by getsockopt. We assume 3663 * this means if you specify SO_ERROR (otherwise whats the point of it). 3664 */ 3665 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3666 char __user *optval, int __user *optlen) 3667 { 3668 struct sock *sk = sock->sk; 3669 3670 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3671 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); 3672 } 3673 EXPORT_SYMBOL(sock_common_getsockopt); 3674 3675 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3676 int flags) 3677 { 3678 struct sock *sk = sock->sk; 3679 int addr_len = 0; 3680 int err; 3681 3682 err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); 3683 if (err >= 0) 3684 msg->msg_namelen = addr_len; 3685 return err; 3686 } 3687 EXPORT_SYMBOL(sock_common_recvmsg); 3688 3689 /* 3690 * Set socket options on an inet socket. 3691 */ 3692 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3693 sockptr_t optval, unsigned int optlen) 3694 { 3695 struct sock *sk = sock->sk; 3696 3697 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3698 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); 3699 } 3700 EXPORT_SYMBOL(sock_common_setsockopt); 3701 3702 void sk_common_release(struct sock *sk) 3703 { 3704 if (sk->sk_prot->destroy) 3705 sk->sk_prot->destroy(sk); 3706 3707 /* 3708 * Observation: when sk_common_release is called, processes have 3709 * no access to socket. But net still has. 3710 * Step one, detach it from networking: 3711 * 3712 * A. Remove from hash tables. 3713 */ 3714 3715 sk->sk_prot->unhash(sk); 3716 3717 /* 3718 * In this point socket cannot receive new packets, but it is possible 3719 * that some packets are in flight because some CPU runs receiver and 3720 * did hash table lookup before we unhashed socket. They will achieve 3721 * receive queue and will be purged by socket destructor. 3722 * 3723 * Also we still have packets pending on receive queue and probably, 3724 * our own packets waiting in device queues. sock_destroy will drain 3725 * receive queue, but transmitted packets will delay socket destruction 3726 * until the last reference will be released. 3727 */ 3728 3729 sock_orphan(sk); 3730 3731 xfrm_sk_free_policy(sk); 3732 3733 sock_put(sk); 3734 } 3735 EXPORT_SYMBOL(sk_common_release); 3736 3737 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3738 { 3739 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3740 3741 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3742 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3743 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3744 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3745 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3746 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3747 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3748 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 3749 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3750 } 3751 3752 #ifdef CONFIG_PROC_FS 3753 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3754 3755 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3756 { 3757 int cpu, idx = prot->inuse_idx; 3758 int res = 0; 3759 3760 for_each_possible_cpu(cpu) 3761 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3762 3763 return res >= 0 ? res : 0; 3764 } 3765 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3766 3767 int sock_inuse_get(struct net *net) 3768 { 3769 int cpu, res = 0; 3770 3771 for_each_possible_cpu(cpu) 3772 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; 3773 3774 return res; 3775 } 3776 3777 EXPORT_SYMBOL_GPL(sock_inuse_get); 3778 3779 static int __net_init sock_inuse_init_net(struct net *net) 3780 { 3781 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3782 if (net->core.prot_inuse == NULL) 3783 return -ENOMEM; 3784 return 0; 3785 } 3786 3787 static void __net_exit sock_inuse_exit_net(struct net *net) 3788 { 3789 free_percpu(net->core.prot_inuse); 3790 } 3791 3792 static struct pernet_operations net_inuse_ops = { 3793 .init = sock_inuse_init_net, 3794 .exit = sock_inuse_exit_net, 3795 }; 3796 3797 static __init int net_inuse_init(void) 3798 { 3799 if (register_pernet_subsys(&net_inuse_ops)) 3800 panic("Cannot initialize net inuse counters"); 3801 3802 return 0; 3803 } 3804 3805 core_initcall(net_inuse_init); 3806 3807 static int assign_proto_idx(struct proto *prot) 3808 { 3809 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3810 3811 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3812 pr_err("PROTO_INUSE_NR exhausted\n"); 3813 return -ENOSPC; 3814 } 3815 3816 set_bit(prot->inuse_idx, proto_inuse_idx); 3817 return 0; 3818 } 3819 3820 static void release_proto_idx(struct proto *prot) 3821 { 3822 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3823 clear_bit(prot->inuse_idx, proto_inuse_idx); 3824 } 3825 #else 3826 static inline int assign_proto_idx(struct proto *prot) 3827 { 3828 return 0; 3829 } 3830 3831 static inline void release_proto_idx(struct proto *prot) 3832 { 3833 } 3834 3835 #endif 3836 3837 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 3838 { 3839 if (!twsk_prot) 3840 return; 3841 kfree(twsk_prot->twsk_slab_name); 3842 twsk_prot->twsk_slab_name = NULL; 3843 kmem_cache_destroy(twsk_prot->twsk_slab); 3844 twsk_prot->twsk_slab = NULL; 3845 } 3846 3847 static int tw_prot_init(const struct proto *prot) 3848 { 3849 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 3850 3851 if (!twsk_prot) 3852 return 0; 3853 3854 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 3855 prot->name); 3856 if (!twsk_prot->twsk_slab_name) 3857 return -ENOMEM; 3858 3859 twsk_prot->twsk_slab = 3860 kmem_cache_create(twsk_prot->twsk_slab_name, 3861 twsk_prot->twsk_obj_size, 0, 3862 SLAB_ACCOUNT | prot->slab_flags, 3863 NULL); 3864 if (!twsk_prot->twsk_slab) { 3865 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 3866 prot->name); 3867 return -ENOMEM; 3868 } 3869 3870 return 0; 3871 } 3872 3873 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3874 { 3875 if (!rsk_prot) 3876 return; 3877 kfree(rsk_prot->slab_name); 3878 rsk_prot->slab_name = NULL; 3879 kmem_cache_destroy(rsk_prot->slab); 3880 rsk_prot->slab = NULL; 3881 } 3882 3883 static int req_prot_init(const struct proto *prot) 3884 { 3885 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3886 3887 if (!rsk_prot) 3888 return 0; 3889 3890 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3891 prot->name); 3892 if (!rsk_prot->slab_name) 3893 return -ENOMEM; 3894 3895 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3896 rsk_prot->obj_size, 0, 3897 SLAB_ACCOUNT | prot->slab_flags, 3898 NULL); 3899 3900 if (!rsk_prot->slab) { 3901 pr_crit("%s: Can't create request sock SLAB cache!\n", 3902 prot->name); 3903 return -ENOMEM; 3904 } 3905 return 0; 3906 } 3907 3908 int proto_register(struct proto *prot, int alloc_slab) 3909 { 3910 int ret = -ENOBUFS; 3911 3912 if (prot->memory_allocated && !prot->sysctl_mem) { 3913 pr_err("%s: missing sysctl_mem\n", prot->name); 3914 return -EINVAL; 3915 } 3916 if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { 3917 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); 3918 return -EINVAL; 3919 } 3920 if (alloc_slab) { 3921 prot->slab = kmem_cache_create_usercopy(prot->name, 3922 prot->obj_size, 0, 3923 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3924 prot->slab_flags, 3925 prot->useroffset, prot->usersize, 3926 NULL); 3927 3928 if (prot->slab == NULL) { 3929 pr_crit("%s: Can't create sock SLAB cache!\n", 3930 prot->name); 3931 goto out; 3932 } 3933 3934 if (req_prot_init(prot)) 3935 goto out_free_request_sock_slab; 3936 3937 if (tw_prot_init(prot)) 3938 goto out_free_timewait_sock_slab; 3939 } 3940 3941 mutex_lock(&proto_list_mutex); 3942 ret = assign_proto_idx(prot); 3943 if (ret) { 3944 mutex_unlock(&proto_list_mutex); 3945 goto out_free_timewait_sock_slab; 3946 } 3947 list_add(&prot->node, &proto_list); 3948 mutex_unlock(&proto_list_mutex); 3949 return ret; 3950 3951 out_free_timewait_sock_slab: 3952 if (alloc_slab) 3953 tw_prot_cleanup(prot->twsk_prot); 3954 out_free_request_sock_slab: 3955 if (alloc_slab) { 3956 req_prot_cleanup(prot->rsk_prot); 3957 3958 kmem_cache_destroy(prot->slab); 3959 prot->slab = NULL; 3960 } 3961 out: 3962 return ret; 3963 } 3964 EXPORT_SYMBOL(proto_register); 3965 3966 void proto_unregister(struct proto *prot) 3967 { 3968 mutex_lock(&proto_list_mutex); 3969 release_proto_idx(prot); 3970 list_del(&prot->node); 3971 mutex_unlock(&proto_list_mutex); 3972 3973 kmem_cache_destroy(prot->slab); 3974 prot->slab = NULL; 3975 3976 req_prot_cleanup(prot->rsk_prot); 3977 tw_prot_cleanup(prot->twsk_prot); 3978 } 3979 EXPORT_SYMBOL(proto_unregister); 3980 3981 int sock_load_diag_module(int family, int protocol) 3982 { 3983 if (!protocol) { 3984 if (!sock_is_registered(family)) 3985 return -ENOENT; 3986 3987 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3988 NETLINK_SOCK_DIAG, family); 3989 } 3990 3991 #ifdef CONFIG_INET 3992 if (family == AF_INET && 3993 protocol != IPPROTO_RAW && 3994 protocol < MAX_INET_PROTOS && 3995 !rcu_access_pointer(inet_protos[protocol])) 3996 return -ENOENT; 3997 #endif 3998 3999 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 4000 NETLINK_SOCK_DIAG, family, protocol); 4001 } 4002 EXPORT_SYMBOL(sock_load_diag_module); 4003 4004 #ifdef CONFIG_PROC_FS 4005 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 4006 __acquires(proto_list_mutex) 4007 { 4008 mutex_lock(&proto_list_mutex); 4009 return seq_list_start_head(&proto_list, *pos); 4010 } 4011 4012 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4013 { 4014 return seq_list_next(v, &proto_list, pos); 4015 } 4016 4017 static void proto_seq_stop(struct seq_file *seq, void *v) 4018 __releases(proto_list_mutex) 4019 { 4020 mutex_unlock(&proto_list_mutex); 4021 } 4022 4023 static char proto_method_implemented(const void *method) 4024 { 4025 return method == NULL ? 'n' : 'y'; 4026 } 4027 static long sock_prot_memory_allocated(struct proto *proto) 4028 { 4029 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 4030 } 4031 4032 static const char *sock_prot_memory_pressure(struct proto *proto) 4033 { 4034 return proto->memory_pressure != NULL ? 4035 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 4036 } 4037 4038 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 4039 { 4040 4041 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 4042 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 4043 proto->name, 4044 proto->obj_size, 4045 sock_prot_inuse_get(seq_file_net(seq), proto), 4046 sock_prot_memory_allocated(proto), 4047 sock_prot_memory_pressure(proto), 4048 proto->max_header, 4049 proto->slab == NULL ? "no" : "yes", 4050 module_name(proto->owner), 4051 proto_method_implemented(proto->close), 4052 proto_method_implemented(proto->connect), 4053 proto_method_implemented(proto->disconnect), 4054 proto_method_implemented(proto->accept), 4055 proto_method_implemented(proto->ioctl), 4056 proto_method_implemented(proto->init), 4057 proto_method_implemented(proto->destroy), 4058 proto_method_implemented(proto->shutdown), 4059 proto_method_implemented(proto->setsockopt), 4060 proto_method_implemented(proto->getsockopt), 4061 proto_method_implemented(proto->sendmsg), 4062 proto_method_implemented(proto->recvmsg), 4063 proto_method_implemented(proto->bind), 4064 proto_method_implemented(proto->backlog_rcv), 4065 proto_method_implemented(proto->hash), 4066 proto_method_implemented(proto->unhash), 4067 proto_method_implemented(proto->get_port), 4068 proto_method_implemented(proto->enter_memory_pressure)); 4069 } 4070 4071 static int proto_seq_show(struct seq_file *seq, void *v) 4072 { 4073 if (v == &proto_list) 4074 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 4075 "protocol", 4076 "size", 4077 "sockets", 4078 "memory", 4079 "press", 4080 "maxhdr", 4081 "slab", 4082 "module", 4083 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); 4084 else 4085 proto_seq_printf(seq, list_entry(v, struct proto, node)); 4086 return 0; 4087 } 4088 4089 static const struct seq_operations proto_seq_ops = { 4090 .start = proto_seq_start, 4091 .next = proto_seq_next, 4092 .stop = proto_seq_stop, 4093 .show = proto_seq_show, 4094 }; 4095 4096 static __net_init int proto_init_net(struct net *net) 4097 { 4098 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 4099 sizeof(struct seq_net_private))) 4100 return -ENOMEM; 4101 4102 return 0; 4103 } 4104 4105 static __net_exit void proto_exit_net(struct net *net) 4106 { 4107 remove_proc_entry("protocols", net->proc_net); 4108 } 4109 4110 4111 static __net_initdata struct pernet_operations proto_net_ops = { 4112 .init = proto_init_net, 4113 .exit = proto_exit_net, 4114 }; 4115 4116 static int __init proto_init(void) 4117 { 4118 return register_pernet_subsys(&proto_net_ops); 4119 } 4120 4121 subsys_initcall(proto_init); 4122 4123 #endif /* PROC_FS */ 4124 4125 #ifdef CONFIG_NET_RX_BUSY_POLL 4126 bool sk_busy_loop_end(void *p, unsigned long start_time) 4127 { 4128 struct sock *sk = p; 4129 4130 return !skb_queue_empty_lockless(&sk->sk_receive_queue) || 4131 sk_busy_loop_timeout(sk, start_time); 4132 } 4133 EXPORT_SYMBOL(sk_busy_loop_end); 4134 #endif /* CONFIG_NET_RX_BUSY_POLL */ 4135 4136 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) 4137 { 4138 if (!sk->sk_prot->bind_add) 4139 return -EOPNOTSUPP; 4140 return sk->sk_prot->bind_add(sk, addr, addr_len); 4141 } 4142 EXPORT_SYMBOL(sock_bind_add); 4143 4144 /* Copy 'size' bytes from userspace and return `size` back to userspace */ 4145 int sock_ioctl_inout(struct sock *sk, unsigned int cmd, 4146 void __user *arg, void *karg, size_t size) 4147 { 4148 int ret; 4149 4150 if (copy_from_user(karg, arg, size)) 4151 return -EFAULT; 4152 4153 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); 4154 if (ret) 4155 return ret; 4156 4157 if (copy_to_user(arg, karg, size)) 4158 return -EFAULT; 4159 4160 return 0; 4161 } 4162 EXPORT_SYMBOL(sock_ioctl_inout); 4163 4164 /* This is the most common ioctl prep function, where the result (4 bytes) is 4165 * copied back to userspace if the ioctl() returns successfully. No input is 4166 * copied from userspace as input argument. 4167 */ 4168 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) 4169 { 4170 int ret, karg = 0; 4171 4172 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); 4173 if (ret) 4174 return ret; 4175 4176 return put_user(karg, (int __user *)arg); 4177 } 4178 4179 /* A wrapper around sock ioctls, which copies the data from userspace 4180 * (depending on the protocol/ioctl), and copies back the result to userspace. 4181 * The main motivation for this function is to pass kernel memory to the 4182 * protocol ioctl callbacks, instead of userspace memory. 4183 */ 4184 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) 4185 { 4186 int rc = 1; 4187 4188 if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) 4189 rc = ipmr_sk_ioctl(sk, cmd, arg); 4190 else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) 4191 rc = ip6mr_sk_ioctl(sk, cmd, arg); 4192 else if (sk_is_phonet(sk)) 4193 rc = phonet_sk_ioctl(sk, cmd, arg); 4194 4195 /* If ioctl was processed, returns its value */ 4196 if (rc <= 0) 4197 return rc; 4198 4199 /* Otherwise call the default handler */ 4200 return sock_ioctl_out(sk, cmd, arg); 4201 } 4202 EXPORT_SYMBOL(sk_ioctl); 4203