1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <asm/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/init.h> 111 #include <linux/highmem.h> 112 #include <linux/user_namespace.h> 113 #include <linux/static_key.h> 114 #include <linux/memcontrol.h> 115 #include <linux/prefetch.h> 116 #include <linux/compat.h> 117 118 #include <linux/uaccess.h> 119 120 #include <linux/netdevice.h> 121 #include <net/protocol.h> 122 #include <linux/skbuff.h> 123 #include <net/net_namespace.h> 124 #include <net/request_sock.h> 125 #include <net/sock.h> 126 #include <linux/net_tstamp.h> 127 #include <net/xfrm.h> 128 #include <linux/ipsec.h> 129 #include <net/cls_cgroup.h> 130 #include <net/netprio_cgroup.h> 131 #include <linux/sock_diag.h> 132 133 #include <linux/filter.h> 134 #include <net/sock_reuseport.h> 135 #include <net/bpf_sk_storage.h> 136 137 #include <trace/events/sock.h> 138 139 #include <net/tcp.h> 140 #include <net/busy_poll.h> 141 142 #include <linux/ethtool.h> 143 144 static DEFINE_MUTEX(proto_list_mutex); 145 static LIST_HEAD(proto_list); 146 147 static void sock_inuse_add(struct net *net, int val); 148 149 /** 150 * sk_ns_capable - General socket capability test 151 * @sk: Socket to use a capability on or through 152 * @user_ns: The user namespace of the capability to use 153 * @cap: The capability to use 154 * 155 * Test to see if the opener of the socket had when the socket was 156 * created and the current process has the capability @cap in the user 157 * namespace @user_ns. 158 */ 159 bool sk_ns_capable(const struct sock *sk, 160 struct user_namespace *user_ns, int cap) 161 { 162 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 163 ns_capable(user_ns, cap); 164 } 165 EXPORT_SYMBOL(sk_ns_capable); 166 167 /** 168 * sk_capable - Socket global capability test 169 * @sk: Socket to use a capability on or through 170 * @cap: The global capability to use 171 * 172 * Test to see if the opener of the socket had when the socket was 173 * created and the current process has the capability @cap in all user 174 * namespaces. 175 */ 176 bool sk_capable(const struct sock *sk, int cap) 177 { 178 return sk_ns_capable(sk, &init_user_ns, cap); 179 } 180 EXPORT_SYMBOL(sk_capable); 181 182 /** 183 * sk_net_capable - Network namespace socket capability test 184 * @sk: Socket to use a capability on or through 185 * @cap: The capability to use 186 * 187 * Test to see if the opener of the socket had when the socket was created 188 * and the current process has the capability @cap over the network namespace 189 * the socket is a member of. 190 */ 191 bool sk_net_capable(const struct sock *sk, int cap) 192 { 193 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 194 } 195 EXPORT_SYMBOL(sk_net_capable); 196 197 /* 198 * Each address family might have different locking rules, so we have 199 * one slock key per address family and separate keys for internal and 200 * userspace sockets. 201 */ 202 static struct lock_class_key af_family_keys[AF_MAX]; 203 static struct lock_class_key af_family_kern_keys[AF_MAX]; 204 static struct lock_class_key af_family_slock_keys[AF_MAX]; 205 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 206 207 /* 208 * Make lock validator output more readable. (we pre-construct these 209 * strings build-time, so that runtime initialization of socket 210 * locks is fast): 211 */ 212 213 #define _sock_locks(x) \ 214 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 215 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 216 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 217 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 218 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 219 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 220 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 221 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 222 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 223 x "27" , x "28" , x "AF_CAN" , \ 224 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 225 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 226 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 227 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 228 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 229 x "AF_MCTP" , \ 230 x "AF_MAX" 231 232 static const char *const af_family_key_strings[AF_MAX+1] = { 233 _sock_locks("sk_lock-") 234 }; 235 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 236 _sock_locks("slock-") 237 }; 238 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 239 _sock_locks("clock-") 240 }; 241 242 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 243 _sock_locks("k-sk_lock-") 244 }; 245 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 246 _sock_locks("k-slock-") 247 }; 248 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 249 _sock_locks("k-clock-") 250 }; 251 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 252 _sock_locks("rlock-") 253 }; 254 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 255 _sock_locks("wlock-") 256 }; 257 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 258 _sock_locks("elock-") 259 }; 260 261 /* 262 * sk_callback_lock and sk queues locking rules are per-address-family, 263 * so split the lock classes by using a per-AF key: 264 */ 265 static struct lock_class_key af_callback_keys[AF_MAX]; 266 static struct lock_class_key af_rlock_keys[AF_MAX]; 267 static struct lock_class_key af_wlock_keys[AF_MAX]; 268 static struct lock_class_key af_elock_keys[AF_MAX]; 269 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 270 271 /* Run time adjustable parameters. */ 272 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 273 EXPORT_SYMBOL(sysctl_wmem_max); 274 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 275 EXPORT_SYMBOL(sysctl_rmem_max); 276 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 277 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 278 279 /* Maximal space eaten by iovec or ancillary data plus some space */ 280 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 281 EXPORT_SYMBOL(sysctl_optmem_max); 282 283 int sysctl_tstamp_allow_data __read_mostly = 1; 284 285 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 286 EXPORT_SYMBOL_GPL(memalloc_socks_key); 287 288 /** 289 * sk_set_memalloc - sets %SOCK_MEMALLOC 290 * @sk: socket to set it on 291 * 292 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 293 * It's the responsibility of the admin to adjust min_free_kbytes 294 * to meet the requirements 295 */ 296 void sk_set_memalloc(struct sock *sk) 297 { 298 sock_set_flag(sk, SOCK_MEMALLOC); 299 sk->sk_allocation |= __GFP_MEMALLOC; 300 static_branch_inc(&memalloc_socks_key); 301 } 302 EXPORT_SYMBOL_GPL(sk_set_memalloc); 303 304 void sk_clear_memalloc(struct sock *sk) 305 { 306 sock_reset_flag(sk, SOCK_MEMALLOC); 307 sk->sk_allocation &= ~__GFP_MEMALLOC; 308 static_branch_dec(&memalloc_socks_key); 309 310 /* 311 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 312 * progress of swapping. SOCK_MEMALLOC may be cleared while 313 * it has rmem allocations due to the last swapfile being deactivated 314 * but there is a risk that the socket is unusable due to exceeding 315 * the rmem limits. Reclaim the reserves and obey rmem limits again. 316 */ 317 sk_mem_reclaim(sk); 318 } 319 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 320 321 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 322 { 323 int ret; 324 unsigned int noreclaim_flag; 325 326 /* these should have been dropped before queueing */ 327 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 328 329 noreclaim_flag = memalloc_noreclaim_save(); 330 ret = sk->sk_backlog_rcv(sk, skb); 331 memalloc_noreclaim_restore(noreclaim_flag); 332 333 return ret; 334 } 335 EXPORT_SYMBOL(__sk_backlog_rcv); 336 337 void sk_error_report(struct sock *sk) 338 { 339 sk->sk_error_report(sk); 340 341 switch (sk->sk_family) { 342 case AF_INET: 343 fallthrough; 344 case AF_INET6: 345 trace_inet_sk_error_report(sk); 346 break; 347 default: 348 break; 349 } 350 } 351 EXPORT_SYMBOL(sk_error_report); 352 353 static int sock_get_timeout(long timeo, void *optval, bool old_timeval) 354 { 355 struct __kernel_sock_timeval tv; 356 357 if (timeo == MAX_SCHEDULE_TIMEOUT) { 358 tv.tv_sec = 0; 359 tv.tv_usec = 0; 360 } else { 361 tv.tv_sec = timeo / HZ; 362 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 363 } 364 365 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 366 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 367 *(struct old_timeval32 *)optval = tv32; 368 return sizeof(tv32); 369 } 370 371 if (old_timeval) { 372 struct __kernel_old_timeval old_tv; 373 old_tv.tv_sec = tv.tv_sec; 374 old_tv.tv_usec = tv.tv_usec; 375 *(struct __kernel_old_timeval *)optval = old_tv; 376 return sizeof(old_tv); 377 } 378 379 *(struct __kernel_sock_timeval *)optval = tv; 380 return sizeof(tv); 381 } 382 383 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 384 bool old_timeval) 385 { 386 struct __kernel_sock_timeval tv; 387 388 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 389 struct old_timeval32 tv32; 390 391 if (optlen < sizeof(tv32)) 392 return -EINVAL; 393 394 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 395 return -EFAULT; 396 tv.tv_sec = tv32.tv_sec; 397 tv.tv_usec = tv32.tv_usec; 398 } else if (old_timeval) { 399 struct __kernel_old_timeval old_tv; 400 401 if (optlen < sizeof(old_tv)) 402 return -EINVAL; 403 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 404 return -EFAULT; 405 tv.tv_sec = old_tv.tv_sec; 406 tv.tv_usec = old_tv.tv_usec; 407 } else { 408 if (optlen < sizeof(tv)) 409 return -EINVAL; 410 if (copy_from_sockptr(&tv, optval, sizeof(tv))) 411 return -EFAULT; 412 } 413 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 414 return -EDOM; 415 416 if (tv.tv_sec < 0) { 417 static int warned __read_mostly; 418 419 *timeo_p = 0; 420 if (warned < 10 && net_ratelimit()) { 421 warned++; 422 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 423 __func__, current->comm, task_pid_nr(current)); 424 } 425 return 0; 426 } 427 *timeo_p = MAX_SCHEDULE_TIMEOUT; 428 if (tv.tv_sec == 0 && tv.tv_usec == 0) 429 return 0; 430 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) 431 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); 432 return 0; 433 } 434 435 static bool sock_needs_netstamp(const struct sock *sk) 436 { 437 switch (sk->sk_family) { 438 case AF_UNSPEC: 439 case AF_UNIX: 440 return false; 441 default: 442 return true; 443 } 444 } 445 446 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 447 { 448 if (sk->sk_flags & flags) { 449 sk->sk_flags &= ~flags; 450 if (sock_needs_netstamp(sk) && 451 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 452 net_disable_timestamp(); 453 } 454 } 455 456 457 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 458 { 459 unsigned long flags; 460 struct sk_buff_head *list = &sk->sk_receive_queue; 461 462 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 463 atomic_inc(&sk->sk_drops); 464 trace_sock_rcvqueue_full(sk, skb); 465 return -ENOMEM; 466 } 467 468 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 469 atomic_inc(&sk->sk_drops); 470 return -ENOBUFS; 471 } 472 473 skb->dev = NULL; 474 skb_set_owner_r(skb, sk); 475 476 /* we escape from rcu protected region, make sure we dont leak 477 * a norefcounted dst 478 */ 479 skb_dst_force(skb); 480 481 spin_lock_irqsave(&list->lock, flags); 482 sock_skb_set_dropcount(sk, skb); 483 __skb_queue_tail(list, skb); 484 spin_unlock_irqrestore(&list->lock, flags); 485 486 if (!sock_flag(sk, SOCK_DEAD)) 487 sk->sk_data_ready(sk); 488 return 0; 489 } 490 EXPORT_SYMBOL(__sock_queue_rcv_skb); 491 492 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 493 { 494 int err; 495 496 err = sk_filter(sk, skb); 497 if (err) 498 return err; 499 500 return __sock_queue_rcv_skb(sk, skb); 501 } 502 EXPORT_SYMBOL(sock_queue_rcv_skb); 503 504 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 505 const int nested, unsigned int trim_cap, bool refcounted) 506 { 507 int rc = NET_RX_SUCCESS; 508 509 if (sk_filter_trim_cap(sk, skb, trim_cap)) 510 goto discard_and_relse; 511 512 skb->dev = NULL; 513 514 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 515 atomic_inc(&sk->sk_drops); 516 goto discard_and_relse; 517 } 518 if (nested) 519 bh_lock_sock_nested(sk); 520 else 521 bh_lock_sock(sk); 522 if (!sock_owned_by_user(sk)) { 523 /* 524 * trylock + unlock semantics: 525 */ 526 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 527 528 rc = sk_backlog_rcv(sk, skb); 529 530 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 531 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { 532 bh_unlock_sock(sk); 533 atomic_inc(&sk->sk_drops); 534 goto discard_and_relse; 535 } 536 537 bh_unlock_sock(sk); 538 out: 539 if (refcounted) 540 sock_put(sk); 541 return rc; 542 discard_and_relse: 543 kfree_skb(skb); 544 goto out; 545 } 546 EXPORT_SYMBOL(__sk_receive_skb); 547 548 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 549 u32)); 550 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 551 u32)); 552 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 553 { 554 struct dst_entry *dst = __sk_dst_get(sk); 555 556 if (dst && dst->obsolete && 557 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 558 dst, cookie) == NULL) { 559 sk_tx_queue_clear(sk); 560 sk->sk_dst_pending_confirm = 0; 561 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 562 dst_release(dst); 563 return NULL; 564 } 565 566 return dst; 567 } 568 EXPORT_SYMBOL(__sk_dst_check); 569 570 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 571 { 572 struct dst_entry *dst = sk_dst_get(sk); 573 574 if (dst && dst->obsolete && 575 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 576 dst, cookie) == NULL) { 577 sk_dst_reset(sk); 578 dst_release(dst); 579 return NULL; 580 } 581 582 return dst; 583 } 584 EXPORT_SYMBOL(sk_dst_check); 585 586 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 587 { 588 int ret = -ENOPROTOOPT; 589 #ifdef CONFIG_NETDEVICES 590 struct net *net = sock_net(sk); 591 592 /* Sorry... */ 593 ret = -EPERM; 594 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 595 goto out; 596 597 ret = -EINVAL; 598 if (ifindex < 0) 599 goto out; 600 601 sk->sk_bound_dev_if = ifindex; 602 if (sk->sk_prot->rehash) 603 sk->sk_prot->rehash(sk); 604 sk_dst_reset(sk); 605 606 ret = 0; 607 608 out: 609 #endif 610 611 return ret; 612 } 613 614 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 615 { 616 int ret; 617 618 if (lock_sk) 619 lock_sock(sk); 620 ret = sock_bindtoindex_locked(sk, ifindex); 621 if (lock_sk) 622 release_sock(sk); 623 624 return ret; 625 } 626 EXPORT_SYMBOL(sock_bindtoindex); 627 628 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 629 { 630 int ret = -ENOPROTOOPT; 631 #ifdef CONFIG_NETDEVICES 632 struct net *net = sock_net(sk); 633 char devname[IFNAMSIZ]; 634 int index; 635 636 ret = -EINVAL; 637 if (optlen < 0) 638 goto out; 639 640 /* Bind this socket to a particular device like "eth0", 641 * as specified in the passed interface name. If the 642 * name is "" or the option length is zero the socket 643 * is not bound. 644 */ 645 if (optlen > IFNAMSIZ - 1) 646 optlen = IFNAMSIZ - 1; 647 memset(devname, 0, sizeof(devname)); 648 649 ret = -EFAULT; 650 if (copy_from_sockptr(devname, optval, optlen)) 651 goto out; 652 653 index = 0; 654 if (devname[0] != '\0') { 655 struct net_device *dev; 656 657 rcu_read_lock(); 658 dev = dev_get_by_name_rcu(net, devname); 659 if (dev) 660 index = dev->ifindex; 661 rcu_read_unlock(); 662 ret = -ENODEV; 663 if (!dev) 664 goto out; 665 } 666 667 return sock_bindtoindex(sk, index, true); 668 out: 669 #endif 670 671 return ret; 672 } 673 674 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 675 int __user *optlen, int len) 676 { 677 int ret = -ENOPROTOOPT; 678 #ifdef CONFIG_NETDEVICES 679 struct net *net = sock_net(sk); 680 char devname[IFNAMSIZ]; 681 682 if (sk->sk_bound_dev_if == 0) { 683 len = 0; 684 goto zero; 685 } 686 687 ret = -EINVAL; 688 if (len < IFNAMSIZ) 689 goto out; 690 691 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 692 if (ret) 693 goto out; 694 695 len = strlen(devname) + 1; 696 697 ret = -EFAULT; 698 if (copy_to_user(optval, devname, len)) 699 goto out; 700 701 zero: 702 ret = -EFAULT; 703 if (put_user(len, optlen)) 704 goto out; 705 706 ret = 0; 707 708 out: 709 #endif 710 711 return ret; 712 } 713 714 bool sk_mc_loop(struct sock *sk) 715 { 716 if (dev_recursion_level()) 717 return false; 718 if (!sk) 719 return true; 720 switch (sk->sk_family) { 721 case AF_INET: 722 return inet_sk(sk)->mc_loop; 723 #if IS_ENABLED(CONFIG_IPV6) 724 case AF_INET6: 725 return inet6_sk(sk)->mc_loop; 726 #endif 727 } 728 WARN_ON_ONCE(1); 729 return true; 730 } 731 EXPORT_SYMBOL(sk_mc_loop); 732 733 void sock_set_reuseaddr(struct sock *sk) 734 { 735 lock_sock(sk); 736 sk->sk_reuse = SK_CAN_REUSE; 737 release_sock(sk); 738 } 739 EXPORT_SYMBOL(sock_set_reuseaddr); 740 741 void sock_set_reuseport(struct sock *sk) 742 { 743 lock_sock(sk); 744 sk->sk_reuseport = true; 745 release_sock(sk); 746 } 747 EXPORT_SYMBOL(sock_set_reuseport); 748 749 void sock_no_linger(struct sock *sk) 750 { 751 lock_sock(sk); 752 sk->sk_lingertime = 0; 753 sock_set_flag(sk, SOCK_LINGER); 754 release_sock(sk); 755 } 756 EXPORT_SYMBOL(sock_no_linger); 757 758 void sock_set_priority(struct sock *sk, u32 priority) 759 { 760 lock_sock(sk); 761 sk->sk_priority = priority; 762 release_sock(sk); 763 } 764 EXPORT_SYMBOL(sock_set_priority); 765 766 void sock_set_sndtimeo(struct sock *sk, s64 secs) 767 { 768 lock_sock(sk); 769 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 770 sk->sk_sndtimeo = secs * HZ; 771 else 772 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 773 release_sock(sk); 774 } 775 EXPORT_SYMBOL(sock_set_sndtimeo); 776 777 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 778 { 779 if (val) { 780 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 781 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); 782 sock_set_flag(sk, SOCK_RCVTSTAMP); 783 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 784 } else { 785 sock_reset_flag(sk, SOCK_RCVTSTAMP); 786 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 787 } 788 } 789 790 void sock_enable_timestamps(struct sock *sk) 791 { 792 lock_sock(sk); 793 __sock_set_timestamps(sk, true, false, true); 794 release_sock(sk); 795 } 796 EXPORT_SYMBOL(sock_enable_timestamps); 797 798 void sock_set_timestamp(struct sock *sk, int optname, bool valbool) 799 { 800 switch (optname) { 801 case SO_TIMESTAMP_OLD: 802 __sock_set_timestamps(sk, valbool, false, false); 803 break; 804 case SO_TIMESTAMP_NEW: 805 __sock_set_timestamps(sk, valbool, true, false); 806 break; 807 case SO_TIMESTAMPNS_OLD: 808 __sock_set_timestamps(sk, valbool, false, true); 809 break; 810 case SO_TIMESTAMPNS_NEW: 811 __sock_set_timestamps(sk, valbool, true, true); 812 break; 813 } 814 } 815 816 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) 817 { 818 struct net *net = sock_net(sk); 819 struct net_device *dev = NULL; 820 bool match = false; 821 int *vclock_index; 822 int i, num; 823 824 if (sk->sk_bound_dev_if) 825 dev = dev_get_by_index(net, sk->sk_bound_dev_if); 826 827 if (!dev) { 828 pr_err("%s: sock not bind to device\n", __func__); 829 return -EOPNOTSUPP; 830 } 831 832 num = ethtool_get_phc_vclocks(dev, &vclock_index); 833 for (i = 0; i < num; i++) { 834 if (*(vclock_index + i) == phc_index) { 835 match = true; 836 break; 837 } 838 } 839 840 if (num > 0) 841 kfree(vclock_index); 842 843 if (!match) 844 return -EINVAL; 845 846 sk->sk_bind_phc = phc_index; 847 848 return 0; 849 } 850 851 int sock_set_timestamping(struct sock *sk, int optname, 852 struct so_timestamping timestamping) 853 { 854 int val = timestamping.flags; 855 int ret; 856 857 if (val & ~SOF_TIMESTAMPING_MASK) 858 return -EINVAL; 859 860 if (val & SOF_TIMESTAMPING_OPT_ID && 861 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 862 if (sk->sk_protocol == IPPROTO_TCP && 863 sk->sk_type == SOCK_STREAM) { 864 if ((1 << sk->sk_state) & 865 (TCPF_CLOSE | TCPF_LISTEN)) 866 return -EINVAL; 867 sk->sk_tskey = tcp_sk(sk)->snd_una; 868 } else { 869 sk->sk_tskey = 0; 870 } 871 } 872 873 if (val & SOF_TIMESTAMPING_OPT_STATS && 874 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) 875 return -EINVAL; 876 877 if (val & SOF_TIMESTAMPING_BIND_PHC) { 878 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); 879 if (ret) 880 return ret; 881 } 882 883 sk->sk_tsflags = val; 884 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 885 886 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 887 sock_enable_timestamp(sk, 888 SOCK_TIMESTAMPING_RX_SOFTWARE); 889 else 890 sock_disable_timestamp(sk, 891 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 892 return 0; 893 } 894 895 void sock_set_keepalive(struct sock *sk) 896 { 897 lock_sock(sk); 898 if (sk->sk_prot->keepalive) 899 sk->sk_prot->keepalive(sk, true); 900 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 901 release_sock(sk); 902 } 903 EXPORT_SYMBOL(sock_set_keepalive); 904 905 static void __sock_set_rcvbuf(struct sock *sk, int val) 906 { 907 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 908 * as a negative value. 909 */ 910 val = min_t(int, val, INT_MAX / 2); 911 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 912 913 /* We double it on the way in to account for "struct sk_buff" etc. 914 * overhead. Applications assume that the SO_RCVBUF setting they make 915 * will allow that much actual data to be received on that socket. 916 * 917 * Applications are unaware that "struct sk_buff" and other overheads 918 * allocate from the receive buffer during socket buffer allocation. 919 * 920 * And after considering the possible alternatives, returning the value 921 * we actually used in getsockopt is the most desirable behavior. 922 */ 923 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 924 } 925 926 void sock_set_rcvbuf(struct sock *sk, int val) 927 { 928 lock_sock(sk); 929 __sock_set_rcvbuf(sk, val); 930 release_sock(sk); 931 } 932 EXPORT_SYMBOL(sock_set_rcvbuf); 933 934 static void __sock_set_mark(struct sock *sk, u32 val) 935 { 936 if (val != sk->sk_mark) { 937 sk->sk_mark = val; 938 sk_dst_reset(sk); 939 } 940 } 941 942 void sock_set_mark(struct sock *sk, u32 val) 943 { 944 lock_sock(sk); 945 __sock_set_mark(sk, val); 946 release_sock(sk); 947 } 948 EXPORT_SYMBOL(sock_set_mark); 949 950 static void sock_release_reserved_memory(struct sock *sk, int bytes) 951 { 952 /* Round down bytes to multiple of pages */ 953 bytes &= ~(SK_MEM_QUANTUM - 1); 954 955 WARN_ON(bytes > sk->sk_reserved_mem); 956 sk->sk_reserved_mem -= bytes; 957 sk_mem_reclaim(sk); 958 } 959 960 static int sock_reserve_memory(struct sock *sk, int bytes) 961 { 962 long allocated; 963 bool charged; 964 int pages; 965 966 if (!mem_cgroup_sockets_enabled || !sk->sk_memcg) 967 return -EOPNOTSUPP; 968 969 if (!bytes) 970 return 0; 971 972 pages = sk_mem_pages(bytes); 973 974 /* pre-charge to memcg */ 975 charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, 976 GFP_KERNEL | __GFP_RETRY_MAYFAIL); 977 if (!charged) 978 return -ENOMEM; 979 980 /* pre-charge to forward_alloc */ 981 allocated = sk_memory_allocated_add(sk, pages); 982 /* If the system goes into memory pressure with this 983 * precharge, give up and return error. 984 */ 985 if (allocated > sk_prot_mem_limits(sk, 1)) { 986 sk_memory_allocated_sub(sk, pages); 987 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); 988 return -ENOMEM; 989 } 990 sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT; 991 992 sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT; 993 994 return 0; 995 } 996 997 /* 998 * This is meant for all protocols to use and covers goings on 999 * at the socket level. Everything here is generic. 1000 */ 1001 1002 int sock_setsockopt(struct socket *sock, int level, int optname, 1003 sockptr_t optval, unsigned int optlen) 1004 { 1005 struct so_timestamping timestamping; 1006 struct sock_txtime sk_txtime; 1007 struct sock *sk = sock->sk; 1008 int val; 1009 int valbool; 1010 struct linger ling; 1011 int ret = 0; 1012 1013 /* 1014 * Options without arguments 1015 */ 1016 1017 if (optname == SO_BINDTODEVICE) 1018 return sock_setbindtodevice(sk, optval, optlen); 1019 1020 if (optlen < sizeof(int)) 1021 return -EINVAL; 1022 1023 if (copy_from_sockptr(&val, optval, sizeof(val))) 1024 return -EFAULT; 1025 1026 valbool = val ? 1 : 0; 1027 1028 lock_sock(sk); 1029 1030 switch (optname) { 1031 case SO_DEBUG: 1032 if (val && !capable(CAP_NET_ADMIN)) 1033 ret = -EACCES; 1034 else 1035 sock_valbool_flag(sk, SOCK_DBG, valbool); 1036 break; 1037 case SO_REUSEADDR: 1038 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 1039 break; 1040 case SO_REUSEPORT: 1041 sk->sk_reuseport = valbool; 1042 break; 1043 case SO_TYPE: 1044 case SO_PROTOCOL: 1045 case SO_DOMAIN: 1046 case SO_ERROR: 1047 ret = -ENOPROTOOPT; 1048 break; 1049 case SO_DONTROUTE: 1050 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 1051 sk_dst_reset(sk); 1052 break; 1053 case SO_BROADCAST: 1054 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 1055 break; 1056 case SO_SNDBUF: 1057 /* Don't error on this BSD doesn't and if you think 1058 * about it this is right. Otherwise apps have to 1059 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1060 * are treated in BSD as hints 1061 */ 1062 val = min_t(u32, val, sysctl_wmem_max); 1063 set_sndbuf: 1064 /* Ensure val * 2 fits into an int, to prevent max_t() 1065 * from treating it as a negative value. 1066 */ 1067 val = min_t(int, val, INT_MAX / 2); 1068 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1069 WRITE_ONCE(sk->sk_sndbuf, 1070 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 1071 /* Wake up sending tasks if we upped the value. */ 1072 sk->sk_write_space(sk); 1073 break; 1074 1075 case SO_SNDBUFFORCE: 1076 if (!capable(CAP_NET_ADMIN)) { 1077 ret = -EPERM; 1078 break; 1079 } 1080 1081 /* No negative values (to prevent underflow, as val will be 1082 * multiplied by 2). 1083 */ 1084 if (val < 0) 1085 val = 0; 1086 goto set_sndbuf; 1087 1088 case SO_RCVBUF: 1089 /* Don't error on this BSD doesn't and if you think 1090 * about it this is right. Otherwise apps have to 1091 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1092 * are treated in BSD as hints 1093 */ 1094 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); 1095 break; 1096 1097 case SO_RCVBUFFORCE: 1098 if (!capable(CAP_NET_ADMIN)) { 1099 ret = -EPERM; 1100 break; 1101 } 1102 1103 /* No negative values (to prevent underflow, as val will be 1104 * multiplied by 2). 1105 */ 1106 __sock_set_rcvbuf(sk, max(val, 0)); 1107 break; 1108 1109 case SO_KEEPALIVE: 1110 if (sk->sk_prot->keepalive) 1111 sk->sk_prot->keepalive(sk, valbool); 1112 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 1113 break; 1114 1115 case SO_OOBINLINE: 1116 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 1117 break; 1118 1119 case SO_NO_CHECK: 1120 sk->sk_no_check_tx = valbool; 1121 break; 1122 1123 case SO_PRIORITY: 1124 if ((val >= 0 && val <= 6) || 1125 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1126 sk->sk_priority = val; 1127 else 1128 ret = -EPERM; 1129 break; 1130 1131 case SO_LINGER: 1132 if (optlen < sizeof(ling)) { 1133 ret = -EINVAL; /* 1003.1g */ 1134 break; 1135 } 1136 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 1137 ret = -EFAULT; 1138 break; 1139 } 1140 if (!ling.l_onoff) 1141 sock_reset_flag(sk, SOCK_LINGER); 1142 else { 1143 #if (BITS_PER_LONG == 32) 1144 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 1145 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 1146 else 1147 #endif 1148 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 1149 sock_set_flag(sk, SOCK_LINGER); 1150 } 1151 break; 1152 1153 case SO_BSDCOMPAT: 1154 break; 1155 1156 case SO_PASSCRED: 1157 if (valbool) 1158 set_bit(SOCK_PASSCRED, &sock->flags); 1159 else 1160 clear_bit(SOCK_PASSCRED, &sock->flags); 1161 break; 1162 1163 case SO_TIMESTAMP_OLD: 1164 case SO_TIMESTAMP_NEW: 1165 case SO_TIMESTAMPNS_OLD: 1166 case SO_TIMESTAMPNS_NEW: 1167 sock_set_timestamp(sk, optname, valbool); 1168 break; 1169 1170 case SO_TIMESTAMPING_NEW: 1171 case SO_TIMESTAMPING_OLD: 1172 if (optlen == sizeof(timestamping)) { 1173 if (copy_from_sockptr(×tamping, optval, 1174 sizeof(timestamping))) { 1175 ret = -EFAULT; 1176 break; 1177 } 1178 } else { 1179 memset(×tamping, 0, sizeof(timestamping)); 1180 timestamping.flags = val; 1181 } 1182 ret = sock_set_timestamping(sk, optname, timestamping); 1183 break; 1184 1185 case SO_RCVLOWAT: 1186 if (val < 0) 1187 val = INT_MAX; 1188 if (sock->ops->set_rcvlowat) 1189 ret = sock->ops->set_rcvlowat(sk, val); 1190 else 1191 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1192 break; 1193 1194 case SO_RCVTIMEO_OLD: 1195 case SO_RCVTIMEO_NEW: 1196 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, 1197 optlen, optname == SO_RCVTIMEO_OLD); 1198 break; 1199 1200 case SO_SNDTIMEO_OLD: 1201 case SO_SNDTIMEO_NEW: 1202 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, 1203 optlen, optname == SO_SNDTIMEO_OLD); 1204 break; 1205 1206 case SO_ATTACH_FILTER: { 1207 struct sock_fprog fprog; 1208 1209 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1210 if (!ret) 1211 ret = sk_attach_filter(&fprog, sk); 1212 break; 1213 } 1214 case SO_ATTACH_BPF: 1215 ret = -EINVAL; 1216 if (optlen == sizeof(u32)) { 1217 u32 ufd; 1218 1219 ret = -EFAULT; 1220 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1221 break; 1222 1223 ret = sk_attach_bpf(ufd, sk); 1224 } 1225 break; 1226 1227 case SO_ATTACH_REUSEPORT_CBPF: { 1228 struct sock_fprog fprog; 1229 1230 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1231 if (!ret) 1232 ret = sk_reuseport_attach_filter(&fprog, sk); 1233 break; 1234 } 1235 case SO_ATTACH_REUSEPORT_EBPF: 1236 ret = -EINVAL; 1237 if (optlen == sizeof(u32)) { 1238 u32 ufd; 1239 1240 ret = -EFAULT; 1241 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1242 break; 1243 1244 ret = sk_reuseport_attach_bpf(ufd, sk); 1245 } 1246 break; 1247 1248 case SO_DETACH_REUSEPORT_BPF: 1249 ret = reuseport_detach_prog(sk); 1250 break; 1251 1252 case SO_DETACH_FILTER: 1253 ret = sk_detach_filter(sk); 1254 break; 1255 1256 case SO_LOCK_FILTER: 1257 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1258 ret = -EPERM; 1259 else 1260 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1261 break; 1262 1263 case SO_PASSSEC: 1264 if (valbool) 1265 set_bit(SOCK_PASSSEC, &sock->flags); 1266 else 1267 clear_bit(SOCK_PASSSEC, &sock->flags); 1268 break; 1269 case SO_MARK: 1270 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1271 ret = -EPERM; 1272 break; 1273 } 1274 1275 __sock_set_mark(sk, val); 1276 break; 1277 1278 case SO_RXQ_OVFL: 1279 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1280 break; 1281 1282 case SO_WIFI_STATUS: 1283 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1284 break; 1285 1286 case SO_PEEK_OFF: 1287 if (sock->ops->set_peek_off) 1288 ret = sock->ops->set_peek_off(sk, val); 1289 else 1290 ret = -EOPNOTSUPP; 1291 break; 1292 1293 case SO_NOFCS: 1294 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1295 break; 1296 1297 case SO_SELECT_ERR_QUEUE: 1298 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1299 break; 1300 1301 #ifdef CONFIG_NET_RX_BUSY_POLL 1302 case SO_BUSY_POLL: 1303 /* allow unprivileged users to decrease the value */ 1304 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1305 ret = -EPERM; 1306 else { 1307 if (val < 0) 1308 ret = -EINVAL; 1309 else 1310 WRITE_ONCE(sk->sk_ll_usec, val); 1311 } 1312 break; 1313 case SO_PREFER_BUSY_POLL: 1314 if (valbool && !capable(CAP_NET_ADMIN)) 1315 ret = -EPERM; 1316 else 1317 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1318 break; 1319 case SO_BUSY_POLL_BUDGET: 1320 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { 1321 ret = -EPERM; 1322 } else { 1323 if (val < 0 || val > U16_MAX) 1324 ret = -EINVAL; 1325 else 1326 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1327 } 1328 break; 1329 #endif 1330 1331 case SO_MAX_PACING_RATE: 1332 { 1333 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1334 1335 if (sizeof(ulval) != sizeof(val) && 1336 optlen >= sizeof(ulval) && 1337 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1338 ret = -EFAULT; 1339 break; 1340 } 1341 if (ulval != ~0UL) 1342 cmpxchg(&sk->sk_pacing_status, 1343 SK_PACING_NONE, 1344 SK_PACING_NEEDED); 1345 sk->sk_max_pacing_rate = ulval; 1346 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); 1347 break; 1348 } 1349 case SO_INCOMING_CPU: 1350 WRITE_ONCE(sk->sk_incoming_cpu, val); 1351 break; 1352 1353 case SO_CNX_ADVICE: 1354 if (val == 1) 1355 dst_negative_advice(sk); 1356 break; 1357 1358 case SO_ZEROCOPY: 1359 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1360 if (!((sk->sk_type == SOCK_STREAM && 1361 sk->sk_protocol == IPPROTO_TCP) || 1362 (sk->sk_type == SOCK_DGRAM && 1363 sk->sk_protocol == IPPROTO_UDP))) 1364 ret = -ENOTSUPP; 1365 } else if (sk->sk_family != PF_RDS) { 1366 ret = -ENOTSUPP; 1367 } 1368 if (!ret) { 1369 if (val < 0 || val > 1) 1370 ret = -EINVAL; 1371 else 1372 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1373 } 1374 break; 1375 1376 case SO_TXTIME: 1377 if (optlen != sizeof(struct sock_txtime)) { 1378 ret = -EINVAL; 1379 break; 1380 } else if (copy_from_sockptr(&sk_txtime, optval, 1381 sizeof(struct sock_txtime))) { 1382 ret = -EFAULT; 1383 break; 1384 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1385 ret = -EINVAL; 1386 break; 1387 } 1388 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1389 * scheduler has enough safe guards. 1390 */ 1391 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1392 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1393 ret = -EPERM; 1394 break; 1395 } 1396 sock_valbool_flag(sk, SOCK_TXTIME, true); 1397 sk->sk_clockid = sk_txtime.clockid; 1398 sk->sk_txtime_deadline_mode = 1399 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1400 sk->sk_txtime_report_errors = 1401 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1402 break; 1403 1404 case SO_BINDTOIFINDEX: 1405 ret = sock_bindtoindex_locked(sk, val); 1406 break; 1407 1408 case SO_BUF_LOCK: 1409 if (val & ~SOCK_BUF_LOCK_MASK) { 1410 ret = -EINVAL; 1411 break; 1412 } 1413 sk->sk_userlocks = val | (sk->sk_userlocks & 1414 ~SOCK_BUF_LOCK_MASK); 1415 break; 1416 1417 case SO_RESERVE_MEM: 1418 { 1419 int delta; 1420 1421 if (val < 0) { 1422 ret = -EINVAL; 1423 break; 1424 } 1425 1426 delta = val - sk->sk_reserved_mem; 1427 if (delta < 0) 1428 sock_release_reserved_memory(sk, -delta); 1429 else 1430 ret = sock_reserve_memory(sk, delta); 1431 break; 1432 } 1433 1434 default: 1435 ret = -ENOPROTOOPT; 1436 break; 1437 } 1438 release_sock(sk); 1439 return ret; 1440 } 1441 EXPORT_SYMBOL(sock_setsockopt); 1442 1443 static const struct cred *sk_get_peer_cred(struct sock *sk) 1444 { 1445 const struct cred *cred; 1446 1447 spin_lock(&sk->sk_peer_lock); 1448 cred = get_cred(sk->sk_peer_cred); 1449 spin_unlock(&sk->sk_peer_lock); 1450 1451 return cred; 1452 } 1453 1454 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1455 struct ucred *ucred) 1456 { 1457 ucred->pid = pid_vnr(pid); 1458 ucred->uid = ucred->gid = -1; 1459 if (cred) { 1460 struct user_namespace *current_ns = current_user_ns(); 1461 1462 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1463 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1464 } 1465 } 1466 1467 static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1468 { 1469 struct user_namespace *user_ns = current_user_ns(); 1470 int i; 1471 1472 for (i = 0; i < src->ngroups; i++) 1473 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1474 return -EFAULT; 1475 1476 return 0; 1477 } 1478 1479 int sock_getsockopt(struct socket *sock, int level, int optname, 1480 char __user *optval, int __user *optlen) 1481 { 1482 struct sock *sk = sock->sk; 1483 1484 union { 1485 int val; 1486 u64 val64; 1487 unsigned long ulval; 1488 struct linger ling; 1489 struct old_timeval32 tm32; 1490 struct __kernel_old_timeval tm; 1491 struct __kernel_sock_timeval stm; 1492 struct sock_txtime txtime; 1493 struct so_timestamping timestamping; 1494 } v; 1495 1496 int lv = sizeof(int); 1497 int len; 1498 1499 if (get_user(len, optlen)) 1500 return -EFAULT; 1501 if (len < 0) 1502 return -EINVAL; 1503 1504 memset(&v, 0, sizeof(v)); 1505 1506 switch (optname) { 1507 case SO_DEBUG: 1508 v.val = sock_flag(sk, SOCK_DBG); 1509 break; 1510 1511 case SO_DONTROUTE: 1512 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1513 break; 1514 1515 case SO_BROADCAST: 1516 v.val = sock_flag(sk, SOCK_BROADCAST); 1517 break; 1518 1519 case SO_SNDBUF: 1520 v.val = sk->sk_sndbuf; 1521 break; 1522 1523 case SO_RCVBUF: 1524 v.val = sk->sk_rcvbuf; 1525 break; 1526 1527 case SO_REUSEADDR: 1528 v.val = sk->sk_reuse; 1529 break; 1530 1531 case SO_REUSEPORT: 1532 v.val = sk->sk_reuseport; 1533 break; 1534 1535 case SO_KEEPALIVE: 1536 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1537 break; 1538 1539 case SO_TYPE: 1540 v.val = sk->sk_type; 1541 break; 1542 1543 case SO_PROTOCOL: 1544 v.val = sk->sk_protocol; 1545 break; 1546 1547 case SO_DOMAIN: 1548 v.val = sk->sk_family; 1549 break; 1550 1551 case SO_ERROR: 1552 v.val = -sock_error(sk); 1553 if (v.val == 0) 1554 v.val = xchg(&sk->sk_err_soft, 0); 1555 break; 1556 1557 case SO_OOBINLINE: 1558 v.val = sock_flag(sk, SOCK_URGINLINE); 1559 break; 1560 1561 case SO_NO_CHECK: 1562 v.val = sk->sk_no_check_tx; 1563 break; 1564 1565 case SO_PRIORITY: 1566 v.val = sk->sk_priority; 1567 break; 1568 1569 case SO_LINGER: 1570 lv = sizeof(v.ling); 1571 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1572 v.ling.l_linger = sk->sk_lingertime / HZ; 1573 break; 1574 1575 case SO_BSDCOMPAT: 1576 break; 1577 1578 case SO_TIMESTAMP_OLD: 1579 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1580 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1581 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1582 break; 1583 1584 case SO_TIMESTAMPNS_OLD: 1585 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1586 break; 1587 1588 case SO_TIMESTAMP_NEW: 1589 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1590 break; 1591 1592 case SO_TIMESTAMPNS_NEW: 1593 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1594 break; 1595 1596 case SO_TIMESTAMPING_OLD: 1597 lv = sizeof(v.timestamping); 1598 v.timestamping.flags = sk->sk_tsflags; 1599 v.timestamping.bind_phc = sk->sk_bind_phc; 1600 break; 1601 1602 case SO_RCVTIMEO_OLD: 1603 case SO_RCVTIMEO_NEW: 1604 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); 1605 break; 1606 1607 case SO_SNDTIMEO_OLD: 1608 case SO_SNDTIMEO_NEW: 1609 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); 1610 break; 1611 1612 case SO_RCVLOWAT: 1613 v.val = sk->sk_rcvlowat; 1614 break; 1615 1616 case SO_SNDLOWAT: 1617 v.val = 1; 1618 break; 1619 1620 case SO_PASSCRED: 1621 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1622 break; 1623 1624 case SO_PEERCRED: 1625 { 1626 struct ucred peercred; 1627 if (len > sizeof(peercred)) 1628 len = sizeof(peercred); 1629 1630 spin_lock(&sk->sk_peer_lock); 1631 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1632 spin_unlock(&sk->sk_peer_lock); 1633 1634 if (copy_to_user(optval, &peercred, len)) 1635 return -EFAULT; 1636 goto lenout; 1637 } 1638 1639 case SO_PEERGROUPS: 1640 { 1641 const struct cred *cred; 1642 int ret, n; 1643 1644 cred = sk_get_peer_cred(sk); 1645 if (!cred) 1646 return -ENODATA; 1647 1648 n = cred->group_info->ngroups; 1649 if (len < n * sizeof(gid_t)) { 1650 len = n * sizeof(gid_t); 1651 put_cred(cred); 1652 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1653 } 1654 len = n * sizeof(gid_t); 1655 1656 ret = groups_to_user((gid_t __user *)optval, cred->group_info); 1657 put_cred(cred); 1658 if (ret) 1659 return ret; 1660 goto lenout; 1661 } 1662 1663 case SO_PEERNAME: 1664 { 1665 char address[128]; 1666 1667 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1668 if (lv < 0) 1669 return -ENOTCONN; 1670 if (lv < len) 1671 return -EINVAL; 1672 if (copy_to_user(optval, address, len)) 1673 return -EFAULT; 1674 goto lenout; 1675 } 1676 1677 /* Dubious BSD thing... Probably nobody even uses it, but 1678 * the UNIX standard wants it for whatever reason... -DaveM 1679 */ 1680 case SO_ACCEPTCONN: 1681 v.val = sk->sk_state == TCP_LISTEN; 1682 break; 1683 1684 case SO_PASSSEC: 1685 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1686 break; 1687 1688 case SO_PEERSEC: 1689 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1690 1691 case SO_MARK: 1692 v.val = sk->sk_mark; 1693 break; 1694 1695 case SO_RXQ_OVFL: 1696 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1697 break; 1698 1699 case SO_WIFI_STATUS: 1700 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1701 break; 1702 1703 case SO_PEEK_OFF: 1704 if (!sock->ops->set_peek_off) 1705 return -EOPNOTSUPP; 1706 1707 v.val = sk->sk_peek_off; 1708 break; 1709 case SO_NOFCS: 1710 v.val = sock_flag(sk, SOCK_NOFCS); 1711 break; 1712 1713 case SO_BINDTODEVICE: 1714 return sock_getbindtodevice(sk, optval, optlen, len); 1715 1716 case SO_GET_FILTER: 1717 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1718 if (len < 0) 1719 return len; 1720 1721 goto lenout; 1722 1723 case SO_LOCK_FILTER: 1724 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1725 break; 1726 1727 case SO_BPF_EXTENSIONS: 1728 v.val = bpf_tell_extensions(); 1729 break; 1730 1731 case SO_SELECT_ERR_QUEUE: 1732 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1733 break; 1734 1735 #ifdef CONFIG_NET_RX_BUSY_POLL 1736 case SO_BUSY_POLL: 1737 v.val = sk->sk_ll_usec; 1738 break; 1739 case SO_PREFER_BUSY_POLL: 1740 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 1741 break; 1742 #endif 1743 1744 case SO_MAX_PACING_RATE: 1745 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 1746 lv = sizeof(v.ulval); 1747 v.ulval = sk->sk_max_pacing_rate; 1748 } else { 1749 /* 32bit version */ 1750 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); 1751 } 1752 break; 1753 1754 case SO_INCOMING_CPU: 1755 v.val = READ_ONCE(sk->sk_incoming_cpu); 1756 break; 1757 1758 case SO_MEMINFO: 1759 { 1760 u32 meminfo[SK_MEMINFO_VARS]; 1761 1762 sk_get_meminfo(sk, meminfo); 1763 1764 len = min_t(unsigned int, len, sizeof(meminfo)); 1765 if (copy_to_user(optval, &meminfo, len)) 1766 return -EFAULT; 1767 1768 goto lenout; 1769 } 1770 1771 #ifdef CONFIG_NET_RX_BUSY_POLL 1772 case SO_INCOMING_NAPI_ID: 1773 v.val = READ_ONCE(sk->sk_napi_id); 1774 1775 /* aggregate non-NAPI IDs down to 0 */ 1776 if (v.val < MIN_NAPI_ID) 1777 v.val = 0; 1778 1779 break; 1780 #endif 1781 1782 case SO_COOKIE: 1783 lv = sizeof(u64); 1784 if (len < lv) 1785 return -EINVAL; 1786 v.val64 = sock_gen_cookie(sk); 1787 break; 1788 1789 case SO_ZEROCOPY: 1790 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1791 break; 1792 1793 case SO_TXTIME: 1794 lv = sizeof(v.txtime); 1795 v.txtime.clockid = sk->sk_clockid; 1796 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1797 SOF_TXTIME_DEADLINE_MODE : 0; 1798 v.txtime.flags |= sk->sk_txtime_report_errors ? 1799 SOF_TXTIME_REPORT_ERRORS : 0; 1800 break; 1801 1802 case SO_BINDTOIFINDEX: 1803 v.val = sk->sk_bound_dev_if; 1804 break; 1805 1806 case SO_NETNS_COOKIE: 1807 lv = sizeof(u64); 1808 if (len != lv) 1809 return -EINVAL; 1810 v.val64 = sock_net(sk)->net_cookie; 1811 break; 1812 1813 case SO_BUF_LOCK: 1814 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; 1815 break; 1816 1817 case SO_RESERVE_MEM: 1818 v.val = sk->sk_reserved_mem; 1819 break; 1820 1821 default: 1822 /* We implement the SO_SNDLOWAT etc to not be settable 1823 * (1003.1g 7). 1824 */ 1825 return -ENOPROTOOPT; 1826 } 1827 1828 if (len > lv) 1829 len = lv; 1830 if (copy_to_user(optval, &v, len)) 1831 return -EFAULT; 1832 lenout: 1833 if (put_user(len, optlen)) 1834 return -EFAULT; 1835 return 0; 1836 } 1837 1838 /* 1839 * Initialize an sk_lock. 1840 * 1841 * (We also register the sk_lock with the lock validator.) 1842 */ 1843 static inline void sock_lock_init(struct sock *sk) 1844 { 1845 if (sk->sk_kern_sock) 1846 sock_lock_init_class_and_name( 1847 sk, 1848 af_family_kern_slock_key_strings[sk->sk_family], 1849 af_family_kern_slock_keys + sk->sk_family, 1850 af_family_kern_key_strings[sk->sk_family], 1851 af_family_kern_keys + sk->sk_family); 1852 else 1853 sock_lock_init_class_and_name( 1854 sk, 1855 af_family_slock_key_strings[sk->sk_family], 1856 af_family_slock_keys + sk->sk_family, 1857 af_family_key_strings[sk->sk_family], 1858 af_family_keys + sk->sk_family); 1859 } 1860 1861 /* 1862 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1863 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1864 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1865 */ 1866 static void sock_copy(struct sock *nsk, const struct sock *osk) 1867 { 1868 const struct proto *prot = READ_ONCE(osk->sk_prot); 1869 #ifdef CONFIG_SECURITY_NETWORK 1870 void *sptr = nsk->sk_security; 1871 #endif 1872 1873 /* If we move sk_tx_queue_mapping out of the private section, 1874 * we must check if sk_tx_queue_clear() is called after 1875 * sock_copy() in sk_clone_lock(). 1876 */ 1877 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 1878 offsetof(struct sock, sk_dontcopy_begin) || 1879 offsetof(struct sock, sk_tx_queue_mapping) >= 1880 offsetof(struct sock, sk_dontcopy_end)); 1881 1882 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1883 1884 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1885 prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1886 1887 #ifdef CONFIG_SECURITY_NETWORK 1888 nsk->sk_security = sptr; 1889 security_sk_clone(osk, nsk); 1890 #endif 1891 } 1892 1893 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1894 int family) 1895 { 1896 struct sock *sk; 1897 struct kmem_cache *slab; 1898 1899 slab = prot->slab; 1900 if (slab != NULL) { 1901 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1902 if (!sk) 1903 return sk; 1904 if (want_init_on_alloc(priority)) 1905 sk_prot_clear_nulls(sk, prot->obj_size); 1906 } else 1907 sk = kmalloc(prot->obj_size, priority); 1908 1909 if (sk != NULL) { 1910 if (security_sk_alloc(sk, family, priority)) 1911 goto out_free; 1912 1913 if (!try_module_get(prot->owner)) 1914 goto out_free_sec; 1915 } 1916 1917 return sk; 1918 1919 out_free_sec: 1920 security_sk_free(sk); 1921 out_free: 1922 if (slab != NULL) 1923 kmem_cache_free(slab, sk); 1924 else 1925 kfree(sk); 1926 return NULL; 1927 } 1928 1929 static void sk_prot_free(struct proto *prot, struct sock *sk) 1930 { 1931 struct kmem_cache *slab; 1932 struct module *owner; 1933 1934 owner = prot->owner; 1935 slab = prot->slab; 1936 1937 cgroup_sk_free(&sk->sk_cgrp_data); 1938 mem_cgroup_sk_free(sk); 1939 security_sk_free(sk); 1940 if (slab != NULL) 1941 kmem_cache_free(slab, sk); 1942 else 1943 kfree(sk); 1944 module_put(owner); 1945 } 1946 1947 /** 1948 * sk_alloc - All socket objects are allocated here 1949 * @net: the applicable net namespace 1950 * @family: protocol family 1951 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1952 * @prot: struct proto associated with this new sock instance 1953 * @kern: is this to be a kernel socket? 1954 */ 1955 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1956 struct proto *prot, int kern) 1957 { 1958 struct sock *sk; 1959 1960 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1961 if (sk) { 1962 sk->sk_family = family; 1963 /* 1964 * See comment in struct sock definition to understand 1965 * why we need sk_prot_creator -acme 1966 */ 1967 sk->sk_prot = sk->sk_prot_creator = prot; 1968 sk->sk_kern_sock = kern; 1969 sock_lock_init(sk); 1970 sk->sk_net_refcnt = kern ? 0 : 1; 1971 if (likely(sk->sk_net_refcnt)) { 1972 get_net(net); 1973 sock_inuse_add(net, 1); 1974 } 1975 1976 sock_net_set(sk, net); 1977 refcount_set(&sk->sk_wmem_alloc, 1); 1978 1979 mem_cgroup_sk_alloc(sk); 1980 cgroup_sk_alloc(&sk->sk_cgrp_data); 1981 sock_update_classid(&sk->sk_cgrp_data); 1982 sock_update_netprioidx(&sk->sk_cgrp_data); 1983 sk_tx_queue_clear(sk); 1984 } 1985 1986 return sk; 1987 } 1988 EXPORT_SYMBOL(sk_alloc); 1989 1990 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 1991 * grace period. This is the case for UDP sockets and TCP listeners. 1992 */ 1993 static void __sk_destruct(struct rcu_head *head) 1994 { 1995 struct sock *sk = container_of(head, struct sock, sk_rcu); 1996 struct sk_filter *filter; 1997 1998 if (sk->sk_destruct) 1999 sk->sk_destruct(sk); 2000 2001 filter = rcu_dereference_check(sk->sk_filter, 2002 refcount_read(&sk->sk_wmem_alloc) == 0); 2003 if (filter) { 2004 sk_filter_uncharge(sk, filter); 2005 RCU_INIT_POINTER(sk->sk_filter, NULL); 2006 } 2007 2008 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 2009 2010 #ifdef CONFIG_BPF_SYSCALL 2011 bpf_sk_storage_free(sk); 2012 #endif 2013 2014 if (atomic_read(&sk->sk_omem_alloc)) 2015 pr_debug("%s: optmem leakage (%d bytes) detected\n", 2016 __func__, atomic_read(&sk->sk_omem_alloc)); 2017 2018 if (sk->sk_frag.page) { 2019 put_page(sk->sk_frag.page); 2020 sk->sk_frag.page = NULL; 2021 } 2022 2023 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ 2024 put_cred(sk->sk_peer_cred); 2025 put_pid(sk->sk_peer_pid); 2026 2027 if (likely(sk->sk_net_refcnt)) 2028 put_net(sock_net(sk)); 2029 sk_prot_free(sk->sk_prot_creator, sk); 2030 } 2031 2032 void sk_destruct(struct sock *sk) 2033 { 2034 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 2035 2036 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 2037 reuseport_detach_sock(sk); 2038 use_call_rcu = true; 2039 } 2040 2041 if (use_call_rcu) 2042 call_rcu(&sk->sk_rcu, __sk_destruct); 2043 else 2044 __sk_destruct(&sk->sk_rcu); 2045 } 2046 2047 static void __sk_free(struct sock *sk) 2048 { 2049 if (likely(sk->sk_net_refcnt)) 2050 sock_inuse_add(sock_net(sk), -1); 2051 2052 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 2053 sock_diag_broadcast_destroy(sk); 2054 else 2055 sk_destruct(sk); 2056 } 2057 2058 void sk_free(struct sock *sk) 2059 { 2060 /* 2061 * We subtract one from sk_wmem_alloc and can know if 2062 * some packets are still in some tx queue. 2063 * If not null, sock_wfree() will call __sk_free(sk) later 2064 */ 2065 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 2066 __sk_free(sk); 2067 } 2068 EXPORT_SYMBOL(sk_free); 2069 2070 static void sk_init_common(struct sock *sk) 2071 { 2072 skb_queue_head_init(&sk->sk_receive_queue); 2073 skb_queue_head_init(&sk->sk_write_queue); 2074 skb_queue_head_init(&sk->sk_error_queue); 2075 2076 rwlock_init(&sk->sk_callback_lock); 2077 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 2078 af_rlock_keys + sk->sk_family, 2079 af_family_rlock_key_strings[sk->sk_family]); 2080 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 2081 af_wlock_keys + sk->sk_family, 2082 af_family_wlock_key_strings[sk->sk_family]); 2083 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 2084 af_elock_keys + sk->sk_family, 2085 af_family_elock_key_strings[sk->sk_family]); 2086 lockdep_set_class_and_name(&sk->sk_callback_lock, 2087 af_callback_keys + sk->sk_family, 2088 af_family_clock_key_strings[sk->sk_family]); 2089 } 2090 2091 /** 2092 * sk_clone_lock - clone a socket, and lock its clone 2093 * @sk: the socket to clone 2094 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2095 * 2096 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 2097 */ 2098 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 2099 { 2100 struct proto *prot = READ_ONCE(sk->sk_prot); 2101 struct sk_filter *filter; 2102 bool is_charged = true; 2103 struct sock *newsk; 2104 2105 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 2106 if (!newsk) 2107 goto out; 2108 2109 sock_copy(newsk, sk); 2110 2111 newsk->sk_prot_creator = prot; 2112 2113 /* SANITY */ 2114 if (likely(newsk->sk_net_refcnt)) 2115 get_net(sock_net(newsk)); 2116 sk_node_init(&newsk->sk_node); 2117 sock_lock_init(newsk); 2118 bh_lock_sock(newsk); 2119 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 2120 newsk->sk_backlog.len = 0; 2121 2122 atomic_set(&newsk->sk_rmem_alloc, 0); 2123 2124 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ 2125 refcount_set(&newsk->sk_wmem_alloc, 1); 2126 2127 atomic_set(&newsk->sk_omem_alloc, 0); 2128 sk_init_common(newsk); 2129 2130 newsk->sk_dst_cache = NULL; 2131 newsk->sk_dst_pending_confirm = 0; 2132 newsk->sk_wmem_queued = 0; 2133 newsk->sk_forward_alloc = 0; 2134 newsk->sk_reserved_mem = 0; 2135 atomic_set(&newsk->sk_drops, 0); 2136 newsk->sk_send_head = NULL; 2137 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 2138 atomic_set(&newsk->sk_zckey, 0); 2139 2140 sock_reset_flag(newsk, SOCK_DONE); 2141 2142 /* sk->sk_memcg will be populated at accept() time */ 2143 newsk->sk_memcg = NULL; 2144 2145 cgroup_sk_clone(&newsk->sk_cgrp_data); 2146 2147 rcu_read_lock(); 2148 filter = rcu_dereference(sk->sk_filter); 2149 if (filter != NULL) 2150 /* though it's an empty new sock, the charging may fail 2151 * if sysctl_optmem_max was changed between creation of 2152 * original socket and cloning 2153 */ 2154 is_charged = sk_filter_charge(newsk, filter); 2155 RCU_INIT_POINTER(newsk->sk_filter, filter); 2156 rcu_read_unlock(); 2157 2158 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 2159 /* We need to make sure that we don't uncharge the new 2160 * socket if we couldn't charge it in the first place 2161 * as otherwise we uncharge the parent's filter. 2162 */ 2163 if (!is_charged) 2164 RCU_INIT_POINTER(newsk->sk_filter, NULL); 2165 sk_free_unlock_clone(newsk); 2166 newsk = NULL; 2167 goto out; 2168 } 2169 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 2170 2171 if (bpf_sk_storage_clone(sk, newsk)) { 2172 sk_free_unlock_clone(newsk); 2173 newsk = NULL; 2174 goto out; 2175 } 2176 2177 /* Clear sk_user_data if parent had the pointer tagged 2178 * as not suitable for copying when cloning. 2179 */ 2180 if (sk_user_data_is_nocopy(newsk)) 2181 newsk->sk_user_data = NULL; 2182 2183 newsk->sk_err = 0; 2184 newsk->sk_err_soft = 0; 2185 newsk->sk_priority = 0; 2186 newsk->sk_incoming_cpu = raw_smp_processor_id(); 2187 if (likely(newsk->sk_net_refcnt)) 2188 sock_inuse_add(sock_net(newsk), 1); 2189 2190 /* Before updating sk_refcnt, we must commit prior changes to memory 2191 * (Documentation/RCU/rculist_nulls.rst for details) 2192 */ 2193 smp_wmb(); 2194 refcount_set(&newsk->sk_refcnt, 2); 2195 2196 /* Increment the counter in the same struct proto as the master 2197 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 2198 * is the same as sk->sk_prot->socks, as this field was copied 2199 * with memcpy). 2200 * 2201 * This _changes_ the previous behaviour, where 2202 * tcp_create_openreq_child always was incrementing the 2203 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 2204 * to be taken into account in all callers. -acme 2205 */ 2206 sk_refcnt_debug_inc(newsk); 2207 sk_set_socket(newsk, NULL); 2208 sk_tx_queue_clear(newsk); 2209 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2210 2211 if (newsk->sk_prot->sockets_allocated) 2212 sk_sockets_allocated_inc(newsk); 2213 2214 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2215 net_enable_timestamp(); 2216 out: 2217 return newsk; 2218 } 2219 EXPORT_SYMBOL_GPL(sk_clone_lock); 2220 2221 void sk_free_unlock_clone(struct sock *sk) 2222 { 2223 /* It is still raw copy of parent, so invalidate 2224 * destructor and make plain sk_free() */ 2225 sk->sk_destruct = NULL; 2226 bh_unlock_sock(sk); 2227 sk_free(sk); 2228 } 2229 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 2230 2231 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2232 { 2233 u32 max_segs = 1; 2234 2235 sk_dst_set(sk, dst); 2236 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; 2237 if (sk->sk_route_caps & NETIF_F_GSO) 2238 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2239 sk->sk_route_caps &= ~sk->sk_route_nocaps; 2240 if (sk_can_gso(sk)) { 2241 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2242 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2243 } else { 2244 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2245 sk->sk_gso_max_size = dst->dev->gso_max_size; 2246 max_segs = max_t(u32, dst->dev->gso_max_segs, 1); 2247 } 2248 } 2249 sk->sk_gso_max_segs = max_segs; 2250 } 2251 EXPORT_SYMBOL_GPL(sk_setup_caps); 2252 2253 /* 2254 * Simple resource managers for sockets. 2255 */ 2256 2257 2258 /* 2259 * Write buffer destructor automatically called from kfree_skb. 2260 */ 2261 void sock_wfree(struct sk_buff *skb) 2262 { 2263 struct sock *sk = skb->sk; 2264 unsigned int len = skb->truesize; 2265 2266 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2267 /* 2268 * Keep a reference on sk_wmem_alloc, this will be released 2269 * after sk_write_space() call 2270 */ 2271 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2272 sk->sk_write_space(sk); 2273 len = 1; 2274 } 2275 /* 2276 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2277 * could not do because of in-flight packets 2278 */ 2279 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2280 __sk_free(sk); 2281 } 2282 EXPORT_SYMBOL(sock_wfree); 2283 2284 /* This variant of sock_wfree() is used by TCP, 2285 * since it sets SOCK_USE_WRITE_QUEUE. 2286 */ 2287 void __sock_wfree(struct sk_buff *skb) 2288 { 2289 struct sock *sk = skb->sk; 2290 2291 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2292 __sk_free(sk); 2293 } 2294 2295 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2296 { 2297 skb_orphan(skb); 2298 skb->sk = sk; 2299 #ifdef CONFIG_INET 2300 if (unlikely(!sk_fullsock(sk))) { 2301 skb->destructor = sock_edemux; 2302 sock_hold(sk); 2303 return; 2304 } 2305 #endif 2306 skb->destructor = sock_wfree; 2307 skb_set_hash_from_sk(skb, sk); 2308 /* 2309 * We used to take a refcount on sk, but following operation 2310 * is enough to guarantee sk_free() wont free this sock until 2311 * all in-flight packets are completed 2312 */ 2313 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 2314 } 2315 EXPORT_SYMBOL(skb_set_owner_w); 2316 2317 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2318 { 2319 #ifdef CONFIG_TLS_DEVICE 2320 /* Drivers depend on in-order delivery for crypto offload, 2321 * partial orphan breaks out-of-order-OK logic. 2322 */ 2323 if (skb->decrypted) 2324 return false; 2325 #endif 2326 return (skb->destructor == sock_wfree || 2327 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2328 } 2329 2330 /* This helper is used by netem, as it can hold packets in its 2331 * delay queue. We want to allow the owner socket to send more 2332 * packets, as if they were already TX completed by a typical driver. 2333 * But we also want to keep skb->sk set because some packet schedulers 2334 * rely on it (sch_fq for example). 2335 */ 2336 void skb_orphan_partial(struct sk_buff *skb) 2337 { 2338 if (skb_is_tcp_pure_ack(skb)) 2339 return; 2340 2341 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2342 return; 2343 2344 skb_orphan(skb); 2345 } 2346 EXPORT_SYMBOL(skb_orphan_partial); 2347 2348 /* 2349 * Read buffer destructor automatically called from kfree_skb. 2350 */ 2351 void sock_rfree(struct sk_buff *skb) 2352 { 2353 struct sock *sk = skb->sk; 2354 unsigned int len = skb->truesize; 2355 2356 atomic_sub(len, &sk->sk_rmem_alloc); 2357 sk_mem_uncharge(sk, len); 2358 } 2359 EXPORT_SYMBOL(sock_rfree); 2360 2361 /* 2362 * Buffer destructor for skbs that are not used directly in read or write 2363 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2364 */ 2365 void sock_efree(struct sk_buff *skb) 2366 { 2367 sock_put(skb->sk); 2368 } 2369 EXPORT_SYMBOL(sock_efree); 2370 2371 /* Buffer destructor for prefetch/receive path where reference count may 2372 * not be held, e.g. for listen sockets. 2373 */ 2374 #ifdef CONFIG_INET 2375 void sock_pfree(struct sk_buff *skb) 2376 { 2377 if (sk_is_refcounted(skb->sk)) 2378 sock_gen_put(skb->sk); 2379 } 2380 EXPORT_SYMBOL(sock_pfree); 2381 #endif /* CONFIG_INET */ 2382 2383 kuid_t sock_i_uid(struct sock *sk) 2384 { 2385 kuid_t uid; 2386 2387 read_lock_bh(&sk->sk_callback_lock); 2388 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 2389 read_unlock_bh(&sk->sk_callback_lock); 2390 return uid; 2391 } 2392 EXPORT_SYMBOL(sock_i_uid); 2393 2394 unsigned long sock_i_ino(struct sock *sk) 2395 { 2396 unsigned long ino; 2397 2398 read_lock_bh(&sk->sk_callback_lock); 2399 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 2400 read_unlock_bh(&sk->sk_callback_lock); 2401 return ino; 2402 } 2403 EXPORT_SYMBOL(sock_i_ino); 2404 2405 /* 2406 * Allocate a skb from the socket's send buffer. 2407 */ 2408 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2409 gfp_t priority) 2410 { 2411 if (force || 2412 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2413 struct sk_buff *skb = alloc_skb(size, priority); 2414 2415 if (skb) { 2416 skb_set_owner_w(skb, sk); 2417 return skb; 2418 } 2419 } 2420 return NULL; 2421 } 2422 EXPORT_SYMBOL(sock_wmalloc); 2423 2424 static void sock_ofree(struct sk_buff *skb) 2425 { 2426 struct sock *sk = skb->sk; 2427 2428 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2429 } 2430 2431 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2432 gfp_t priority) 2433 { 2434 struct sk_buff *skb; 2435 2436 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2437 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2438 sysctl_optmem_max) 2439 return NULL; 2440 2441 skb = alloc_skb(size, priority); 2442 if (!skb) 2443 return NULL; 2444 2445 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2446 skb->sk = sk; 2447 skb->destructor = sock_ofree; 2448 return skb; 2449 } 2450 2451 /* 2452 * Allocate a memory block from the socket's option memory buffer. 2453 */ 2454 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2455 { 2456 if ((unsigned int)size <= sysctl_optmem_max && 2457 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 2458 void *mem; 2459 /* First do the add, to avoid the race if kmalloc 2460 * might sleep. 2461 */ 2462 atomic_add(size, &sk->sk_omem_alloc); 2463 mem = kmalloc(size, priority); 2464 if (mem) 2465 return mem; 2466 atomic_sub(size, &sk->sk_omem_alloc); 2467 } 2468 return NULL; 2469 } 2470 EXPORT_SYMBOL(sock_kmalloc); 2471 2472 /* Free an option memory block. Note, we actually want the inline 2473 * here as this allows gcc to detect the nullify and fold away the 2474 * condition entirely. 2475 */ 2476 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2477 const bool nullify) 2478 { 2479 if (WARN_ON_ONCE(!mem)) 2480 return; 2481 if (nullify) 2482 kfree_sensitive(mem); 2483 else 2484 kfree(mem); 2485 atomic_sub(size, &sk->sk_omem_alloc); 2486 } 2487 2488 void sock_kfree_s(struct sock *sk, void *mem, int size) 2489 { 2490 __sock_kfree_s(sk, mem, size, false); 2491 } 2492 EXPORT_SYMBOL(sock_kfree_s); 2493 2494 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2495 { 2496 __sock_kfree_s(sk, mem, size, true); 2497 } 2498 EXPORT_SYMBOL(sock_kzfree_s); 2499 2500 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2501 I think, these locks should be removed for datagram sockets. 2502 */ 2503 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2504 { 2505 DEFINE_WAIT(wait); 2506 2507 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2508 for (;;) { 2509 if (!timeo) 2510 break; 2511 if (signal_pending(current)) 2512 break; 2513 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2514 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2515 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2516 break; 2517 if (sk->sk_shutdown & SEND_SHUTDOWN) 2518 break; 2519 if (sk->sk_err) 2520 break; 2521 timeo = schedule_timeout(timeo); 2522 } 2523 finish_wait(sk_sleep(sk), &wait); 2524 return timeo; 2525 } 2526 2527 2528 /* 2529 * Generic send/receive buffer handlers 2530 */ 2531 2532 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2533 unsigned long data_len, int noblock, 2534 int *errcode, int max_page_order) 2535 { 2536 struct sk_buff *skb; 2537 long timeo; 2538 int err; 2539 2540 timeo = sock_sndtimeo(sk, noblock); 2541 for (;;) { 2542 err = sock_error(sk); 2543 if (err != 0) 2544 goto failure; 2545 2546 err = -EPIPE; 2547 if (sk->sk_shutdown & SEND_SHUTDOWN) 2548 goto failure; 2549 2550 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2551 break; 2552 2553 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2554 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2555 err = -EAGAIN; 2556 if (!timeo) 2557 goto failure; 2558 if (signal_pending(current)) 2559 goto interrupted; 2560 timeo = sock_wait_for_wmem(sk, timeo); 2561 } 2562 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2563 errcode, sk->sk_allocation); 2564 if (skb) 2565 skb_set_owner_w(skb, sk); 2566 return skb; 2567 2568 interrupted: 2569 err = sock_intr_errno(timeo); 2570 failure: 2571 *errcode = err; 2572 return NULL; 2573 } 2574 EXPORT_SYMBOL(sock_alloc_send_pskb); 2575 2576 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 2577 int noblock, int *errcode) 2578 { 2579 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 2580 } 2581 EXPORT_SYMBOL(sock_alloc_send_skb); 2582 2583 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2584 struct sockcm_cookie *sockc) 2585 { 2586 u32 tsflags; 2587 2588 switch (cmsg->cmsg_type) { 2589 case SO_MARK: 2590 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2591 return -EPERM; 2592 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2593 return -EINVAL; 2594 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2595 break; 2596 case SO_TIMESTAMPING_OLD: 2597 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2598 return -EINVAL; 2599 2600 tsflags = *(u32 *)CMSG_DATA(cmsg); 2601 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2602 return -EINVAL; 2603 2604 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2605 sockc->tsflags |= tsflags; 2606 break; 2607 case SCM_TXTIME: 2608 if (!sock_flag(sk, SOCK_TXTIME)) 2609 return -EINVAL; 2610 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2611 return -EINVAL; 2612 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2613 break; 2614 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2615 case SCM_RIGHTS: 2616 case SCM_CREDENTIALS: 2617 break; 2618 default: 2619 return -EINVAL; 2620 } 2621 return 0; 2622 } 2623 EXPORT_SYMBOL(__sock_cmsg_send); 2624 2625 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2626 struct sockcm_cookie *sockc) 2627 { 2628 struct cmsghdr *cmsg; 2629 int ret; 2630 2631 for_each_cmsghdr(cmsg, msg) { 2632 if (!CMSG_OK(msg, cmsg)) 2633 return -EINVAL; 2634 if (cmsg->cmsg_level != SOL_SOCKET) 2635 continue; 2636 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2637 if (ret) 2638 return ret; 2639 } 2640 return 0; 2641 } 2642 EXPORT_SYMBOL(sock_cmsg_send); 2643 2644 static void sk_enter_memory_pressure(struct sock *sk) 2645 { 2646 if (!sk->sk_prot->enter_memory_pressure) 2647 return; 2648 2649 sk->sk_prot->enter_memory_pressure(sk); 2650 } 2651 2652 static void sk_leave_memory_pressure(struct sock *sk) 2653 { 2654 if (sk->sk_prot->leave_memory_pressure) { 2655 sk->sk_prot->leave_memory_pressure(sk); 2656 } else { 2657 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2658 2659 if (memory_pressure && READ_ONCE(*memory_pressure)) 2660 WRITE_ONCE(*memory_pressure, 0); 2661 } 2662 } 2663 2664 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 2665 2666 /** 2667 * skb_page_frag_refill - check that a page_frag contains enough room 2668 * @sz: minimum size of the fragment we want to get 2669 * @pfrag: pointer to page_frag 2670 * @gfp: priority for memory allocation 2671 * 2672 * Note: While this allocator tries to use high order pages, there is 2673 * no guarantee that allocations succeed. Therefore, @sz MUST be 2674 * less or equal than PAGE_SIZE. 2675 */ 2676 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2677 { 2678 if (pfrag->page) { 2679 if (page_ref_count(pfrag->page) == 1) { 2680 pfrag->offset = 0; 2681 return true; 2682 } 2683 if (pfrag->offset + sz <= pfrag->size) 2684 return true; 2685 put_page(pfrag->page); 2686 } 2687 2688 pfrag->offset = 0; 2689 if (SKB_FRAG_PAGE_ORDER && 2690 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 2691 /* Avoid direct reclaim but allow kswapd to wake */ 2692 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2693 __GFP_COMP | __GFP_NOWARN | 2694 __GFP_NORETRY, 2695 SKB_FRAG_PAGE_ORDER); 2696 if (likely(pfrag->page)) { 2697 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2698 return true; 2699 } 2700 } 2701 pfrag->page = alloc_page(gfp); 2702 if (likely(pfrag->page)) { 2703 pfrag->size = PAGE_SIZE; 2704 return true; 2705 } 2706 return false; 2707 } 2708 EXPORT_SYMBOL(skb_page_frag_refill); 2709 2710 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2711 { 2712 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2713 return true; 2714 2715 sk_enter_memory_pressure(sk); 2716 sk_stream_moderate_sndbuf(sk); 2717 return false; 2718 } 2719 EXPORT_SYMBOL(sk_page_frag_refill); 2720 2721 void __lock_sock(struct sock *sk) 2722 __releases(&sk->sk_lock.slock) 2723 __acquires(&sk->sk_lock.slock) 2724 { 2725 DEFINE_WAIT(wait); 2726 2727 for (;;) { 2728 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2729 TASK_UNINTERRUPTIBLE); 2730 spin_unlock_bh(&sk->sk_lock.slock); 2731 schedule(); 2732 spin_lock_bh(&sk->sk_lock.slock); 2733 if (!sock_owned_by_user(sk)) 2734 break; 2735 } 2736 finish_wait(&sk->sk_lock.wq, &wait); 2737 } 2738 2739 void __release_sock(struct sock *sk) 2740 __releases(&sk->sk_lock.slock) 2741 __acquires(&sk->sk_lock.slock) 2742 { 2743 struct sk_buff *skb, *next; 2744 2745 while ((skb = sk->sk_backlog.head) != NULL) { 2746 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2747 2748 spin_unlock_bh(&sk->sk_lock.slock); 2749 2750 do { 2751 next = skb->next; 2752 prefetch(next); 2753 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2754 skb_mark_not_on_list(skb); 2755 sk_backlog_rcv(sk, skb); 2756 2757 cond_resched(); 2758 2759 skb = next; 2760 } while (skb != NULL); 2761 2762 spin_lock_bh(&sk->sk_lock.slock); 2763 } 2764 2765 /* 2766 * Doing the zeroing here guarantee we can not loop forever 2767 * while a wild producer attempts to flood us. 2768 */ 2769 sk->sk_backlog.len = 0; 2770 } 2771 2772 void __sk_flush_backlog(struct sock *sk) 2773 { 2774 spin_lock_bh(&sk->sk_lock.slock); 2775 __release_sock(sk); 2776 spin_unlock_bh(&sk->sk_lock.slock); 2777 } 2778 2779 /** 2780 * sk_wait_data - wait for data to arrive at sk_receive_queue 2781 * @sk: sock to wait on 2782 * @timeo: for how long 2783 * @skb: last skb seen on sk_receive_queue 2784 * 2785 * Now socket state including sk->sk_err is changed only under lock, 2786 * hence we may omit checks after joining wait queue. 2787 * We check receive queue before schedule() only as optimization; 2788 * it is very likely that release_sock() added new data. 2789 */ 2790 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2791 { 2792 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2793 int rc; 2794 2795 add_wait_queue(sk_sleep(sk), &wait); 2796 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2797 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2798 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2799 remove_wait_queue(sk_sleep(sk), &wait); 2800 return rc; 2801 } 2802 EXPORT_SYMBOL(sk_wait_data); 2803 2804 /** 2805 * __sk_mem_raise_allocated - increase memory_allocated 2806 * @sk: socket 2807 * @size: memory size to allocate 2808 * @amt: pages to allocate 2809 * @kind: allocation type 2810 * 2811 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2812 */ 2813 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2814 { 2815 struct proto *prot = sk->sk_prot; 2816 long allocated = sk_memory_allocated_add(sk, amt); 2817 bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg; 2818 bool charged = true; 2819 2820 if (memcg_charge && 2821 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt, 2822 gfp_memcg_charge()))) 2823 goto suppress_allocation; 2824 2825 /* Under limit. */ 2826 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2827 sk_leave_memory_pressure(sk); 2828 return 1; 2829 } 2830 2831 /* Under pressure. */ 2832 if (allocated > sk_prot_mem_limits(sk, 1)) 2833 sk_enter_memory_pressure(sk); 2834 2835 /* Over hard limit. */ 2836 if (allocated > sk_prot_mem_limits(sk, 2)) 2837 goto suppress_allocation; 2838 2839 /* guarantee minimum buffer size under pressure */ 2840 if (kind == SK_MEM_RECV) { 2841 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2842 return 1; 2843 2844 } else { /* SK_MEM_SEND */ 2845 int wmem0 = sk_get_wmem0(sk, prot); 2846 2847 if (sk->sk_type == SOCK_STREAM) { 2848 if (sk->sk_wmem_queued < wmem0) 2849 return 1; 2850 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2851 return 1; 2852 } 2853 } 2854 2855 if (sk_has_memory_pressure(sk)) { 2856 u64 alloc; 2857 2858 if (!sk_under_memory_pressure(sk)) 2859 return 1; 2860 alloc = sk_sockets_allocated_read_positive(sk); 2861 if (sk_prot_mem_limits(sk, 2) > alloc * 2862 sk_mem_pages(sk->sk_wmem_queued + 2863 atomic_read(&sk->sk_rmem_alloc) + 2864 sk->sk_forward_alloc)) 2865 return 1; 2866 } 2867 2868 suppress_allocation: 2869 2870 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2871 sk_stream_moderate_sndbuf(sk); 2872 2873 /* Fail only if socket is _under_ its sndbuf. 2874 * In this case we cannot block, so that we have to fail. 2875 */ 2876 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { 2877 /* Force charge with __GFP_NOFAIL */ 2878 if (memcg_charge && !charged) { 2879 mem_cgroup_charge_skmem(sk->sk_memcg, amt, 2880 gfp_memcg_charge() | __GFP_NOFAIL); 2881 } 2882 return 1; 2883 } 2884 } 2885 2886 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 2887 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 2888 2889 sk_memory_allocated_sub(sk, amt); 2890 2891 if (memcg_charge && charged) 2892 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2893 2894 return 0; 2895 } 2896 EXPORT_SYMBOL(__sk_mem_raise_allocated); 2897 2898 /** 2899 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2900 * @sk: socket 2901 * @size: memory size to allocate 2902 * @kind: allocation type 2903 * 2904 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2905 * rmem allocation. This function assumes that protocols which have 2906 * memory_pressure use sk_wmem_queued as write buffer accounting. 2907 */ 2908 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2909 { 2910 int ret, amt = sk_mem_pages(size); 2911 2912 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2913 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2914 if (!ret) 2915 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2916 return ret; 2917 } 2918 EXPORT_SYMBOL(__sk_mem_schedule); 2919 2920 /** 2921 * __sk_mem_reduce_allocated - reclaim memory_allocated 2922 * @sk: socket 2923 * @amount: number of quanta 2924 * 2925 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 2926 */ 2927 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 2928 { 2929 sk_memory_allocated_sub(sk, amount); 2930 2931 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2932 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2933 2934 if (sk_under_memory_pressure(sk) && 2935 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2936 sk_leave_memory_pressure(sk); 2937 } 2938 EXPORT_SYMBOL(__sk_mem_reduce_allocated); 2939 2940 /** 2941 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 2942 * @sk: socket 2943 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2944 */ 2945 void __sk_mem_reclaim(struct sock *sk, int amount) 2946 { 2947 amount >>= SK_MEM_QUANTUM_SHIFT; 2948 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2949 __sk_mem_reduce_allocated(sk, amount); 2950 } 2951 EXPORT_SYMBOL(__sk_mem_reclaim); 2952 2953 int sk_set_peek_off(struct sock *sk, int val) 2954 { 2955 sk->sk_peek_off = val; 2956 return 0; 2957 } 2958 EXPORT_SYMBOL_GPL(sk_set_peek_off); 2959 2960 /* 2961 * Set of default routines for initialising struct proto_ops when 2962 * the protocol does not support a particular function. In certain 2963 * cases where it makes no sense for a protocol to have a "do nothing" 2964 * function, some default processing is provided. 2965 */ 2966 2967 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2968 { 2969 return -EOPNOTSUPP; 2970 } 2971 EXPORT_SYMBOL(sock_no_bind); 2972 2973 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2974 int len, int flags) 2975 { 2976 return -EOPNOTSUPP; 2977 } 2978 EXPORT_SYMBOL(sock_no_connect); 2979 2980 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2981 { 2982 return -EOPNOTSUPP; 2983 } 2984 EXPORT_SYMBOL(sock_no_socketpair); 2985 2986 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 2987 bool kern) 2988 { 2989 return -EOPNOTSUPP; 2990 } 2991 EXPORT_SYMBOL(sock_no_accept); 2992 2993 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2994 int peer) 2995 { 2996 return -EOPNOTSUPP; 2997 } 2998 EXPORT_SYMBOL(sock_no_getname); 2999 3000 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3001 { 3002 return -EOPNOTSUPP; 3003 } 3004 EXPORT_SYMBOL(sock_no_ioctl); 3005 3006 int sock_no_listen(struct socket *sock, int backlog) 3007 { 3008 return -EOPNOTSUPP; 3009 } 3010 EXPORT_SYMBOL(sock_no_listen); 3011 3012 int sock_no_shutdown(struct socket *sock, int how) 3013 { 3014 return -EOPNOTSUPP; 3015 } 3016 EXPORT_SYMBOL(sock_no_shutdown); 3017 3018 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 3019 { 3020 return -EOPNOTSUPP; 3021 } 3022 EXPORT_SYMBOL(sock_no_sendmsg); 3023 3024 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 3025 { 3026 return -EOPNOTSUPP; 3027 } 3028 EXPORT_SYMBOL(sock_no_sendmsg_locked); 3029 3030 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 3031 int flags) 3032 { 3033 return -EOPNOTSUPP; 3034 } 3035 EXPORT_SYMBOL(sock_no_recvmsg); 3036 3037 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 3038 { 3039 /* Mirror missing mmap method error code */ 3040 return -ENODEV; 3041 } 3042 EXPORT_SYMBOL(sock_no_mmap); 3043 3044 /* 3045 * When a file is received (via SCM_RIGHTS, etc), we must bump the 3046 * various sock-based usage counts. 3047 */ 3048 void __receive_sock(struct file *file) 3049 { 3050 struct socket *sock; 3051 3052 sock = sock_from_file(file); 3053 if (sock) { 3054 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 3055 sock_update_classid(&sock->sk->sk_cgrp_data); 3056 } 3057 } 3058 3059 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 3060 { 3061 ssize_t res; 3062 struct msghdr msg = {.msg_flags = flags}; 3063 struct kvec iov; 3064 char *kaddr = kmap(page); 3065 iov.iov_base = kaddr + offset; 3066 iov.iov_len = size; 3067 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 3068 kunmap(page); 3069 return res; 3070 } 3071 EXPORT_SYMBOL(sock_no_sendpage); 3072 3073 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 3074 int offset, size_t size, int flags) 3075 { 3076 ssize_t res; 3077 struct msghdr msg = {.msg_flags = flags}; 3078 struct kvec iov; 3079 char *kaddr = kmap(page); 3080 3081 iov.iov_base = kaddr + offset; 3082 iov.iov_len = size; 3083 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 3084 kunmap(page); 3085 return res; 3086 } 3087 EXPORT_SYMBOL(sock_no_sendpage_locked); 3088 3089 /* 3090 * Default Socket Callbacks 3091 */ 3092 3093 static void sock_def_wakeup(struct sock *sk) 3094 { 3095 struct socket_wq *wq; 3096 3097 rcu_read_lock(); 3098 wq = rcu_dereference(sk->sk_wq); 3099 if (skwq_has_sleeper(wq)) 3100 wake_up_interruptible_all(&wq->wait); 3101 rcu_read_unlock(); 3102 } 3103 3104 static void sock_def_error_report(struct sock *sk) 3105 { 3106 struct socket_wq *wq; 3107 3108 rcu_read_lock(); 3109 wq = rcu_dereference(sk->sk_wq); 3110 if (skwq_has_sleeper(wq)) 3111 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 3112 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 3113 rcu_read_unlock(); 3114 } 3115 3116 void sock_def_readable(struct sock *sk) 3117 { 3118 struct socket_wq *wq; 3119 3120 rcu_read_lock(); 3121 wq = rcu_dereference(sk->sk_wq); 3122 if (skwq_has_sleeper(wq)) 3123 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 3124 EPOLLRDNORM | EPOLLRDBAND); 3125 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 3126 rcu_read_unlock(); 3127 } 3128 3129 static void sock_def_write_space(struct sock *sk) 3130 { 3131 struct socket_wq *wq; 3132 3133 rcu_read_lock(); 3134 3135 /* Do not wake up a writer until he can make "significant" 3136 * progress. --DaveM 3137 */ 3138 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { 3139 wq = rcu_dereference(sk->sk_wq); 3140 if (skwq_has_sleeper(wq)) 3141 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3142 EPOLLWRNORM | EPOLLWRBAND); 3143 3144 /* Should agree with poll, otherwise some programs break */ 3145 if (sock_writeable(sk)) 3146 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 3147 } 3148 3149 rcu_read_unlock(); 3150 } 3151 3152 static void sock_def_destruct(struct sock *sk) 3153 { 3154 } 3155 3156 void sk_send_sigurg(struct sock *sk) 3157 { 3158 if (sk->sk_socket && sk->sk_socket->file) 3159 if (send_sigurg(&sk->sk_socket->file->f_owner)) 3160 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 3161 } 3162 EXPORT_SYMBOL(sk_send_sigurg); 3163 3164 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 3165 unsigned long expires) 3166 { 3167 if (!mod_timer(timer, expires)) 3168 sock_hold(sk); 3169 } 3170 EXPORT_SYMBOL(sk_reset_timer); 3171 3172 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 3173 { 3174 if (del_timer(timer)) 3175 __sock_put(sk); 3176 } 3177 EXPORT_SYMBOL(sk_stop_timer); 3178 3179 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 3180 { 3181 if (del_timer_sync(timer)) 3182 __sock_put(sk); 3183 } 3184 EXPORT_SYMBOL(sk_stop_timer_sync); 3185 3186 void sock_init_data(struct socket *sock, struct sock *sk) 3187 { 3188 sk_init_common(sk); 3189 sk->sk_send_head = NULL; 3190 3191 timer_setup(&sk->sk_timer, NULL, 0); 3192 3193 sk->sk_allocation = GFP_KERNEL; 3194 sk->sk_rcvbuf = sysctl_rmem_default; 3195 sk->sk_sndbuf = sysctl_wmem_default; 3196 sk->sk_state = TCP_CLOSE; 3197 sk_set_socket(sk, sock); 3198 3199 sock_set_flag(sk, SOCK_ZAPPED); 3200 3201 if (sock) { 3202 sk->sk_type = sock->type; 3203 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3204 sock->sk = sk; 3205 sk->sk_uid = SOCK_INODE(sock)->i_uid; 3206 } else { 3207 RCU_INIT_POINTER(sk->sk_wq, NULL); 3208 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); 3209 } 3210 3211 rwlock_init(&sk->sk_callback_lock); 3212 if (sk->sk_kern_sock) 3213 lockdep_set_class_and_name( 3214 &sk->sk_callback_lock, 3215 af_kern_callback_keys + sk->sk_family, 3216 af_family_kern_clock_key_strings[sk->sk_family]); 3217 else 3218 lockdep_set_class_and_name( 3219 &sk->sk_callback_lock, 3220 af_callback_keys + sk->sk_family, 3221 af_family_clock_key_strings[sk->sk_family]); 3222 3223 sk->sk_state_change = sock_def_wakeup; 3224 sk->sk_data_ready = sock_def_readable; 3225 sk->sk_write_space = sock_def_write_space; 3226 sk->sk_error_report = sock_def_error_report; 3227 sk->sk_destruct = sock_def_destruct; 3228 3229 sk->sk_frag.page = NULL; 3230 sk->sk_frag.offset = 0; 3231 sk->sk_peek_off = -1; 3232 3233 sk->sk_peer_pid = NULL; 3234 sk->sk_peer_cred = NULL; 3235 spin_lock_init(&sk->sk_peer_lock); 3236 3237 sk->sk_write_pending = 0; 3238 sk->sk_rcvlowat = 1; 3239 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3240 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3241 3242 sk->sk_stamp = SK_DEFAULT_STAMP; 3243 #if BITS_PER_LONG==32 3244 seqlock_init(&sk->sk_stamp_seq); 3245 #endif 3246 atomic_set(&sk->sk_zckey, 0); 3247 3248 #ifdef CONFIG_NET_RX_BUSY_POLL 3249 sk->sk_napi_id = 0; 3250 sk->sk_ll_usec = sysctl_net_busy_read; 3251 #endif 3252 3253 sk->sk_max_pacing_rate = ~0UL; 3254 sk->sk_pacing_rate = ~0UL; 3255 WRITE_ONCE(sk->sk_pacing_shift, 10); 3256 sk->sk_incoming_cpu = -1; 3257 3258 sk_rx_queue_clear(sk); 3259 /* 3260 * Before updating sk_refcnt, we must commit prior changes to memory 3261 * (Documentation/RCU/rculist_nulls.rst for details) 3262 */ 3263 smp_wmb(); 3264 refcount_set(&sk->sk_refcnt, 1); 3265 atomic_set(&sk->sk_drops, 0); 3266 } 3267 EXPORT_SYMBOL(sock_init_data); 3268 3269 void lock_sock_nested(struct sock *sk, int subclass) 3270 { 3271 /* The sk_lock has mutex_lock() semantics here. */ 3272 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3273 3274 might_sleep(); 3275 spin_lock_bh(&sk->sk_lock.slock); 3276 if (sk->sk_lock.owned) 3277 __lock_sock(sk); 3278 sk->sk_lock.owned = 1; 3279 spin_unlock_bh(&sk->sk_lock.slock); 3280 } 3281 EXPORT_SYMBOL(lock_sock_nested); 3282 3283 void release_sock(struct sock *sk) 3284 { 3285 spin_lock_bh(&sk->sk_lock.slock); 3286 if (sk->sk_backlog.tail) 3287 __release_sock(sk); 3288 3289 /* Warning : release_cb() might need to release sk ownership, 3290 * ie call sock_release_ownership(sk) before us. 3291 */ 3292 if (sk->sk_prot->release_cb) 3293 sk->sk_prot->release_cb(sk); 3294 3295 sock_release_ownership(sk); 3296 if (waitqueue_active(&sk->sk_lock.wq)) 3297 wake_up(&sk->sk_lock.wq); 3298 spin_unlock_bh(&sk->sk_lock.slock); 3299 } 3300 EXPORT_SYMBOL(release_sock); 3301 3302 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3303 { 3304 might_sleep(); 3305 spin_lock_bh(&sk->sk_lock.slock); 3306 3307 if (!sk->sk_lock.owned) { 3308 /* 3309 * Fast path return with bottom halves disabled and 3310 * sock::sk_lock.slock held. 3311 * 3312 * The 'mutex' is not contended and holding 3313 * sock::sk_lock.slock prevents all other lockers to 3314 * proceed so the corresponding unlock_sock_fast() can 3315 * avoid the slow path of release_sock() completely and 3316 * just release slock. 3317 * 3318 * From a semantical POV this is equivalent to 'acquiring' 3319 * the 'mutex', hence the corresponding lockdep 3320 * mutex_release() has to happen in the fast path of 3321 * unlock_sock_fast(). 3322 */ 3323 return false; 3324 } 3325 3326 __lock_sock(sk); 3327 sk->sk_lock.owned = 1; 3328 __acquire(&sk->sk_lock.slock); 3329 spin_unlock_bh(&sk->sk_lock.slock); 3330 return true; 3331 } 3332 EXPORT_SYMBOL(__lock_sock_fast); 3333 3334 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3335 bool timeval, bool time32) 3336 { 3337 struct sock *sk = sock->sk; 3338 struct timespec64 ts; 3339 3340 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3341 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3342 if (ts.tv_sec == -1) 3343 return -ENOENT; 3344 if (ts.tv_sec == 0) { 3345 ktime_t kt = ktime_get_real(); 3346 sock_write_timestamp(sk, kt); 3347 ts = ktime_to_timespec64(kt); 3348 } 3349 3350 if (timeval) 3351 ts.tv_nsec /= 1000; 3352 3353 #ifdef CONFIG_COMPAT_32BIT_TIME 3354 if (time32) 3355 return put_old_timespec32(&ts, userstamp); 3356 #endif 3357 #ifdef CONFIG_SPARC64 3358 /* beware of padding in sparc64 timeval */ 3359 if (timeval && !in_compat_syscall()) { 3360 struct __kernel_old_timeval __user tv = { 3361 .tv_sec = ts.tv_sec, 3362 .tv_usec = ts.tv_nsec, 3363 }; 3364 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3365 return -EFAULT; 3366 return 0; 3367 } 3368 #endif 3369 return put_timespec64(&ts, userstamp); 3370 } 3371 EXPORT_SYMBOL(sock_gettstamp); 3372 3373 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3374 { 3375 if (!sock_flag(sk, flag)) { 3376 unsigned long previous_flags = sk->sk_flags; 3377 3378 sock_set_flag(sk, flag); 3379 /* 3380 * we just set one of the two flags which require net 3381 * time stamping, but time stamping might have been on 3382 * already because of the other one 3383 */ 3384 if (sock_needs_netstamp(sk) && 3385 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3386 net_enable_timestamp(); 3387 } 3388 } 3389 3390 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3391 int level, int type) 3392 { 3393 struct sock_exterr_skb *serr; 3394 struct sk_buff *skb; 3395 int copied, err; 3396 3397 err = -EAGAIN; 3398 skb = sock_dequeue_err_skb(sk); 3399 if (skb == NULL) 3400 goto out; 3401 3402 copied = skb->len; 3403 if (copied > len) { 3404 msg->msg_flags |= MSG_TRUNC; 3405 copied = len; 3406 } 3407 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3408 if (err) 3409 goto out_free_skb; 3410 3411 sock_recv_timestamp(msg, sk, skb); 3412 3413 serr = SKB_EXT_ERR(skb); 3414 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3415 3416 msg->msg_flags |= MSG_ERRQUEUE; 3417 err = copied; 3418 3419 out_free_skb: 3420 kfree_skb(skb); 3421 out: 3422 return err; 3423 } 3424 EXPORT_SYMBOL(sock_recv_errqueue); 3425 3426 /* 3427 * Get a socket option on an socket. 3428 * 3429 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3430 * asynchronous errors should be reported by getsockopt. We assume 3431 * this means if you specify SO_ERROR (otherwise whats the point of it). 3432 */ 3433 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3434 char __user *optval, int __user *optlen) 3435 { 3436 struct sock *sk = sock->sk; 3437 3438 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3439 } 3440 EXPORT_SYMBOL(sock_common_getsockopt); 3441 3442 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3443 int flags) 3444 { 3445 struct sock *sk = sock->sk; 3446 int addr_len = 0; 3447 int err; 3448 3449 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 3450 flags & ~MSG_DONTWAIT, &addr_len); 3451 if (err >= 0) 3452 msg->msg_namelen = addr_len; 3453 return err; 3454 } 3455 EXPORT_SYMBOL(sock_common_recvmsg); 3456 3457 /* 3458 * Set socket options on an inet socket. 3459 */ 3460 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3461 sockptr_t optval, unsigned int optlen) 3462 { 3463 struct sock *sk = sock->sk; 3464 3465 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3466 } 3467 EXPORT_SYMBOL(sock_common_setsockopt); 3468 3469 void sk_common_release(struct sock *sk) 3470 { 3471 if (sk->sk_prot->destroy) 3472 sk->sk_prot->destroy(sk); 3473 3474 /* 3475 * Observation: when sk_common_release is called, processes have 3476 * no access to socket. But net still has. 3477 * Step one, detach it from networking: 3478 * 3479 * A. Remove from hash tables. 3480 */ 3481 3482 sk->sk_prot->unhash(sk); 3483 3484 /* 3485 * In this point socket cannot receive new packets, but it is possible 3486 * that some packets are in flight because some CPU runs receiver and 3487 * did hash table lookup before we unhashed socket. They will achieve 3488 * receive queue and will be purged by socket destructor. 3489 * 3490 * Also we still have packets pending on receive queue and probably, 3491 * our own packets waiting in device queues. sock_destroy will drain 3492 * receive queue, but transmitted packets will delay socket destruction 3493 * until the last reference will be released. 3494 */ 3495 3496 sock_orphan(sk); 3497 3498 xfrm_sk_free_policy(sk); 3499 3500 sk_refcnt_debug_release(sk); 3501 3502 sock_put(sk); 3503 } 3504 EXPORT_SYMBOL(sk_common_release); 3505 3506 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3507 { 3508 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3509 3510 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3511 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3512 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3513 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3514 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3515 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3516 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3517 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 3518 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3519 } 3520 3521 #ifdef CONFIG_PROC_FS 3522 #define PROTO_INUSE_NR 64 /* should be enough for the first time */ 3523 struct prot_inuse { 3524 int val[PROTO_INUSE_NR]; 3525 }; 3526 3527 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3528 3529 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 3530 { 3531 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); 3532 } 3533 EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 3534 3535 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3536 { 3537 int cpu, idx = prot->inuse_idx; 3538 int res = 0; 3539 3540 for_each_possible_cpu(cpu) 3541 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3542 3543 return res >= 0 ? res : 0; 3544 } 3545 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3546 3547 static void sock_inuse_add(struct net *net, int val) 3548 { 3549 this_cpu_add(*net->core.sock_inuse, val); 3550 } 3551 3552 int sock_inuse_get(struct net *net) 3553 { 3554 int cpu, res = 0; 3555 3556 for_each_possible_cpu(cpu) 3557 res += *per_cpu_ptr(net->core.sock_inuse, cpu); 3558 3559 return res; 3560 } 3561 3562 EXPORT_SYMBOL_GPL(sock_inuse_get); 3563 3564 static int __net_init sock_inuse_init_net(struct net *net) 3565 { 3566 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3567 if (net->core.prot_inuse == NULL) 3568 return -ENOMEM; 3569 3570 net->core.sock_inuse = alloc_percpu(int); 3571 if (net->core.sock_inuse == NULL) 3572 goto out; 3573 3574 return 0; 3575 3576 out: 3577 free_percpu(net->core.prot_inuse); 3578 return -ENOMEM; 3579 } 3580 3581 static void __net_exit sock_inuse_exit_net(struct net *net) 3582 { 3583 free_percpu(net->core.prot_inuse); 3584 free_percpu(net->core.sock_inuse); 3585 } 3586 3587 static struct pernet_operations net_inuse_ops = { 3588 .init = sock_inuse_init_net, 3589 .exit = sock_inuse_exit_net, 3590 }; 3591 3592 static __init int net_inuse_init(void) 3593 { 3594 if (register_pernet_subsys(&net_inuse_ops)) 3595 panic("Cannot initialize net inuse counters"); 3596 3597 return 0; 3598 } 3599 3600 core_initcall(net_inuse_init); 3601 3602 static int assign_proto_idx(struct proto *prot) 3603 { 3604 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3605 3606 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3607 pr_err("PROTO_INUSE_NR exhausted\n"); 3608 return -ENOSPC; 3609 } 3610 3611 set_bit(prot->inuse_idx, proto_inuse_idx); 3612 return 0; 3613 } 3614 3615 static void release_proto_idx(struct proto *prot) 3616 { 3617 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3618 clear_bit(prot->inuse_idx, proto_inuse_idx); 3619 } 3620 #else 3621 static inline int assign_proto_idx(struct proto *prot) 3622 { 3623 return 0; 3624 } 3625 3626 static inline void release_proto_idx(struct proto *prot) 3627 { 3628 } 3629 3630 static void sock_inuse_add(struct net *net, int val) 3631 { 3632 } 3633 #endif 3634 3635 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 3636 { 3637 if (!twsk_prot) 3638 return; 3639 kfree(twsk_prot->twsk_slab_name); 3640 twsk_prot->twsk_slab_name = NULL; 3641 kmem_cache_destroy(twsk_prot->twsk_slab); 3642 twsk_prot->twsk_slab = NULL; 3643 } 3644 3645 static int tw_prot_init(const struct proto *prot) 3646 { 3647 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 3648 3649 if (!twsk_prot) 3650 return 0; 3651 3652 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 3653 prot->name); 3654 if (!twsk_prot->twsk_slab_name) 3655 return -ENOMEM; 3656 3657 twsk_prot->twsk_slab = 3658 kmem_cache_create(twsk_prot->twsk_slab_name, 3659 twsk_prot->twsk_obj_size, 0, 3660 SLAB_ACCOUNT | prot->slab_flags, 3661 NULL); 3662 if (!twsk_prot->twsk_slab) { 3663 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 3664 prot->name); 3665 return -ENOMEM; 3666 } 3667 3668 return 0; 3669 } 3670 3671 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3672 { 3673 if (!rsk_prot) 3674 return; 3675 kfree(rsk_prot->slab_name); 3676 rsk_prot->slab_name = NULL; 3677 kmem_cache_destroy(rsk_prot->slab); 3678 rsk_prot->slab = NULL; 3679 } 3680 3681 static int req_prot_init(const struct proto *prot) 3682 { 3683 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3684 3685 if (!rsk_prot) 3686 return 0; 3687 3688 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3689 prot->name); 3690 if (!rsk_prot->slab_name) 3691 return -ENOMEM; 3692 3693 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3694 rsk_prot->obj_size, 0, 3695 SLAB_ACCOUNT | prot->slab_flags, 3696 NULL); 3697 3698 if (!rsk_prot->slab) { 3699 pr_crit("%s: Can't create request sock SLAB cache!\n", 3700 prot->name); 3701 return -ENOMEM; 3702 } 3703 return 0; 3704 } 3705 3706 int proto_register(struct proto *prot, int alloc_slab) 3707 { 3708 int ret = -ENOBUFS; 3709 3710 if (alloc_slab) { 3711 prot->slab = kmem_cache_create_usercopy(prot->name, 3712 prot->obj_size, 0, 3713 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3714 prot->slab_flags, 3715 prot->useroffset, prot->usersize, 3716 NULL); 3717 3718 if (prot->slab == NULL) { 3719 pr_crit("%s: Can't create sock SLAB cache!\n", 3720 prot->name); 3721 goto out; 3722 } 3723 3724 if (req_prot_init(prot)) 3725 goto out_free_request_sock_slab; 3726 3727 if (tw_prot_init(prot)) 3728 goto out_free_timewait_sock_slab; 3729 } 3730 3731 mutex_lock(&proto_list_mutex); 3732 ret = assign_proto_idx(prot); 3733 if (ret) { 3734 mutex_unlock(&proto_list_mutex); 3735 goto out_free_timewait_sock_slab; 3736 } 3737 list_add(&prot->node, &proto_list); 3738 mutex_unlock(&proto_list_mutex); 3739 return ret; 3740 3741 out_free_timewait_sock_slab: 3742 if (alloc_slab) 3743 tw_prot_cleanup(prot->twsk_prot); 3744 out_free_request_sock_slab: 3745 if (alloc_slab) { 3746 req_prot_cleanup(prot->rsk_prot); 3747 3748 kmem_cache_destroy(prot->slab); 3749 prot->slab = NULL; 3750 } 3751 out: 3752 return ret; 3753 } 3754 EXPORT_SYMBOL(proto_register); 3755 3756 void proto_unregister(struct proto *prot) 3757 { 3758 mutex_lock(&proto_list_mutex); 3759 release_proto_idx(prot); 3760 list_del(&prot->node); 3761 mutex_unlock(&proto_list_mutex); 3762 3763 kmem_cache_destroy(prot->slab); 3764 prot->slab = NULL; 3765 3766 req_prot_cleanup(prot->rsk_prot); 3767 tw_prot_cleanup(prot->twsk_prot); 3768 } 3769 EXPORT_SYMBOL(proto_unregister); 3770 3771 int sock_load_diag_module(int family, int protocol) 3772 { 3773 if (!protocol) { 3774 if (!sock_is_registered(family)) 3775 return -ENOENT; 3776 3777 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3778 NETLINK_SOCK_DIAG, family); 3779 } 3780 3781 #ifdef CONFIG_INET 3782 if (family == AF_INET && 3783 protocol != IPPROTO_RAW && 3784 protocol < MAX_INET_PROTOS && 3785 !rcu_access_pointer(inet_protos[protocol])) 3786 return -ENOENT; 3787 #endif 3788 3789 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3790 NETLINK_SOCK_DIAG, family, protocol); 3791 } 3792 EXPORT_SYMBOL(sock_load_diag_module); 3793 3794 #ifdef CONFIG_PROC_FS 3795 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3796 __acquires(proto_list_mutex) 3797 { 3798 mutex_lock(&proto_list_mutex); 3799 return seq_list_start_head(&proto_list, *pos); 3800 } 3801 3802 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3803 { 3804 return seq_list_next(v, &proto_list, pos); 3805 } 3806 3807 static void proto_seq_stop(struct seq_file *seq, void *v) 3808 __releases(proto_list_mutex) 3809 { 3810 mutex_unlock(&proto_list_mutex); 3811 } 3812 3813 static char proto_method_implemented(const void *method) 3814 { 3815 return method == NULL ? 'n' : 'y'; 3816 } 3817 static long sock_prot_memory_allocated(struct proto *proto) 3818 { 3819 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3820 } 3821 3822 static const char *sock_prot_memory_pressure(struct proto *proto) 3823 { 3824 return proto->memory_pressure != NULL ? 3825 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3826 } 3827 3828 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3829 { 3830 3831 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3832 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3833 proto->name, 3834 proto->obj_size, 3835 sock_prot_inuse_get(seq_file_net(seq), proto), 3836 sock_prot_memory_allocated(proto), 3837 sock_prot_memory_pressure(proto), 3838 proto->max_header, 3839 proto->slab == NULL ? "no" : "yes", 3840 module_name(proto->owner), 3841 proto_method_implemented(proto->close), 3842 proto_method_implemented(proto->connect), 3843 proto_method_implemented(proto->disconnect), 3844 proto_method_implemented(proto->accept), 3845 proto_method_implemented(proto->ioctl), 3846 proto_method_implemented(proto->init), 3847 proto_method_implemented(proto->destroy), 3848 proto_method_implemented(proto->shutdown), 3849 proto_method_implemented(proto->setsockopt), 3850 proto_method_implemented(proto->getsockopt), 3851 proto_method_implemented(proto->sendmsg), 3852 proto_method_implemented(proto->recvmsg), 3853 proto_method_implemented(proto->sendpage), 3854 proto_method_implemented(proto->bind), 3855 proto_method_implemented(proto->backlog_rcv), 3856 proto_method_implemented(proto->hash), 3857 proto_method_implemented(proto->unhash), 3858 proto_method_implemented(proto->get_port), 3859 proto_method_implemented(proto->enter_memory_pressure)); 3860 } 3861 3862 static int proto_seq_show(struct seq_file *seq, void *v) 3863 { 3864 if (v == &proto_list) 3865 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3866 "protocol", 3867 "size", 3868 "sockets", 3869 "memory", 3870 "press", 3871 "maxhdr", 3872 "slab", 3873 "module", 3874 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3875 else 3876 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3877 return 0; 3878 } 3879 3880 static const struct seq_operations proto_seq_ops = { 3881 .start = proto_seq_start, 3882 .next = proto_seq_next, 3883 .stop = proto_seq_stop, 3884 .show = proto_seq_show, 3885 }; 3886 3887 static __net_init int proto_init_net(struct net *net) 3888 { 3889 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 3890 sizeof(struct seq_net_private))) 3891 return -ENOMEM; 3892 3893 return 0; 3894 } 3895 3896 static __net_exit void proto_exit_net(struct net *net) 3897 { 3898 remove_proc_entry("protocols", net->proc_net); 3899 } 3900 3901 3902 static __net_initdata struct pernet_operations proto_net_ops = { 3903 .init = proto_init_net, 3904 .exit = proto_exit_net, 3905 }; 3906 3907 static int __init proto_init(void) 3908 { 3909 return register_pernet_subsys(&proto_net_ops); 3910 } 3911 3912 subsys_initcall(proto_init); 3913 3914 #endif /* PROC_FS */ 3915 3916 #ifdef CONFIG_NET_RX_BUSY_POLL 3917 bool sk_busy_loop_end(void *p, unsigned long start_time) 3918 { 3919 struct sock *sk = p; 3920 3921 return !skb_queue_empty_lockless(&sk->sk_receive_queue) || 3922 sk_busy_loop_timeout(sk, start_time); 3923 } 3924 EXPORT_SYMBOL(sk_busy_loop_end); 3925 #endif /* CONFIG_NET_RX_BUSY_POLL */ 3926 3927 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) 3928 { 3929 if (!sk->sk_prot->bind_add) 3930 return -EOPNOTSUPP; 3931 return sk->sk_prot->bind_add(sk, addr, addr_len); 3932 } 3933 EXPORT_SYMBOL(sock_bind_add); 3934