1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <asm/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/init.h> 111 #include <linux/highmem.h> 112 #include <linux/user_namespace.h> 113 #include <linux/static_key.h> 114 #include <linux/memcontrol.h> 115 #include <linux/prefetch.h> 116 #include <linux/compat.h> 117 118 #include <linux/uaccess.h> 119 120 #include <linux/netdevice.h> 121 #include <net/protocol.h> 122 #include <linux/skbuff.h> 123 #include <net/net_namespace.h> 124 #include <net/request_sock.h> 125 #include <net/sock.h> 126 #include <linux/net_tstamp.h> 127 #include <net/xfrm.h> 128 #include <linux/ipsec.h> 129 #include <net/cls_cgroup.h> 130 #include <net/netprio_cgroup.h> 131 #include <linux/sock_diag.h> 132 133 #include <linux/filter.h> 134 #include <net/sock_reuseport.h> 135 #include <net/bpf_sk_storage.h> 136 137 #include <trace/events/sock.h> 138 139 #include <net/tcp.h> 140 #include <net/busy_poll.h> 141 142 #include <linux/ethtool.h> 143 144 static DEFINE_MUTEX(proto_list_mutex); 145 static LIST_HEAD(proto_list); 146 147 /** 148 * sk_ns_capable - General socket capability test 149 * @sk: Socket to use a capability on or through 150 * @user_ns: The user namespace of the capability to use 151 * @cap: The capability to use 152 * 153 * Test to see if the opener of the socket had when the socket was 154 * created and the current process has the capability @cap in the user 155 * namespace @user_ns. 156 */ 157 bool sk_ns_capable(const struct sock *sk, 158 struct user_namespace *user_ns, int cap) 159 { 160 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 161 ns_capable(user_ns, cap); 162 } 163 EXPORT_SYMBOL(sk_ns_capable); 164 165 /** 166 * sk_capable - Socket global capability test 167 * @sk: Socket to use a capability on or through 168 * @cap: The global capability to use 169 * 170 * Test to see if the opener of the socket had when the socket was 171 * created and the current process has the capability @cap in all user 172 * namespaces. 173 */ 174 bool sk_capable(const struct sock *sk, int cap) 175 { 176 return sk_ns_capable(sk, &init_user_ns, cap); 177 } 178 EXPORT_SYMBOL(sk_capable); 179 180 /** 181 * sk_net_capable - Network namespace socket capability test 182 * @sk: Socket to use a capability on or through 183 * @cap: The capability to use 184 * 185 * Test to see if the opener of the socket had when the socket was created 186 * and the current process has the capability @cap over the network namespace 187 * the socket is a member of. 188 */ 189 bool sk_net_capable(const struct sock *sk, int cap) 190 { 191 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 192 } 193 EXPORT_SYMBOL(sk_net_capable); 194 195 /* 196 * Each address family might have different locking rules, so we have 197 * one slock key per address family and separate keys for internal and 198 * userspace sockets. 199 */ 200 static struct lock_class_key af_family_keys[AF_MAX]; 201 static struct lock_class_key af_family_kern_keys[AF_MAX]; 202 static struct lock_class_key af_family_slock_keys[AF_MAX]; 203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 204 205 /* 206 * Make lock validator output more readable. (we pre-construct these 207 * strings build-time, so that runtime initialization of socket 208 * locks is fast): 209 */ 210 211 #define _sock_locks(x) \ 212 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 213 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 214 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 215 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 216 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 217 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 218 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 219 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 220 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 221 x "27" , x "28" , x "AF_CAN" , \ 222 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 223 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 224 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 225 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 226 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 227 x "AF_MCTP" , \ 228 x "AF_MAX" 229 230 static const char *const af_family_key_strings[AF_MAX+1] = { 231 _sock_locks("sk_lock-") 232 }; 233 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 234 _sock_locks("slock-") 235 }; 236 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 237 _sock_locks("clock-") 238 }; 239 240 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 241 _sock_locks("k-sk_lock-") 242 }; 243 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 244 _sock_locks("k-slock-") 245 }; 246 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 247 _sock_locks("k-clock-") 248 }; 249 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 250 _sock_locks("rlock-") 251 }; 252 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 253 _sock_locks("wlock-") 254 }; 255 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 256 _sock_locks("elock-") 257 }; 258 259 /* 260 * sk_callback_lock and sk queues locking rules are per-address-family, 261 * so split the lock classes by using a per-AF key: 262 */ 263 static struct lock_class_key af_callback_keys[AF_MAX]; 264 static struct lock_class_key af_rlock_keys[AF_MAX]; 265 static struct lock_class_key af_wlock_keys[AF_MAX]; 266 static struct lock_class_key af_elock_keys[AF_MAX]; 267 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 268 269 /* Run time adjustable parameters. */ 270 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 271 EXPORT_SYMBOL(sysctl_wmem_max); 272 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 273 EXPORT_SYMBOL(sysctl_rmem_max); 274 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 275 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 276 277 /* Maximal space eaten by iovec or ancillary data plus some space */ 278 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 279 EXPORT_SYMBOL(sysctl_optmem_max); 280 281 int sysctl_tstamp_allow_data __read_mostly = 1; 282 283 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 284 EXPORT_SYMBOL_GPL(memalloc_socks_key); 285 286 /** 287 * sk_set_memalloc - sets %SOCK_MEMALLOC 288 * @sk: socket to set it on 289 * 290 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 291 * It's the responsibility of the admin to adjust min_free_kbytes 292 * to meet the requirements 293 */ 294 void sk_set_memalloc(struct sock *sk) 295 { 296 sock_set_flag(sk, SOCK_MEMALLOC); 297 sk->sk_allocation |= __GFP_MEMALLOC; 298 static_branch_inc(&memalloc_socks_key); 299 } 300 EXPORT_SYMBOL_GPL(sk_set_memalloc); 301 302 void sk_clear_memalloc(struct sock *sk) 303 { 304 sock_reset_flag(sk, SOCK_MEMALLOC); 305 sk->sk_allocation &= ~__GFP_MEMALLOC; 306 static_branch_dec(&memalloc_socks_key); 307 308 /* 309 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 310 * progress of swapping. SOCK_MEMALLOC may be cleared while 311 * it has rmem allocations due to the last swapfile being deactivated 312 * but there is a risk that the socket is unusable due to exceeding 313 * the rmem limits. Reclaim the reserves and obey rmem limits again. 314 */ 315 sk_mem_reclaim(sk); 316 } 317 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 318 319 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 320 { 321 int ret; 322 unsigned int noreclaim_flag; 323 324 /* these should have been dropped before queueing */ 325 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 326 327 noreclaim_flag = memalloc_noreclaim_save(); 328 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, 329 tcp_v6_do_rcv, 330 tcp_v4_do_rcv, 331 sk, skb); 332 memalloc_noreclaim_restore(noreclaim_flag); 333 334 return ret; 335 } 336 EXPORT_SYMBOL(__sk_backlog_rcv); 337 338 void sk_error_report(struct sock *sk) 339 { 340 sk->sk_error_report(sk); 341 342 switch (sk->sk_family) { 343 case AF_INET: 344 fallthrough; 345 case AF_INET6: 346 trace_inet_sk_error_report(sk); 347 break; 348 default: 349 break; 350 } 351 } 352 EXPORT_SYMBOL(sk_error_report); 353 354 int sock_get_timeout(long timeo, void *optval, bool old_timeval) 355 { 356 struct __kernel_sock_timeval tv; 357 358 if (timeo == MAX_SCHEDULE_TIMEOUT) { 359 tv.tv_sec = 0; 360 tv.tv_usec = 0; 361 } else { 362 tv.tv_sec = timeo / HZ; 363 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 364 } 365 366 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 367 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 368 *(struct old_timeval32 *)optval = tv32; 369 return sizeof(tv32); 370 } 371 372 if (old_timeval) { 373 struct __kernel_old_timeval old_tv; 374 old_tv.tv_sec = tv.tv_sec; 375 old_tv.tv_usec = tv.tv_usec; 376 *(struct __kernel_old_timeval *)optval = old_tv; 377 return sizeof(old_tv); 378 } 379 380 *(struct __kernel_sock_timeval *)optval = tv; 381 return sizeof(tv); 382 } 383 EXPORT_SYMBOL(sock_get_timeout); 384 385 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, 386 sockptr_t optval, int optlen, bool old_timeval) 387 { 388 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 389 struct old_timeval32 tv32; 390 391 if (optlen < sizeof(tv32)) 392 return -EINVAL; 393 394 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 395 return -EFAULT; 396 tv->tv_sec = tv32.tv_sec; 397 tv->tv_usec = tv32.tv_usec; 398 } else if (old_timeval) { 399 struct __kernel_old_timeval old_tv; 400 401 if (optlen < sizeof(old_tv)) 402 return -EINVAL; 403 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 404 return -EFAULT; 405 tv->tv_sec = old_tv.tv_sec; 406 tv->tv_usec = old_tv.tv_usec; 407 } else { 408 if (optlen < sizeof(*tv)) 409 return -EINVAL; 410 if (copy_from_sockptr(tv, optval, sizeof(*tv))) 411 return -EFAULT; 412 } 413 414 return 0; 415 } 416 EXPORT_SYMBOL(sock_copy_user_timeval); 417 418 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 419 bool old_timeval) 420 { 421 struct __kernel_sock_timeval tv; 422 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); 423 424 if (err) 425 return err; 426 427 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 428 return -EDOM; 429 430 if (tv.tv_sec < 0) { 431 static int warned __read_mostly; 432 433 *timeo_p = 0; 434 if (warned < 10 && net_ratelimit()) { 435 warned++; 436 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 437 __func__, current->comm, task_pid_nr(current)); 438 } 439 return 0; 440 } 441 *timeo_p = MAX_SCHEDULE_TIMEOUT; 442 if (tv.tv_sec == 0 && tv.tv_usec == 0) 443 return 0; 444 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) 445 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); 446 return 0; 447 } 448 449 static bool sock_needs_netstamp(const struct sock *sk) 450 { 451 switch (sk->sk_family) { 452 case AF_UNSPEC: 453 case AF_UNIX: 454 return false; 455 default: 456 return true; 457 } 458 } 459 460 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 461 { 462 if (sk->sk_flags & flags) { 463 sk->sk_flags &= ~flags; 464 if (sock_needs_netstamp(sk) && 465 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 466 net_disable_timestamp(); 467 } 468 } 469 470 471 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 472 { 473 unsigned long flags; 474 struct sk_buff_head *list = &sk->sk_receive_queue; 475 476 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 477 atomic_inc(&sk->sk_drops); 478 trace_sock_rcvqueue_full(sk, skb); 479 return -ENOMEM; 480 } 481 482 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 483 atomic_inc(&sk->sk_drops); 484 return -ENOBUFS; 485 } 486 487 skb->dev = NULL; 488 skb_set_owner_r(skb, sk); 489 490 /* we escape from rcu protected region, make sure we dont leak 491 * a norefcounted dst 492 */ 493 skb_dst_force(skb); 494 495 spin_lock_irqsave(&list->lock, flags); 496 sock_skb_set_dropcount(sk, skb); 497 __skb_queue_tail(list, skb); 498 spin_unlock_irqrestore(&list->lock, flags); 499 500 if (!sock_flag(sk, SOCK_DEAD)) 501 sk->sk_data_ready(sk); 502 return 0; 503 } 504 EXPORT_SYMBOL(__sock_queue_rcv_skb); 505 506 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 507 { 508 int err; 509 510 err = sk_filter(sk, skb); 511 if (err) 512 return err; 513 514 return __sock_queue_rcv_skb(sk, skb); 515 } 516 EXPORT_SYMBOL(sock_queue_rcv_skb); 517 518 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 519 const int nested, unsigned int trim_cap, bool refcounted) 520 { 521 int rc = NET_RX_SUCCESS; 522 523 if (sk_filter_trim_cap(sk, skb, trim_cap)) 524 goto discard_and_relse; 525 526 skb->dev = NULL; 527 528 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 529 atomic_inc(&sk->sk_drops); 530 goto discard_and_relse; 531 } 532 if (nested) 533 bh_lock_sock_nested(sk); 534 else 535 bh_lock_sock(sk); 536 if (!sock_owned_by_user(sk)) { 537 /* 538 * trylock + unlock semantics: 539 */ 540 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 541 542 rc = sk_backlog_rcv(sk, skb); 543 544 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 545 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { 546 bh_unlock_sock(sk); 547 atomic_inc(&sk->sk_drops); 548 goto discard_and_relse; 549 } 550 551 bh_unlock_sock(sk); 552 out: 553 if (refcounted) 554 sock_put(sk); 555 return rc; 556 discard_and_relse: 557 kfree_skb(skb); 558 goto out; 559 } 560 EXPORT_SYMBOL(__sk_receive_skb); 561 562 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 563 u32)); 564 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 565 u32)); 566 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 567 { 568 struct dst_entry *dst = __sk_dst_get(sk); 569 570 if (dst && dst->obsolete && 571 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 572 dst, cookie) == NULL) { 573 sk_tx_queue_clear(sk); 574 sk->sk_dst_pending_confirm = 0; 575 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 576 dst_release(dst); 577 return NULL; 578 } 579 580 return dst; 581 } 582 EXPORT_SYMBOL(__sk_dst_check); 583 584 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 585 { 586 struct dst_entry *dst = sk_dst_get(sk); 587 588 if (dst && dst->obsolete && 589 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 590 dst, cookie) == NULL) { 591 sk_dst_reset(sk); 592 dst_release(dst); 593 return NULL; 594 } 595 596 return dst; 597 } 598 EXPORT_SYMBOL(sk_dst_check); 599 600 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 601 { 602 int ret = -ENOPROTOOPT; 603 #ifdef CONFIG_NETDEVICES 604 struct net *net = sock_net(sk); 605 606 /* Sorry... */ 607 ret = -EPERM; 608 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 609 goto out; 610 611 ret = -EINVAL; 612 if (ifindex < 0) 613 goto out; 614 615 sk->sk_bound_dev_if = ifindex; 616 if (sk->sk_prot->rehash) 617 sk->sk_prot->rehash(sk); 618 sk_dst_reset(sk); 619 620 ret = 0; 621 622 out: 623 #endif 624 625 return ret; 626 } 627 628 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 629 { 630 int ret; 631 632 if (lock_sk) 633 lock_sock(sk); 634 ret = sock_bindtoindex_locked(sk, ifindex); 635 if (lock_sk) 636 release_sock(sk); 637 638 return ret; 639 } 640 EXPORT_SYMBOL(sock_bindtoindex); 641 642 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 643 { 644 int ret = -ENOPROTOOPT; 645 #ifdef CONFIG_NETDEVICES 646 struct net *net = sock_net(sk); 647 char devname[IFNAMSIZ]; 648 int index; 649 650 ret = -EINVAL; 651 if (optlen < 0) 652 goto out; 653 654 /* Bind this socket to a particular device like "eth0", 655 * as specified in the passed interface name. If the 656 * name is "" or the option length is zero the socket 657 * is not bound. 658 */ 659 if (optlen > IFNAMSIZ - 1) 660 optlen = IFNAMSIZ - 1; 661 memset(devname, 0, sizeof(devname)); 662 663 ret = -EFAULT; 664 if (copy_from_sockptr(devname, optval, optlen)) 665 goto out; 666 667 index = 0; 668 if (devname[0] != '\0') { 669 struct net_device *dev; 670 671 rcu_read_lock(); 672 dev = dev_get_by_name_rcu(net, devname); 673 if (dev) 674 index = dev->ifindex; 675 rcu_read_unlock(); 676 ret = -ENODEV; 677 if (!dev) 678 goto out; 679 } 680 681 return sock_bindtoindex(sk, index, true); 682 out: 683 #endif 684 685 return ret; 686 } 687 688 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 689 int __user *optlen, int len) 690 { 691 int ret = -ENOPROTOOPT; 692 #ifdef CONFIG_NETDEVICES 693 struct net *net = sock_net(sk); 694 char devname[IFNAMSIZ]; 695 696 if (sk->sk_bound_dev_if == 0) { 697 len = 0; 698 goto zero; 699 } 700 701 ret = -EINVAL; 702 if (len < IFNAMSIZ) 703 goto out; 704 705 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 706 if (ret) 707 goto out; 708 709 len = strlen(devname) + 1; 710 711 ret = -EFAULT; 712 if (copy_to_user(optval, devname, len)) 713 goto out; 714 715 zero: 716 ret = -EFAULT; 717 if (put_user(len, optlen)) 718 goto out; 719 720 ret = 0; 721 722 out: 723 #endif 724 725 return ret; 726 } 727 728 bool sk_mc_loop(struct sock *sk) 729 { 730 if (dev_recursion_level()) 731 return false; 732 if (!sk) 733 return true; 734 switch (sk->sk_family) { 735 case AF_INET: 736 return inet_sk(sk)->mc_loop; 737 #if IS_ENABLED(CONFIG_IPV6) 738 case AF_INET6: 739 return inet6_sk(sk)->mc_loop; 740 #endif 741 } 742 WARN_ON_ONCE(1); 743 return true; 744 } 745 EXPORT_SYMBOL(sk_mc_loop); 746 747 void sock_set_reuseaddr(struct sock *sk) 748 { 749 lock_sock(sk); 750 sk->sk_reuse = SK_CAN_REUSE; 751 release_sock(sk); 752 } 753 EXPORT_SYMBOL(sock_set_reuseaddr); 754 755 void sock_set_reuseport(struct sock *sk) 756 { 757 lock_sock(sk); 758 sk->sk_reuseport = true; 759 release_sock(sk); 760 } 761 EXPORT_SYMBOL(sock_set_reuseport); 762 763 void sock_no_linger(struct sock *sk) 764 { 765 lock_sock(sk); 766 sk->sk_lingertime = 0; 767 sock_set_flag(sk, SOCK_LINGER); 768 release_sock(sk); 769 } 770 EXPORT_SYMBOL(sock_no_linger); 771 772 void sock_set_priority(struct sock *sk, u32 priority) 773 { 774 lock_sock(sk); 775 sk->sk_priority = priority; 776 release_sock(sk); 777 } 778 EXPORT_SYMBOL(sock_set_priority); 779 780 void sock_set_sndtimeo(struct sock *sk, s64 secs) 781 { 782 lock_sock(sk); 783 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 784 sk->sk_sndtimeo = secs * HZ; 785 else 786 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 787 release_sock(sk); 788 } 789 EXPORT_SYMBOL(sock_set_sndtimeo); 790 791 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 792 { 793 if (val) { 794 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 795 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); 796 sock_set_flag(sk, SOCK_RCVTSTAMP); 797 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 798 } else { 799 sock_reset_flag(sk, SOCK_RCVTSTAMP); 800 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 801 } 802 } 803 804 void sock_enable_timestamps(struct sock *sk) 805 { 806 lock_sock(sk); 807 __sock_set_timestamps(sk, true, false, true); 808 release_sock(sk); 809 } 810 EXPORT_SYMBOL(sock_enable_timestamps); 811 812 void sock_set_timestamp(struct sock *sk, int optname, bool valbool) 813 { 814 switch (optname) { 815 case SO_TIMESTAMP_OLD: 816 __sock_set_timestamps(sk, valbool, false, false); 817 break; 818 case SO_TIMESTAMP_NEW: 819 __sock_set_timestamps(sk, valbool, true, false); 820 break; 821 case SO_TIMESTAMPNS_OLD: 822 __sock_set_timestamps(sk, valbool, false, true); 823 break; 824 case SO_TIMESTAMPNS_NEW: 825 __sock_set_timestamps(sk, valbool, true, true); 826 break; 827 } 828 } 829 830 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) 831 { 832 struct net *net = sock_net(sk); 833 struct net_device *dev = NULL; 834 bool match = false; 835 int *vclock_index; 836 int i, num; 837 838 if (sk->sk_bound_dev_if) 839 dev = dev_get_by_index(net, sk->sk_bound_dev_if); 840 841 if (!dev) { 842 pr_err("%s: sock not bind to device\n", __func__); 843 return -EOPNOTSUPP; 844 } 845 846 num = ethtool_get_phc_vclocks(dev, &vclock_index); 847 dev_put(dev); 848 849 for (i = 0; i < num; i++) { 850 if (*(vclock_index + i) == phc_index) { 851 match = true; 852 break; 853 } 854 } 855 856 if (num > 0) 857 kfree(vclock_index); 858 859 if (!match) 860 return -EINVAL; 861 862 sk->sk_bind_phc = phc_index; 863 864 return 0; 865 } 866 867 int sock_set_timestamping(struct sock *sk, int optname, 868 struct so_timestamping timestamping) 869 { 870 int val = timestamping.flags; 871 int ret; 872 873 if (val & ~SOF_TIMESTAMPING_MASK) 874 return -EINVAL; 875 876 if (val & SOF_TIMESTAMPING_OPT_ID && 877 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 878 if (sk_is_tcp(sk)) { 879 if ((1 << sk->sk_state) & 880 (TCPF_CLOSE | TCPF_LISTEN)) 881 return -EINVAL; 882 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); 883 } else { 884 atomic_set(&sk->sk_tskey, 0); 885 } 886 } 887 888 if (val & SOF_TIMESTAMPING_OPT_STATS && 889 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) 890 return -EINVAL; 891 892 if (val & SOF_TIMESTAMPING_BIND_PHC) { 893 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); 894 if (ret) 895 return ret; 896 } 897 898 sk->sk_tsflags = val; 899 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 900 901 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 902 sock_enable_timestamp(sk, 903 SOCK_TIMESTAMPING_RX_SOFTWARE); 904 else 905 sock_disable_timestamp(sk, 906 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 907 return 0; 908 } 909 910 void sock_set_keepalive(struct sock *sk) 911 { 912 lock_sock(sk); 913 if (sk->sk_prot->keepalive) 914 sk->sk_prot->keepalive(sk, true); 915 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 916 release_sock(sk); 917 } 918 EXPORT_SYMBOL(sock_set_keepalive); 919 920 static void __sock_set_rcvbuf(struct sock *sk, int val) 921 { 922 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 923 * as a negative value. 924 */ 925 val = min_t(int, val, INT_MAX / 2); 926 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 927 928 /* We double it on the way in to account for "struct sk_buff" etc. 929 * overhead. Applications assume that the SO_RCVBUF setting they make 930 * will allow that much actual data to be received on that socket. 931 * 932 * Applications are unaware that "struct sk_buff" and other overheads 933 * allocate from the receive buffer during socket buffer allocation. 934 * 935 * And after considering the possible alternatives, returning the value 936 * we actually used in getsockopt is the most desirable behavior. 937 */ 938 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 939 } 940 941 void sock_set_rcvbuf(struct sock *sk, int val) 942 { 943 lock_sock(sk); 944 __sock_set_rcvbuf(sk, val); 945 release_sock(sk); 946 } 947 EXPORT_SYMBOL(sock_set_rcvbuf); 948 949 static void __sock_set_mark(struct sock *sk, u32 val) 950 { 951 if (val != sk->sk_mark) { 952 sk->sk_mark = val; 953 sk_dst_reset(sk); 954 } 955 } 956 957 void sock_set_mark(struct sock *sk, u32 val) 958 { 959 lock_sock(sk); 960 __sock_set_mark(sk, val); 961 release_sock(sk); 962 } 963 EXPORT_SYMBOL(sock_set_mark); 964 965 static void sock_release_reserved_memory(struct sock *sk, int bytes) 966 { 967 /* Round down bytes to multiple of pages */ 968 bytes &= ~(SK_MEM_QUANTUM - 1); 969 970 WARN_ON(bytes > sk->sk_reserved_mem); 971 sk->sk_reserved_mem -= bytes; 972 sk_mem_reclaim(sk); 973 } 974 975 static int sock_reserve_memory(struct sock *sk, int bytes) 976 { 977 long allocated; 978 bool charged; 979 int pages; 980 981 if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) 982 return -EOPNOTSUPP; 983 984 if (!bytes) 985 return 0; 986 987 pages = sk_mem_pages(bytes); 988 989 /* pre-charge to memcg */ 990 charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, 991 GFP_KERNEL | __GFP_RETRY_MAYFAIL); 992 if (!charged) 993 return -ENOMEM; 994 995 /* pre-charge to forward_alloc */ 996 allocated = sk_memory_allocated_add(sk, pages); 997 /* If the system goes into memory pressure with this 998 * precharge, give up and return error. 999 */ 1000 if (allocated > sk_prot_mem_limits(sk, 1)) { 1001 sk_memory_allocated_sub(sk, pages); 1002 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); 1003 return -ENOMEM; 1004 } 1005 sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT; 1006 1007 sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT; 1008 1009 return 0; 1010 } 1011 1012 /* 1013 * This is meant for all protocols to use and covers goings on 1014 * at the socket level. Everything here is generic. 1015 */ 1016 1017 int sock_setsockopt(struct socket *sock, int level, int optname, 1018 sockptr_t optval, unsigned int optlen) 1019 { 1020 struct so_timestamping timestamping; 1021 struct sock_txtime sk_txtime; 1022 struct sock *sk = sock->sk; 1023 int val; 1024 int valbool; 1025 struct linger ling; 1026 int ret = 0; 1027 1028 /* 1029 * Options without arguments 1030 */ 1031 1032 if (optname == SO_BINDTODEVICE) 1033 return sock_setbindtodevice(sk, optval, optlen); 1034 1035 if (optlen < sizeof(int)) 1036 return -EINVAL; 1037 1038 if (copy_from_sockptr(&val, optval, sizeof(val))) 1039 return -EFAULT; 1040 1041 valbool = val ? 1 : 0; 1042 1043 lock_sock(sk); 1044 1045 switch (optname) { 1046 case SO_DEBUG: 1047 if (val && !capable(CAP_NET_ADMIN)) 1048 ret = -EACCES; 1049 else 1050 sock_valbool_flag(sk, SOCK_DBG, valbool); 1051 break; 1052 case SO_REUSEADDR: 1053 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 1054 break; 1055 case SO_REUSEPORT: 1056 sk->sk_reuseport = valbool; 1057 break; 1058 case SO_TYPE: 1059 case SO_PROTOCOL: 1060 case SO_DOMAIN: 1061 case SO_ERROR: 1062 ret = -ENOPROTOOPT; 1063 break; 1064 case SO_DONTROUTE: 1065 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 1066 sk_dst_reset(sk); 1067 break; 1068 case SO_BROADCAST: 1069 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 1070 break; 1071 case SO_SNDBUF: 1072 /* Don't error on this BSD doesn't and if you think 1073 * about it this is right. Otherwise apps have to 1074 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1075 * are treated in BSD as hints 1076 */ 1077 val = min_t(u32, val, sysctl_wmem_max); 1078 set_sndbuf: 1079 /* Ensure val * 2 fits into an int, to prevent max_t() 1080 * from treating it as a negative value. 1081 */ 1082 val = min_t(int, val, INT_MAX / 2); 1083 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1084 WRITE_ONCE(sk->sk_sndbuf, 1085 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 1086 /* Wake up sending tasks if we upped the value. */ 1087 sk->sk_write_space(sk); 1088 break; 1089 1090 case SO_SNDBUFFORCE: 1091 if (!capable(CAP_NET_ADMIN)) { 1092 ret = -EPERM; 1093 break; 1094 } 1095 1096 /* No negative values (to prevent underflow, as val will be 1097 * multiplied by 2). 1098 */ 1099 if (val < 0) 1100 val = 0; 1101 goto set_sndbuf; 1102 1103 case SO_RCVBUF: 1104 /* Don't error on this BSD doesn't and if you think 1105 * about it this is right. Otherwise apps have to 1106 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1107 * are treated in BSD as hints 1108 */ 1109 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); 1110 break; 1111 1112 case SO_RCVBUFFORCE: 1113 if (!capable(CAP_NET_ADMIN)) { 1114 ret = -EPERM; 1115 break; 1116 } 1117 1118 /* No negative values (to prevent underflow, as val will be 1119 * multiplied by 2). 1120 */ 1121 __sock_set_rcvbuf(sk, max(val, 0)); 1122 break; 1123 1124 case SO_KEEPALIVE: 1125 if (sk->sk_prot->keepalive) 1126 sk->sk_prot->keepalive(sk, valbool); 1127 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 1128 break; 1129 1130 case SO_OOBINLINE: 1131 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 1132 break; 1133 1134 case SO_NO_CHECK: 1135 sk->sk_no_check_tx = valbool; 1136 break; 1137 1138 case SO_PRIORITY: 1139 if ((val >= 0 && val <= 6) || 1140 ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || 1141 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1142 sk->sk_priority = val; 1143 else 1144 ret = -EPERM; 1145 break; 1146 1147 case SO_LINGER: 1148 if (optlen < sizeof(ling)) { 1149 ret = -EINVAL; /* 1003.1g */ 1150 break; 1151 } 1152 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 1153 ret = -EFAULT; 1154 break; 1155 } 1156 if (!ling.l_onoff) 1157 sock_reset_flag(sk, SOCK_LINGER); 1158 else { 1159 #if (BITS_PER_LONG == 32) 1160 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 1161 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 1162 else 1163 #endif 1164 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 1165 sock_set_flag(sk, SOCK_LINGER); 1166 } 1167 break; 1168 1169 case SO_BSDCOMPAT: 1170 break; 1171 1172 case SO_PASSCRED: 1173 if (valbool) 1174 set_bit(SOCK_PASSCRED, &sock->flags); 1175 else 1176 clear_bit(SOCK_PASSCRED, &sock->flags); 1177 break; 1178 1179 case SO_TIMESTAMP_OLD: 1180 case SO_TIMESTAMP_NEW: 1181 case SO_TIMESTAMPNS_OLD: 1182 case SO_TIMESTAMPNS_NEW: 1183 sock_set_timestamp(sk, optname, valbool); 1184 break; 1185 1186 case SO_TIMESTAMPING_NEW: 1187 case SO_TIMESTAMPING_OLD: 1188 if (optlen == sizeof(timestamping)) { 1189 if (copy_from_sockptr(×tamping, optval, 1190 sizeof(timestamping))) { 1191 ret = -EFAULT; 1192 break; 1193 } 1194 } else { 1195 memset(×tamping, 0, sizeof(timestamping)); 1196 timestamping.flags = val; 1197 } 1198 ret = sock_set_timestamping(sk, optname, timestamping); 1199 break; 1200 1201 case SO_RCVLOWAT: 1202 if (val < 0) 1203 val = INT_MAX; 1204 if (sock->ops->set_rcvlowat) 1205 ret = sock->ops->set_rcvlowat(sk, val); 1206 else 1207 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1208 break; 1209 1210 case SO_RCVTIMEO_OLD: 1211 case SO_RCVTIMEO_NEW: 1212 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, 1213 optlen, optname == SO_RCVTIMEO_OLD); 1214 break; 1215 1216 case SO_SNDTIMEO_OLD: 1217 case SO_SNDTIMEO_NEW: 1218 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, 1219 optlen, optname == SO_SNDTIMEO_OLD); 1220 break; 1221 1222 case SO_ATTACH_FILTER: { 1223 struct sock_fprog fprog; 1224 1225 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1226 if (!ret) 1227 ret = sk_attach_filter(&fprog, sk); 1228 break; 1229 } 1230 case SO_ATTACH_BPF: 1231 ret = -EINVAL; 1232 if (optlen == sizeof(u32)) { 1233 u32 ufd; 1234 1235 ret = -EFAULT; 1236 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1237 break; 1238 1239 ret = sk_attach_bpf(ufd, sk); 1240 } 1241 break; 1242 1243 case SO_ATTACH_REUSEPORT_CBPF: { 1244 struct sock_fprog fprog; 1245 1246 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1247 if (!ret) 1248 ret = sk_reuseport_attach_filter(&fprog, sk); 1249 break; 1250 } 1251 case SO_ATTACH_REUSEPORT_EBPF: 1252 ret = -EINVAL; 1253 if (optlen == sizeof(u32)) { 1254 u32 ufd; 1255 1256 ret = -EFAULT; 1257 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1258 break; 1259 1260 ret = sk_reuseport_attach_bpf(ufd, sk); 1261 } 1262 break; 1263 1264 case SO_DETACH_REUSEPORT_BPF: 1265 ret = reuseport_detach_prog(sk); 1266 break; 1267 1268 case SO_DETACH_FILTER: 1269 ret = sk_detach_filter(sk); 1270 break; 1271 1272 case SO_LOCK_FILTER: 1273 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1274 ret = -EPERM; 1275 else 1276 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1277 break; 1278 1279 case SO_PASSSEC: 1280 if (valbool) 1281 set_bit(SOCK_PASSSEC, &sock->flags); 1282 else 1283 clear_bit(SOCK_PASSSEC, &sock->flags); 1284 break; 1285 case SO_MARK: 1286 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 1287 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1288 ret = -EPERM; 1289 break; 1290 } 1291 1292 __sock_set_mark(sk, val); 1293 break; 1294 1295 case SO_RXQ_OVFL: 1296 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1297 break; 1298 1299 case SO_WIFI_STATUS: 1300 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1301 break; 1302 1303 case SO_PEEK_OFF: 1304 if (sock->ops->set_peek_off) 1305 ret = sock->ops->set_peek_off(sk, val); 1306 else 1307 ret = -EOPNOTSUPP; 1308 break; 1309 1310 case SO_NOFCS: 1311 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1312 break; 1313 1314 case SO_SELECT_ERR_QUEUE: 1315 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1316 break; 1317 1318 #ifdef CONFIG_NET_RX_BUSY_POLL 1319 case SO_BUSY_POLL: 1320 /* allow unprivileged users to decrease the value */ 1321 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1322 ret = -EPERM; 1323 else { 1324 if (val < 0) 1325 ret = -EINVAL; 1326 else 1327 WRITE_ONCE(sk->sk_ll_usec, val); 1328 } 1329 break; 1330 case SO_PREFER_BUSY_POLL: 1331 if (valbool && !capable(CAP_NET_ADMIN)) 1332 ret = -EPERM; 1333 else 1334 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1335 break; 1336 case SO_BUSY_POLL_BUDGET: 1337 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { 1338 ret = -EPERM; 1339 } else { 1340 if (val < 0 || val > U16_MAX) 1341 ret = -EINVAL; 1342 else 1343 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1344 } 1345 break; 1346 #endif 1347 1348 case SO_MAX_PACING_RATE: 1349 { 1350 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1351 1352 if (sizeof(ulval) != sizeof(val) && 1353 optlen >= sizeof(ulval) && 1354 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1355 ret = -EFAULT; 1356 break; 1357 } 1358 if (ulval != ~0UL) 1359 cmpxchg(&sk->sk_pacing_status, 1360 SK_PACING_NONE, 1361 SK_PACING_NEEDED); 1362 sk->sk_max_pacing_rate = ulval; 1363 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); 1364 break; 1365 } 1366 case SO_INCOMING_CPU: 1367 WRITE_ONCE(sk->sk_incoming_cpu, val); 1368 break; 1369 1370 case SO_CNX_ADVICE: 1371 if (val == 1) 1372 dst_negative_advice(sk); 1373 break; 1374 1375 case SO_ZEROCOPY: 1376 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1377 if (!(sk_is_tcp(sk) || 1378 (sk->sk_type == SOCK_DGRAM && 1379 sk->sk_protocol == IPPROTO_UDP))) 1380 ret = -ENOTSUPP; 1381 } else if (sk->sk_family != PF_RDS) { 1382 ret = -ENOTSUPP; 1383 } 1384 if (!ret) { 1385 if (val < 0 || val > 1) 1386 ret = -EINVAL; 1387 else 1388 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1389 } 1390 break; 1391 1392 case SO_TXTIME: 1393 if (optlen != sizeof(struct sock_txtime)) { 1394 ret = -EINVAL; 1395 break; 1396 } else if (copy_from_sockptr(&sk_txtime, optval, 1397 sizeof(struct sock_txtime))) { 1398 ret = -EFAULT; 1399 break; 1400 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1401 ret = -EINVAL; 1402 break; 1403 } 1404 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1405 * scheduler has enough safe guards. 1406 */ 1407 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1408 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1409 ret = -EPERM; 1410 break; 1411 } 1412 sock_valbool_flag(sk, SOCK_TXTIME, true); 1413 sk->sk_clockid = sk_txtime.clockid; 1414 sk->sk_txtime_deadline_mode = 1415 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1416 sk->sk_txtime_report_errors = 1417 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1418 break; 1419 1420 case SO_BINDTOIFINDEX: 1421 ret = sock_bindtoindex_locked(sk, val); 1422 break; 1423 1424 case SO_BUF_LOCK: 1425 if (val & ~SOCK_BUF_LOCK_MASK) { 1426 ret = -EINVAL; 1427 break; 1428 } 1429 sk->sk_userlocks = val | (sk->sk_userlocks & 1430 ~SOCK_BUF_LOCK_MASK); 1431 break; 1432 1433 case SO_RESERVE_MEM: 1434 { 1435 int delta; 1436 1437 if (val < 0) { 1438 ret = -EINVAL; 1439 break; 1440 } 1441 1442 delta = val - sk->sk_reserved_mem; 1443 if (delta < 0) 1444 sock_release_reserved_memory(sk, -delta); 1445 else 1446 ret = sock_reserve_memory(sk, delta); 1447 break; 1448 } 1449 1450 default: 1451 ret = -ENOPROTOOPT; 1452 break; 1453 } 1454 release_sock(sk); 1455 return ret; 1456 } 1457 EXPORT_SYMBOL(sock_setsockopt); 1458 1459 static const struct cred *sk_get_peer_cred(struct sock *sk) 1460 { 1461 const struct cred *cred; 1462 1463 spin_lock(&sk->sk_peer_lock); 1464 cred = get_cred(sk->sk_peer_cred); 1465 spin_unlock(&sk->sk_peer_lock); 1466 1467 return cred; 1468 } 1469 1470 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1471 struct ucred *ucred) 1472 { 1473 ucred->pid = pid_vnr(pid); 1474 ucred->uid = ucred->gid = -1; 1475 if (cred) { 1476 struct user_namespace *current_ns = current_user_ns(); 1477 1478 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1479 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1480 } 1481 } 1482 1483 static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1484 { 1485 struct user_namespace *user_ns = current_user_ns(); 1486 int i; 1487 1488 for (i = 0; i < src->ngroups; i++) 1489 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1490 return -EFAULT; 1491 1492 return 0; 1493 } 1494 1495 int sock_getsockopt(struct socket *sock, int level, int optname, 1496 char __user *optval, int __user *optlen) 1497 { 1498 struct sock *sk = sock->sk; 1499 1500 union { 1501 int val; 1502 u64 val64; 1503 unsigned long ulval; 1504 struct linger ling; 1505 struct old_timeval32 tm32; 1506 struct __kernel_old_timeval tm; 1507 struct __kernel_sock_timeval stm; 1508 struct sock_txtime txtime; 1509 struct so_timestamping timestamping; 1510 } v; 1511 1512 int lv = sizeof(int); 1513 int len; 1514 1515 if (get_user(len, optlen)) 1516 return -EFAULT; 1517 if (len < 0) 1518 return -EINVAL; 1519 1520 memset(&v, 0, sizeof(v)); 1521 1522 switch (optname) { 1523 case SO_DEBUG: 1524 v.val = sock_flag(sk, SOCK_DBG); 1525 break; 1526 1527 case SO_DONTROUTE: 1528 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1529 break; 1530 1531 case SO_BROADCAST: 1532 v.val = sock_flag(sk, SOCK_BROADCAST); 1533 break; 1534 1535 case SO_SNDBUF: 1536 v.val = sk->sk_sndbuf; 1537 break; 1538 1539 case SO_RCVBUF: 1540 v.val = sk->sk_rcvbuf; 1541 break; 1542 1543 case SO_REUSEADDR: 1544 v.val = sk->sk_reuse; 1545 break; 1546 1547 case SO_REUSEPORT: 1548 v.val = sk->sk_reuseport; 1549 break; 1550 1551 case SO_KEEPALIVE: 1552 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1553 break; 1554 1555 case SO_TYPE: 1556 v.val = sk->sk_type; 1557 break; 1558 1559 case SO_PROTOCOL: 1560 v.val = sk->sk_protocol; 1561 break; 1562 1563 case SO_DOMAIN: 1564 v.val = sk->sk_family; 1565 break; 1566 1567 case SO_ERROR: 1568 v.val = -sock_error(sk); 1569 if (v.val == 0) 1570 v.val = xchg(&sk->sk_err_soft, 0); 1571 break; 1572 1573 case SO_OOBINLINE: 1574 v.val = sock_flag(sk, SOCK_URGINLINE); 1575 break; 1576 1577 case SO_NO_CHECK: 1578 v.val = sk->sk_no_check_tx; 1579 break; 1580 1581 case SO_PRIORITY: 1582 v.val = sk->sk_priority; 1583 break; 1584 1585 case SO_LINGER: 1586 lv = sizeof(v.ling); 1587 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1588 v.ling.l_linger = sk->sk_lingertime / HZ; 1589 break; 1590 1591 case SO_BSDCOMPAT: 1592 break; 1593 1594 case SO_TIMESTAMP_OLD: 1595 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1596 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1597 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1598 break; 1599 1600 case SO_TIMESTAMPNS_OLD: 1601 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1602 break; 1603 1604 case SO_TIMESTAMP_NEW: 1605 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1606 break; 1607 1608 case SO_TIMESTAMPNS_NEW: 1609 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1610 break; 1611 1612 case SO_TIMESTAMPING_OLD: 1613 lv = sizeof(v.timestamping); 1614 v.timestamping.flags = sk->sk_tsflags; 1615 v.timestamping.bind_phc = sk->sk_bind_phc; 1616 break; 1617 1618 case SO_RCVTIMEO_OLD: 1619 case SO_RCVTIMEO_NEW: 1620 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); 1621 break; 1622 1623 case SO_SNDTIMEO_OLD: 1624 case SO_SNDTIMEO_NEW: 1625 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); 1626 break; 1627 1628 case SO_RCVLOWAT: 1629 v.val = sk->sk_rcvlowat; 1630 break; 1631 1632 case SO_SNDLOWAT: 1633 v.val = 1; 1634 break; 1635 1636 case SO_PASSCRED: 1637 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1638 break; 1639 1640 case SO_PEERCRED: 1641 { 1642 struct ucred peercred; 1643 if (len > sizeof(peercred)) 1644 len = sizeof(peercred); 1645 1646 spin_lock(&sk->sk_peer_lock); 1647 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1648 spin_unlock(&sk->sk_peer_lock); 1649 1650 if (copy_to_user(optval, &peercred, len)) 1651 return -EFAULT; 1652 goto lenout; 1653 } 1654 1655 case SO_PEERGROUPS: 1656 { 1657 const struct cred *cred; 1658 int ret, n; 1659 1660 cred = sk_get_peer_cred(sk); 1661 if (!cred) 1662 return -ENODATA; 1663 1664 n = cred->group_info->ngroups; 1665 if (len < n * sizeof(gid_t)) { 1666 len = n * sizeof(gid_t); 1667 put_cred(cred); 1668 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1669 } 1670 len = n * sizeof(gid_t); 1671 1672 ret = groups_to_user((gid_t __user *)optval, cred->group_info); 1673 put_cred(cred); 1674 if (ret) 1675 return ret; 1676 goto lenout; 1677 } 1678 1679 case SO_PEERNAME: 1680 { 1681 char address[128]; 1682 1683 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1684 if (lv < 0) 1685 return -ENOTCONN; 1686 if (lv < len) 1687 return -EINVAL; 1688 if (copy_to_user(optval, address, len)) 1689 return -EFAULT; 1690 goto lenout; 1691 } 1692 1693 /* Dubious BSD thing... Probably nobody even uses it, but 1694 * the UNIX standard wants it for whatever reason... -DaveM 1695 */ 1696 case SO_ACCEPTCONN: 1697 v.val = sk->sk_state == TCP_LISTEN; 1698 break; 1699 1700 case SO_PASSSEC: 1701 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1702 break; 1703 1704 case SO_PEERSEC: 1705 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1706 1707 case SO_MARK: 1708 v.val = sk->sk_mark; 1709 break; 1710 1711 case SO_RXQ_OVFL: 1712 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1713 break; 1714 1715 case SO_WIFI_STATUS: 1716 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1717 break; 1718 1719 case SO_PEEK_OFF: 1720 if (!sock->ops->set_peek_off) 1721 return -EOPNOTSUPP; 1722 1723 v.val = sk->sk_peek_off; 1724 break; 1725 case SO_NOFCS: 1726 v.val = sock_flag(sk, SOCK_NOFCS); 1727 break; 1728 1729 case SO_BINDTODEVICE: 1730 return sock_getbindtodevice(sk, optval, optlen, len); 1731 1732 case SO_GET_FILTER: 1733 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1734 if (len < 0) 1735 return len; 1736 1737 goto lenout; 1738 1739 case SO_LOCK_FILTER: 1740 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1741 break; 1742 1743 case SO_BPF_EXTENSIONS: 1744 v.val = bpf_tell_extensions(); 1745 break; 1746 1747 case SO_SELECT_ERR_QUEUE: 1748 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1749 break; 1750 1751 #ifdef CONFIG_NET_RX_BUSY_POLL 1752 case SO_BUSY_POLL: 1753 v.val = sk->sk_ll_usec; 1754 break; 1755 case SO_PREFER_BUSY_POLL: 1756 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 1757 break; 1758 #endif 1759 1760 case SO_MAX_PACING_RATE: 1761 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 1762 lv = sizeof(v.ulval); 1763 v.ulval = sk->sk_max_pacing_rate; 1764 } else { 1765 /* 32bit version */ 1766 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); 1767 } 1768 break; 1769 1770 case SO_INCOMING_CPU: 1771 v.val = READ_ONCE(sk->sk_incoming_cpu); 1772 break; 1773 1774 case SO_MEMINFO: 1775 { 1776 u32 meminfo[SK_MEMINFO_VARS]; 1777 1778 sk_get_meminfo(sk, meminfo); 1779 1780 len = min_t(unsigned int, len, sizeof(meminfo)); 1781 if (copy_to_user(optval, &meminfo, len)) 1782 return -EFAULT; 1783 1784 goto lenout; 1785 } 1786 1787 #ifdef CONFIG_NET_RX_BUSY_POLL 1788 case SO_INCOMING_NAPI_ID: 1789 v.val = READ_ONCE(sk->sk_napi_id); 1790 1791 /* aggregate non-NAPI IDs down to 0 */ 1792 if (v.val < MIN_NAPI_ID) 1793 v.val = 0; 1794 1795 break; 1796 #endif 1797 1798 case SO_COOKIE: 1799 lv = sizeof(u64); 1800 if (len < lv) 1801 return -EINVAL; 1802 v.val64 = sock_gen_cookie(sk); 1803 break; 1804 1805 case SO_ZEROCOPY: 1806 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1807 break; 1808 1809 case SO_TXTIME: 1810 lv = sizeof(v.txtime); 1811 v.txtime.clockid = sk->sk_clockid; 1812 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1813 SOF_TXTIME_DEADLINE_MODE : 0; 1814 v.txtime.flags |= sk->sk_txtime_report_errors ? 1815 SOF_TXTIME_REPORT_ERRORS : 0; 1816 break; 1817 1818 case SO_BINDTOIFINDEX: 1819 v.val = sk->sk_bound_dev_if; 1820 break; 1821 1822 case SO_NETNS_COOKIE: 1823 lv = sizeof(u64); 1824 if (len != lv) 1825 return -EINVAL; 1826 v.val64 = sock_net(sk)->net_cookie; 1827 break; 1828 1829 case SO_BUF_LOCK: 1830 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; 1831 break; 1832 1833 case SO_RESERVE_MEM: 1834 v.val = sk->sk_reserved_mem; 1835 break; 1836 1837 default: 1838 /* We implement the SO_SNDLOWAT etc to not be settable 1839 * (1003.1g 7). 1840 */ 1841 return -ENOPROTOOPT; 1842 } 1843 1844 if (len > lv) 1845 len = lv; 1846 if (copy_to_user(optval, &v, len)) 1847 return -EFAULT; 1848 lenout: 1849 if (put_user(len, optlen)) 1850 return -EFAULT; 1851 return 0; 1852 } 1853 1854 /* 1855 * Initialize an sk_lock. 1856 * 1857 * (We also register the sk_lock with the lock validator.) 1858 */ 1859 static inline void sock_lock_init(struct sock *sk) 1860 { 1861 if (sk->sk_kern_sock) 1862 sock_lock_init_class_and_name( 1863 sk, 1864 af_family_kern_slock_key_strings[sk->sk_family], 1865 af_family_kern_slock_keys + sk->sk_family, 1866 af_family_kern_key_strings[sk->sk_family], 1867 af_family_kern_keys + sk->sk_family); 1868 else 1869 sock_lock_init_class_and_name( 1870 sk, 1871 af_family_slock_key_strings[sk->sk_family], 1872 af_family_slock_keys + sk->sk_family, 1873 af_family_key_strings[sk->sk_family], 1874 af_family_keys + sk->sk_family); 1875 } 1876 1877 /* 1878 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1879 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1880 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1881 */ 1882 static void sock_copy(struct sock *nsk, const struct sock *osk) 1883 { 1884 const struct proto *prot = READ_ONCE(osk->sk_prot); 1885 #ifdef CONFIG_SECURITY_NETWORK 1886 void *sptr = nsk->sk_security; 1887 #endif 1888 1889 /* If we move sk_tx_queue_mapping out of the private section, 1890 * we must check if sk_tx_queue_clear() is called after 1891 * sock_copy() in sk_clone_lock(). 1892 */ 1893 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 1894 offsetof(struct sock, sk_dontcopy_begin) || 1895 offsetof(struct sock, sk_tx_queue_mapping) >= 1896 offsetof(struct sock, sk_dontcopy_end)); 1897 1898 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1899 1900 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1901 prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1902 1903 #ifdef CONFIG_SECURITY_NETWORK 1904 nsk->sk_security = sptr; 1905 security_sk_clone(osk, nsk); 1906 #endif 1907 } 1908 1909 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1910 int family) 1911 { 1912 struct sock *sk; 1913 struct kmem_cache *slab; 1914 1915 slab = prot->slab; 1916 if (slab != NULL) { 1917 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1918 if (!sk) 1919 return sk; 1920 if (want_init_on_alloc(priority)) 1921 sk_prot_clear_nulls(sk, prot->obj_size); 1922 } else 1923 sk = kmalloc(prot->obj_size, priority); 1924 1925 if (sk != NULL) { 1926 if (security_sk_alloc(sk, family, priority)) 1927 goto out_free; 1928 1929 if (!try_module_get(prot->owner)) 1930 goto out_free_sec; 1931 } 1932 1933 return sk; 1934 1935 out_free_sec: 1936 security_sk_free(sk); 1937 out_free: 1938 if (slab != NULL) 1939 kmem_cache_free(slab, sk); 1940 else 1941 kfree(sk); 1942 return NULL; 1943 } 1944 1945 static void sk_prot_free(struct proto *prot, struct sock *sk) 1946 { 1947 struct kmem_cache *slab; 1948 struct module *owner; 1949 1950 owner = prot->owner; 1951 slab = prot->slab; 1952 1953 cgroup_sk_free(&sk->sk_cgrp_data); 1954 mem_cgroup_sk_free(sk); 1955 security_sk_free(sk); 1956 if (slab != NULL) 1957 kmem_cache_free(slab, sk); 1958 else 1959 kfree(sk); 1960 module_put(owner); 1961 } 1962 1963 /** 1964 * sk_alloc - All socket objects are allocated here 1965 * @net: the applicable net namespace 1966 * @family: protocol family 1967 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1968 * @prot: struct proto associated with this new sock instance 1969 * @kern: is this to be a kernel socket? 1970 */ 1971 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1972 struct proto *prot, int kern) 1973 { 1974 struct sock *sk; 1975 1976 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1977 if (sk) { 1978 sk->sk_family = family; 1979 /* 1980 * See comment in struct sock definition to understand 1981 * why we need sk_prot_creator -acme 1982 */ 1983 sk->sk_prot = sk->sk_prot_creator = prot; 1984 sk->sk_kern_sock = kern; 1985 sock_lock_init(sk); 1986 sk->sk_net_refcnt = kern ? 0 : 1; 1987 if (likely(sk->sk_net_refcnt)) { 1988 get_net_track(net, &sk->ns_tracker, priority); 1989 sock_inuse_add(net, 1); 1990 } 1991 1992 sock_net_set(sk, net); 1993 refcount_set(&sk->sk_wmem_alloc, 1); 1994 1995 mem_cgroup_sk_alloc(sk); 1996 cgroup_sk_alloc(&sk->sk_cgrp_data); 1997 sock_update_classid(&sk->sk_cgrp_data); 1998 sock_update_netprioidx(&sk->sk_cgrp_data); 1999 sk_tx_queue_clear(sk); 2000 } 2001 2002 return sk; 2003 } 2004 EXPORT_SYMBOL(sk_alloc); 2005 2006 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 2007 * grace period. This is the case for UDP sockets and TCP listeners. 2008 */ 2009 static void __sk_destruct(struct rcu_head *head) 2010 { 2011 struct sock *sk = container_of(head, struct sock, sk_rcu); 2012 struct sk_filter *filter; 2013 2014 if (sk->sk_destruct) 2015 sk->sk_destruct(sk); 2016 2017 filter = rcu_dereference_check(sk->sk_filter, 2018 refcount_read(&sk->sk_wmem_alloc) == 0); 2019 if (filter) { 2020 sk_filter_uncharge(sk, filter); 2021 RCU_INIT_POINTER(sk->sk_filter, NULL); 2022 } 2023 2024 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 2025 2026 #ifdef CONFIG_BPF_SYSCALL 2027 bpf_sk_storage_free(sk); 2028 #endif 2029 2030 if (atomic_read(&sk->sk_omem_alloc)) 2031 pr_debug("%s: optmem leakage (%d bytes) detected\n", 2032 __func__, atomic_read(&sk->sk_omem_alloc)); 2033 2034 if (sk->sk_frag.page) { 2035 put_page(sk->sk_frag.page); 2036 sk->sk_frag.page = NULL; 2037 } 2038 2039 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ 2040 put_cred(sk->sk_peer_cred); 2041 put_pid(sk->sk_peer_pid); 2042 2043 if (likely(sk->sk_net_refcnt)) 2044 put_net_track(sock_net(sk), &sk->ns_tracker); 2045 sk_prot_free(sk->sk_prot_creator, sk); 2046 } 2047 2048 void sk_destruct(struct sock *sk) 2049 { 2050 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 2051 2052 WARN_ON_ONCE(!llist_empty(&sk->defer_list)); 2053 sk_defer_free_flush(sk); 2054 2055 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 2056 reuseport_detach_sock(sk); 2057 use_call_rcu = true; 2058 } 2059 2060 if (use_call_rcu) 2061 call_rcu(&sk->sk_rcu, __sk_destruct); 2062 else 2063 __sk_destruct(&sk->sk_rcu); 2064 } 2065 2066 static void __sk_free(struct sock *sk) 2067 { 2068 if (likely(sk->sk_net_refcnt)) 2069 sock_inuse_add(sock_net(sk), -1); 2070 2071 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 2072 sock_diag_broadcast_destroy(sk); 2073 else 2074 sk_destruct(sk); 2075 } 2076 2077 void sk_free(struct sock *sk) 2078 { 2079 /* 2080 * We subtract one from sk_wmem_alloc and can know if 2081 * some packets are still in some tx queue. 2082 * If not null, sock_wfree() will call __sk_free(sk) later 2083 */ 2084 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 2085 __sk_free(sk); 2086 } 2087 EXPORT_SYMBOL(sk_free); 2088 2089 static void sk_init_common(struct sock *sk) 2090 { 2091 skb_queue_head_init(&sk->sk_receive_queue); 2092 skb_queue_head_init(&sk->sk_write_queue); 2093 skb_queue_head_init(&sk->sk_error_queue); 2094 2095 rwlock_init(&sk->sk_callback_lock); 2096 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 2097 af_rlock_keys + sk->sk_family, 2098 af_family_rlock_key_strings[sk->sk_family]); 2099 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 2100 af_wlock_keys + sk->sk_family, 2101 af_family_wlock_key_strings[sk->sk_family]); 2102 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 2103 af_elock_keys + sk->sk_family, 2104 af_family_elock_key_strings[sk->sk_family]); 2105 lockdep_set_class_and_name(&sk->sk_callback_lock, 2106 af_callback_keys + sk->sk_family, 2107 af_family_clock_key_strings[sk->sk_family]); 2108 } 2109 2110 /** 2111 * sk_clone_lock - clone a socket, and lock its clone 2112 * @sk: the socket to clone 2113 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2114 * 2115 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 2116 */ 2117 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 2118 { 2119 struct proto *prot = READ_ONCE(sk->sk_prot); 2120 struct sk_filter *filter; 2121 bool is_charged = true; 2122 struct sock *newsk; 2123 2124 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 2125 if (!newsk) 2126 goto out; 2127 2128 sock_copy(newsk, sk); 2129 2130 newsk->sk_prot_creator = prot; 2131 2132 /* SANITY */ 2133 if (likely(newsk->sk_net_refcnt)) { 2134 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); 2135 sock_inuse_add(sock_net(newsk), 1); 2136 } 2137 sk_node_init(&newsk->sk_node); 2138 sock_lock_init(newsk); 2139 bh_lock_sock(newsk); 2140 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 2141 newsk->sk_backlog.len = 0; 2142 2143 atomic_set(&newsk->sk_rmem_alloc, 0); 2144 2145 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ 2146 refcount_set(&newsk->sk_wmem_alloc, 1); 2147 2148 atomic_set(&newsk->sk_omem_alloc, 0); 2149 sk_init_common(newsk); 2150 2151 newsk->sk_dst_cache = NULL; 2152 newsk->sk_dst_pending_confirm = 0; 2153 newsk->sk_wmem_queued = 0; 2154 newsk->sk_forward_alloc = 0; 2155 newsk->sk_reserved_mem = 0; 2156 atomic_set(&newsk->sk_drops, 0); 2157 newsk->sk_send_head = NULL; 2158 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 2159 atomic_set(&newsk->sk_zckey, 0); 2160 2161 sock_reset_flag(newsk, SOCK_DONE); 2162 2163 /* sk->sk_memcg will be populated at accept() time */ 2164 newsk->sk_memcg = NULL; 2165 2166 cgroup_sk_clone(&newsk->sk_cgrp_data); 2167 2168 rcu_read_lock(); 2169 filter = rcu_dereference(sk->sk_filter); 2170 if (filter != NULL) 2171 /* though it's an empty new sock, the charging may fail 2172 * if sysctl_optmem_max was changed between creation of 2173 * original socket and cloning 2174 */ 2175 is_charged = sk_filter_charge(newsk, filter); 2176 RCU_INIT_POINTER(newsk->sk_filter, filter); 2177 rcu_read_unlock(); 2178 2179 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 2180 /* We need to make sure that we don't uncharge the new 2181 * socket if we couldn't charge it in the first place 2182 * as otherwise we uncharge the parent's filter. 2183 */ 2184 if (!is_charged) 2185 RCU_INIT_POINTER(newsk->sk_filter, NULL); 2186 sk_free_unlock_clone(newsk); 2187 newsk = NULL; 2188 goto out; 2189 } 2190 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 2191 2192 if (bpf_sk_storage_clone(sk, newsk)) { 2193 sk_free_unlock_clone(newsk); 2194 newsk = NULL; 2195 goto out; 2196 } 2197 2198 /* Clear sk_user_data if parent had the pointer tagged 2199 * as not suitable for copying when cloning. 2200 */ 2201 if (sk_user_data_is_nocopy(newsk)) 2202 newsk->sk_user_data = NULL; 2203 2204 newsk->sk_err = 0; 2205 newsk->sk_err_soft = 0; 2206 newsk->sk_priority = 0; 2207 newsk->sk_incoming_cpu = raw_smp_processor_id(); 2208 2209 /* Before updating sk_refcnt, we must commit prior changes to memory 2210 * (Documentation/RCU/rculist_nulls.rst for details) 2211 */ 2212 smp_wmb(); 2213 refcount_set(&newsk->sk_refcnt, 2); 2214 2215 /* Increment the counter in the same struct proto as the master 2216 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 2217 * is the same as sk->sk_prot->socks, as this field was copied 2218 * with memcpy). 2219 * 2220 * This _changes_ the previous behaviour, where 2221 * tcp_create_openreq_child always was incrementing the 2222 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 2223 * to be taken into account in all callers. -acme 2224 */ 2225 sk_refcnt_debug_inc(newsk); 2226 sk_set_socket(newsk, NULL); 2227 sk_tx_queue_clear(newsk); 2228 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2229 2230 if (newsk->sk_prot->sockets_allocated) 2231 sk_sockets_allocated_inc(newsk); 2232 2233 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2234 net_enable_timestamp(); 2235 out: 2236 return newsk; 2237 } 2238 EXPORT_SYMBOL_GPL(sk_clone_lock); 2239 2240 void sk_free_unlock_clone(struct sock *sk) 2241 { 2242 /* It is still raw copy of parent, so invalidate 2243 * destructor and make plain sk_free() */ 2244 sk->sk_destruct = NULL; 2245 bh_unlock_sock(sk); 2246 sk_free(sk); 2247 } 2248 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 2249 2250 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2251 { 2252 u32 max_segs = 1; 2253 2254 sk_dst_set(sk, dst); 2255 sk->sk_route_caps = dst->dev->features; 2256 if (sk_is_tcp(sk)) 2257 sk->sk_route_caps |= NETIF_F_GSO; 2258 if (sk->sk_route_caps & NETIF_F_GSO) 2259 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2260 if (unlikely(sk->sk_gso_disabled)) 2261 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2262 if (sk_can_gso(sk)) { 2263 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2264 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2265 } else { 2266 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2267 /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ 2268 sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); 2269 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ 2270 max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); 2271 } 2272 } 2273 sk->sk_gso_max_segs = max_segs; 2274 } 2275 EXPORT_SYMBOL_GPL(sk_setup_caps); 2276 2277 /* 2278 * Simple resource managers for sockets. 2279 */ 2280 2281 2282 /* 2283 * Write buffer destructor automatically called from kfree_skb. 2284 */ 2285 void sock_wfree(struct sk_buff *skb) 2286 { 2287 struct sock *sk = skb->sk; 2288 unsigned int len = skb->truesize; 2289 2290 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2291 /* 2292 * Keep a reference on sk_wmem_alloc, this will be released 2293 * after sk_write_space() call 2294 */ 2295 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2296 sk->sk_write_space(sk); 2297 len = 1; 2298 } 2299 /* 2300 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2301 * could not do because of in-flight packets 2302 */ 2303 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2304 __sk_free(sk); 2305 } 2306 EXPORT_SYMBOL(sock_wfree); 2307 2308 /* This variant of sock_wfree() is used by TCP, 2309 * since it sets SOCK_USE_WRITE_QUEUE. 2310 */ 2311 void __sock_wfree(struct sk_buff *skb) 2312 { 2313 struct sock *sk = skb->sk; 2314 2315 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2316 __sk_free(sk); 2317 } 2318 2319 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2320 { 2321 skb_orphan(skb); 2322 skb->sk = sk; 2323 #ifdef CONFIG_INET 2324 if (unlikely(!sk_fullsock(sk))) { 2325 skb->destructor = sock_edemux; 2326 sock_hold(sk); 2327 return; 2328 } 2329 #endif 2330 skb->destructor = sock_wfree; 2331 skb_set_hash_from_sk(skb, sk); 2332 /* 2333 * We used to take a refcount on sk, but following operation 2334 * is enough to guarantee sk_free() wont free this sock until 2335 * all in-flight packets are completed 2336 */ 2337 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 2338 } 2339 EXPORT_SYMBOL(skb_set_owner_w); 2340 2341 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2342 { 2343 #ifdef CONFIG_TLS_DEVICE 2344 /* Drivers depend on in-order delivery for crypto offload, 2345 * partial orphan breaks out-of-order-OK logic. 2346 */ 2347 if (skb->decrypted) 2348 return false; 2349 #endif 2350 return (skb->destructor == sock_wfree || 2351 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2352 } 2353 2354 /* This helper is used by netem, as it can hold packets in its 2355 * delay queue. We want to allow the owner socket to send more 2356 * packets, as if they were already TX completed by a typical driver. 2357 * But we also want to keep skb->sk set because some packet schedulers 2358 * rely on it (sch_fq for example). 2359 */ 2360 void skb_orphan_partial(struct sk_buff *skb) 2361 { 2362 if (skb_is_tcp_pure_ack(skb)) 2363 return; 2364 2365 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2366 return; 2367 2368 skb_orphan(skb); 2369 } 2370 EXPORT_SYMBOL(skb_orphan_partial); 2371 2372 /* 2373 * Read buffer destructor automatically called from kfree_skb. 2374 */ 2375 void sock_rfree(struct sk_buff *skb) 2376 { 2377 struct sock *sk = skb->sk; 2378 unsigned int len = skb->truesize; 2379 2380 atomic_sub(len, &sk->sk_rmem_alloc); 2381 sk_mem_uncharge(sk, len); 2382 } 2383 EXPORT_SYMBOL(sock_rfree); 2384 2385 /* 2386 * Buffer destructor for skbs that are not used directly in read or write 2387 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2388 */ 2389 void sock_efree(struct sk_buff *skb) 2390 { 2391 sock_put(skb->sk); 2392 } 2393 EXPORT_SYMBOL(sock_efree); 2394 2395 /* Buffer destructor for prefetch/receive path where reference count may 2396 * not be held, e.g. for listen sockets. 2397 */ 2398 #ifdef CONFIG_INET 2399 void sock_pfree(struct sk_buff *skb) 2400 { 2401 if (sk_is_refcounted(skb->sk)) 2402 sock_gen_put(skb->sk); 2403 } 2404 EXPORT_SYMBOL(sock_pfree); 2405 #endif /* CONFIG_INET */ 2406 2407 kuid_t sock_i_uid(struct sock *sk) 2408 { 2409 kuid_t uid; 2410 2411 read_lock_bh(&sk->sk_callback_lock); 2412 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 2413 read_unlock_bh(&sk->sk_callback_lock); 2414 return uid; 2415 } 2416 EXPORT_SYMBOL(sock_i_uid); 2417 2418 unsigned long sock_i_ino(struct sock *sk) 2419 { 2420 unsigned long ino; 2421 2422 read_lock_bh(&sk->sk_callback_lock); 2423 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 2424 read_unlock_bh(&sk->sk_callback_lock); 2425 return ino; 2426 } 2427 EXPORT_SYMBOL(sock_i_ino); 2428 2429 /* 2430 * Allocate a skb from the socket's send buffer. 2431 */ 2432 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2433 gfp_t priority) 2434 { 2435 if (force || 2436 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2437 struct sk_buff *skb = alloc_skb(size, priority); 2438 2439 if (skb) { 2440 skb_set_owner_w(skb, sk); 2441 return skb; 2442 } 2443 } 2444 return NULL; 2445 } 2446 EXPORT_SYMBOL(sock_wmalloc); 2447 2448 static void sock_ofree(struct sk_buff *skb) 2449 { 2450 struct sock *sk = skb->sk; 2451 2452 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2453 } 2454 2455 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2456 gfp_t priority) 2457 { 2458 struct sk_buff *skb; 2459 2460 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2461 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2462 sysctl_optmem_max) 2463 return NULL; 2464 2465 skb = alloc_skb(size, priority); 2466 if (!skb) 2467 return NULL; 2468 2469 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2470 skb->sk = sk; 2471 skb->destructor = sock_ofree; 2472 return skb; 2473 } 2474 2475 /* 2476 * Allocate a memory block from the socket's option memory buffer. 2477 */ 2478 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2479 { 2480 if ((unsigned int)size <= sysctl_optmem_max && 2481 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 2482 void *mem; 2483 /* First do the add, to avoid the race if kmalloc 2484 * might sleep. 2485 */ 2486 atomic_add(size, &sk->sk_omem_alloc); 2487 mem = kmalloc(size, priority); 2488 if (mem) 2489 return mem; 2490 atomic_sub(size, &sk->sk_omem_alloc); 2491 } 2492 return NULL; 2493 } 2494 EXPORT_SYMBOL(sock_kmalloc); 2495 2496 /* Free an option memory block. Note, we actually want the inline 2497 * here as this allows gcc to detect the nullify and fold away the 2498 * condition entirely. 2499 */ 2500 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2501 const bool nullify) 2502 { 2503 if (WARN_ON_ONCE(!mem)) 2504 return; 2505 if (nullify) 2506 kfree_sensitive(mem); 2507 else 2508 kfree(mem); 2509 atomic_sub(size, &sk->sk_omem_alloc); 2510 } 2511 2512 void sock_kfree_s(struct sock *sk, void *mem, int size) 2513 { 2514 __sock_kfree_s(sk, mem, size, false); 2515 } 2516 EXPORT_SYMBOL(sock_kfree_s); 2517 2518 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2519 { 2520 __sock_kfree_s(sk, mem, size, true); 2521 } 2522 EXPORT_SYMBOL(sock_kzfree_s); 2523 2524 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2525 I think, these locks should be removed for datagram sockets. 2526 */ 2527 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2528 { 2529 DEFINE_WAIT(wait); 2530 2531 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2532 for (;;) { 2533 if (!timeo) 2534 break; 2535 if (signal_pending(current)) 2536 break; 2537 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2538 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2539 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2540 break; 2541 if (sk->sk_shutdown & SEND_SHUTDOWN) 2542 break; 2543 if (sk->sk_err) 2544 break; 2545 timeo = schedule_timeout(timeo); 2546 } 2547 finish_wait(sk_sleep(sk), &wait); 2548 return timeo; 2549 } 2550 2551 2552 /* 2553 * Generic send/receive buffer handlers 2554 */ 2555 2556 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2557 unsigned long data_len, int noblock, 2558 int *errcode, int max_page_order) 2559 { 2560 struct sk_buff *skb; 2561 long timeo; 2562 int err; 2563 2564 timeo = sock_sndtimeo(sk, noblock); 2565 for (;;) { 2566 err = sock_error(sk); 2567 if (err != 0) 2568 goto failure; 2569 2570 err = -EPIPE; 2571 if (sk->sk_shutdown & SEND_SHUTDOWN) 2572 goto failure; 2573 2574 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2575 break; 2576 2577 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2578 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2579 err = -EAGAIN; 2580 if (!timeo) 2581 goto failure; 2582 if (signal_pending(current)) 2583 goto interrupted; 2584 timeo = sock_wait_for_wmem(sk, timeo); 2585 } 2586 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2587 errcode, sk->sk_allocation); 2588 if (skb) 2589 skb_set_owner_w(skb, sk); 2590 return skb; 2591 2592 interrupted: 2593 err = sock_intr_errno(timeo); 2594 failure: 2595 *errcode = err; 2596 return NULL; 2597 } 2598 EXPORT_SYMBOL(sock_alloc_send_pskb); 2599 2600 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 2601 int noblock, int *errcode) 2602 { 2603 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 2604 } 2605 EXPORT_SYMBOL(sock_alloc_send_skb); 2606 2607 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2608 struct sockcm_cookie *sockc) 2609 { 2610 u32 tsflags; 2611 2612 switch (cmsg->cmsg_type) { 2613 case SO_MARK: 2614 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2615 return -EPERM; 2616 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2617 return -EINVAL; 2618 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2619 break; 2620 case SO_TIMESTAMPING_OLD: 2621 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2622 return -EINVAL; 2623 2624 tsflags = *(u32 *)CMSG_DATA(cmsg); 2625 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2626 return -EINVAL; 2627 2628 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2629 sockc->tsflags |= tsflags; 2630 break; 2631 case SCM_TXTIME: 2632 if (!sock_flag(sk, SOCK_TXTIME)) 2633 return -EINVAL; 2634 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2635 return -EINVAL; 2636 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2637 break; 2638 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2639 case SCM_RIGHTS: 2640 case SCM_CREDENTIALS: 2641 break; 2642 default: 2643 return -EINVAL; 2644 } 2645 return 0; 2646 } 2647 EXPORT_SYMBOL(__sock_cmsg_send); 2648 2649 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2650 struct sockcm_cookie *sockc) 2651 { 2652 struct cmsghdr *cmsg; 2653 int ret; 2654 2655 for_each_cmsghdr(cmsg, msg) { 2656 if (!CMSG_OK(msg, cmsg)) 2657 return -EINVAL; 2658 if (cmsg->cmsg_level != SOL_SOCKET) 2659 continue; 2660 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2661 if (ret) 2662 return ret; 2663 } 2664 return 0; 2665 } 2666 EXPORT_SYMBOL(sock_cmsg_send); 2667 2668 static void sk_enter_memory_pressure(struct sock *sk) 2669 { 2670 if (!sk->sk_prot->enter_memory_pressure) 2671 return; 2672 2673 sk->sk_prot->enter_memory_pressure(sk); 2674 } 2675 2676 static void sk_leave_memory_pressure(struct sock *sk) 2677 { 2678 if (sk->sk_prot->leave_memory_pressure) { 2679 sk->sk_prot->leave_memory_pressure(sk); 2680 } else { 2681 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2682 2683 if (memory_pressure && READ_ONCE(*memory_pressure)) 2684 WRITE_ONCE(*memory_pressure, 0); 2685 } 2686 } 2687 2688 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 2689 2690 /** 2691 * skb_page_frag_refill - check that a page_frag contains enough room 2692 * @sz: minimum size of the fragment we want to get 2693 * @pfrag: pointer to page_frag 2694 * @gfp: priority for memory allocation 2695 * 2696 * Note: While this allocator tries to use high order pages, there is 2697 * no guarantee that allocations succeed. Therefore, @sz MUST be 2698 * less or equal than PAGE_SIZE. 2699 */ 2700 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2701 { 2702 if (pfrag->page) { 2703 if (page_ref_count(pfrag->page) == 1) { 2704 pfrag->offset = 0; 2705 return true; 2706 } 2707 if (pfrag->offset + sz <= pfrag->size) 2708 return true; 2709 put_page(pfrag->page); 2710 } 2711 2712 pfrag->offset = 0; 2713 if (SKB_FRAG_PAGE_ORDER && 2714 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 2715 /* Avoid direct reclaim but allow kswapd to wake */ 2716 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2717 __GFP_COMP | __GFP_NOWARN | 2718 __GFP_NORETRY, 2719 SKB_FRAG_PAGE_ORDER); 2720 if (likely(pfrag->page)) { 2721 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2722 return true; 2723 } 2724 } 2725 pfrag->page = alloc_page(gfp); 2726 if (likely(pfrag->page)) { 2727 pfrag->size = PAGE_SIZE; 2728 return true; 2729 } 2730 return false; 2731 } 2732 EXPORT_SYMBOL(skb_page_frag_refill); 2733 2734 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2735 { 2736 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2737 return true; 2738 2739 sk_enter_memory_pressure(sk); 2740 sk_stream_moderate_sndbuf(sk); 2741 return false; 2742 } 2743 EXPORT_SYMBOL(sk_page_frag_refill); 2744 2745 void __lock_sock(struct sock *sk) 2746 __releases(&sk->sk_lock.slock) 2747 __acquires(&sk->sk_lock.slock) 2748 { 2749 DEFINE_WAIT(wait); 2750 2751 for (;;) { 2752 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2753 TASK_UNINTERRUPTIBLE); 2754 spin_unlock_bh(&sk->sk_lock.slock); 2755 schedule(); 2756 spin_lock_bh(&sk->sk_lock.slock); 2757 if (!sock_owned_by_user(sk)) 2758 break; 2759 } 2760 finish_wait(&sk->sk_lock.wq, &wait); 2761 } 2762 2763 void __release_sock(struct sock *sk) 2764 __releases(&sk->sk_lock.slock) 2765 __acquires(&sk->sk_lock.slock) 2766 { 2767 struct sk_buff *skb, *next; 2768 2769 while ((skb = sk->sk_backlog.head) != NULL) { 2770 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2771 2772 spin_unlock_bh(&sk->sk_lock.slock); 2773 2774 do { 2775 next = skb->next; 2776 prefetch(next); 2777 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2778 skb_mark_not_on_list(skb); 2779 sk_backlog_rcv(sk, skb); 2780 2781 cond_resched(); 2782 2783 skb = next; 2784 } while (skb != NULL); 2785 2786 spin_lock_bh(&sk->sk_lock.slock); 2787 } 2788 2789 /* 2790 * Doing the zeroing here guarantee we can not loop forever 2791 * while a wild producer attempts to flood us. 2792 */ 2793 sk->sk_backlog.len = 0; 2794 } 2795 2796 void __sk_flush_backlog(struct sock *sk) 2797 { 2798 spin_lock_bh(&sk->sk_lock.slock); 2799 __release_sock(sk); 2800 spin_unlock_bh(&sk->sk_lock.slock); 2801 } 2802 2803 /** 2804 * sk_wait_data - wait for data to arrive at sk_receive_queue 2805 * @sk: sock to wait on 2806 * @timeo: for how long 2807 * @skb: last skb seen on sk_receive_queue 2808 * 2809 * Now socket state including sk->sk_err is changed only under lock, 2810 * hence we may omit checks after joining wait queue. 2811 * We check receive queue before schedule() only as optimization; 2812 * it is very likely that release_sock() added new data. 2813 */ 2814 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2815 { 2816 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2817 int rc; 2818 2819 add_wait_queue(sk_sleep(sk), &wait); 2820 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2821 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2822 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2823 remove_wait_queue(sk_sleep(sk), &wait); 2824 return rc; 2825 } 2826 EXPORT_SYMBOL(sk_wait_data); 2827 2828 /** 2829 * __sk_mem_raise_allocated - increase memory_allocated 2830 * @sk: socket 2831 * @size: memory size to allocate 2832 * @amt: pages to allocate 2833 * @kind: allocation type 2834 * 2835 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2836 */ 2837 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2838 { 2839 struct proto *prot = sk->sk_prot; 2840 long allocated = sk_memory_allocated_add(sk, amt); 2841 bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg; 2842 bool charged = true; 2843 2844 if (memcg_charge && 2845 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt, 2846 gfp_memcg_charge()))) 2847 goto suppress_allocation; 2848 2849 /* Under limit. */ 2850 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2851 sk_leave_memory_pressure(sk); 2852 return 1; 2853 } 2854 2855 /* Under pressure. */ 2856 if (allocated > sk_prot_mem_limits(sk, 1)) 2857 sk_enter_memory_pressure(sk); 2858 2859 /* Over hard limit. */ 2860 if (allocated > sk_prot_mem_limits(sk, 2)) 2861 goto suppress_allocation; 2862 2863 /* guarantee minimum buffer size under pressure */ 2864 if (kind == SK_MEM_RECV) { 2865 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2866 return 1; 2867 2868 } else { /* SK_MEM_SEND */ 2869 int wmem0 = sk_get_wmem0(sk, prot); 2870 2871 if (sk->sk_type == SOCK_STREAM) { 2872 if (sk->sk_wmem_queued < wmem0) 2873 return 1; 2874 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2875 return 1; 2876 } 2877 } 2878 2879 if (sk_has_memory_pressure(sk)) { 2880 u64 alloc; 2881 2882 if (!sk_under_memory_pressure(sk)) 2883 return 1; 2884 alloc = sk_sockets_allocated_read_positive(sk); 2885 if (sk_prot_mem_limits(sk, 2) > alloc * 2886 sk_mem_pages(sk->sk_wmem_queued + 2887 atomic_read(&sk->sk_rmem_alloc) + 2888 sk->sk_forward_alloc)) 2889 return 1; 2890 } 2891 2892 suppress_allocation: 2893 2894 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2895 sk_stream_moderate_sndbuf(sk); 2896 2897 /* Fail only if socket is _under_ its sndbuf. 2898 * In this case we cannot block, so that we have to fail. 2899 */ 2900 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { 2901 /* Force charge with __GFP_NOFAIL */ 2902 if (memcg_charge && !charged) { 2903 mem_cgroup_charge_skmem(sk->sk_memcg, amt, 2904 gfp_memcg_charge() | __GFP_NOFAIL); 2905 } 2906 return 1; 2907 } 2908 } 2909 2910 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 2911 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 2912 2913 sk_memory_allocated_sub(sk, amt); 2914 2915 if (memcg_charge && charged) 2916 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2917 2918 return 0; 2919 } 2920 EXPORT_SYMBOL(__sk_mem_raise_allocated); 2921 2922 /** 2923 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2924 * @sk: socket 2925 * @size: memory size to allocate 2926 * @kind: allocation type 2927 * 2928 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2929 * rmem allocation. This function assumes that protocols which have 2930 * memory_pressure use sk_wmem_queued as write buffer accounting. 2931 */ 2932 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2933 { 2934 int ret, amt = sk_mem_pages(size); 2935 2936 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2937 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2938 if (!ret) 2939 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2940 return ret; 2941 } 2942 EXPORT_SYMBOL(__sk_mem_schedule); 2943 2944 /** 2945 * __sk_mem_reduce_allocated - reclaim memory_allocated 2946 * @sk: socket 2947 * @amount: number of quanta 2948 * 2949 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 2950 */ 2951 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 2952 { 2953 sk_memory_allocated_sub(sk, amount); 2954 2955 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2956 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2957 2958 if (sk_under_memory_pressure(sk) && 2959 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2960 sk_leave_memory_pressure(sk); 2961 } 2962 EXPORT_SYMBOL(__sk_mem_reduce_allocated); 2963 2964 /** 2965 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 2966 * @sk: socket 2967 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2968 */ 2969 void __sk_mem_reclaim(struct sock *sk, int amount) 2970 { 2971 amount >>= SK_MEM_QUANTUM_SHIFT; 2972 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2973 __sk_mem_reduce_allocated(sk, amount); 2974 } 2975 EXPORT_SYMBOL(__sk_mem_reclaim); 2976 2977 int sk_set_peek_off(struct sock *sk, int val) 2978 { 2979 sk->sk_peek_off = val; 2980 return 0; 2981 } 2982 EXPORT_SYMBOL_GPL(sk_set_peek_off); 2983 2984 /* 2985 * Set of default routines for initialising struct proto_ops when 2986 * the protocol does not support a particular function. In certain 2987 * cases where it makes no sense for a protocol to have a "do nothing" 2988 * function, some default processing is provided. 2989 */ 2990 2991 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2992 { 2993 return -EOPNOTSUPP; 2994 } 2995 EXPORT_SYMBOL(sock_no_bind); 2996 2997 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2998 int len, int flags) 2999 { 3000 return -EOPNOTSUPP; 3001 } 3002 EXPORT_SYMBOL(sock_no_connect); 3003 3004 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 3005 { 3006 return -EOPNOTSUPP; 3007 } 3008 EXPORT_SYMBOL(sock_no_socketpair); 3009 3010 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 3011 bool kern) 3012 { 3013 return -EOPNOTSUPP; 3014 } 3015 EXPORT_SYMBOL(sock_no_accept); 3016 3017 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 3018 int peer) 3019 { 3020 return -EOPNOTSUPP; 3021 } 3022 EXPORT_SYMBOL(sock_no_getname); 3023 3024 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3025 { 3026 return -EOPNOTSUPP; 3027 } 3028 EXPORT_SYMBOL(sock_no_ioctl); 3029 3030 int sock_no_listen(struct socket *sock, int backlog) 3031 { 3032 return -EOPNOTSUPP; 3033 } 3034 EXPORT_SYMBOL(sock_no_listen); 3035 3036 int sock_no_shutdown(struct socket *sock, int how) 3037 { 3038 return -EOPNOTSUPP; 3039 } 3040 EXPORT_SYMBOL(sock_no_shutdown); 3041 3042 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 3043 { 3044 return -EOPNOTSUPP; 3045 } 3046 EXPORT_SYMBOL(sock_no_sendmsg); 3047 3048 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 3049 { 3050 return -EOPNOTSUPP; 3051 } 3052 EXPORT_SYMBOL(sock_no_sendmsg_locked); 3053 3054 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 3055 int flags) 3056 { 3057 return -EOPNOTSUPP; 3058 } 3059 EXPORT_SYMBOL(sock_no_recvmsg); 3060 3061 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 3062 { 3063 /* Mirror missing mmap method error code */ 3064 return -ENODEV; 3065 } 3066 EXPORT_SYMBOL(sock_no_mmap); 3067 3068 /* 3069 * When a file is received (via SCM_RIGHTS, etc), we must bump the 3070 * various sock-based usage counts. 3071 */ 3072 void __receive_sock(struct file *file) 3073 { 3074 struct socket *sock; 3075 3076 sock = sock_from_file(file); 3077 if (sock) { 3078 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 3079 sock_update_classid(&sock->sk->sk_cgrp_data); 3080 } 3081 } 3082 3083 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 3084 { 3085 ssize_t res; 3086 struct msghdr msg = {.msg_flags = flags}; 3087 struct kvec iov; 3088 char *kaddr = kmap(page); 3089 iov.iov_base = kaddr + offset; 3090 iov.iov_len = size; 3091 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 3092 kunmap(page); 3093 return res; 3094 } 3095 EXPORT_SYMBOL(sock_no_sendpage); 3096 3097 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 3098 int offset, size_t size, int flags) 3099 { 3100 ssize_t res; 3101 struct msghdr msg = {.msg_flags = flags}; 3102 struct kvec iov; 3103 char *kaddr = kmap(page); 3104 3105 iov.iov_base = kaddr + offset; 3106 iov.iov_len = size; 3107 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 3108 kunmap(page); 3109 return res; 3110 } 3111 EXPORT_SYMBOL(sock_no_sendpage_locked); 3112 3113 /* 3114 * Default Socket Callbacks 3115 */ 3116 3117 static void sock_def_wakeup(struct sock *sk) 3118 { 3119 struct socket_wq *wq; 3120 3121 rcu_read_lock(); 3122 wq = rcu_dereference(sk->sk_wq); 3123 if (skwq_has_sleeper(wq)) 3124 wake_up_interruptible_all(&wq->wait); 3125 rcu_read_unlock(); 3126 } 3127 3128 static void sock_def_error_report(struct sock *sk) 3129 { 3130 struct socket_wq *wq; 3131 3132 rcu_read_lock(); 3133 wq = rcu_dereference(sk->sk_wq); 3134 if (skwq_has_sleeper(wq)) 3135 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 3136 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 3137 rcu_read_unlock(); 3138 } 3139 3140 void sock_def_readable(struct sock *sk) 3141 { 3142 struct socket_wq *wq; 3143 3144 rcu_read_lock(); 3145 wq = rcu_dereference(sk->sk_wq); 3146 if (skwq_has_sleeper(wq)) 3147 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 3148 EPOLLRDNORM | EPOLLRDBAND); 3149 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 3150 rcu_read_unlock(); 3151 } 3152 3153 static void sock_def_write_space(struct sock *sk) 3154 { 3155 struct socket_wq *wq; 3156 3157 rcu_read_lock(); 3158 3159 /* Do not wake up a writer until he can make "significant" 3160 * progress. --DaveM 3161 */ 3162 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { 3163 wq = rcu_dereference(sk->sk_wq); 3164 if (skwq_has_sleeper(wq)) 3165 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3166 EPOLLWRNORM | EPOLLWRBAND); 3167 3168 /* Should agree with poll, otherwise some programs break */ 3169 if (sock_writeable(sk)) 3170 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 3171 } 3172 3173 rcu_read_unlock(); 3174 } 3175 3176 static void sock_def_destruct(struct sock *sk) 3177 { 3178 } 3179 3180 void sk_send_sigurg(struct sock *sk) 3181 { 3182 if (sk->sk_socket && sk->sk_socket->file) 3183 if (send_sigurg(&sk->sk_socket->file->f_owner)) 3184 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 3185 } 3186 EXPORT_SYMBOL(sk_send_sigurg); 3187 3188 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 3189 unsigned long expires) 3190 { 3191 if (!mod_timer(timer, expires)) 3192 sock_hold(sk); 3193 } 3194 EXPORT_SYMBOL(sk_reset_timer); 3195 3196 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 3197 { 3198 if (del_timer(timer)) 3199 __sock_put(sk); 3200 } 3201 EXPORT_SYMBOL(sk_stop_timer); 3202 3203 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 3204 { 3205 if (del_timer_sync(timer)) 3206 __sock_put(sk); 3207 } 3208 EXPORT_SYMBOL(sk_stop_timer_sync); 3209 3210 void sock_init_data(struct socket *sock, struct sock *sk) 3211 { 3212 sk_init_common(sk); 3213 sk->sk_send_head = NULL; 3214 3215 timer_setup(&sk->sk_timer, NULL, 0); 3216 3217 sk->sk_allocation = GFP_KERNEL; 3218 sk->sk_rcvbuf = sysctl_rmem_default; 3219 sk->sk_sndbuf = sysctl_wmem_default; 3220 sk->sk_state = TCP_CLOSE; 3221 sk_set_socket(sk, sock); 3222 3223 sock_set_flag(sk, SOCK_ZAPPED); 3224 3225 if (sock) { 3226 sk->sk_type = sock->type; 3227 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3228 sock->sk = sk; 3229 sk->sk_uid = SOCK_INODE(sock)->i_uid; 3230 } else { 3231 RCU_INIT_POINTER(sk->sk_wq, NULL); 3232 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); 3233 } 3234 3235 rwlock_init(&sk->sk_callback_lock); 3236 if (sk->sk_kern_sock) 3237 lockdep_set_class_and_name( 3238 &sk->sk_callback_lock, 3239 af_kern_callback_keys + sk->sk_family, 3240 af_family_kern_clock_key_strings[sk->sk_family]); 3241 else 3242 lockdep_set_class_and_name( 3243 &sk->sk_callback_lock, 3244 af_callback_keys + sk->sk_family, 3245 af_family_clock_key_strings[sk->sk_family]); 3246 3247 sk->sk_state_change = sock_def_wakeup; 3248 sk->sk_data_ready = sock_def_readable; 3249 sk->sk_write_space = sock_def_write_space; 3250 sk->sk_error_report = sock_def_error_report; 3251 sk->sk_destruct = sock_def_destruct; 3252 3253 sk->sk_frag.page = NULL; 3254 sk->sk_frag.offset = 0; 3255 sk->sk_peek_off = -1; 3256 3257 sk->sk_peer_pid = NULL; 3258 sk->sk_peer_cred = NULL; 3259 spin_lock_init(&sk->sk_peer_lock); 3260 3261 sk->sk_write_pending = 0; 3262 sk->sk_rcvlowat = 1; 3263 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3264 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3265 3266 sk->sk_stamp = SK_DEFAULT_STAMP; 3267 #if BITS_PER_LONG==32 3268 seqlock_init(&sk->sk_stamp_seq); 3269 #endif 3270 atomic_set(&sk->sk_zckey, 0); 3271 3272 #ifdef CONFIG_NET_RX_BUSY_POLL 3273 sk->sk_napi_id = 0; 3274 sk->sk_ll_usec = sysctl_net_busy_read; 3275 #endif 3276 3277 sk->sk_max_pacing_rate = ~0UL; 3278 sk->sk_pacing_rate = ~0UL; 3279 WRITE_ONCE(sk->sk_pacing_shift, 10); 3280 sk->sk_incoming_cpu = -1; 3281 3282 sk_rx_queue_clear(sk); 3283 /* 3284 * Before updating sk_refcnt, we must commit prior changes to memory 3285 * (Documentation/RCU/rculist_nulls.rst for details) 3286 */ 3287 smp_wmb(); 3288 refcount_set(&sk->sk_refcnt, 1); 3289 atomic_set(&sk->sk_drops, 0); 3290 } 3291 EXPORT_SYMBOL(sock_init_data); 3292 3293 void lock_sock_nested(struct sock *sk, int subclass) 3294 { 3295 /* The sk_lock has mutex_lock() semantics here. */ 3296 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3297 3298 might_sleep(); 3299 spin_lock_bh(&sk->sk_lock.slock); 3300 if (sock_owned_by_user_nocheck(sk)) 3301 __lock_sock(sk); 3302 sk->sk_lock.owned = 1; 3303 spin_unlock_bh(&sk->sk_lock.slock); 3304 } 3305 EXPORT_SYMBOL(lock_sock_nested); 3306 3307 void release_sock(struct sock *sk) 3308 { 3309 spin_lock_bh(&sk->sk_lock.slock); 3310 if (sk->sk_backlog.tail) 3311 __release_sock(sk); 3312 3313 /* Warning : release_cb() might need to release sk ownership, 3314 * ie call sock_release_ownership(sk) before us. 3315 */ 3316 if (sk->sk_prot->release_cb) 3317 sk->sk_prot->release_cb(sk); 3318 3319 sock_release_ownership(sk); 3320 if (waitqueue_active(&sk->sk_lock.wq)) 3321 wake_up(&sk->sk_lock.wq); 3322 spin_unlock_bh(&sk->sk_lock.slock); 3323 } 3324 EXPORT_SYMBOL(release_sock); 3325 3326 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3327 { 3328 might_sleep(); 3329 spin_lock_bh(&sk->sk_lock.slock); 3330 3331 if (!sock_owned_by_user_nocheck(sk)) { 3332 /* 3333 * Fast path return with bottom halves disabled and 3334 * sock::sk_lock.slock held. 3335 * 3336 * The 'mutex' is not contended and holding 3337 * sock::sk_lock.slock prevents all other lockers to 3338 * proceed so the corresponding unlock_sock_fast() can 3339 * avoid the slow path of release_sock() completely and 3340 * just release slock. 3341 * 3342 * From a semantical POV this is equivalent to 'acquiring' 3343 * the 'mutex', hence the corresponding lockdep 3344 * mutex_release() has to happen in the fast path of 3345 * unlock_sock_fast(). 3346 */ 3347 return false; 3348 } 3349 3350 __lock_sock(sk); 3351 sk->sk_lock.owned = 1; 3352 __acquire(&sk->sk_lock.slock); 3353 spin_unlock_bh(&sk->sk_lock.slock); 3354 return true; 3355 } 3356 EXPORT_SYMBOL(__lock_sock_fast); 3357 3358 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3359 bool timeval, bool time32) 3360 { 3361 struct sock *sk = sock->sk; 3362 struct timespec64 ts; 3363 3364 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3365 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3366 if (ts.tv_sec == -1) 3367 return -ENOENT; 3368 if (ts.tv_sec == 0) { 3369 ktime_t kt = ktime_get_real(); 3370 sock_write_timestamp(sk, kt); 3371 ts = ktime_to_timespec64(kt); 3372 } 3373 3374 if (timeval) 3375 ts.tv_nsec /= 1000; 3376 3377 #ifdef CONFIG_COMPAT_32BIT_TIME 3378 if (time32) 3379 return put_old_timespec32(&ts, userstamp); 3380 #endif 3381 #ifdef CONFIG_SPARC64 3382 /* beware of padding in sparc64 timeval */ 3383 if (timeval && !in_compat_syscall()) { 3384 struct __kernel_old_timeval __user tv = { 3385 .tv_sec = ts.tv_sec, 3386 .tv_usec = ts.tv_nsec, 3387 }; 3388 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3389 return -EFAULT; 3390 return 0; 3391 } 3392 #endif 3393 return put_timespec64(&ts, userstamp); 3394 } 3395 EXPORT_SYMBOL(sock_gettstamp); 3396 3397 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3398 { 3399 if (!sock_flag(sk, flag)) { 3400 unsigned long previous_flags = sk->sk_flags; 3401 3402 sock_set_flag(sk, flag); 3403 /* 3404 * we just set one of the two flags which require net 3405 * time stamping, but time stamping might have been on 3406 * already because of the other one 3407 */ 3408 if (sock_needs_netstamp(sk) && 3409 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3410 net_enable_timestamp(); 3411 } 3412 } 3413 3414 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3415 int level, int type) 3416 { 3417 struct sock_exterr_skb *serr; 3418 struct sk_buff *skb; 3419 int copied, err; 3420 3421 err = -EAGAIN; 3422 skb = sock_dequeue_err_skb(sk); 3423 if (skb == NULL) 3424 goto out; 3425 3426 copied = skb->len; 3427 if (copied > len) { 3428 msg->msg_flags |= MSG_TRUNC; 3429 copied = len; 3430 } 3431 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3432 if (err) 3433 goto out_free_skb; 3434 3435 sock_recv_timestamp(msg, sk, skb); 3436 3437 serr = SKB_EXT_ERR(skb); 3438 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3439 3440 msg->msg_flags |= MSG_ERRQUEUE; 3441 err = copied; 3442 3443 out_free_skb: 3444 kfree_skb(skb); 3445 out: 3446 return err; 3447 } 3448 EXPORT_SYMBOL(sock_recv_errqueue); 3449 3450 /* 3451 * Get a socket option on an socket. 3452 * 3453 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3454 * asynchronous errors should be reported by getsockopt. We assume 3455 * this means if you specify SO_ERROR (otherwise whats the point of it). 3456 */ 3457 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3458 char __user *optval, int __user *optlen) 3459 { 3460 struct sock *sk = sock->sk; 3461 3462 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3463 } 3464 EXPORT_SYMBOL(sock_common_getsockopt); 3465 3466 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3467 int flags) 3468 { 3469 struct sock *sk = sock->sk; 3470 int addr_len = 0; 3471 int err; 3472 3473 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 3474 flags & ~MSG_DONTWAIT, &addr_len); 3475 if (err >= 0) 3476 msg->msg_namelen = addr_len; 3477 return err; 3478 } 3479 EXPORT_SYMBOL(sock_common_recvmsg); 3480 3481 /* 3482 * Set socket options on an inet socket. 3483 */ 3484 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3485 sockptr_t optval, unsigned int optlen) 3486 { 3487 struct sock *sk = sock->sk; 3488 3489 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3490 } 3491 EXPORT_SYMBOL(sock_common_setsockopt); 3492 3493 void sk_common_release(struct sock *sk) 3494 { 3495 if (sk->sk_prot->destroy) 3496 sk->sk_prot->destroy(sk); 3497 3498 /* 3499 * Observation: when sk_common_release is called, processes have 3500 * no access to socket. But net still has. 3501 * Step one, detach it from networking: 3502 * 3503 * A. Remove from hash tables. 3504 */ 3505 3506 sk->sk_prot->unhash(sk); 3507 3508 /* 3509 * In this point socket cannot receive new packets, but it is possible 3510 * that some packets are in flight because some CPU runs receiver and 3511 * did hash table lookup before we unhashed socket. They will achieve 3512 * receive queue and will be purged by socket destructor. 3513 * 3514 * Also we still have packets pending on receive queue and probably, 3515 * our own packets waiting in device queues. sock_destroy will drain 3516 * receive queue, but transmitted packets will delay socket destruction 3517 * until the last reference will be released. 3518 */ 3519 3520 sock_orphan(sk); 3521 3522 xfrm_sk_free_policy(sk); 3523 3524 sk_refcnt_debug_release(sk); 3525 3526 sock_put(sk); 3527 } 3528 EXPORT_SYMBOL(sk_common_release); 3529 3530 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3531 { 3532 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3533 3534 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3535 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3536 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3537 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3538 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3539 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3540 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3541 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 3542 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3543 } 3544 3545 #ifdef CONFIG_PROC_FS 3546 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3547 3548 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3549 { 3550 int cpu, idx = prot->inuse_idx; 3551 int res = 0; 3552 3553 for_each_possible_cpu(cpu) 3554 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3555 3556 return res >= 0 ? res : 0; 3557 } 3558 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3559 3560 int sock_inuse_get(struct net *net) 3561 { 3562 int cpu, res = 0; 3563 3564 for_each_possible_cpu(cpu) 3565 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; 3566 3567 return res; 3568 } 3569 3570 EXPORT_SYMBOL_GPL(sock_inuse_get); 3571 3572 static int __net_init sock_inuse_init_net(struct net *net) 3573 { 3574 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3575 if (net->core.prot_inuse == NULL) 3576 return -ENOMEM; 3577 return 0; 3578 } 3579 3580 static void __net_exit sock_inuse_exit_net(struct net *net) 3581 { 3582 free_percpu(net->core.prot_inuse); 3583 } 3584 3585 static struct pernet_operations net_inuse_ops = { 3586 .init = sock_inuse_init_net, 3587 .exit = sock_inuse_exit_net, 3588 }; 3589 3590 static __init int net_inuse_init(void) 3591 { 3592 if (register_pernet_subsys(&net_inuse_ops)) 3593 panic("Cannot initialize net inuse counters"); 3594 3595 return 0; 3596 } 3597 3598 core_initcall(net_inuse_init); 3599 3600 static int assign_proto_idx(struct proto *prot) 3601 { 3602 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3603 3604 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3605 pr_err("PROTO_INUSE_NR exhausted\n"); 3606 return -ENOSPC; 3607 } 3608 3609 set_bit(prot->inuse_idx, proto_inuse_idx); 3610 return 0; 3611 } 3612 3613 static void release_proto_idx(struct proto *prot) 3614 { 3615 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3616 clear_bit(prot->inuse_idx, proto_inuse_idx); 3617 } 3618 #else 3619 static inline int assign_proto_idx(struct proto *prot) 3620 { 3621 return 0; 3622 } 3623 3624 static inline void release_proto_idx(struct proto *prot) 3625 { 3626 } 3627 3628 #endif 3629 3630 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 3631 { 3632 if (!twsk_prot) 3633 return; 3634 kfree(twsk_prot->twsk_slab_name); 3635 twsk_prot->twsk_slab_name = NULL; 3636 kmem_cache_destroy(twsk_prot->twsk_slab); 3637 twsk_prot->twsk_slab = NULL; 3638 } 3639 3640 static int tw_prot_init(const struct proto *prot) 3641 { 3642 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 3643 3644 if (!twsk_prot) 3645 return 0; 3646 3647 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 3648 prot->name); 3649 if (!twsk_prot->twsk_slab_name) 3650 return -ENOMEM; 3651 3652 twsk_prot->twsk_slab = 3653 kmem_cache_create(twsk_prot->twsk_slab_name, 3654 twsk_prot->twsk_obj_size, 0, 3655 SLAB_ACCOUNT | prot->slab_flags, 3656 NULL); 3657 if (!twsk_prot->twsk_slab) { 3658 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 3659 prot->name); 3660 return -ENOMEM; 3661 } 3662 3663 return 0; 3664 } 3665 3666 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3667 { 3668 if (!rsk_prot) 3669 return; 3670 kfree(rsk_prot->slab_name); 3671 rsk_prot->slab_name = NULL; 3672 kmem_cache_destroy(rsk_prot->slab); 3673 rsk_prot->slab = NULL; 3674 } 3675 3676 static int req_prot_init(const struct proto *prot) 3677 { 3678 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3679 3680 if (!rsk_prot) 3681 return 0; 3682 3683 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3684 prot->name); 3685 if (!rsk_prot->slab_name) 3686 return -ENOMEM; 3687 3688 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3689 rsk_prot->obj_size, 0, 3690 SLAB_ACCOUNT | prot->slab_flags, 3691 NULL); 3692 3693 if (!rsk_prot->slab) { 3694 pr_crit("%s: Can't create request sock SLAB cache!\n", 3695 prot->name); 3696 return -ENOMEM; 3697 } 3698 return 0; 3699 } 3700 3701 int proto_register(struct proto *prot, int alloc_slab) 3702 { 3703 int ret = -ENOBUFS; 3704 3705 if (alloc_slab) { 3706 prot->slab = kmem_cache_create_usercopy(prot->name, 3707 prot->obj_size, 0, 3708 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3709 prot->slab_flags, 3710 prot->useroffset, prot->usersize, 3711 NULL); 3712 3713 if (prot->slab == NULL) { 3714 pr_crit("%s: Can't create sock SLAB cache!\n", 3715 prot->name); 3716 goto out; 3717 } 3718 3719 if (req_prot_init(prot)) 3720 goto out_free_request_sock_slab; 3721 3722 if (tw_prot_init(prot)) 3723 goto out_free_timewait_sock_slab; 3724 } 3725 3726 mutex_lock(&proto_list_mutex); 3727 ret = assign_proto_idx(prot); 3728 if (ret) { 3729 mutex_unlock(&proto_list_mutex); 3730 goto out_free_timewait_sock_slab; 3731 } 3732 list_add(&prot->node, &proto_list); 3733 mutex_unlock(&proto_list_mutex); 3734 return ret; 3735 3736 out_free_timewait_sock_slab: 3737 if (alloc_slab) 3738 tw_prot_cleanup(prot->twsk_prot); 3739 out_free_request_sock_slab: 3740 if (alloc_slab) { 3741 req_prot_cleanup(prot->rsk_prot); 3742 3743 kmem_cache_destroy(prot->slab); 3744 prot->slab = NULL; 3745 } 3746 out: 3747 return ret; 3748 } 3749 EXPORT_SYMBOL(proto_register); 3750 3751 void proto_unregister(struct proto *prot) 3752 { 3753 mutex_lock(&proto_list_mutex); 3754 release_proto_idx(prot); 3755 list_del(&prot->node); 3756 mutex_unlock(&proto_list_mutex); 3757 3758 kmem_cache_destroy(prot->slab); 3759 prot->slab = NULL; 3760 3761 req_prot_cleanup(prot->rsk_prot); 3762 tw_prot_cleanup(prot->twsk_prot); 3763 } 3764 EXPORT_SYMBOL(proto_unregister); 3765 3766 int sock_load_diag_module(int family, int protocol) 3767 { 3768 if (!protocol) { 3769 if (!sock_is_registered(family)) 3770 return -ENOENT; 3771 3772 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3773 NETLINK_SOCK_DIAG, family); 3774 } 3775 3776 #ifdef CONFIG_INET 3777 if (family == AF_INET && 3778 protocol != IPPROTO_RAW && 3779 protocol < MAX_INET_PROTOS && 3780 !rcu_access_pointer(inet_protos[protocol])) 3781 return -ENOENT; 3782 #endif 3783 3784 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3785 NETLINK_SOCK_DIAG, family, protocol); 3786 } 3787 EXPORT_SYMBOL(sock_load_diag_module); 3788 3789 #ifdef CONFIG_PROC_FS 3790 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3791 __acquires(proto_list_mutex) 3792 { 3793 mutex_lock(&proto_list_mutex); 3794 return seq_list_start_head(&proto_list, *pos); 3795 } 3796 3797 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3798 { 3799 return seq_list_next(v, &proto_list, pos); 3800 } 3801 3802 static void proto_seq_stop(struct seq_file *seq, void *v) 3803 __releases(proto_list_mutex) 3804 { 3805 mutex_unlock(&proto_list_mutex); 3806 } 3807 3808 static char proto_method_implemented(const void *method) 3809 { 3810 return method == NULL ? 'n' : 'y'; 3811 } 3812 static long sock_prot_memory_allocated(struct proto *proto) 3813 { 3814 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3815 } 3816 3817 static const char *sock_prot_memory_pressure(struct proto *proto) 3818 { 3819 return proto->memory_pressure != NULL ? 3820 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3821 } 3822 3823 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3824 { 3825 3826 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3827 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3828 proto->name, 3829 proto->obj_size, 3830 sock_prot_inuse_get(seq_file_net(seq), proto), 3831 sock_prot_memory_allocated(proto), 3832 sock_prot_memory_pressure(proto), 3833 proto->max_header, 3834 proto->slab == NULL ? "no" : "yes", 3835 module_name(proto->owner), 3836 proto_method_implemented(proto->close), 3837 proto_method_implemented(proto->connect), 3838 proto_method_implemented(proto->disconnect), 3839 proto_method_implemented(proto->accept), 3840 proto_method_implemented(proto->ioctl), 3841 proto_method_implemented(proto->init), 3842 proto_method_implemented(proto->destroy), 3843 proto_method_implemented(proto->shutdown), 3844 proto_method_implemented(proto->setsockopt), 3845 proto_method_implemented(proto->getsockopt), 3846 proto_method_implemented(proto->sendmsg), 3847 proto_method_implemented(proto->recvmsg), 3848 proto_method_implemented(proto->sendpage), 3849 proto_method_implemented(proto->bind), 3850 proto_method_implemented(proto->backlog_rcv), 3851 proto_method_implemented(proto->hash), 3852 proto_method_implemented(proto->unhash), 3853 proto_method_implemented(proto->get_port), 3854 proto_method_implemented(proto->enter_memory_pressure)); 3855 } 3856 3857 static int proto_seq_show(struct seq_file *seq, void *v) 3858 { 3859 if (v == &proto_list) 3860 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3861 "protocol", 3862 "size", 3863 "sockets", 3864 "memory", 3865 "press", 3866 "maxhdr", 3867 "slab", 3868 "module", 3869 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3870 else 3871 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3872 return 0; 3873 } 3874 3875 static const struct seq_operations proto_seq_ops = { 3876 .start = proto_seq_start, 3877 .next = proto_seq_next, 3878 .stop = proto_seq_stop, 3879 .show = proto_seq_show, 3880 }; 3881 3882 static __net_init int proto_init_net(struct net *net) 3883 { 3884 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 3885 sizeof(struct seq_net_private))) 3886 return -ENOMEM; 3887 3888 return 0; 3889 } 3890 3891 static __net_exit void proto_exit_net(struct net *net) 3892 { 3893 remove_proc_entry("protocols", net->proc_net); 3894 } 3895 3896 3897 static __net_initdata struct pernet_operations proto_net_ops = { 3898 .init = proto_init_net, 3899 .exit = proto_exit_net, 3900 }; 3901 3902 static int __init proto_init(void) 3903 { 3904 return register_pernet_subsys(&proto_net_ops); 3905 } 3906 3907 subsys_initcall(proto_init); 3908 3909 #endif /* PROC_FS */ 3910 3911 #ifdef CONFIG_NET_RX_BUSY_POLL 3912 bool sk_busy_loop_end(void *p, unsigned long start_time) 3913 { 3914 struct sock *sk = p; 3915 3916 return !skb_queue_empty_lockless(&sk->sk_receive_queue) || 3917 sk_busy_loop_timeout(sk, start_time); 3918 } 3919 EXPORT_SYMBOL(sk_busy_loop_end); 3920 #endif /* CONFIG_NET_RX_BUSY_POLL */ 3921 3922 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) 3923 { 3924 if (!sk->sk_prot->bind_add) 3925 return -EOPNOTSUPP; 3926 return sk->sk_prot->bind_add(sk, addr, addr_len); 3927 } 3928 EXPORT_SYMBOL(sock_bind_add); 3929