1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <asm/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/init.h> 111 #include <linux/highmem.h> 112 #include <linux/user_namespace.h> 113 #include <linux/static_key.h> 114 #include <linux/memcontrol.h> 115 #include <linux/prefetch.h> 116 #include <linux/compat.h> 117 118 #include <linux/uaccess.h> 119 120 #include <linux/netdevice.h> 121 #include <net/protocol.h> 122 #include <linux/skbuff.h> 123 #include <net/net_namespace.h> 124 #include <net/request_sock.h> 125 #include <net/sock.h> 126 #include <linux/net_tstamp.h> 127 #include <net/xfrm.h> 128 #include <linux/ipsec.h> 129 #include <net/cls_cgroup.h> 130 #include <net/netprio_cgroup.h> 131 #include <linux/sock_diag.h> 132 133 #include <linux/filter.h> 134 #include <net/sock_reuseport.h> 135 #include <net/bpf_sk_storage.h> 136 137 #include <trace/events/sock.h> 138 139 #include <net/tcp.h> 140 #include <net/busy_poll.h> 141 142 static DEFINE_MUTEX(proto_list_mutex); 143 static LIST_HEAD(proto_list); 144 145 static void sock_inuse_add(struct net *net, int val); 146 147 /** 148 * sk_ns_capable - General socket capability test 149 * @sk: Socket to use a capability on or through 150 * @user_ns: The user namespace of the capability to use 151 * @cap: The capability to use 152 * 153 * Test to see if the opener of the socket had when the socket was 154 * created and the current process has the capability @cap in the user 155 * namespace @user_ns. 156 */ 157 bool sk_ns_capable(const struct sock *sk, 158 struct user_namespace *user_ns, int cap) 159 { 160 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 161 ns_capable(user_ns, cap); 162 } 163 EXPORT_SYMBOL(sk_ns_capable); 164 165 /** 166 * sk_capable - Socket global capability test 167 * @sk: Socket to use a capability on or through 168 * @cap: The global capability to use 169 * 170 * Test to see if the opener of the socket had when the socket was 171 * created and the current process has the capability @cap in all user 172 * namespaces. 173 */ 174 bool sk_capable(const struct sock *sk, int cap) 175 { 176 return sk_ns_capable(sk, &init_user_ns, cap); 177 } 178 EXPORT_SYMBOL(sk_capable); 179 180 /** 181 * sk_net_capable - Network namespace socket capability test 182 * @sk: Socket to use a capability on or through 183 * @cap: The capability to use 184 * 185 * Test to see if the opener of the socket had when the socket was created 186 * and the current process has the capability @cap over the network namespace 187 * the socket is a member of. 188 */ 189 bool sk_net_capable(const struct sock *sk, int cap) 190 { 191 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 192 } 193 EXPORT_SYMBOL(sk_net_capable); 194 195 /* 196 * Each address family might have different locking rules, so we have 197 * one slock key per address family and separate keys for internal and 198 * userspace sockets. 199 */ 200 static struct lock_class_key af_family_keys[AF_MAX]; 201 static struct lock_class_key af_family_kern_keys[AF_MAX]; 202 static struct lock_class_key af_family_slock_keys[AF_MAX]; 203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 204 205 /* 206 * Make lock validator output more readable. (we pre-construct these 207 * strings build-time, so that runtime initialization of socket 208 * locks is fast): 209 */ 210 211 #define _sock_locks(x) \ 212 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 213 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 214 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 215 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 216 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 217 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 218 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 219 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 220 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 221 x "27" , x "28" , x "AF_CAN" , \ 222 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 223 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 224 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 225 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 226 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 227 x "AF_MAX" 228 229 static const char *const af_family_key_strings[AF_MAX+1] = { 230 _sock_locks("sk_lock-") 231 }; 232 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 233 _sock_locks("slock-") 234 }; 235 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 236 _sock_locks("clock-") 237 }; 238 239 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 240 _sock_locks("k-sk_lock-") 241 }; 242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 243 _sock_locks("k-slock-") 244 }; 245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 246 _sock_locks("k-clock-") 247 }; 248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 249 _sock_locks("rlock-") 250 }; 251 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 252 _sock_locks("wlock-") 253 }; 254 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 255 _sock_locks("elock-") 256 }; 257 258 /* 259 * sk_callback_lock and sk queues locking rules are per-address-family, 260 * so split the lock classes by using a per-AF key: 261 */ 262 static struct lock_class_key af_callback_keys[AF_MAX]; 263 static struct lock_class_key af_rlock_keys[AF_MAX]; 264 static struct lock_class_key af_wlock_keys[AF_MAX]; 265 static struct lock_class_key af_elock_keys[AF_MAX]; 266 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 267 268 /* Run time adjustable parameters. */ 269 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 270 EXPORT_SYMBOL(sysctl_wmem_max); 271 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 272 EXPORT_SYMBOL(sysctl_rmem_max); 273 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 274 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 275 276 /* Maximal space eaten by iovec or ancillary data plus some space */ 277 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 278 EXPORT_SYMBOL(sysctl_optmem_max); 279 280 int sysctl_tstamp_allow_data __read_mostly = 1; 281 282 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 283 EXPORT_SYMBOL_GPL(memalloc_socks_key); 284 285 /** 286 * sk_set_memalloc - sets %SOCK_MEMALLOC 287 * @sk: socket to set it on 288 * 289 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 290 * It's the responsibility of the admin to adjust min_free_kbytes 291 * to meet the requirements 292 */ 293 void sk_set_memalloc(struct sock *sk) 294 { 295 sock_set_flag(sk, SOCK_MEMALLOC); 296 sk->sk_allocation |= __GFP_MEMALLOC; 297 static_branch_inc(&memalloc_socks_key); 298 } 299 EXPORT_SYMBOL_GPL(sk_set_memalloc); 300 301 void sk_clear_memalloc(struct sock *sk) 302 { 303 sock_reset_flag(sk, SOCK_MEMALLOC); 304 sk->sk_allocation &= ~__GFP_MEMALLOC; 305 static_branch_dec(&memalloc_socks_key); 306 307 /* 308 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 309 * progress of swapping. SOCK_MEMALLOC may be cleared while 310 * it has rmem allocations due to the last swapfile being deactivated 311 * but there is a risk that the socket is unusable due to exceeding 312 * the rmem limits. Reclaim the reserves and obey rmem limits again. 313 */ 314 sk_mem_reclaim(sk); 315 } 316 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 317 318 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 319 { 320 int ret; 321 unsigned int noreclaim_flag; 322 323 /* these should have been dropped before queueing */ 324 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 325 326 noreclaim_flag = memalloc_noreclaim_save(); 327 ret = sk->sk_backlog_rcv(sk, skb); 328 memalloc_noreclaim_restore(noreclaim_flag); 329 330 return ret; 331 } 332 EXPORT_SYMBOL(__sk_backlog_rcv); 333 334 static int sock_get_timeout(long timeo, void *optval, bool old_timeval) 335 { 336 struct __kernel_sock_timeval tv; 337 338 if (timeo == MAX_SCHEDULE_TIMEOUT) { 339 tv.tv_sec = 0; 340 tv.tv_usec = 0; 341 } else { 342 tv.tv_sec = timeo / HZ; 343 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 344 } 345 346 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 347 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 348 *(struct old_timeval32 *)optval = tv32; 349 return sizeof(tv32); 350 } 351 352 if (old_timeval) { 353 struct __kernel_old_timeval old_tv; 354 old_tv.tv_sec = tv.tv_sec; 355 old_tv.tv_usec = tv.tv_usec; 356 *(struct __kernel_old_timeval *)optval = old_tv; 357 return sizeof(old_tv); 358 } 359 360 *(struct __kernel_sock_timeval *)optval = tv; 361 return sizeof(tv); 362 } 363 364 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 365 bool old_timeval) 366 { 367 struct __kernel_sock_timeval tv; 368 369 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 370 struct old_timeval32 tv32; 371 372 if (optlen < sizeof(tv32)) 373 return -EINVAL; 374 375 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 376 return -EFAULT; 377 tv.tv_sec = tv32.tv_sec; 378 tv.tv_usec = tv32.tv_usec; 379 } else if (old_timeval) { 380 struct __kernel_old_timeval old_tv; 381 382 if (optlen < sizeof(old_tv)) 383 return -EINVAL; 384 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 385 return -EFAULT; 386 tv.tv_sec = old_tv.tv_sec; 387 tv.tv_usec = old_tv.tv_usec; 388 } else { 389 if (optlen < sizeof(tv)) 390 return -EINVAL; 391 if (copy_from_sockptr(&tv, optval, sizeof(tv))) 392 return -EFAULT; 393 } 394 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 395 return -EDOM; 396 397 if (tv.tv_sec < 0) { 398 static int warned __read_mostly; 399 400 *timeo_p = 0; 401 if (warned < 10 && net_ratelimit()) { 402 warned++; 403 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 404 __func__, current->comm, task_pid_nr(current)); 405 } 406 return 0; 407 } 408 *timeo_p = MAX_SCHEDULE_TIMEOUT; 409 if (tv.tv_sec == 0 && tv.tv_usec == 0) 410 return 0; 411 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) 412 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ); 413 return 0; 414 } 415 416 static bool sock_needs_netstamp(const struct sock *sk) 417 { 418 switch (sk->sk_family) { 419 case AF_UNSPEC: 420 case AF_UNIX: 421 return false; 422 default: 423 return true; 424 } 425 } 426 427 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 428 { 429 if (sk->sk_flags & flags) { 430 sk->sk_flags &= ~flags; 431 if (sock_needs_netstamp(sk) && 432 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 433 net_disable_timestamp(); 434 } 435 } 436 437 438 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 439 { 440 unsigned long flags; 441 struct sk_buff_head *list = &sk->sk_receive_queue; 442 443 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 444 atomic_inc(&sk->sk_drops); 445 trace_sock_rcvqueue_full(sk, skb); 446 return -ENOMEM; 447 } 448 449 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 450 atomic_inc(&sk->sk_drops); 451 return -ENOBUFS; 452 } 453 454 skb->dev = NULL; 455 skb_set_owner_r(skb, sk); 456 457 /* we escape from rcu protected region, make sure we dont leak 458 * a norefcounted dst 459 */ 460 skb_dst_force(skb); 461 462 spin_lock_irqsave(&list->lock, flags); 463 sock_skb_set_dropcount(sk, skb); 464 __skb_queue_tail(list, skb); 465 spin_unlock_irqrestore(&list->lock, flags); 466 467 if (!sock_flag(sk, SOCK_DEAD)) 468 sk->sk_data_ready(sk); 469 return 0; 470 } 471 EXPORT_SYMBOL(__sock_queue_rcv_skb); 472 473 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 474 { 475 int err; 476 477 err = sk_filter(sk, skb); 478 if (err) 479 return err; 480 481 return __sock_queue_rcv_skb(sk, skb); 482 } 483 EXPORT_SYMBOL(sock_queue_rcv_skb); 484 485 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 486 const int nested, unsigned int trim_cap, bool refcounted) 487 { 488 int rc = NET_RX_SUCCESS; 489 490 if (sk_filter_trim_cap(sk, skb, trim_cap)) 491 goto discard_and_relse; 492 493 skb->dev = NULL; 494 495 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 496 atomic_inc(&sk->sk_drops); 497 goto discard_and_relse; 498 } 499 if (nested) 500 bh_lock_sock_nested(sk); 501 else 502 bh_lock_sock(sk); 503 if (!sock_owned_by_user(sk)) { 504 /* 505 * trylock + unlock semantics: 506 */ 507 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 508 509 rc = sk_backlog_rcv(sk, skb); 510 511 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 512 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { 513 bh_unlock_sock(sk); 514 atomic_inc(&sk->sk_drops); 515 goto discard_and_relse; 516 } 517 518 bh_unlock_sock(sk); 519 out: 520 if (refcounted) 521 sock_put(sk); 522 return rc; 523 discard_and_relse: 524 kfree_skb(skb); 525 goto out; 526 } 527 EXPORT_SYMBOL(__sk_receive_skb); 528 529 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 530 u32)); 531 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 532 u32)); 533 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 534 { 535 struct dst_entry *dst = __sk_dst_get(sk); 536 537 if (dst && dst->obsolete && 538 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 539 dst, cookie) == NULL) { 540 sk_tx_queue_clear(sk); 541 sk->sk_dst_pending_confirm = 0; 542 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 543 dst_release(dst); 544 return NULL; 545 } 546 547 return dst; 548 } 549 EXPORT_SYMBOL(__sk_dst_check); 550 551 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 552 { 553 struct dst_entry *dst = sk_dst_get(sk); 554 555 if (dst && dst->obsolete && 556 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 557 dst, cookie) == NULL) { 558 sk_dst_reset(sk); 559 dst_release(dst); 560 return NULL; 561 } 562 563 return dst; 564 } 565 EXPORT_SYMBOL(sk_dst_check); 566 567 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 568 { 569 int ret = -ENOPROTOOPT; 570 #ifdef CONFIG_NETDEVICES 571 struct net *net = sock_net(sk); 572 573 /* Sorry... */ 574 ret = -EPERM; 575 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 576 goto out; 577 578 ret = -EINVAL; 579 if (ifindex < 0) 580 goto out; 581 582 sk->sk_bound_dev_if = ifindex; 583 if (sk->sk_prot->rehash) 584 sk->sk_prot->rehash(sk); 585 sk_dst_reset(sk); 586 587 ret = 0; 588 589 out: 590 #endif 591 592 return ret; 593 } 594 595 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 596 { 597 int ret; 598 599 if (lock_sk) 600 lock_sock(sk); 601 ret = sock_bindtoindex_locked(sk, ifindex); 602 if (lock_sk) 603 release_sock(sk); 604 605 return ret; 606 } 607 EXPORT_SYMBOL(sock_bindtoindex); 608 609 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 610 { 611 int ret = -ENOPROTOOPT; 612 #ifdef CONFIG_NETDEVICES 613 struct net *net = sock_net(sk); 614 char devname[IFNAMSIZ]; 615 int index; 616 617 ret = -EINVAL; 618 if (optlen < 0) 619 goto out; 620 621 /* Bind this socket to a particular device like "eth0", 622 * as specified in the passed interface name. If the 623 * name is "" or the option length is zero the socket 624 * is not bound. 625 */ 626 if (optlen > IFNAMSIZ - 1) 627 optlen = IFNAMSIZ - 1; 628 memset(devname, 0, sizeof(devname)); 629 630 ret = -EFAULT; 631 if (copy_from_sockptr(devname, optval, optlen)) 632 goto out; 633 634 index = 0; 635 if (devname[0] != '\0') { 636 struct net_device *dev; 637 638 rcu_read_lock(); 639 dev = dev_get_by_name_rcu(net, devname); 640 if (dev) 641 index = dev->ifindex; 642 rcu_read_unlock(); 643 ret = -ENODEV; 644 if (!dev) 645 goto out; 646 } 647 648 return sock_bindtoindex(sk, index, true); 649 out: 650 #endif 651 652 return ret; 653 } 654 655 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 656 int __user *optlen, int len) 657 { 658 int ret = -ENOPROTOOPT; 659 #ifdef CONFIG_NETDEVICES 660 struct net *net = sock_net(sk); 661 char devname[IFNAMSIZ]; 662 663 if (sk->sk_bound_dev_if == 0) { 664 len = 0; 665 goto zero; 666 } 667 668 ret = -EINVAL; 669 if (len < IFNAMSIZ) 670 goto out; 671 672 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 673 if (ret) 674 goto out; 675 676 len = strlen(devname) + 1; 677 678 ret = -EFAULT; 679 if (copy_to_user(optval, devname, len)) 680 goto out; 681 682 zero: 683 ret = -EFAULT; 684 if (put_user(len, optlen)) 685 goto out; 686 687 ret = 0; 688 689 out: 690 #endif 691 692 return ret; 693 } 694 695 bool sk_mc_loop(struct sock *sk) 696 { 697 if (dev_recursion_level()) 698 return false; 699 if (!sk) 700 return true; 701 switch (sk->sk_family) { 702 case AF_INET: 703 return inet_sk(sk)->mc_loop; 704 #if IS_ENABLED(CONFIG_IPV6) 705 case AF_INET6: 706 return inet6_sk(sk)->mc_loop; 707 #endif 708 } 709 WARN_ON_ONCE(1); 710 return true; 711 } 712 EXPORT_SYMBOL(sk_mc_loop); 713 714 void sock_set_reuseaddr(struct sock *sk) 715 { 716 lock_sock(sk); 717 sk->sk_reuse = SK_CAN_REUSE; 718 release_sock(sk); 719 } 720 EXPORT_SYMBOL(sock_set_reuseaddr); 721 722 void sock_set_reuseport(struct sock *sk) 723 { 724 lock_sock(sk); 725 sk->sk_reuseport = true; 726 release_sock(sk); 727 } 728 EXPORT_SYMBOL(sock_set_reuseport); 729 730 void sock_no_linger(struct sock *sk) 731 { 732 lock_sock(sk); 733 sk->sk_lingertime = 0; 734 sock_set_flag(sk, SOCK_LINGER); 735 release_sock(sk); 736 } 737 EXPORT_SYMBOL(sock_no_linger); 738 739 void sock_set_priority(struct sock *sk, u32 priority) 740 { 741 lock_sock(sk); 742 sk->sk_priority = priority; 743 release_sock(sk); 744 } 745 EXPORT_SYMBOL(sock_set_priority); 746 747 void sock_set_sndtimeo(struct sock *sk, s64 secs) 748 { 749 lock_sock(sk); 750 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 751 sk->sk_sndtimeo = secs * HZ; 752 else 753 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 754 release_sock(sk); 755 } 756 EXPORT_SYMBOL(sock_set_sndtimeo); 757 758 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 759 { 760 if (val) { 761 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 762 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); 763 sock_set_flag(sk, SOCK_RCVTSTAMP); 764 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 765 } else { 766 sock_reset_flag(sk, SOCK_RCVTSTAMP); 767 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 768 } 769 } 770 771 void sock_enable_timestamps(struct sock *sk) 772 { 773 lock_sock(sk); 774 __sock_set_timestamps(sk, true, false, true); 775 release_sock(sk); 776 } 777 EXPORT_SYMBOL(sock_enable_timestamps); 778 779 void sock_set_keepalive(struct sock *sk) 780 { 781 lock_sock(sk); 782 if (sk->sk_prot->keepalive) 783 sk->sk_prot->keepalive(sk, true); 784 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 785 release_sock(sk); 786 } 787 EXPORT_SYMBOL(sock_set_keepalive); 788 789 static void __sock_set_rcvbuf(struct sock *sk, int val) 790 { 791 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 792 * as a negative value. 793 */ 794 val = min_t(int, val, INT_MAX / 2); 795 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 796 797 /* We double it on the way in to account for "struct sk_buff" etc. 798 * overhead. Applications assume that the SO_RCVBUF setting they make 799 * will allow that much actual data to be received on that socket. 800 * 801 * Applications are unaware that "struct sk_buff" and other overheads 802 * allocate from the receive buffer during socket buffer allocation. 803 * 804 * And after considering the possible alternatives, returning the value 805 * we actually used in getsockopt is the most desirable behavior. 806 */ 807 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 808 } 809 810 void sock_set_rcvbuf(struct sock *sk, int val) 811 { 812 lock_sock(sk); 813 __sock_set_rcvbuf(sk, val); 814 release_sock(sk); 815 } 816 EXPORT_SYMBOL(sock_set_rcvbuf); 817 818 static void __sock_set_mark(struct sock *sk, u32 val) 819 { 820 if (val != sk->sk_mark) { 821 sk->sk_mark = val; 822 sk_dst_reset(sk); 823 } 824 } 825 826 void sock_set_mark(struct sock *sk, u32 val) 827 { 828 lock_sock(sk); 829 __sock_set_mark(sk, val); 830 release_sock(sk); 831 } 832 EXPORT_SYMBOL(sock_set_mark); 833 834 /* 835 * This is meant for all protocols to use and covers goings on 836 * at the socket level. Everything here is generic. 837 */ 838 839 int sock_setsockopt(struct socket *sock, int level, int optname, 840 sockptr_t optval, unsigned int optlen) 841 { 842 struct sock_txtime sk_txtime; 843 struct sock *sk = sock->sk; 844 int val; 845 int valbool; 846 struct linger ling; 847 int ret = 0; 848 849 /* 850 * Options without arguments 851 */ 852 853 if (optname == SO_BINDTODEVICE) 854 return sock_setbindtodevice(sk, optval, optlen); 855 856 if (optlen < sizeof(int)) 857 return -EINVAL; 858 859 if (copy_from_sockptr(&val, optval, sizeof(val))) 860 return -EFAULT; 861 862 valbool = val ? 1 : 0; 863 864 lock_sock(sk); 865 866 switch (optname) { 867 case SO_DEBUG: 868 if (val && !capable(CAP_NET_ADMIN)) 869 ret = -EACCES; 870 else 871 sock_valbool_flag(sk, SOCK_DBG, valbool); 872 break; 873 case SO_REUSEADDR: 874 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 875 break; 876 case SO_REUSEPORT: 877 sk->sk_reuseport = valbool; 878 break; 879 case SO_TYPE: 880 case SO_PROTOCOL: 881 case SO_DOMAIN: 882 case SO_ERROR: 883 ret = -ENOPROTOOPT; 884 break; 885 case SO_DONTROUTE: 886 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 887 sk_dst_reset(sk); 888 break; 889 case SO_BROADCAST: 890 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 891 break; 892 case SO_SNDBUF: 893 /* Don't error on this BSD doesn't and if you think 894 * about it this is right. Otherwise apps have to 895 * play 'guess the biggest size' games. RCVBUF/SNDBUF 896 * are treated in BSD as hints 897 */ 898 val = min_t(u32, val, sysctl_wmem_max); 899 set_sndbuf: 900 /* Ensure val * 2 fits into an int, to prevent max_t() 901 * from treating it as a negative value. 902 */ 903 val = min_t(int, val, INT_MAX / 2); 904 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 905 WRITE_ONCE(sk->sk_sndbuf, 906 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 907 /* Wake up sending tasks if we upped the value. */ 908 sk->sk_write_space(sk); 909 break; 910 911 case SO_SNDBUFFORCE: 912 if (!capable(CAP_NET_ADMIN)) { 913 ret = -EPERM; 914 break; 915 } 916 917 /* No negative values (to prevent underflow, as val will be 918 * multiplied by 2). 919 */ 920 if (val < 0) 921 val = 0; 922 goto set_sndbuf; 923 924 case SO_RCVBUF: 925 /* Don't error on this BSD doesn't and if you think 926 * about it this is right. Otherwise apps have to 927 * play 'guess the biggest size' games. RCVBUF/SNDBUF 928 * are treated in BSD as hints 929 */ 930 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); 931 break; 932 933 case SO_RCVBUFFORCE: 934 if (!capable(CAP_NET_ADMIN)) { 935 ret = -EPERM; 936 break; 937 } 938 939 /* No negative values (to prevent underflow, as val will be 940 * multiplied by 2). 941 */ 942 __sock_set_rcvbuf(sk, max(val, 0)); 943 break; 944 945 case SO_KEEPALIVE: 946 if (sk->sk_prot->keepalive) 947 sk->sk_prot->keepalive(sk, valbool); 948 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 949 break; 950 951 case SO_OOBINLINE: 952 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 953 break; 954 955 case SO_NO_CHECK: 956 sk->sk_no_check_tx = valbool; 957 break; 958 959 case SO_PRIORITY: 960 if ((val >= 0 && val <= 6) || 961 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 962 sk->sk_priority = val; 963 else 964 ret = -EPERM; 965 break; 966 967 case SO_LINGER: 968 if (optlen < sizeof(ling)) { 969 ret = -EINVAL; /* 1003.1g */ 970 break; 971 } 972 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 973 ret = -EFAULT; 974 break; 975 } 976 if (!ling.l_onoff) 977 sock_reset_flag(sk, SOCK_LINGER); 978 else { 979 #if (BITS_PER_LONG == 32) 980 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 981 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 982 else 983 #endif 984 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 985 sock_set_flag(sk, SOCK_LINGER); 986 } 987 break; 988 989 case SO_BSDCOMPAT: 990 break; 991 992 case SO_PASSCRED: 993 if (valbool) 994 set_bit(SOCK_PASSCRED, &sock->flags); 995 else 996 clear_bit(SOCK_PASSCRED, &sock->flags); 997 break; 998 999 case SO_TIMESTAMP_OLD: 1000 __sock_set_timestamps(sk, valbool, false, false); 1001 break; 1002 case SO_TIMESTAMP_NEW: 1003 __sock_set_timestamps(sk, valbool, true, false); 1004 break; 1005 case SO_TIMESTAMPNS_OLD: 1006 __sock_set_timestamps(sk, valbool, false, true); 1007 break; 1008 case SO_TIMESTAMPNS_NEW: 1009 __sock_set_timestamps(sk, valbool, true, true); 1010 break; 1011 case SO_TIMESTAMPING_NEW: 1012 case SO_TIMESTAMPING_OLD: 1013 if (val & ~SOF_TIMESTAMPING_MASK) { 1014 ret = -EINVAL; 1015 break; 1016 } 1017 1018 if (val & SOF_TIMESTAMPING_OPT_ID && 1019 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 1020 if (sk->sk_protocol == IPPROTO_TCP && 1021 sk->sk_type == SOCK_STREAM) { 1022 if ((1 << sk->sk_state) & 1023 (TCPF_CLOSE | TCPF_LISTEN)) { 1024 ret = -EINVAL; 1025 break; 1026 } 1027 sk->sk_tskey = tcp_sk(sk)->snd_una; 1028 } else { 1029 sk->sk_tskey = 0; 1030 } 1031 } 1032 1033 if (val & SOF_TIMESTAMPING_OPT_STATS && 1034 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { 1035 ret = -EINVAL; 1036 break; 1037 } 1038 1039 sk->sk_tsflags = val; 1040 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 1041 1042 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 1043 sock_enable_timestamp(sk, 1044 SOCK_TIMESTAMPING_RX_SOFTWARE); 1045 else 1046 sock_disable_timestamp(sk, 1047 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 1048 break; 1049 1050 case SO_RCVLOWAT: 1051 if (val < 0) 1052 val = INT_MAX; 1053 if (sock->ops->set_rcvlowat) 1054 ret = sock->ops->set_rcvlowat(sk, val); 1055 else 1056 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1057 break; 1058 1059 case SO_RCVTIMEO_OLD: 1060 case SO_RCVTIMEO_NEW: 1061 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, 1062 optlen, optname == SO_RCVTIMEO_OLD); 1063 break; 1064 1065 case SO_SNDTIMEO_OLD: 1066 case SO_SNDTIMEO_NEW: 1067 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, 1068 optlen, optname == SO_SNDTIMEO_OLD); 1069 break; 1070 1071 case SO_ATTACH_FILTER: { 1072 struct sock_fprog fprog; 1073 1074 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1075 if (!ret) 1076 ret = sk_attach_filter(&fprog, sk); 1077 break; 1078 } 1079 case SO_ATTACH_BPF: 1080 ret = -EINVAL; 1081 if (optlen == sizeof(u32)) { 1082 u32 ufd; 1083 1084 ret = -EFAULT; 1085 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1086 break; 1087 1088 ret = sk_attach_bpf(ufd, sk); 1089 } 1090 break; 1091 1092 case SO_ATTACH_REUSEPORT_CBPF: { 1093 struct sock_fprog fprog; 1094 1095 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1096 if (!ret) 1097 ret = sk_reuseport_attach_filter(&fprog, sk); 1098 break; 1099 } 1100 case SO_ATTACH_REUSEPORT_EBPF: 1101 ret = -EINVAL; 1102 if (optlen == sizeof(u32)) { 1103 u32 ufd; 1104 1105 ret = -EFAULT; 1106 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1107 break; 1108 1109 ret = sk_reuseport_attach_bpf(ufd, sk); 1110 } 1111 break; 1112 1113 case SO_DETACH_REUSEPORT_BPF: 1114 ret = reuseport_detach_prog(sk); 1115 break; 1116 1117 case SO_DETACH_FILTER: 1118 ret = sk_detach_filter(sk); 1119 break; 1120 1121 case SO_LOCK_FILTER: 1122 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1123 ret = -EPERM; 1124 else 1125 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1126 break; 1127 1128 case SO_PASSSEC: 1129 if (valbool) 1130 set_bit(SOCK_PASSSEC, &sock->flags); 1131 else 1132 clear_bit(SOCK_PASSSEC, &sock->flags); 1133 break; 1134 case SO_MARK: 1135 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1136 ret = -EPERM; 1137 break; 1138 } 1139 1140 __sock_set_mark(sk, val); 1141 break; 1142 1143 case SO_RXQ_OVFL: 1144 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1145 break; 1146 1147 case SO_WIFI_STATUS: 1148 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1149 break; 1150 1151 case SO_PEEK_OFF: 1152 if (sock->ops->set_peek_off) 1153 ret = sock->ops->set_peek_off(sk, val); 1154 else 1155 ret = -EOPNOTSUPP; 1156 break; 1157 1158 case SO_NOFCS: 1159 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1160 break; 1161 1162 case SO_SELECT_ERR_QUEUE: 1163 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1164 break; 1165 1166 #ifdef CONFIG_NET_RX_BUSY_POLL 1167 case SO_BUSY_POLL: 1168 /* allow unprivileged users to decrease the value */ 1169 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1170 ret = -EPERM; 1171 else { 1172 if (val < 0) 1173 ret = -EINVAL; 1174 else 1175 sk->sk_ll_usec = val; 1176 } 1177 break; 1178 case SO_PREFER_BUSY_POLL: 1179 if (valbool && !capable(CAP_NET_ADMIN)) 1180 ret = -EPERM; 1181 else 1182 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1183 break; 1184 case SO_BUSY_POLL_BUDGET: 1185 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { 1186 ret = -EPERM; 1187 } else { 1188 if (val < 0 || val > U16_MAX) 1189 ret = -EINVAL; 1190 else 1191 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1192 } 1193 break; 1194 #endif 1195 1196 case SO_MAX_PACING_RATE: 1197 { 1198 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1199 1200 if (sizeof(ulval) != sizeof(val) && 1201 optlen >= sizeof(ulval) && 1202 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1203 ret = -EFAULT; 1204 break; 1205 } 1206 if (ulval != ~0UL) 1207 cmpxchg(&sk->sk_pacing_status, 1208 SK_PACING_NONE, 1209 SK_PACING_NEEDED); 1210 sk->sk_max_pacing_rate = ulval; 1211 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval); 1212 break; 1213 } 1214 case SO_INCOMING_CPU: 1215 WRITE_ONCE(sk->sk_incoming_cpu, val); 1216 break; 1217 1218 case SO_CNX_ADVICE: 1219 if (val == 1) 1220 dst_negative_advice(sk); 1221 break; 1222 1223 case SO_ZEROCOPY: 1224 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1225 if (!((sk->sk_type == SOCK_STREAM && 1226 sk->sk_protocol == IPPROTO_TCP) || 1227 (sk->sk_type == SOCK_DGRAM && 1228 sk->sk_protocol == IPPROTO_UDP))) 1229 ret = -ENOTSUPP; 1230 } else if (sk->sk_family != PF_RDS) { 1231 ret = -ENOTSUPP; 1232 } 1233 if (!ret) { 1234 if (val < 0 || val > 1) 1235 ret = -EINVAL; 1236 else 1237 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1238 } 1239 break; 1240 1241 case SO_TXTIME: 1242 if (optlen != sizeof(struct sock_txtime)) { 1243 ret = -EINVAL; 1244 break; 1245 } else if (copy_from_sockptr(&sk_txtime, optval, 1246 sizeof(struct sock_txtime))) { 1247 ret = -EFAULT; 1248 break; 1249 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1250 ret = -EINVAL; 1251 break; 1252 } 1253 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1254 * scheduler has enough safe guards. 1255 */ 1256 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1257 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1258 ret = -EPERM; 1259 break; 1260 } 1261 sock_valbool_flag(sk, SOCK_TXTIME, true); 1262 sk->sk_clockid = sk_txtime.clockid; 1263 sk->sk_txtime_deadline_mode = 1264 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1265 sk->sk_txtime_report_errors = 1266 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1267 break; 1268 1269 case SO_BINDTOIFINDEX: 1270 ret = sock_bindtoindex_locked(sk, val); 1271 break; 1272 1273 default: 1274 ret = -ENOPROTOOPT; 1275 break; 1276 } 1277 release_sock(sk); 1278 return ret; 1279 } 1280 EXPORT_SYMBOL(sock_setsockopt); 1281 1282 1283 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1284 struct ucred *ucred) 1285 { 1286 ucred->pid = pid_vnr(pid); 1287 ucred->uid = ucred->gid = -1; 1288 if (cred) { 1289 struct user_namespace *current_ns = current_user_ns(); 1290 1291 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1292 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1293 } 1294 } 1295 1296 static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1297 { 1298 struct user_namespace *user_ns = current_user_ns(); 1299 int i; 1300 1301 for (i = 0; i < src->ngroups; i++) 1302 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1303 return -EFAULT; 1304 1305 return 0; 1306 } 1307 1308 int sock_getsockopt(struct socket *sock, int level, int optname, 1309 char __user *optval, int __user *optlen) 1310 { 1311 struct sock *sk = sock->sk; 1312 1313 union { 1314 int val; 1315 u64 val64; 1316 unsigned long ulval; 1317 struct linger ling; 1318 struct old_timeval32 tm32; 1319 struct __kernel_old_timeval tm; 1320 struct __kernel_sock_timeval stm; 1321 struct sock_txtime txtime; 1322 } v; 1323 1324 int lv = sizeof(int); 1325 int len; 1326 1327 if (get_user(len, optlen)) 1328 return -EFAULT; 1329 if (len < 0) 1330 return -EINVAL; 1331 1332 memset(&v, 0, sizeof(v)); 1333 1334 switch (optname) { 1335 case SO_DEBUG: 1336 v.val = sock_flag(sk, SOCK_DBG); 1337 break; 1338 1339 case SO_DONTROUTE: 1340 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1341 break; 1342 1343 case SO_BROADCAST: 1344 v.val = sock_flag(sk, SOCK_BROADCAST); 1345 break; 1346 1347 case SO_SNDBUF: 1348 v.val = sk->sk_sndbuf; 1349 break; 1350 1351 case SO_RCVBUF: 1352 v.val = sk->sk_rcvbuf; 1353 break; 1354 1355 case SO_REUSEADDR: 1356 v.val = sk->sk_reuse; 1357 break; 1358 1359 case SO_REUSEPORT: 1360 v.val = sk->sk_reuseport; 1361 break; 1362 1363 case SO_KEEPALIVE: 1364 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1365 break; 1366 1367 case SO_TYPE: 1368 v.val = sk->sk_type; 1369 break; 1370 1371 case SO_PROTOCOL: 1372 v.val = sk->sk_protocol; 1373 break; 1374 1375 case SO_DOMAIN: 1376 v.val = sk->sk_family; 1377 break; 1378 1379 case SO_ERROR: 1380 v.val = -sock_error(sk); 1381 if (v.val == 0) 1382 v.val = xchg(&sk->sk_err_soft, 0); 1383 break; 1384 1385 case SO_OOBINLINE: 1386 v.val = sock_flag(sk, SOCK_URGINLINE); 1387 break; 1388 1389 case SO_NO_CHECK: 1390 v.val = sk->sk_no_check_tx; 1391 break; 1392 1393 case SO_PRIORITY: 1394 v.val = sk->sk_priority; 1395 break; 1396 1397 case SO_LINGER: 1398 lv = sizeof(v.ling); 1399 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1400 v.ling.l_linger = sk->sk_lingertime / HZ; 1401 break; 1402 1403 case SO_BSDCOMPAT: 1404 break; 1405 1406 case SO_TIMESTAMP_OLD: 1407 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1408 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1409 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1410 break; 1411 1412 case SO_TIMESTAMPNS_OLD: 1413 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1414 break; 1415 1416 case SO_TIMESTAMP_NEW: 1417 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1418 break; 1419 1420 case SO_TIMESTAMPNS_NEW: 1421 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1422 break; 1423 1424 case SO_TIMESTAMPING_OLD: 1425 v.val = sk->sk_tsflags; 1426 break; 1427 1428 case SO_RCVTIMEO_OLD: 1429 case SO_RCVTIMEO_NEW: 1430 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname); 1431 break; 1432 1433 case SO_SNDTIMEO_OLD: 1434 case SO_SNDTIMEO_NEW: 1435 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname); 1436 break; 1437 1438 case SO_RCVLOWAT: 1439 v.val = sk->sk_rcvlowat; 1440 break; 1441 1442 case SO_SNDLOWAT: 1443 v.val = 1; 1444 break; 1445 1446 case SO_PASSCRED: 1447 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1448 break; 1449 1450 case SO_PEERCRED: 1451 { 1452 struct ucred peercred; 1453 if (len > sizeof(peercred)) 1454 len = sizeof(peercred); 1455 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1456 if (copy_to_user(optval, &peercred, len)) 1457 return -EFAULT; 1458 goto lenout; 1459 } 1460 1461 case SO_PEERGROUPS: 1462 { 1463 int ret, n; 1464 1465 if (!sk->sk_peer_cred) 1466 return -ENODATA; 1467 1468 n = sk->sk_peer_cred->group_info->ngroups; 1469 if (len < n * sizeof(gid_t)) { 1470 len = n * sizeof(gid_t); 1471 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1472 } 1473 len = n * sizeof(gid_t); 1474 1475 ret = groups_to_user((gid_t __user *)optval, 1476 sk->sk_peer_cred->group_info); 1477 if (ret) 1478 return ret; 1479 goto lenout; 1480 } 1481 1482 case SO_PEERNAME: 1483 { 1484 char address[128]; 1485 1486 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1487 if (lv < 0) 1488 return -ENOTCONN; 1489 if (lv < len) 1490 return -EINVAL; 1491 if (copy_to_user(optval, address, len)) 1492 return -EFAULT; 1493 goto lenout; 1494 } 1495 1496 /* Dubious BSD thing... Probably nobody even uses it, but 1497 * the UNIX standard wants it for whatever reason... -DaveM 1498 */ 1499 case SO_ACCEPTCONN: 1500 v.val = sk->sk_state == TCP_LISTEN; 1501 break; 1502 1503 case SO_PASSSEC: 1504 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1505 break; 1506 1507 case SO_PEERSEC: 1508 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1509 1510 case SO_MARK: 1511 v.val = sk->sk_mark; 1512 break; 1513 1514 case SO_RXQ_OVFL: 1515 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1516 break; 1517 1518 case SO_WIFI_STATUS: 1519 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1520 break; 1521 1522 case SO_PEEK_OFF: 1523 if (!sock->ops->set_peek_off) 1524 return -EOPNOTSUPP; 1525 1526 v.val = sk->sk_peek_off; 1527 break; 1528 case SO_NOFCS: 1529 v.val = sock_flag(sk, SOCK_NOFCS); 1530 break; 1531 1532 case SO_BINDTODEVICE: 1533 return sock_getbindtodevice(sk, optval, optlen, len); 1534 1535 case SO_GET_FILTER: 1536 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1537 if (len < 0) 1538 return len; 1539 1540 goto lenout; 1541 1542 case SO_LOCK_FILTER: 1543 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1544 break; 1545 1546 case SO_BPF_EXTENSIONS: 1547 v.val = bpf_tell_extensions(); 1548 break; 1549 1550 case SO_SELECT_ERR_QUEUE: 1551 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1552 break; 1553 1554 #ifdef CONFIG_NET_RX_BUSY_POLL 1555 case SO_BUSY_POLL: 1556 v.val = sk->sk_ll_usec; 1557 break; 1558 case SO_PREFER_BUSY_POLL: 1559 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 1560 break; 1561 #endif 1562 1563 case SO_MAX_PACING_RATE: 1564 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 1565 lv = sizeof(v.ulval); 1566 v.ulval = sk->sk_max_pacing_rate; 1567 } else { 1568 /* 32bit version */ 1569 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); 1570 } 1571 break; 1572 1573 case SO_INCOMING_CPU: 1574 v.val = READ_ONCE(sk->sk_incoming_cpu); 1575 break; 1576 1577 case SO_MEMINFO: 1578 { 1579 u32 meminfo[SK_MEMINFO_VARS]; 1580 1581 sk_get_meminfo(sk, meminfo); 1582 1583 len = min_t(unsigned int, len, sizeof(meminfo)); 1584 if (copy_to_user(optval, &meminfo, len)) 1585 return -EFAULT; 1586 1587 goto lenout; 1588 } 1589 1590 #ifdef CONFIG_NET_RX_BUSY_POLL 1591 case SO_INCOMING_NAPI_ID: 1592 v.val = READ_ONCE(sk->sk_napi_id); 1593 1594 /* aggregate non-NAPI IDs down to 0 */ 1595 if (v.val < MIN_NAPI_ID) 1596 v.val = 0; 1597 1598 break; 1599 #endif 1600 1601 case SO_COOKIE: 1602 lv = sizeof(u64); 1603 if (len < lv) 1604 return -EINVAL; 1605 v.val64 = sock_gen_cookie(sk); 1606 break; 1607 1608 case SO_ZEROCOPY: 1609 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1610 break; 1611 1612 case SO_TXTIME: 1613 lv = sizeof(v.txtime); 1614 v.txtime.clockid = sk->sk_clockid; 1615 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1616 SOF_TXTIME_DEADLINE_MODE : 0; 1617 v.txtime.flags |= sk->sk_txtime_report_errors ? 1618 SOF_TXTIME_REPORT_ERRORS : 0; 1619 break; 1620 1621 case SO_BINDTOIFINDEX: 1622 v.val = sk->sk_bound_dev_if; 1623 break; 1624 1625 default: 1626 /* We implement the SO_SNDLOWAT etc to not be settable 1627 * (1003.1g 7). 1628 */ 1629 return -ENOPROTOOPT; 1630 } 1631 1632 if (len > lv) 1633 len = lv; 1634 if (copy_to_user(optval, &v, len)) 1635 return -EFAULT; 1636 lenout: 1637 if (put_user(len, optlen)) 1638 return -EFAULT; 1639 return 0; 1640 } 1641 1642 /* 1643 * Initialize an sk_lock. 1644 * 1645 * (We also register the sk_lock with the lock validator.) 1646 */ 1647 static inline void sock_lock_init(struct sock *sk) 1648 { 1649 if (sk->sk_kern_sock) 1650 sock_lock_init_class_and_name( 1651 sk, 1652 af_family_kern_slock_key_strings[sk->sk_family], 1653 af_family_kern_slock_keys + sk->sk_family, 1654 af_family_kern_key_strings[sk->sk_family], 1655 af_family_kern_keys + sk->sk_family); 1656 else 1657 sock_lock_init_class_and_name( 1658 sk, 1659 af_family_slock_key_strings[sk->sk_family], 1660 af_family_slock_keys + sk->sk_family, 1661 af_family_key_strings[sk->sk_family], 1662 af_family_keys + sk->sk_family); 1663 } 1664 1665 /* 1666 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1667 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1668 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1669 */ 1670 static void sock_copy(struct sock *nsk, const struct sock *osk) 1671 { 1672 const struct proto *prot = READ_ONCE(osk->sk_prot); 1673 #ifdef CONFIG_SECURITY_NETWORK 1674 void *sptr = nsk->sk_security; 1675 #endif 1676 1677 /* If we move sk_tx_queue_mapping out of the private section, 1678 * we must check if sk_tx_queue_clear() is called after 1679 * sock_copy() in sk_clone_lock(). 1680 */ 1681 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 1682 offsetof(struct sock, sk_dontcopy_begin) || 1683 offsetof(struct sock, sk_tx_queue_mapping) >= 1684 offsetof(struct sock, sk_dontcopy_end)); 1685 1686 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1687 1688 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1689 prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1690 1691 #ifdef CONFIG_SECURITY_NETWORK 1692 nsk->sk_security = sptr; 1693 security_sk_clone(osk, nsk); 1694 #endif 1695 } 1696 1697 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1698 int family) 1699 { 1700 struct sock *sk; 1701 struct kmem_cache *slab; 1702 1703 slab = prot->slab; 1704 if (slab != NULL) { 1705 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1706 if (!sk) 1707 return sk; 1708 if (want_init_on_alloc(priority)) 1709 sk_prot_clear_nulls(sk, prot->obj_size); 1710 } else 1711 sk = kmalloc(prot->obj_size, priority); 1712 1713 if (sk != NULL) { 1714 if (security_sk_alloc(sk, family, priority)) 1715 goto out_free; 1716 1717 if (!try_module_get(prot->owner)) 1718 goto out_free_sec; 1719 } 1720 1721 return sk; 1722 1723 out_free_sec: 1724 security_sk_free(sk); 1725 out_free: 1726 if (slab != NULL) 1727 kmem_cache_free(slab, sk); 1728 else 1729 kfree(sk); 1730 return NULL; 1731 } 1732 1733 static void sk_prot_free(struct proto *prot, struct sock *sk) 1734 { 1735 struct kmem_cache *slab; 1736 struct module *owner; 1737 1738 owner = prot->owner; 1739 slab = prot->slab; 1740 1741 cgroup_sk_free(&sk->sk_cgrp_data); 1742 mem_cgroup_sk_free(sk); 1743 security_sk_free(sk); 1744 if (slab != NULL) 1745 kmem_cache_free(slab, sk); 1746 else 1747 kfree(sk); 1748 module_put(owner); 1749 } 1750 1751 /** 1752 * sk_alloc - All socket objects are allocated here 1753 * @net: the applicable net namespace 1754 * @family: protocol family 1755 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1756 * @prot: struct proto associated with this new sock instance 1757 * @kern: is this to be a kernel socket? 1758 */ 1759 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1760 struct proto *prot, int kern) 1761 { 1762 struct sock *sk; 1763 1764 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1765 if (sk) { 1766 sk->sk_family = family; 1767 /* 1768 * See comment in struct sock definition to understand 1769 * why we need sk_prot_creator -acme 1770 */ 1771 sk->sk_prot = sk->sk_prot_creator = prot; 1772 sk->sk_kern_sock = kern; 1773 sock_lock_init(sk); 1774 sk->sk_net_refcnt = kern ? 0 : 1; 1775 if (likely(sk->sk_net_refcnt)) { 1776 get_net(net); 1777 sock_inuse_add(net, 1); 1778 } 1779 1780 sock_net_set(sk, net); 1781 refcount_set(&sk->sk_wmem_alloc, 1); 1782 1783 mem_cgroup_sk_alloc(sk); 1784 cgroup_sk_alloc(&sk->sk_cgrp_data); 1785 sock_update_classid(&sk->sk_cgrp_data); 1786 sock_update_netprioidx(&sk->sk_cgrp_data); 1787 sk_tx_queue_clear(sk); 1788 } 1789 1790 return sk; 1791 } 1792 EXPORT_SYMBOL(sk_alloc); 1793 1794 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 1795 * grace period. This is the case for UDP sockets and TCP listeners. 1796 */ 1797 static void __sk_destruct(struct rcu_head *head) 1798 { 1799 struct sock *sk = container_of(head, struct sock, sk_rcu); 1800 struct sk_filter *filter; 1801 1802 if (sk->sk_destruct) 1803 sk->sk_destruct(sk); 1804 1805 filter = rcu_dereference_check(sk->sk_filter, 1806 refcount_read(&sk->sk_wmem_alloc) == 0); 1807 if (filter) { 1808 sk_filter_uncharge(sk, filter); 1809 RCU_INIT_POINTER(sk->sk_filter, NULL); 1810 } 1811 1812 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1813 1814 #ifdef CONFIG_BPF_SYSCALL 1815 bpf_sk_storage_free(sk); 1816 #endif 1817 1818 if (atomic_read(&sk->sk_omem_alloc)) 1819 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1820 __func__, atomic_read(&sk->sk_omem_alloc)); 1821 1822 if (sk->sk_frag.page) { 1823 put_page(sk->sk_frag.page); 1824 sk->sk_frag.page = NULL; 1825 } 1826 1827 if (sk->sk_peer_cred) 1828 put_cred(sk->sk_peer_cred); 1829 put_pid(sk->sk_peer_pid); 1830 if (likely(sk->sk_net_refcnt)) 1831 put_net(sock_net(sk)); 1832 sk_prot_free(sk->sk_prot_creator, sk); 1833 } 1834 1835 void sk_destruct(struct sock *sk) 1836 { 1837 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 1838 1839 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 1840 reuseport_detach_sock(sk); 1841 use_call_rcu = true; 1842 } 1843 1844 if (use_call_rcu) 1845 call_rcu(&sk->sk_rcu, __sk_destruct); 1846 else 1847 __sk_destruct(&sk->sk_rcu); 1848 } 1849 1850 static void __sk_free(struct sock *sk) 1851 { 1852 if (likely(sk->sk_net_refcnt)) 1853 sock_inuse_add(sock_net(sk), -1); 1854 1855 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 1856 sock_diag_broadcast_destroy(sk); 1857 else 1858 sk_destruct(sk); 1859 } 1860 1861 void sk_free(struct sock *sk) 1862 { 1863 /* 1864 * We subtract one from sk_wmem_alloc and can know if 1865 * some packets are still in some tx queue. 1866 * If not null, sock_wfree() will call __sk_free(sk) later 1867 */ 1868 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 1869 __sk_free(sk); 1870 } 1871 EXPORT_SYMBOL(sk_free); 1872 1873 static void sk_init_common(struct sock *sk) 1874 { 1875 skb_queue_head_init(&sk->sk_receive_queue); 1876 skb_queue_head_init(&sk->sk_write_queue); 1877 skb_queue_head_init(&sk->sk_error_queue); 1878 1879 rwlock_init(&sk->sk_callback_lock); 1880 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 1881 af_rlock_keys + sk->sk_family, 1882 af_family_rlock_key_strings[sk->sk_family]); 1883 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 1884 af_wlock_keys + sk->sk_family, 1885 af_family_wlock_key_strings[sk->sk_family]); 1886 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 1887 af_elock_keys + sk->sk_family, 1888 af_family_elock_key_strings[sk->sk_family]); 1889 lockdep_set_class_and_name(&sk->sk_callback_lock, 1890 af_callback_keys + sk->sk_family, 1891 af_family_clock_key_strings[sk->sk_family]); 1892 } 1893 1894 /** 1895 * sk_clone_lock - clone a socket, and lock its clone 1896 * @sk: the socket to clone 1897 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1898 * 1899 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1900 */ 1901 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1902 { 1903 struct proto *prot = READ_ONCE(sk->sk_prot); 1904 struct sk_filter *filter; 1905 bool is_charged = true; 1906 struct sock *newsk; 1907 1908 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 1909 if (!newsk) 1910 goto out; 1911 1912 sock_copy(newsk, sk); 1913 1914 newsk->sk_prot_creator = prot; 1915 1916 /* SANITY */ 1917 if (likely(newsk->sk_net_refcnt)) 1918 get_net(sock_net(newsk)); 1919 sk_node_init(&newsk->sk_node); 1920 sock_lock_init(newsk); 1921 bh_lock_sock(newsk); 1922 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1923 newsk->sk_backlog.len = 0; 1924 1925 atomic_set(&newsk->sk_rmem_alloc, 0); 1926 1927 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ 1928 refcount_set(&newsk->sk_wmem_alloc, 1); 1929 1930 atomic_set(&newsk->sk_omem_alloc, 0); 1931 sk_init_common(newsk); 1932 1933 newsk->sk_dst_cache = NULL; 1934 newsk->sk_dst_pending_confirm = 0; 1935 newsk->sk_wmem_queued = 0; 1936 newsk->sk_forward_alloc = 0; 1937 atomic_set(&newsk->sk_drops, 0); 1938 newsk->sk_send_head = NULL; 1939 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1940 atomic_set(&newsk->sk_zckey, 0); 1941 1942 sock_reset_flag(newsk, SOCK_DONE); 1943 1944 /* sk->sk_memcg will be populated at accept() time */ 1945 newsk->sk_memcg = NULL; 1946 1947 cgroup_sk_clone(&newsk->sk_cgrp_data); 1948 1949 rcu_read_lock(); 1950 filter = rcu_dereference(sk->sk_filter); 1951 if (filter != NULL) 1952 /* though it's an empty new sock, the charging may fail 1953 * if sysctl_optmem_max was changed between creation of 1954 * original socket and cloning 1955 */ 1956 is_charged = sk_filter_charge(newsk, filter); 1957 RCU_INIT_POINTER(newsk->sk_filter, filter); 1958 rcu_read_unlock(); 1959 1960 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 1961 /* We need to make sure that we don't uncharge the new 1962 * socket if we couldn't charge it in the first place 1963 * as otherwise we uncharge the parent's filter. 1964 */ 1965 if (!is_charged) 1966 RCU_INIT_POINTER(newsk->sk_filter, NULL); 1967 sk_free_unlock_clone(newsk); 1968 newsk = NULL; 1969 goto out; 1970 } 1971 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 1972 1973 if (bpf_sk_storage_clone(sk, newsk)) { 1974 sk_free_unlock_clone(newsk); 1975 newsk = NULL; 1976 goto out; 1977 } 1978 1979 /* Clear sk_user_data if parent had the pointer tagged 1980 * as not suitable for copying when cloning. 1981 */ 1982 if (sk_user_data_is_nocopy(newsk)) 1983 newsk->sk_user_data = NULL; 1984 1985 newsk->sk_err = 0; 1986 newsk->sk_err_soft = 0; 1987 newsk->sk_priority = 0; 1988 newsk->sk_incoming_cpu = raw_smp_processor_id(); 1989 if (likely(newsk->sk_net_refcnt)) 1990 sock_inuse_add(sock_net(newsk), 1); 1991 1992 /* Before updating sk_refcnt, we must commit prior changes to memory 1993 * (Documentation/RCU/rculist_nulls.rst for details) 1994 */ 1995 smp_wmb(); 1996 refcount_set(&newsk->sk_refcnt, 2); 1997 1998 /* Increment the counter in the same struct proto as the master 1999 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 2000 * is the same as sk->sk_prot->socks, as this field was copied 2001 * with memcpy). 2002 * 2003 * This _changes_ the previous behaviour, where 2004 * tcp_create_openreq_child always was incrementing the 2005 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 2006 * to be taken into account in all callers. -acme 2007 */ 2008 sk_refcnt_debug_inc(newsk); 2009 sk_set_socket(newsk, NULL); 2010 sk_tx_queue_clear(newsk); 2011 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2012 2013 if (newsk->sk_prot->sockets_allocated) 2014 sk_sockets_allocated_inc(newsk); 2015 2016 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2017 net_enable_timestamp(); 2018 out: 2019 return newsk; 2020 } 2021 EXPORT_SYMBOL_GPL(sk_clone_lock); 2022 2023 void sk_free_unlock_clone(struct sock *sk) 2024 { 2025 /* It is still raw copy of parent, so invalidate 2026 * destructor and make plain sk_free() */ 2027 sk->sk_destruct = NULL; 2028 bh_unlock_sock(sk); 2029 sk_free(sk); 2030 } 2031 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 2032 2033 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2034 { 2035 u32 max_segs = 1; 2036 2037 sk_dst_set(sk, dst); 2038 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; 2039 if (sk->sk_route_caps & NETIF_F_GSO) 2040 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2041 sk->sk_route_caps &= ~sk->sk_route_nocaps; 2042 if (sk_can_gso(sk)) { 2043 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2044 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2045 } else { 2046 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2047 sk->sk_gso_max_size = dst->dev->gso_max_size; 2048 max_segs = max_t(u32, dst->dev->gso_max_segs, 1); 2049 } 2050 } 2051 sk->sk_gso_max_segs = max_segs; 2052 } 2053 EXPORT_SYMBOL_GPL(sk_setup_caps); 2054 2055 /* 2056 * Simple resource managers for sockets. 2057 */ 2058 2059 2060 /* 2061 * Write buffer destructor automatically called from kfree_skb. 2062 */ 2063 void sock_wfree(struct sk_buff *skb) 2064 { 2065 struct sock *sk = skb->sk; 2066 unsigned int len = skb->truesize; 2067 2068 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2069 /* 2070 * Keep a reference on sk_wmem_alloc, this will be released 2071 * after sk_write_space() call 2072 */ 2073 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2074 sk->sk_write_space(sk); 2075 len = 1; 2076 } 2077 /* 2078 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2079 * could not do because of in-flight packets 2080 */ 2081 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2082 __sk_free(sk); 2083 } 2084 EXPORT_SYMBOL(sock_wfree); 2085 2086 /* This variant of sock_wfree() is used by TCP, 2087 * since it sets SOCK_USE_WRITE_QUEUE. 2088 */ 2089 void __sock_wfree(struct sk_buff *skb) 2090 { 2091 struct sock *sk = skb->sk; 2092 2093 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2094 __sk_free(sk); 2095 } 2096 2097 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2098 { 2099 skb_orphan(skb); 2100 skb->sk = sk; 2101 #ifdef CONFIG_INET 2102 if (unlikely(!sk_fullsock(sk))) { 2103 skb->destructor = sock_edemux; 2104 sock_hold(sk); 2105 return; 2106 } 2107 #endif 2108 skb->destructor = sock_wfree; 2109 skb_set_hash_from_sk(skb, sk); 2110 /* 2111 * We used to take a refcount on sk, but following operation 2112 * is enough to guarantee sk_free() wont free this sock until 2113 * all in-flight packets are completed 2114 */ 2115 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 2116 } 2117 EXPORT_SYMBOL(skb_set_owner_w); 2118 2119 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2120 { 2121 #ifdef CONFIG_TLS_DEVICE 2122 /* Drivers depend on in-order delivery for crypto offload, 2123 * partial orphan breaks out-of-order-OK logic. 2124 */ 2125 if (skb->decrypted) 2126 return false; 2127 #endif 2128 return (skb->destructor == sock_wfree || 2129 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2130 } 2131 2132 /* This helper is used by netem, as it can hold packets in its 2133 * delay queue. We want to allow the owner socket to send more 2134 * packets, as if they were already TX completed by a typical driver. 2135 * But we also want to keep skb->sk set because some packet schedulers 2136 * rely on it (sch_fq for example). 2137 */ 2138 void skb_orphan_partial(struct sk_buff *skb) 2139 { 2140 if (skb_is_tcp_pure_ack(skb)) 2141 return; 2142 2143 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2144 return; 2145 2146 skb_orphan(skb); 2147 } 2148 EXPORT_SYMBOL(skb_orphan_partial); 2149 2150 /* 2151 * Read buffer destructor automatically called from kfree_skb. 2152 */ 2153 void sock_rfree(struct sk_buff *skb) 2154 { 2155 struct sock *sk = skb->sk; 2156 unsigned int len = skb->truesize; 2157 2158 atomic_sub(len, &sk->sk_rmem_alloc); 2159 sk_mem_uncharge(sk, len); 2160 } 2161 EXPORT_SYMBOL(sock_rfree); 2162 2163 /* 2164 * Buffer destructor for skbs that are not used directly in read or write 2165 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2166 */ 2167 void sock_efree(struct sk_buff *skb) 2168 { 2169 sock_put(skb->sk); 2170 } 2171 EXPORT_SYMBOL(sock_efree); 2172 2173 /* Buffer destructor for prefetch/receive path where reference count may 2174 * not be held, e.g. for listen sockets. 2175 */ 2176 #ifdef CONFIG_INET 2177 void sock_pfree(struct sk_buff *skb) 2178 { 2179 if (sk_is_refcounted(skb->sk)) 2180 sock_gen_put(skb->sk); 2181 } 2182 EXPORT_SYMBOL(sock_pfree); 2183 #endif /* CONFIG_INET */ 2184 2185 kuid_t sock_i_uid(struct sock *sk) 2186 { 2187 kuid_t uid; 2188 2189 read_lock_bh(&sk->sk_callback_lock); 2190 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 2191 read_unlock_bh(&sk->sk_callback_lock); 2192 return uid; 2193 } 2194 EXPORT_SYMBOL(sock_i_uid); 2195 2196 unsigned long sock_i_ino(struct sock *sk) 2197 { 2198 unsigned long ino; 2199 2200 read_lock_bh(&sk->sk_callback_lock); 2201 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 2202 read_unlock_bh(&sk->sk_callback_lock); 2203 return ino; 2204 } 2205 EXPORT_SYMBOL(sock_i_ino); 2206 2207 /* 2208 * Allocate a skb from the socket's send buffer. 2209 */ 2210 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2211 gfp_t priority) 2212 { 2213 if (force || 2214 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2215 struct sk_buff *skb = alloc_skb(size, priority); 2216 2217 if (skb) { 2218 skb_set_owner_w(skb, sk); 2219 return skb; 2220 } 2221 } 2222 return NULL; 2223 } 2224 EXPORT_SYMBOL(sock_wmalloc); 2225 2226 static void sock_ofree(struct sk_buff *skb) 2227 { 2228 struct sock *sk = skb->sk; 2229 2230 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2231 } 2232 2233 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2234 gfp_t priority) 2235 { 2236 struct sk_buff *skb; 2237 2238 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2239 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2240 sysctl_optmem_max) 2241 return NULL; 2242 2243 skb = alloc_skb(size, priority); 2244 if (!skb) 2245 return NULL; 2246 2247 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2248 skb->sk = sk; 2249 skb->destructor = sock_ofree; 2250 return skb; 2251 } 2252 2253 /* 2254 * Allocate a memory block from the socket's option memory buffer. 2255 */ 2256 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2257 { 2258 if ((unsigned int)size <= sysctl_optmem_max && 2259 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 2260 void *mem; 2261 /* First do the add, to avoid the race if kmalloc 2262 * might sleep. 2263 */ 2264 atomic_add(size, &sk->sk_omem_alloc); 2265 mem = kmalloc(size, priority); 2266 if (mem) 2267 return mem; 2268 atomic_sub(size, &sk->sk_omem_alloc); 2269 } 2270 return NULL; 2271 } 2272 EXPORT_SYMBOL(sock_kmalloc); 2273 2274 /* Free an option memory block. Note, we actually want the inline 2275 * here as this allows gcc to detect the nullify and fold away the 2276 * condition entirely. 2277 */ 2278 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2279 const bool nullify) 2280 { 2281 if (WARN_ON_ONCE(!mem)) 2282 return; 2283 if (nullify) 2284 kfree_sensitive(mem); 2285 else 2286 kfree(mem); 2287 atomic_sub(size, &sk->sk_omem_alloc); 2288 } 2289 2290 void sock_kfree_s(struct sock *sk, void *mem, int size) 2291 { 2292 __sock_kfree_s(sk, mem, size, false); 2293 } 2294 EXPORT_SYMBOL(sock_kfree_s); 2295 2296 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2297 { 2298 __sock_kfree_s(sk, mem, size, true); 2299 } 2300 EXPORT_SYMBOL(sock_kzfree_s); 2301 2302 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2303 I think, these locks should be removed for datagram sockets. 2304 */ 2305 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2306 { 2307 DEFINE_WAIT(wait); 2308 2309 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2310 for (;;) { 2311 if (!timeo) 2312 break; 2313 if (signal_pending(current)) 2314 break; 2315 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2316 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2317 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2318 break; 2319 if (sk->sk_shutdown & SEND_SHUTDOWN) 2320 break; 2321 if (sk->sk_err) 2322 break; 2323 timeo = schedule_timeout(timeo); 2324 } 2325 finish_wait(sk_sleep(sk), &wait); 2326 return timeo; 2327 } 2328 2329 2330 /* 2331 * Generic send/receive buffer handlers 2332 */ 2333 2334 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2335 unsigned long data_len, int noblock, 2336 int *errcode, int max_page_order) 2337 { 2338 struct sk_buff *skb; 2339 long timeo; 2340 int err; 2341 2342 timeo = sock_sndtimeo(sk, noblock); 2343 for (;;) { 2344 err = sock_error(sk); 2345 if (err != 0) 2346 goto failure; 2347 2348 err = -EPIPE; 2349 if (sk->sk_shutdown & SEND_SHUTDOWN) 2350 goto failure; 2351 2352 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2353 break; 2354 2355 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2356 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2357 err = -EAGAIN; 2358 if (!timeo) 2359 goto failure; 2360 if (signal_pending(current)) 2361 goto interrupted; 2362 timeo = sock_wait_for_wmem(sk, timeo); 2363 } 2364 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2365 errcode, sk->sk_allocation); 2366 if (skb) 2367 skb_set_owner_w(skb, sk); 2368 return skb; 2369 2370 interrupted: 2371 err = sock_intr_errno(timeo); 2372 failure: 2373 *errcode = err; 2374 return NULL; 2375 } 2376 EXPORT_SYMBOL(sock_alloc_send_pskb); 2377 2378 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 2379 int noblock, int *errcode) 2380 { 2381 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 2382 } 2383 EXPORT_SYMBOL(sock_alloc_send_skb); 2384 2385 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2386 struct sockcm_cookie *sockc) 2387 { 2388 u32 tsflags; 2389 2390 switch (cmsg->cmsg_type) { 2391 case SO_MARK: 2392 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2393 return -EPERM; 2394 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2395 return -EINVAL; 2396 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2397 break; 2398 case SO_TIMESTAMPING_OLD: 2399 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2400 return -EINVAL; 2401 2402 tsflags = *(u32 *)CMSG_DATA(cmsg); 2403 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2404 return -EINVAL; 2405 2406 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2407 sockc->tsflags |= tsflags; 2408 break; 2409 case SCM_TXTIME: 2410 if (!sock_flag(sk, SOCK_TXTIME)) 2411 return -EINVAL; 2412 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2413 return -EINVAL; 2414 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2415 break; 2416 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2417 case SCM_RIGHTS: 2418 case SCM_CREDENTIALS: 2419 break; 2420 default: 2421 return -EINVAL; 2422 } 2423 return 0; 2424 } 2425 EXPORT_SYMBOL(__sock_cmsg_send); 2426 2427 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2428 struct sockcm_cookie *sockc) 2429 { 2430 struct cmsghdr *cmsg; 2431 int ret; 2432 2433 for_each_cmsghdr(cmsg, msg) { 2434 if (!CMSG_OK(msg, cmsg)) 2435 return -EINVAL; 2436 if (cmsg->cmsg_level != SOL_SOCKET) 2437 continue; 2438 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2439 if (ret) 2440 return ret; 2441 } 2442 return 0; 2443 } 2444 EXPORT_SYMBOL(sock_cmsg_send); 2445 2446 static void sk_enter_memory_pressure(struct sock *sk) 2447 { 2448 if (!sk->sk_prot->enter_memory_pressure) 2449 return; 2450 2451 sk->sk_prot->enter_memory_pressure(sk); 2452 } 2453 2454 static void sk_leave_memory_pressure(struct sock *sk) 2455 { 2456 if (sk->sk_prot->leave_memory_pressure) { 2457 sk->sk_prot->leave_memory_pressure(sk); 2458 } else { 2459 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2460 2461 if (memory_pressure && READ_ONCE(*memory_pressure)) 2462 WRITE_ONCE(*memory_pressure, 0); 2463 } 2464 } 2465 2466 #define SKB_FRAG_PAGE_ORDER get_order(32768) 2467 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 2468 2469 /** 2470 * skb_page_frag_refill - check that a page_frag contains enough room 2471 * @sz: minimum size of the fragment we want to get 2472 * @pfrag: pointer to page_frag 2473 * @gfp: priority for memory allocation 2474 * 2475 * Note: While this allocator tries to use high order pages, there is 2476 * no guarantee that allocations succeed. Therefore, @sz MUST be 2477 * less or equal than PAGE_SIZE. 2478 */ 2479 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2480 { 2481 if (pfrag->page) { 2482 if (page_ref_count(pfrag->page) == 1) { 2483 pfrag->offset = 0; 2484 return true; 2485 } 2486 if (pfrag->offset + sz <= pfrag->size) 2487 return true; 2488 put_page(pfrag->page); 2489 } 2490 2491 pfrag->offset = 0; 2492 if (SKB_FRAG_PAGE_ORDER && 2493 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 2494 /* Avoid direct reclaim but allow kswapd to wake */ 2495 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2496 __GFP_COMP | __GFP_NOWARN | 2497 __GFP_NORETRY, 2498 SKB_FRAG_PAGE_ORDER); 2499 if (likely(pfrag->page)) { 2500 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2501 return true; 2502 } 2503 } 2504 pfrag->page = alloc_page(gfp); 2505 if (likely(pfrag->page)) { 2506 pfrag->size = PAGE_SIZE; 2507 return true; 2508 } 2509 return false; 2510 } 2511 EXPORT_SYMBOL(skb_page_frag_refill); 2512 2513 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2514 { 2515 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2516 return true; 2517 2518 sk_enter_memory_pressure(sk); 2519 sk_stream_moderate_sndbuf(sk); 2520 return false; 2521 } 2522 EXPORT_SYMBOL(sk_page_frag_refill); 2523 2524 void __lock_sock(struct sock *sk) 2525 __releases(&sk->sk_lock.slock) 2526 __acquires(&sk->sk_lock.slock) 2527 { 2528 DEFINE_WAIT(wait); 2529 2530 for (;;) { 2531 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2532 TASK_UNINTERRUPTIBLE); 2533 spin_unlock_bh(&sk->sk_lock.slock); 2534 schedule(); 2535 spin_lock_bh(&sk->sk_lock.slock); 2536 if (!sock_owned_by_user(sk)) 2537 break; 2538 } 2539 finish_wait(&sk->sk_lock.wq, &wait); 2540 } 2541 2542 void __release_sock(struct sock *sk) 2543 __releases(&sk->sk_lock.slock) 2544 __acquires(&sk->sk_lock.slock) 2545 { 2546 struct sk_buff *skb, *next; 2547 2548 while ((skb = sk->sk_backlog.head) != NULL) { 2549 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2550 2551 spin_unlock_bh(&sk->sk_lock.slock); 2552 2553 do { 2554 next = skb->next; 2555 prefetch(next); 2556 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2557 skb_mark_not_on_list(skb); 2558 sk_backlog_rcv(sk, skb); 2559 2560 cond_resched(); 2561 2562 skb = next; 2563 } while (skb != NULL); 2564 2565 spin_lock_bh(&sk->sk_lock.slock); 2566 } 2567 2568 /* 2569 * Doing the zeroing here guarantee we can not loop forever 2570 * while a wild producer attempts to flood us. 2571 */ 2572 sk->sk_backlog.len = 0; 2573 } 2574 2575 void __sk_flush_backlog(struct sock *sk) 2576 { 2577 spin_lock_bh(&sk->sk_lock.slock); 2578 __release_sock(sk); 2579 spin_unlock_bh(&sk->sk_lock.slock); 2580 } 2581 2582 /** 2583 * sk_wait_data - wait for data to arrive at sk_receive_queue 2584 * @sk: sock to wait on 2585 * @timeo: for how long 2586 * @skb: last skb seen on sk_receive_queue 2587 * 2588 * Now socket state including sk->sk_err is changed only under lock, 2589 * hence we may omit checks after joining wait queue. 2590 * We check receive queue before schedule() only as optimization; 2591 * it is very likely that release_sock() added new data. 2592 */ 2593 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2594 { 2595 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2596 int rc; 2597 2598 add_wait_queue(sk_sleep(sk), &wait); 2599 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2600 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2601 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2602 remove_wait_queue(sk_sleep(sk), &wait); 2603 return rc; 2604 } 2605 EXPORT_SYMBOL(sk_wait_data); 2606 2607 /** 2608 * __sk_mem_raise_allocated - increase memory_allocated 2609 * @sk: socket 2610 * @size: memory size to allocate 2611 * @amt: pages to allocate 2612 * @kind: allocation type 2613 * 2614 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2615 */ 2616 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2617 { 2618 struct proto *prot = sk->sk_prot; 2619 long allocated = sk_memory_allocated_add(sk, amt); 2620 bool charged = true; 2621 2622 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2623 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) 2624 goto suppress_allocation; 2625 2626 /* Under limit. */ 2627 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2628 sk_leave_memory_pressure(sk); 2629 return 1; 2630 } 2631 2632 /* Under pressure. */ 2633 if (allocated > sk_prot_mem_limits(sk, 1)) 2634 sk_enter_memory_pressure(sk); 2635 2636 /* Over hard limit. */ 2637 if (allocated > sk_prot_mem_limits(sk, 2)) 2638 goto suppress_allocation; 2639 2640 /* guarantee minimum buffer size under pressure */ 2641 if (kind == SK_MEM_RECV) { 2642 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2643 return 1; 2644 2645 } else { /* SK_MEM_SEND */ 2646 int wmem0 = sk_get_wmem0(sk, prot); 2647 2648 if (sk->sk_type == SOCK_STREAM) { 2649 if (sk->sk_wmem_queued < wmem0) 2650 return 1; 2651 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2652 return 1; 2653 } 2654 } 2655 2656 if (sk_has_memory_pressure(sk)) { 2657 u64 alloc; 2658 2659 if (!sk_under_memory_pressure(sk)) 2660 return 1; 2661 alloc = sk_sockets_allocated_read_positive(sk); 2662 if (sk_prot_mem_limits(sk, 2) > alloc * 2663 sk_mem_pages(sk->sk_wmem_queued + 2664 atomic_read(&sk->sk_rmem_alloc) + 2665 sk->sk_forward_alloc)) 2666 return 1; 2667 } 2668 2669 suppress_allocation: 2670 2671 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2672 sk_stream_moderate_sndbuf(sk); 2673 2674 /* Fail only if socket is _under_ its sndbuf. 2675 * In this case we cannot block, so that we have to fail. 2676 */ 2677 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 2678 return 1; 2679 } 2680 2681 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 2682 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 2683 2684 sk_memory_allocated_sub(sk, amt); 2685 2686 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2687 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2688 2689 return 0; 2690 } 2691 EXPORT_SYMBOL(__sk_mem_raise_allocated); 2692 2693 /** 2694 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2695 * @sk: socket 2696 * @size: memory size to allocate 2697 * @kind: allocation type 2698 * 2699 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2700 * rmem allocation. This function assumes that protocols which have 2701 * memory_pressure use sk_wmem_queued as write buffer accounting. 2702 */ 2703 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2704 { 2705 int ret, amt = sk_mem_pages(size); 2706 2707 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2708 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2709 if (!ret) 2710 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2711 return ret; 2712 } 2713 EXPORT_SYMBOL(__sk_mem_schedule); 2714 2715 /** 2716 * __sk_mem_reduce_allocated - reclaim memory_allocated 2717 * @sk: socket 2718 * @amount: number of quanta 2719 * 2720 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 2721 */ 2722 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 2723 { 2724 sk_memory_allocated_sub(sk, amount); 2725 2726 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2727 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2728 2729 if (sk_under_memory_pressure(sk) && 2730 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2731 sk_leave_memory_pressure(sk); 2732 } 2733 EXPORT_SYMBOL(__sk_mem_reduce_allocated); 2734 2735 /** 2736 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 2737 * @sk: socket 2738 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2739 */ 2740 void __sk_mem_reclaim(struct sock *sk, int amount) 2741 { 2742 amount >>= SK_MEM_QUANTUM_SHIFT; 2743 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2744 __sk_mem_reduce_allocated(sk, amount); 2745 } 2746 EXPORT_SYMBOL(__sk_mem_reclaim); 2747 2748 int sk_set_peek_off(struct sock *sk, int val) 2749 { 2750 sk->sk_peek_off = val; 2751 return 0; 2752 } 2753 EXPORT_SYMBOL_GPL(sk_set_peek_off); 2754 2755 /* 2756 * Set of default routines for initialising struct proto_ops when 2757 * the protocol does not support a particular function. In certain 2758 * cases where it makes no sense for a protocol to have a "do nothing" 2759 * function, some default processing is provided. 2760 */ 2761 2762 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2763 { 2764 return -EOPNOTSUPP; 2765 } 2766 EXPORT_SYMBOL(sock_no_bind); 2767 2768 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2769 int len, int flags) 2770 { 2771 return -EOPNOTSUPP; 2772 } 2773 EXPORT_SYMBOL(sock_no_connect); 2774 2775 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2776 { 2777 return -EOPNOTSUPP; 2778 } 2779 EXPORT_SYMBOL(sock_no_socketpair); 2780 2781 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 2782 bool kern) 2783 { 2784 return -EOPNOTSUPP; 2785 } 2786 EXPORT_SYMBOL(sock_no_accept); 2787 2788 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2789 int peer) 2790 { 2791 return -EOPNOTSUPP; 2792 } 2793 EXPORT_SYMBOL(sock_no_getname); 2794 2795 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2796 { 2797 return -EOPNOTSUPP; 2798 } 2799 EXPORT_SYMBOL(sock_no_ioctl); 2800 2801 int sock_no_listen(struct socket *sock, int backlog) 2802 { 2803 return -EOPNOTSUPP; 2804 } 2805 EXPORT_SYMBOL(sock_no_listen); 2806 2807 int sock_no_shutdown(struct socket *sock, int how) 2808 { 2809 return -EOPNOTSUPP; 2810 } 2811 EXPORT_SYMBOL(sock_no_shutdown); 2812 2813 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 2814 { 2815 return -EOPNOTSUPP; 2816 } 2817 EXPORT_SYMBOL(sock_no_sendmsg); 2818 2819 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 2820 { 2821 return -EOPNOTSUPP; 2822 } 2823 EXPORT_SYMBOL(sock_no_sendmsg_locked); 2824 2825 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 2826 int flags) 2827 { 2828 return -EOPNOTSUPP; 2829 } 2830 EXPORT_SYMBOL(sock_no_recvmsg); 2831 2832 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2833 { 2834 /* Mirror missing mmap method error code */ 2835 return -ENODEV; 2836 } 2837 EXPORT_SYMBOL(sock_no_mmap); 2838 2839 /* 2840 * When a file is received (via SCM_RIGHTS, etc), we must bump the 2841 * various sock-based usage counts. 2842 */ 2843 void __receive_sock(struct file *file) 2844 { 2845 struct socket *sock; 2846 2847 sock = sock_from_file(file); 2848 if (sock) { 2849 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 2850 sock_update_classid(&sock->sk->sk_cgrp_data); 2851 } 2852 } 2853 2854 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2855 { 2856 ssize_t res; 2857 struct msghdr msg = {.msg_flags = flags}; 2858 struct kvec iov; 2859 char *kaddr = kmap(page); 2860 iov.iov_base = kaddr + offset; 2861 iov.iov_len = size; 2862 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2863 kunmap(page); 2864 return res; 2865 } 2866 EXPORT_SYMBOL(sock_no_sendpage); 2867 2868 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 2869 int offset, size_t size, int flags) 2870 { 2871 ssize_t res; 2872 struct msghdr msg = {.msg_flags = flags}; 2873 struct kvec iov; 2874 char *kaddr = kmap(page); 2875 2876 iov.iov_base = kaddr + offset; 2877 iov.iov_len = size; 2878 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 2879 kunmap(page); 2880 return res; 2881 } 2882 EXPORT_SYMBOL(sock_no_sendpage_locked); 2883 2884 /* 2885 * Default Socket Callbacks 2886 */ 2887 2888 static void sock_def_wakeup(struct sock *sk) 2889 { 2890 struct socket_wq *wq; 2891 2892 rcu_read_lock(); 2893 wq = rcu_dereference(sk->sk_wq); 2894 if (skwq_has_sleeper(wq)) 2895 wake_up_interruptible_all(&wq->wait); 2896 rcu_read_unlock(); 2897 } 2898 2899 static void sock_def_error_report(struct sock *sk) 2900 { 2901 struct socket_wq *wq; 2902 2903 rcu_read_lock(); 2904 wq = rcu_dereference(sk->sk_wq); 2905 if (skwq_has_sleeper(wq)) 2906 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 2907 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 2908 rcu_read_unlock(); 2909 } 2910 2911 void sock_def_readable(struct sock *sk) 2912 { 2913 struct socket_wq *wq; 2914 2915 rcu_read_lock(); 2916 wq = rcu_dereference(sk->sk_wq); 2917 if (skwq_has_sleeper(wq)) 2918 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 2919 EPOLLRDNORM | EPOLLRDBAND); 2920 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 2921 rcu_read_unlock(); 2922 } 2923 2924 static void sock_def_write_space(struct sock *sk) 2925 { 2926 struct socket_wq *wq; 2927 2928 rcu_read_lock(); 2929 2930 /* Do not wake up a writer until he can make "significant" 2931 * progress. --DaveM 2932 */ 2933 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { 2934 wq = rcu_dereference(sk->sk_wq); 2935 if (skwq_has_sleeper(wq)) 2936 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 2937 EPOLLWRNORM | EPOLLWRBAND); 2938 2939 /* Should agree with poll, otherwise some programs break */ 2940 if (sock_writeable(sk)) 2941 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 2942 } 2943 2944 rcu_read_unlock(); 2945 } 2946 2947 static void sock_def_destruct(struct sock *sk) 2948 { 2949 } 2950 2951 void sk_send_sigurg(struct sock *sk) 2952 { 2953 if (sk->sk_socket && sk->sk_socket->file) 2954 if (send_sigurg(&sk->sk_socket->file->f_owner)) 2955 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 2956 } 2957 EXPORT_SYMBOL(sk_send_sigurg); 2958 2959 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 2960 unsigned long expires) 2961 { 2962 if (!mod_timer(timer, expires)) 2963 sock_hold(sk); 2964 } 2965 EXPORT_SYMBOL(sk_reset_timer); 2966 2967 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 2968 { 2969 if (del_timer(timer)) 2970 __sock_put(sk); 2971 } 2972 EXPORT_SYMBOL(sk_stop_timer); 2973 2974 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 2975 { 2976 if (del_timer_sync(timer)) 2977 __sock_put(sk); 2978 } 2979 EXPORT_SYMBOL(sk_stop_timer_sync); 2980 2981 void sock_init_data(struct socket *sock, struct sock *sk) 2982 { 2983 sk_init_common(sk); 2984 sk->sk_send_head = NULL; 2985 2986 timer_setup(&sk->sk_timer, NULL, 0); 2987 2988 sk->sk_allocation = GFP_KERNEL; 2989 sk->sk_rcvbuf = sysctl_rmem_default; 2990 sk->sk_sndbuf = sysctl_wmem_default; 2991 sk->sk_state = TCP_CLOSE; 2992 sk_set_socket(sk, sock); 2993 2994 sock_set_flag(sk, SOCK_ZAPPED); 2995 2996 if (sock) { 2997 sk->sk_type = sock->type; 2998 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 2999 sock->sk = sk; 3000 sk->sk_uid = SOCK_INODE(sock)->i_uid; 3001 } else { 3002 RCU_INIT_POINTER(sk->sk_wq, NULL); 3003 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); 3004 } 3005 3006 rwlock_init(&sk->sk_callback_lock); 3007 if (sk->sk_kern_sock) 3008 lockdep_set_class_and_name( 3009 &sk->sk_callback_lock, 3010 af_kern_callback_keys + sk->sk_family, 3011 af_family_kern_clock_key_strings[sk->sk_family]); 3012 else 3013 lockdep_set_class_and_name( 3014 &sk->sk_callback_lock, 3015 af_callback_keys + sk->sk_family, 3016 af_family_clock_key_strings[sk->sk_family]); 3017 3018 sk->sk_state_change = sock_def_wakeup; 3019 sk->sk_data_ready = sock_def_readable; 3020 sk->sk_write_space = sock_def_write_space; 3021 sk->sk_error_report = sock_def_error_report; 3022 sk->sk_destruct = sock_def_destruct; 3023 3024 sk->sk_frag.page = NULL; 3025 sk->sk_frag.offset = 0; 3026 sk->sk_peek_off = -1; 3027 3028 sk->sk_peer_pid = NULL; 3029 sk->sk_peer_cred = NULL; 3030 sk->sk_write_pending = 0; 3031 sk->sk_rcvlowat = 1; 3032 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3033 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3034 3035 sk->sk_stamp = SK_DEFAULT_STAMP; 3036 #if BITS_PER_LONG==32 3037 seqlock_init(&sk->sk_stamp_seq); 3038 #endif 3039 atomic_set(&sk->sk_zckey, 0); 3040 3041 #ifdef CONFIG_NET_RX_BUSY_POLL 3042 sk->sk_napi_id = 0; 3043 sk->sk_ll_usec = sysctl_net_busy_read; 3044 #endif 3045 3046 sk->sk_max_pacing_rate = ~0UL; 3047 sk->sk_pacing_rate = ~0UL; 3048 WRITE_ONCE(sk->sk_pacing_shift, 10); 3049 sk->sk_incoming_cpu = -1; 3050 3051 sk_rx_queue_clear(sk); 3052 /* 3053 * Before updating sk_refcnt, we must commit prior changes to memory 3054 * (Documentation/RCU/rculist_nulls.rst for details) 3055 */ 3056 smp_wmb(); 3057 refcount_set(&sk->sk_refcnt, 1); 3058 atomic_set(&sk->sk_drops, 0); 3059 } 3060 EXPORT_SYMBOL(sock_init_data); 3061 3062 void lock_sock_nested(struct sock *sk, int subclass) 3063 { 3064 might_sleep(); 3065 spin_lock_bh(&sk->sk_lock.slock); 3066 if (sk->sk_lock.owned) 3067 __lock_sock(sk); 3068 sk->sk_lock.owned = 1; 3069 spin_unlock(&sk->sk_lock.slock); 3070 /* 3071 * The sk_lock has mutex_lock() semantics here: 3072 */ 3073 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3074 local_bh_enable(); 3075 } 3076 EXPORT_SYMBOL(lock_sock_nested); 3077 3078 void release_sock(struct sock *sk) 3079 { 3080 spin_lock_bh(&sk->sk_lock.slock); 3081 if (sk->sk_backlog.tail) 3082 __release_sock(sk); 3083 3084 /* Warning : release_cb() might need to release sk ownership, 3085 * ie call sock_release_ownership(sk) before us. 3086 */ 3087 if (sk->sk_prot->release_cb) 3088 sk->sk_prot->release_cb(sk); 3089 3090 sock_release_ownership(sk); 3091 if (waitqueue_active(&sk->sk_lock.wq)) 3092 wake_up(&sk->sk_lock.wq); 3093 spin_unlock_bh(&sk->sk_lock.slock); 3094 } 3095 EXPORT_SYMBOL(release_sock); 3096 3097 /** 3098 * lock_sock_fast - fast version of lock_sock 3099 * @sk: socket 3100 * 3101 * This version should be used for very small section, where process wont block 3102 * return false if fast path is taken: 3103 * 3104 * sk_lock.slock locked, owned = 0, BH disabled 3105 * 3106 * return true if slow path is taken: 3107 * 3108 * sk_lock.slock unlocked, owned = 1, BH enabled 3109 */ 3110 bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3111 { 3112 might_sleep(); 3113 spin_lock_bh(&sk->sk_lock.slock); 3114 3115 if (!sk->sk_lock.owned) 3116 /* 3117 * Note : We must disable BH 3118 */ 3119 return false; 3120 3121 __lock_sock(sk); 3122 sk->sk_lock.owned = 1; 3123 spin_unlock(&sk->sk_lock.slock); 3124 /* 3125 * The sk_lock has mutex_lock() semantics here: 3126 */ 3127 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 3128 __acquire(&sk->sk_lock.slock); 3129 local_bh_enable(); 3130 return true; 3131 } 3132 EXPORT_SYMBOL(lock_sock_fast); 3133 3134 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3135 bool timeval, bool time32) 3136 { 3137 struct sock *sk = sock->sk; 3138 struct timespec64 ts; 3139 3140 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3141 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3142 if (ts.tv_sec == -1) 3143 return -ENOENT; 3144 if (ts.tv_sec == 0) { 3145 ktime_t kt = ktime_get_real(); 3146 sock_write_timestamp(sk, kt); 3147 ts = ktime_to_timespec64(kt); 3148 } 3149 3150 if (timeval) 3151 ts.tv_nsec /= 1000; 3152 3153 #ifdef CONFIG_COMPAT_32BIT_TIME 3154 if (time32) 3155 return put_old_timespec32(&ts, userstamp); 3156 #endif 3157 #ifdef CONFIG_SPARC64 3158 /* beware of padding in sparc64 timeval */ 3159 if (timeval && !in_compat_syscall()) { 3160 struct __kernel_old_timeval __user tv = { 3161 .tv_sec = ts.tv_sec, 3162 .tv_usec = ts.tv_nsec, 3163 }; 3164 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3165 return -EFAULT; 3166 return 0; 3167 } 3168 #endif 3169 return put_timespec64(&ts, userstamp); 3170 } 3171 EXPORT_SYMBOL(sock_gettstamp); 3172 3173 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3174 { 3175 if (!sock_flag(sk, flag)) { 3176 unsigned long previous_flags = sk->sk_flags; 3177 3178 sock_set_flag(sk, flag); 3179 /* 3180 * we just set one of the two flags which require net 3181 * time stamping, but time stamping might have been on 3182 * already because of the other one 3183 */ 3184 if (sock_needs_netstamp(sk) && 3185 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3186 net_enable_timestamp(); 3187 } 3188 } 3189 3190 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3191 int level, int type) 3192 { 3193 struct sock_exterr_skb *serr; 3194 struct sk_buff *skb; 3195 int copied, err; 3196 3197 err = -EAGAIN; 3198 skb = sock_dequeue_err_skb(sk); 3199 if (skb == NULL) 3200 goto out; 3201 3202 copied = skb->len; 3203 if (copied > len) { 3204 msg->msg_flags |= MSG_TRUNC; 3205 copied = len; 3206 } 3207 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3208 if (err) 3209 goto out_free_skb; 3210 3211 sock_recv_timestamp(msg, sk, skb); 3212 3213 serr = SKB_EXT_ERR(skb); 3214 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3215 3216 msg->msg_flags |= MSG_ERRQUEUE; 3217 err = copied; 3218 3219 out_free_skb: 3220 kfree_skb(skb); 3221 out: 3222 return err; 3223 } 3224 EXPORT_SYMBOL(sock_recv_errqueue); 3225 3226 /* 3227 * Get a socket option on an socket. 3228 * 3229 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3230 * asynchronous errors should be reported by getsockopt. We assume 3231 * this means if you specify SO_ERROR (otherwise whats the point of it). 3232 */ 3233 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3234 char __user *optval, int __user *optlen) 3235 { 3236 struct sock *sk = sock->sk; 3237 3238 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 3239 } 3240 EXPORT_SYMBOL(sock_common_getsockopt); 3241 3242 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3243 int flags) 3244 { 3245 struct sock *sk = sock->sk; 3246 int addr_len = 0; 3247 int err; 3248 3249 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 3250 flags & ~MSG_DONTWAIT, &addr_len); 3251 if (err >= 0) 3252 msg->msg_namelen = addr_len; 3253 return err; 3254 } 3255 EXPORT_SYMBOL(sock_common_recvmsg); 3256 3257 /* 3258 * Set socket options on an inet socket. 3259 */ 3260 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3261 sockptr_t optval, unsigned int optlen) 3262 { 3263 struct sock *sk = sock->sk; 3264 3265 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3266 } 3267 EXPORT_SYMBOL(sock_common_setsockopt); 3268 3269 void sk_common_release(struct sock *sk) 3270 { 3271 if (sk->sk_prot->destroy) 3272 sk->sk_prot->destroy(sk); 3273 3274 /* 3275 * Observation: when sk_common_release is called, processes have 3276 * no access to socket. But net still has. 3277 * Step one, detach it from networking: 3278 * 3279 * A. Remove from hash tables. 3280 */ 3281 3282 sk->sk_prot->unhash(sk); 3283 3284 /* 3285 * In this point socket cannot receive new packets, but it is possible 3286 * that some packets are in flight because some CPU runs receiver and 3287 * did hash table lookup before we unhashed socket. They will achieve 3288 * receive queue and will be purged by socket destructor. 3289 * 3290 * Also we still have packets pending on receive queue and probably, 3291 * our own packets waiting in device queues. sock_destroy will drain 3292 * receive queue, but transmitted packets will delay socket destruction 3293 * until the last reference will be released. 3294 */ 3295 3296 sock_orphan(sk); 3297 3298 xfrm_sk_free_policy(sk); 3299 3300 sk_refcnt_debug_release(sk); 3301 3302 sock_put(sk); 3303 } 3304 EXPORT_SYMBOL(sk_common_release); 3305 3306 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3307 { 3308 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3309 3310 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3311 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3312 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3313 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3314 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3315 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3316 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3317 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 3318 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3319 } 3320 3321 #ifdef CONFIG_PROC_FS 3322 #define PROTO_INUSE_NR 64 /* should be enough for the first time */ 3323 struct prot_inuse { 3324 int val[PROTO_INUSE_NR]; 3325 }; 3326 3327 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3328 3329 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 3330 { 3331 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); 3332 } 3333 EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 3334 3335 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3336 { 3337 int cpu, idx = prot->inuse_idx; 3338 int res = 0; 3339 3340 for_each_possible_cpu(cpu) 3341 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3342 3343 return res >= 0 ? res : 0; 3344 } 3345 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3346 3347 static void sock_inuse_add(struct net *net, int val) 3348 { 3349 this_cpu_add(*net->core.sock_inuse, val); 3350 } 3351 3352 int sock_inuse_get(struct net *net) 3353 { 3354 int cpu, res = 0; 3355 3356 for_each_possible_cpu(cpu) 3357 res += *per_cpu_ptr(net->core.sock_inuse, cpu); 3358 3359 return res; 3360 } 3361 3362 EXPORT_SYMBOL_GPL(sock_inuse_get); 3363 3364 static int __net_init sock_inuse_init_net(struct net *net) 3365 { 3366 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3367 if (net->core.prot_inuse == NULL) 3368 return -ENOMEM; 3369 3370 net->core.sock_inuse = alloc_percpu(int); 3371 if (net->core.sock_inuse == NULL) 3372 goto out; 3373 3374 return 0; 3375 3376 out: 3377 free_percpu(net->core.prot_inuse); 3378 return -ENOMEM; 3379 } 3380 3381 static void __net_exit sock_inuse_exit_net(struct net *net) 3382 { 3383 free_percpu(net->core.prot_inuse); 3384 free_percpu(net->core.sock_inuse); 3385 } 3386 3387 static struct pernet_operations net_inuse_ops = { 3388 .init = sock_inuse_init_net, 3389 .exit = sock_inuse_exit_net, 3390 }; 3391 3392 static __init int net_inuse_init(void) 3393 { 3394 if (register_pernet_subsys(&net_inuse_ops)) 3395 panic("Cannot initialize net inuse counters"); 3396 3397 return 0; 3398 } 3399 3400 core_initcall(net_inuse_init); 3401 3402 static int assign_proto_idx(struct proto *prot) 3403 { 3404 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3405 3406 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3407 pr_err("PROTO_INUSE_NR exhausted\n"); 3408 return -ENOSPC; 3409 } 3410 3411 set_bit(prot->inuse_idx, proto_inuse_idx); 3412 return 0; 3413 } 3414 3415 static void release_proto_idx(struct proto *prot) 3416 { 3417 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3418 clear_bit(prot->inuse_idx, proto_inuse_idx); 3419 } 3420 #else 3421 static inline int assign_proto_idx(struct proto *prot) 3422 { 3423 return 0; 3424 } 3425 3426 static inline void release_proto_idx(struct proto *prot) 3427 { 3428 } 3429 3430 static void sock_inuse_add(struct net *net, int val) 3431 { 3432 } 3433 #endif 3434 3435 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 3436 { 3437 if (!twsk_prot) 3438 return; 3439 kfree(twsk_prot->twsk_slab_name); 3440 twsk_prot->twsk_slab_name = NULL; 3441 kmem_cache_destroy(twsk_prot->twsk_slab); 3442 twsk_prot->twsk_slab = NULL; 3443 } 3444 3445 static int tw_prot_init(const struct proto *prot) 3446 { 3447 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 3448 3449 if (!twsk_prot) 3450 return 0; 3451 3452 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 3453 prot->name); 3454 if (!twsk_prot->twsk_slab_name) 3455 return -ENOMEM; 3456 3457 twsk_prot->twsk_slab = 3458 kmem_cache_create(twsk_prot->twsk_slab_name, 3459 twsk_prot->twsk_obj_size, 0, 3460 SLAB_ACCOUNT | prot->slab_flags, 3461 NULL); 3462 if (!twsk_prot->twsk_slab) { 3463 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 3464 prot->name); 3465 return -ENOMEM; 3466 } 3467 3468 return 0; 3469 } 3470 3471 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3472 { 3473 if (!rsk_prot) 3474 return; 3475 kfree(rsk_prot->slab_name); 3476 rsk_prot->slab_name = NULL; 3477 kmem_cache_destroy(rsk_prot->slab); 3478 rsk_prot->slab = NULL; 3479 } 3480 3481 static int req_prot_init(const struct proto *prot) 3482 { 3483 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3484 3485 if (!rsk_prot) 3486 return 0; 3487 3488 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3489 prot->name); 3490 if (!rsk_prot->slab_name) 3491 return -ENOMEM; 3492 3493 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3494 rsk_prot->obj_size, 0, 3495 SLAB_ACCOUNT | prot->slab_flags, 3496 NULL); 3497 3498 if (!rsk_prot->slab) { 3499 pr_crit("%s: Can't create request sock SLAB cache!\n", 3500 prot->name); 3501 return -ENOMEM; 3502 } 3503 return 0; 3504 } 3505 3506 int proto_register(struct proto *prot, int alloc_slab) 3507 { 3508 int ret = -ENOBUFS; 3509 3510 if (alloc_slab) { 3511 prot->slab = kmem_cache_create_usercopy(prot->name, 3512 prot->obj_size, 0, 3513 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3514 prot->slab_flags, 3515 prot->useroffset, prot->usersize, 3516 NULL); 3517 3518 if (prot->slab == NULL) { 3519 pr_crit("%s: Can't create sock SLAB cache!\n", 3520 prot->name); 3521 goto out; 3522 } 3523 3524 if (req_prot_init(prot)) 3525 goto out_free_request_sock_slab; 3526 3527 if (tw_prot_init(prot)) 3528 goto out_free_timewait_sock_slab; 3529 } 3530 3531 mutex_lock(&proto_list_mutex); 3532 ret = assign_proto_idx(prot); 3533 if (ret) { 3534 mutex_unlock(&proto_list_mutex); 3535 goto out_free_timewait_sock_slab; 3536 } 3537 list_add(&prot->node, &proto_list); 3538 mutex_unlock(&proto_list_mutex); 3539 return ret; 3540 3541 out_free_timewait_sock_slab: 3542 if (alloc_slab) 3543 tw_prot_cleanup(prot->twsk_prot); 3544 out_free_request_sock_slab: 3545 if (alloc_slab) { 3546 req_prot_cleanup(prot->rsk_prot); 3547 3548 kmem_cache_destroy(prot->slab); 3549 prot->slab = NULL; 3550 } 3551 out: 3552 return ret; 3553 } 3554 EXPORT_SYMBOL(proto_register); 3555 3556 void proto_unregister(struct proto *prot) 3557 { 3558 mutex_lock(&proto_list_mutex); 3559 release_proto_idx(prot); 3560 list_del(&prot->node); 3561 mutex_unlock(&proto_list_mutex); 3562 3563 kmem_cache_destroy(prot->slab); 3564 prot->slab = NULL; 3565 3566 req_prot_cleanup(prot->rsk_prot); 3567 tw_prot_cleanup(prot->twsk_prot); 3568 } 3569 EXPORT_SYMBOL(proto_unregister); 3570 3571 int sock_load_diag_module(int family, int protocol) 3572 { 3573 if (!protocol) { 3574 if (!sock_is_registered(family)) 3575 return -ENOENT; 3576 3577 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3578 NETLINK_SOCK_DIAG, family); 3579 } 3580 3581 #ifdef CONFIG_INET 3582 if (family == AF_INET && 3583 protocol != IPPROTO_RAW && 3584 protocol < MAX_INET_PROTOS && 3585 !rcu_access_pointer(inet_protos[protocol])) 3586 return -ENOENT; 3587 #endif 3588 3589 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3590 NETLINK_SOCK_DIAG, family, protocol); 3591 } 3592 EXPORT_SYMBOL(sock_load_diag_module); 3593 3594 #ifdef CONFIG_PROC_FS 3595 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3596 __acquires(proto_list_mutex) 3597 { 3598 mutex_lock(&proto_list_mutex); 3599 return seq_list_start_head(&proto_list, *pos); 3600 } 3601 3602 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3603 { 3604 return seq_list_next(v, &proto_list, pos); 3605 } 3606 3607 static void proto_seq_stop(struct seq_file *seq, void *v) 3608 __releases(proto_list_mutex) 3609 { 3610 mutex_unlock(&proto_list_mutex); 3611 } 3612 3613 static char proto_method_implemented(const void *method) 3614 { 3615 return method == NULL ? 'n' : 'y'; 3616 } 3617 static long sock_prot_memory_allocated(struct proto *proto) 3618 { 3619 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3620 } 3621 3622 static const char *sock_prot_memory_pressure(struct proto *proto) 3623 { 3624 return proto->memory_pressure != NULL ? 3625 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3626 } 3627 3628 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3629 { 3630 3631 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3632 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3633 proto->name, 3634 proto->obj_size, 3635 sock_prot_inuse_get(seq_file_net(seq), proto), 3636 sock_prot_memory_allocated(proto), 3637 sock_prot_memory_pressure(proto), 3638 proto->max_header, 3639 proto->slab == NULL ? "no" : "yes", 3640 module_name(proto->owner), 3641 proto_method_implemented(proto->close), 3642 proto_method_implemented(proto->connect), 3643 proto_method_implemented(proto->disconnect), 3644 proto_method_implemented(proto->accept), 3645 proto_method_implemented(proto->ioctl), 3646 proto_method_implemented(proto->init), 3647 proto_method_implemented(proto->destroy), 3648 proto_method_implemented(proto->shutdown), 3649 proto_method_implemented(proto->setsockopt), 3650 proto_method_implemented(proto->getsockopt), 3651 proto_method_implemented(proto->sendmsg), 3652 proto_method_implemented(proto->recvmsg), 3653 proto_method_implemented(proto->sendpage), 3654 proto_method_implemented(proto->bind), 3655 proto_method_implemented(proto->backlog_rcv), 3656 proto_method_implemented(proto->hash), 3657 proto_method_implemented(proto->unhash), 3658 proto_method_implemented(proto->get_port), 3659 proto_method_implemented(proto->enter_memory_pressure)); 3660 } 3661 3662 static int proto_seq_show(struct seq_file *seq, void *v) 3663 { 3664 if (v == &proto_list) 3665 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3666 "protocol", 3667 "size", 3668 "sockets", 3669 "memory", 3670 "press", 3671 "maxhdr", 3672 "slab", 3673 "module", 3674 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3675 else 3676 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3677 return 0; 3678 } 3679 3680 static const struct seq_operations proto_seq_ops = { 3681 .start = proto_seq_start, 3682 .next = proto_seq_next, 3683 .stop = proto_seq_stop, 3684 .show = proto_seq_show, 3685 }; 3686 3687 static __net_init int proto_init_net(struct net *net) 3688 { 3689 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 3690 sizeof(struct seq_net_private))) 3691 return -ENOMEM; 3692 3693 return 0; 3694 } 3695 3696 static __net_exit void proto_exit_net(struct net *net) 3697 { 3698 remove_proc_entry("protocols", net->proc_net); 3699 } 3700 3701 3702 static __net_initdata struct pernet_operations proto_net_ops = { 3703 .init = proto_init_net, 3704 .exit = proto_exit_net, 3705 }; 3706 3707 static int __init proto_init(void) 3708 { 3709 return register_pernet_subsys(&proto_net_ops); 3710 } 3711 3712 subsys_initcall(proto_init); 3713 3714 #endif /* PROC_FS */ 3715 3716 #ifdef CONFIG_NET_RX_BUSY_POLL 3717 bool sk_busy_loop_end(void *p, unsigned long start_time) 3718 { 3719 struct sock *sk = p; 3720 3721 return !skb_queue_empty_lockless(&sk->sk_receive_queue) || 3722 sk_busy_loop_timeout(sk, start_time); 3723 } 3724 EXPORT_SYMBOL(sk_busy_loop_end); 3725 #endif /* CONFIG_NET_RX_BUSY_POLL */ 3726 3727 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) 3728 { 3729 if (!sk->sk_prot->bind_add) 3730 return -EOPNOTSUPP; 3731 return sk->sk_prot->bind_add(sk, addr, addr_len); 3732 } 3733 EXPORT_SYMBOL(sock_bind_add); 3734